Revert "Extract high quality images from PDF only when they are needed for OCR."
This reverts commit5208eafb23
. Reason for revert: LUCI Bisection identified this CL as the culprit of a build failure. See the analysis: https://luci-bisection.appspot.com/analysis/b/8773075837739418257 Sample failed build: https://ci.chromium.org/b/8773075837739418257 If this is a false positive, please report it at https://bugs.chromium.org/p/chromium/issues/entry?comment=Analysis%3A+https%3A%2F%2Fluci-bisection.appspot.com%2Fanalysis%2Fb%2F8773075837739418257&components=Tools%3ETest%3EFindit&labels=LUCI-Bisection-Wrong%2CPri-3%2CType-Bug&status=Available&summary=Wrongly+blamed+https%3A%2F%2Fchromium-review.googlesource.com%2Fc%2Fchromium%2Fsrc%2F%2B%2F4754282 Original change's description: > Extract high quality images from PDF only when they are needed for OCR. > > 32-bit bitmaps with highest available quality are extracted from PDF > files when they are loaded, so that they would be sent later to OCR > service. To avoid the memory overhead of this process, this CL > postpones image extraction to the time they are sent to OCR service, > and destroys the extracted images immediately after that. > > AX-Relnotes: n/a > Bug:1471392
> Change-Id: Id337edf693d8d4a4ddd1a56d814a0d1f0e1ac5e4 > Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4754282 > Auto-Submit: Ramin Halavati <rhalavati@chromium.org> > Reviewed-by: Kyungjun Lee <kyungjunlee@google.com> > Commit-Queue: Ramin Halavati <rhalavati@chromium.org> > Commit-Queue: Lei Zhang <thestig@chromium.org> > Reviewed-by: Lei Zhang <thestig@chromium.org> > Cr-Commit-Position: refs/heads/main@{#1182463} > Bug:1471392
Change-Id: If5d194c454e0cced5a2b7f4041973870b427e5cf No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4772383 Owners-Override: luci-bisection@appspot.gserviceaccount.com <luci-bisection@appspot.gserviceaccount.com> Commit-Queue: luci-bisection@appspot.gserviceaccount.com <luci-bisection@appspot.gserviceaccount.com> Bot-Commit: luci-bisection@appspot.gserviceaccount.com <luci-bisection@appspot.gserviceaccount.com> Cr-Commit-Position: refs/heads/main@{#1182468}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
f95b9ba3f8
commit
edf0d3d1ed
@ -12,7 +12,6 @@ include_rules = [
|
||||
"+pdf/mojom/pdf.mojom.h",
|
||||
"+pdf/pdf_accessibility_action_handler.h",
|
||||
"+pdf/pdf_accessibility_data_handler.h",
|
||||
"+pdf/pdf_accessibility_image_fetcher.h",
|
||||
"+pdf/pdf_features.h",
|
||||
"+pdf/pdf_view_web_plugin.h",
|
||||
"+printing/buildflags/buildflags.h",
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include "content/public/renderer/render_frame.h"
|
||||
#include "content/public/renderer/render_thread.h"
|
||||
#include "pdf/pdf_accessibility_action_handler.h"
|
||||
#include "pdf/pdf_accessibility_image_fetcher.h"
|
||||
#include "pdf/pdf_features.h"
|
||||
#include "third_party/blink/public/strings/grit/blink_accessibility_strings.h"
|
||||
#include "ui/accessibility/ax_enums.mojom.h"
|
||||
@ -58,12 +57,10 @@ using PdfOcrRequest = PdfAccessibilityTree::PdfOcrRequest;
|
||||
|
||||
PdfOcrRequest::PdfOcrRequest(const ui::AXNodeID& image_node_id,
|
||||
const chrome_pdf::AccessibilityImageInfo& image,
|
||||
const ui::AXNodeID& parent_node_id,
|
||||
uint32_t page_index)
|
||||
const ui::AXNodeID& parent_node_id)
|
||||
: image_node_id(image_node_id),
|
||||
image(image),
|
||||
parent_node_id(parent_node_id),
|
||||
page_index(page_index) {}
|
||||
parent_node_id(parent_node_id) {}
|
||||
|
||||
//
|
||||
// PdfOcrService
|
||||
@ -71,13 +68,10 @@ PdfOcrRequest::PdfOcrRequest(const ui::AXNodeID& image_node_id,
|
||||
|
||||
using PdfOcrService = PdfAccessibilityTree::PdfOcrService;
|
||||
|
||||
PdfOcrService::PdfOcrService(
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher,
|
||||
content::RenderFrame& render_frame,
|
||||
uint32_t page_count,
|
||||
OnOcrDataReceivedCallback callback)
|
||||
: image_fetcher_(image_fetcher),
|
||||
remaining_page_count_(page_count),
|
||||
PdfOcrService::PdfOcrService(content::RenderFrame& render_frame,
|
||||
uint32_t page_count,
|
||||
OnOcrDataReceivedCallback callback)
|
||||
: remaining_page_count_(page_count),
|
||||
on_ocr_data_received_callback_(std::move(callback)) {
|
||||
CHECK(features::IsPdfOcrEnabled());
|
||||
render_frame.GetBrowserInterfaceBroker()->GetInterface(
|
||||
@ -130,21 +124,12 @@ void PdfOcrService::OcrNextImage() {
|
||||
if (all_requests_.empty()) {
|
||||
return;
|
||||
}
|
||||
PdfOcrRequest request = all_requests_.front();
|
||||
const PdfOcrRequest request = all_requests_.front();
|
||||
all_requests_.pop();
|
||||
|
||||
SkBitmap bitmap = image_fetcher_->GetImageForOcr(
|
||||
request.page_index, request.image.page_object_index);
|
||||
request.image_pixel_size = gfx::SizeF(bitmap.width(), bitmap.height());
|
||||
if (bitmap.drawsNothing()) {
|
||||
ReceiveOcrResultsForImage(std::move(request), ui::AXTreeUpdate());
|
||||
return;
|
||||
}
|
||||
|
||||
screen_ai_annotator_->PerformOcrAndReturnAXTreeUpdate(
|
||||
std::move(bitmap),
|
||||
request.image.image_data,
|
||||
base::BindOnce(&PdfOcrService::ReceiveOcrResultsForImage,
|
||||
weak_ptr_factory_.GetWeakPtr(), std::move(request)));
|
||||
weak_ptr_factory_.GetWeakPtr(), request));
|
||||
|
||||
base::UmaHistogramEnumeration("Accessibility.PdfOcr.PDFImages",
|
||||
PdfOcrRequestStatus::kRequested);
|
||||
@ -521,11 +506,14 @@ std::unique_ptr<ui::AXNodeData> CreateStatusNodeWrapper(
|
||||
return node_wrapper;
|
||||
}
|
||||
|
||||
gfx::Transform MakeTransformForImage(const gfx::RectF image_screen_size,
|
||||
const gfx::SizeF image_pixel_size) {
|
||||
gfx::Transform MakeTransformForImage(
|
||||
const chrome_pdf::AccessibilityImageInfo& image) {
|
||||
// Nodes created with OCR results from the image will be misaligned on screen
|
||||
// if `image_screen_size` is different from `image_pixel_size`. To address
|
||||
// this misalignment issue, an additional transform needs to be created.
|
||||
const gfx::RectF& image_screen_size = image.bounds;
|
||||
const gfx::RectF image_pixel_size =
|
||||
gfx::RectF(image.image_data.width(), image.image_data.height());
|
||||
CHECK(!image_pixel_size.IsEmpty());
|
||||
|
||||
gfx::Transform transform;
|
||||
@ -1343,9 +1331,9 @@ class PdfAccessibilityTreeBuilder {
|
||||
ui::AXNodeData* image_node = CreateImageNode(images_[i]);
|
||||
para_node->child_ids.push_back(image_node->id);
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
if (!has_accessible_text_ && ocr_available) {
|
||||
ocr_requests.emplace(image_node->id, images_[i], para_node->id,
|
||||
page_index_);
|
||||
if (!has_accessible_text_ && ocr_available &&
|
||||
!images_[i].image_data.drawsNothing()) {
|
||||
ocr_requests.emplace(image_node->id, images_[i], para_node->id);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1421,15 +1409,12 @@ class PdfAccessibilityTreeBuilder {
|
||||
|
||||
PdfAccessibilityTree::PdfAccessibilityTree(
|
||||
content::RenderFrame* render_frame,
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler,
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher)
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler)
|
||||
: content::RenderFrameObserver(render_frame),
|
||||
render_frame_(render_frame),
|
||||
action_handler_(action_handler),
|
||||
image_fetcher_(image_fetcher) {
|
||||
action_handler_(action_handler) {
|
||||
DCHECK(render_frame);
|
||||
DCHECK(action_handler_);
|
||||
DCHECK(image_fetcher_);
|
||||
MaybeHandleAccessibilityChange(/*always_load_or_reload_accessibility=*/false);
|
||||
}
|
||||
|
||||
@ -2122,17 +2107,14 @@ void PdfAccessibilityTree::OnOcrDataReceived(
|
||||
// would be more convenient and less complex if an `ui::AXTree` was never
|
||||
// constructed and if the `ui::AXTreeSource` was able to use the collection
|
||||
// of `nodes_` directly.
|
||||
base::UmaHistogramEnumeration("Accessibility.PdfOcr.PDFImages",
|
||||
PdfOcrRequestStatus::kPerformed);
|
||||
|
||||
if (tree_update.nodes.empty()) {
|
||||
VLOG(1) << "Empty OCR data received.";
|
||||
// TODO(crbug.com/1471392): Create an empty update and continue. This can
|
||||
// happen if OCR returns an empty result, or the image draws nothing.
|
||||
return;
|
||||
}
|
||||
|
||||
base::UmaHistogramEnumeration("Accessibility.PdfOcr.PDFImages",
|
||||
PdfOcrRequestStatus::kPerformed);
|
||||
|
||||
// Update the flag if OCR extracted text from any images. This flag will be
|
||||
// used to update the status node to notify users of it.
|
||||
was_text_converted_from_image_ = true;
|
||||
@ -2163,8 +2145,7 @@ void PdfAccessibilityTree::OnOcrDataReceived(
|
||||
// transform, nodes created from OCR results will have misaligned bounding
|
||||
// boxes. This transform will be applied to all nodes from OCR results
|
||||
// below.
|
||||
gfx::Transform transform = MakeTransformForImage(
|
||||
ocr_request.image.bounds, ocr_request.image_pixel_size);
|
||||
gfx::Transform transform = MakeTransformForImage(ocr_request.image);
|
||||
|
||||
// Count each detected language and find out the most detected language in
|
||||
// OCR result. Then record the most detected language in UMA.
|
||||
@ -2250,7 +2231,7 @@ void PdfAccessibilityTree::OnOcrDataReceived(
|
||||
void PdfAccessibilityTree::CreateOcrService() {
|
||||
VLOG(2) << "Creating OCR service.";
|
||||
ocr_service_ = std::make_unique<PdfOcrService>(
|
||||
image_fetcher_, *render_frame_, page_count_,
|
||||
*render_frame_, page_count_,
|
||||
base::BindRepeating(&PdfAccessibilityTree::OnOcrDataReceived,
|
||||
weak_ptr_factory_.GetWeakPtr()));
|
||||
}
|
||||
|
@ -35,7 +35,6 @@
|
||||
namespace chrome_pdf {
|
||||
|
||||
class PdfAccessibilityActionHandler;
|
||||
class PdfAccessibilityImageFetcher;
|
||||
|
||||
} // namespace chrome_pdf
|
||||
|
||||
@ -73,19 +72,14 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,
|
||||
struct PdfOcrRequest {
|
||||
PdfOcrRequest(const ui::AXNodeID& image_node_id,
|
||||
const chrome_pdf::AccessibilityImageInfo& image,
|
||||
const ui::AXNodeID& parent_node_id,
|
||||
uint32_t page_index);
|
||||
const ui::AXNodeID& parent_node_id);
|
||||
|
||||
const ui::AXNodeID image_node_id;
|
||||
const chrome_pdf::AccessibilityImageInfo image;
|
||||
const ui::AXNodeID parent_node_id;
|
||||
const uint32_t page_index;
|
||||
// This boolean indicates which request corresponds to the last image on
|
||||
// each page.
|
||||
bool is_last_on_page = false;
|
||||
|
||||
// This field is set after the image is extracted from PDF.
|
||||
gfx::SizeF image_pixel_size;
|
||||
};
|
||||
|
||||
// Manages the connection to the OCR Service via Mojo, and ensures that
|
||||
@ -96,8 +90,7 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,
|
||||
std::vector<PdfOcrRequest> ocr_requests,
|
||||
std::vector<ui::AXTreeUpdate> tree_updates)>;
|
||||
|
||||
PdfOcrService(chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher,
|
||||
content::RenderFrame& render_frame,
|
||||
PdfOcrService(content::RenderFrame& render_frame,
|
||||
uint32_t page_count,
|
||||
OnOcrDataReceivedCallback callback);
|
||||
|
||||
@ -131,9 +124,6 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,
|
||||
void ReceiveOcrResultsForImage(PdfOcrRequest request,
|
||||
const ui::AXTreeUpdate& tree_update);
|
||||
|
||||
// `image_fetcher_` owns `this`.
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* const image_fetcher_;
|
||||
|
||||
uint32_t remaining_page_count_;
|
||||
// True if there are pending OCR requests. Used to determine if `OcrPage`
|
||||
// should call `OcrNextImage` or if the next call to
|
||||
@ -159,8 +149,7 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,
|
||||
|
||||
PdfAccessibilityTree(
|
||||
content::RenderFrame* render_frame,
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler,
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher);
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler);
|
||||
~PdfAccessibilityTree() override;
|
||||
|
||||
static bool IsDataFromPluginValid(
|
||||
@ -316,7 +305,6 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,
|
||||
|
||||
// Unowned. Must outlive `this`.
|
||||
chrome_pdf::PdfAccessibilityActionHandler* const action_handler_;
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* const image_fetcher_;
|
||||
|
||||
// `zoom_` signifies the zoom level set in for the browser content.
|
||||
// `scale_` signifies the scale level set by user. Scale is applied
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -255,10 +255,8 @@ void PdfViewWebPluginClient::RecordComputedAction(const std::string& action) {
|
||||
|
||||
std::unique_ptr<chrome_pdf::PdfAccessibilityDataHandler>
|
||||
PdfViewWebPluginClient::CreateAccessibilityDataHandler(
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler,
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher) {
|
||||
return std::make_unique<PdfAccessibilityTree>(render_frame_, action_handler,
|
||||
image_fetcher);
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler) {
|
||||
return std::make_unique<PdfAccessibilityTree>(render_frame_, action_handler);
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
@ -78,8 +78,7 @@ class PdfViewWebPluginClient : public chrome_pdf::PdfViewWebPlugin::Client {
|
||||
void RecordComputedAction(const std::string& action) override;
|
||||
std::unique_ptr<chrome_pdf::PdfAccessibilityDataHandler>
|
||||
CreateAccessibilityDataHandler(
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler,
|
||||
chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher) override;
|
||||
chrome_pdf::PdfAccessibilityActionHandler* action_handler) override;
|
||||
|
||||
private:
|
||||
blink::WebLocalFrame* GetFrame() const;
|
||||
|
@ -217,7 +217,6 @@ if (enable_pdf) {
|
||||
"accessibility_structs.h",
|
||||
"pdf_accessibility_action_handler.h",
|
||||
"pdf_accessibility_data_handler.h",
|
||||
"pdf_accessibility_image_fetcher.h",
|
||||
]
|
||||
|
||||
configs += [ ":strict" ]
|
||||
|
@ -77,11 +77,11 @@ AccessibilityImageInfo::AccessibilityImageInfo() = default;
|
||||
AccessibilityImageInfo::AccessibilityImageInfo(const std::string& alt_text,
|
||||
uint32_t text_run_index,
|
||||
const gfx::RectF& bounds,
|
||||
int32_t page_object_index)
|
||||
const SkBitmap& image_data)
|
||||
: alt_text(alt_text),
|
||||
text_run_index(text_run_index),
|
||||
bounds(bounds),
|
||||
page_object_index(page_object_index) {}
|
||||
image_data(image_data) {}
|
||||
|
||||
AccessibilityImageInfo::AccessibilityImageInfo(
|
||||
const AccessibilityImageInfo& other) = default;
|
||||
|
@ -132,7 +132,7 @@ struct AccessibilityImageInfo {
|
||||
AccessibilityImageInfo(const std::string& alt_text,
|
||||
uint32_t text_run_index,
|
||||
const gfx::RectF& bounds,
|
||||
int32_t page_object_index);
|
||||
const SkBitmap& image_data);
|
||||
AccessibilityImageInfo(const AccessibilityImageInfo& other);
|
||||
~AccessibilityImageInfo();
|
||||
|
||||
@ -147,8 +147,9 @@ struct AccessibilityImageInfo {
|
||||
// Bounding box of the image.
|
||||
gfx::RectF bounds;
|
||||
|
||||
// Index of the image object in its page.
|
||||
int32_t page_object_index;
|
||||
// Only populated if `alt_text` is empty or unavailable, and if the user has
|
||||
// requested that the OCR service tag the PDF so that it is made accessible.
|
||||
SkBitmap image_data;
|
||||
};
|
||||
|
||||
struct AccessibilityHighlightInfo {
|
||||
|
@ -1,22 +0,0 @@
|
||||
// Copyright 2023 The Chromium Authors
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef PDF_PDF_ACCESSIBILITY_IMAGE_FETCHER_H_
|
||||
#define PDF_PDF_ACCESSIBILITY_IMAGE_FETCHER_H_
|
||||
|
||||
class SkBitmap;
|
||||
|
||||
namespace chrome_pdf {
|
||||
|
||||
class PdfAccessibilityImageFetcher {
|
||||
public:
|
||||
virtual ~PdfAccessibilityImageFetcher() = default;
|
||||
// Fetches the image as a 32-bit bitmap for OCR.
|
||||
virtual SkBitmap GetImageForOcr(int32_t page_index,
|
||||
int32_t page_object_index) = 0;
|
||||
};
|
||||
|
||||
} // namespace chrome_pdf
|
||||
|
||||
#endif // PDF_PDF_ACCESSIBILITY_IMAGE_FETCHER_H_
|
@ -408,8 +408,6 @@ class PDFEngine {
|
||||
virtual std::vector<AccessibilityImageInfo> GetImageInfo(
|
||||
int page_index,
|
||||
uint32_t text_run_count) = 0;
|
||||
// Returns the image as a 32-bit bitmap format for OCR.
|
||||
virtual SkBitmap GetImageForOcr(int page_index, int image_index) = 0;
|
||||
// For all the highlights in page `page_index`, get their underlying text
|
||||
// ranges and bounding boxes.
|
||||
virtual std::vector<AccessibilityHighlightInfo> GetHighlightInfo(
|
||||
|
@ -274,8 +274,7 @@ std::unique_ptr<PDFiumEngine> PdfViewWebPlugin::Client::CreateEngine(
|
||||
|
||||
std::unique_ptr<PdfAccessibilityDataHandler>
|
||||
PdfViewWebPlugin::Client::CreateAccessibilityDataHandler(
|
||||
PdfAccessibilityActionHandler* action_handler,
|
||||
PdfAccessibilityImageFetcher* image_fetcher) {
|
||||
PdfAccessibilityActionHandler* action_handler) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -287,7 +286,7 @@ PdfViewWebPlugin::PdfViewWebPlugin(
|
||||
pdf_service_(std::move(pdf_service)),
|
||||
initial_params_(params),
|
||||
pdf_accessibility_data_handler_(
|
||||
client_->CreateAccessibilityDataHandler(this, this)) {
|
||||
client_->CreateAccessibilityDataHandler(this)) {
|
||||
DCHECK(pdf_service_);
|
||||
pdf_service_->SetListener(listener_receiver_.BindNewPipeAndPassRemote());
|
||||
}
|
||||
@ -1930,11 +1929,6 @@ void PdfViewWebPlugin::EnableAccessibility() {
|
||||
LoadOrReloadAccessibility();
|
||||
}
|
||||
|
||||
SkBitmap PdfViewWebPlugin::GetImageForOcr(int32_t page_index,
|
||||
int32_t page_object_index) {
|
||||
return engine_->GetImageForOcr(page_index, page_object_index);
|
||||
}
|
||||
|
||||
void PdfViewWebPlugin::HandleAccessibilityAction(
|
||||
const AccessibilityActionData& action_data) {
|
||||
engine_->HandleAccessibilityAction(action_data);
|
||||
|
@ -27,7 +27,6 @@
|
||||
#include "pdf/mojom/pdf.mojom.h"
|
||||
#include "pdf/paint_manager.h"
|
||||
#include "pdf/pdf_accessibility_action_handler.h"
|
||||
#include "pdf/pdf_accessibility_image_fetcher.h"
|
||||
#include "pdf/pdf_engine.h"
|
||||
#include "pdf/pdfium/pdfium_form_filler.h"
|
||||
#include "pdf/post_message_receiver.h"
|
||||
@ -83,7 +82,6 @@ class PdfViewWebPlugin final : public PDFEngine::Client,
|
||||
public PostMessageReceiver::Client,
|
||||
public PaintManager::Client,
|
||||
public PdfAccessibilityActionHandler,
|
||||
public PdfAccessibilityImageFetcher,
|
||||
public PreviewModeClient::Client {
|
||||
public:
|
||||
// Do not save files larger than 100 MB. This cap should be kept in sync with
|
||||
@ -221,8 +219,7 @@ class PdfViewWebPlugin final : public PDFEngine::Client,
|
||||
// client.
|
||||
virtual std::unique_ptr<PdfAccessibilityDataHandler>
|
||||
CreateAccessibilityDataHandler(
|
||||
PdfAccessibilityActionHandler* action_handler,
|
||||
PdfAccessibilityImageFetcher* image_fetcher);
|
||||
PdfAccessibilityActionHandler* action_handler);
|
||||
};
|
||||
|
||||
PdfViewWebPlugin(std::unique_ptr<Client> client,
|
||||
@ -384,10 +381,6 @@ class PdfViewWebPlugin final : public PDFEngine::Client,
|
||||
const AccessibilityActionData& action_data) override;
|
||||
void LoadOrReloadAccessibility() override;
|
||||
|
||||
// PdfAccessibilityImageFetcher:
|
||||
SkBitmap GetImageForOcr(int32_t page_index,
|
||||
int32_t page_object_index) override;
|
||||
|
||||
// PreviewModeClient::Client:
|
||||
void PreviewDocumentLoadComplete() override;
|
||||
void PreviewDocumentLoadFailed() override;
|
||||
|
@ -37,7 +37,6 @@
|
||||
#include "pdf/mojom/pdf.mojom.h"
|
||||
#include "pdf/paint_ready_rect.h"
|
||||
#include "pdf/pdf_accessibility_data_handler.h"
|
||||
#include "pdf/pdf_accessibility_image_fetcher.h"
|
||||
#include "pdf/pdf_features.h"
|
||||
#include "pdf/test/mock_web_associated_url_loader.h"
|
||||
#include "pdf/test/test_helpers.h"
|
||||
@ -309,7 +308,7 @@ class FakePdfViewWebPluginClient : public PdfViewWebPlugin::Client {
|
||||
|
||||
MOCK_METHOD(std::unique_ptr<PdfAccessibilityDataHandler>,
|
||||
CreateAccessibilityDataHandler,
|
||||
(PdfAccessibilityActionHandler*, PdfAccessibilityImageFetcher*),
|
||||
(PdfAccessibilityActionHandler*),
|
||||
(override));
|
||||
};
|
||||
|
||||
|
@ -2618,11 +2618,6 @@ std::vector<AccessibilityImageInfo> PDFiumEngine::GetImageInfo(
|
||||
return pages_[page_index]->GetImageInfo(text_run_count);
|
||||
}
|
||||
|
||||
SkBitmap PDFiumEngine::GetImageForOcr(int page_index, int image_index) {
|
||||
DCHECK(PageIndexInBounds(page_index));
|
||||
return pages_[page_index]->GetImageForOcr(image_index);
|
||||
}
|
||||
|
||||
std::vector<AccessibilityHighlightInfo> PDFiumEngine::GetHighlightInfo(
|
||||
int page_index,
|
||||
const std::vector<AccessibilityTextRunInfo>& text_runs) {
|
||||
|
@ -158,7 +158,6 @@ class PDFiumEngine : public PDFEngine,
|
||||
std::vector<AccessibilityImageInfo> GetImageInfo(
|
||||
int page_index,
|
||||
uint32_t text_run_count) override;
|
||||
SkBitmap GetImageForOcr(int page_index, int image_index) override;
|
||||
std::vector<AccessibilityHighlightInfo> GetHighlightInfo(
|
||||
int page_index,
|
||||
const std::vector<AccessibilityTextRunInfo>& text_runs) override;
|
||||
|
@ -770,84 +770,12 @@ std::vector<AccessibilityImageInfo> PDFiumPage::GetImageInfo(
|
||||
cur_info.bounds =
|
||||
gfx::RectF(image.bounding_rect.x(), image.bounding_rect.y(),
|
||||
image.bounding_rect.width(), image.bounding_rect.height());
|
||||
cur_info.page_object_index = image.page_object_index;
|
||||
cur_info.image_data = image.image_data;
|
||||
image_info.push_back(std::move(cur_info));
|
||||
}
|
||||
return image_info;
|
||||
}
|
||||
|
||||
SkBitmap PDFiumPage::GetImageForOcr(int page_object_index) {
|
||||
SkBitmap bitmap;
|
||||
|
||||
FPDF_PAGE page = GetPage();
|
||||
FPDF_PAGEOBJECT page_object = FPDFPage_GetObject(page, page_object_index);
|
||||
|
||||
if (FPDFPageObj_GetType(page_object) != FPDF_PAGEOBJ_IMAGE) {
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
// OCR needs the image with the highest available quality. To get it, the
|
||||
// image transform matrix is reset to no-scale, the bitmap is extracted,
|
||||
// and then the original matrix is restored.
|
||||
FS_MATRIX original_matrix;
|
||||
if (!FPDFPageObj_GetMatrix(page_object, &original_matrix)) {
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
// Get the actual image size.
|
||||
unsigned int width;
|
||||
unsigned int height;
|
||||
if (!FPDFImageObj_GetImagePixelSize(page_object, &width, &height)) {
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
// Resize the matrix to actual size.
|
||||
FS_MATRIX new_matrix = {static_cast<float>(width), 0, 0,
|
||||
static_cast<float>(height), 0, 0};
|
||||
if (!FPDFPageObj_SetMatrix(page_object, &new_matrix)) {
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
ScopedFPDFBitmap raw_bitmap(
|
||||
FPDFImageObj_GetRenderedBitmap(engine_->doc(), page, page_object));
|
||||
|
||||
// Restore the original matrix.
|
||||
CHECK(FPDFPageObj_SetMatrix(page_object, &original_matrix));
|
||||
|
||||
if (!raw_bitmap) {
|
||||
return SkBitmap();
|
||||
}
|
||||
|
||||
CHECK_EQ(FPDFBitmap_GetFormat(raw_bitmap.get()), FPDFBitmap_BGRA);
|
||||
SkImageInfo info =
|
||||
SkImageInfo::Make(FPDFBitmap_GetWidth(raw_bitmap.get()),
|
||||
FPDFBitmap_GetHeight(raw_bitmap.get()),
|
||||
kBGRA_8888_SkColorType, kOpaque_SkAlphaType);
|
||||
const size_t row_bytes = FPDFBitmap_GetStride(raw_bitmap.get());
|
||||
SkPixmap pixels(info, FPDFBitmap_GetBuffer(raw_bitmap.get()), row_bytes);
|
||||
if (!bitmap.tryAllocPixels(info, row_bytes)) {
|
||||
return bitmap;
|
||||
}
|
||||
bitmap.writePixels(pixels);
|
||||
|
||||
SkBitmapOperations::RotationAmount rotation;
|
||||
switch (FPDFPage_GetRotation(page)) {
|
||||
case 0:
|
||||
return bitmap;
|
||||
case 1:
|
||||
rotation = SkBitmapOperations::RotationAmount::ROTATION_90_CW;
|
||||
break;
|
||||
case 2:
|
||||
rotation = SkBitmapOperations::RotationAmount::ROTATION_180_CW;
|
||||
break;
|
||||
case 3:
|
||||
rotation = SkBitmapOperations::RotationAmount::ROTATION_270_CW;
|
||||
break;
|
||||
}
|
||||
|
||||
return SkBitmapOperations::Rotate(bitmap, rotation);
|
||||
}
|
||||
|
||||
std::vector<AccessibilityHighlightInfo> PDFiumPage::GetHighlightInfo(
|
||||
const std::vector<AccessibilityTextRunInfo>& text_runs) {
|
||||
std::vector<AccessibilityHighlightInfo> highlight_info;
|
||||
@ -1343,6 +1271,80 @@ void PDFiumPage::CalculateImages() {
|
||||
|
||||
if (!marked_content_id_image_map.empty())
|
||||
PopulateImageAltText(marked_content_id_image_map);
|
||||
|
||||
if (!features::IsPdfOcrEnabled())
|
||||
return;
|
||||
|
||||
// If requested by the user, we store the raw image data so that the OCR
|
||||
// service can try and retrieve textual and layout information from the image.
|
||||
// This is because alt text might be empty, or the PDF might simply be
|
||||
// untagged for accessibility.
|
||||
for (Image& image : images_) {
|
||||
if (!image.alt_text.empty())
|
||||
continue;
|
||||
|
||||
FPDF_PAGEOBJECT page_object =
|
||||
FPDFPage_GetObject(page, image.page_object_index);
|
||||
|
||||
// OCR needs the image with the highest available quality. To get it, the
|
||||
// image transform matrix is reset to no-scale, the bitmap is extracted,
|
||||
// and then the original matrix is restored.
|
||||
FS_MATRIX original_matrix;
|
||||
if (!FPDFPageObj_GetMatrix(page_object, &original_matrix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the actual image size.
|
||||
unsigned int width;
|
||||
unsigned int height;
|
||||
if (!FPDFImageObj_GetImagePixelSize(page_object, &width, &height)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Resize the matrix to actual size.
|
||||
FS_MATRIX new_matrix = {static_cast<float>(width), 0, 0,
|
||||
static_cast<float>(height), 0, 0};
|
||||
if (!FPDFPageObj_SetMatrix(page_object, &new_matrix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ScopedFPDFBitmap bitmap(
|
||||
FPDFImageObj_GetRenderedBitmap(engine_->doc(), page, page_object));
|
||||
|
||||
// Restore the original matrix.
|
||||
CHECK(FPDFPageObj_SetMatrix(page_object, &original_matrix));
|
||||
|
||||
if (!bitmap)
|
||||
continue;
|
||||
|
||||
CHECK_EQ(FPDFBitmap_GetFormat(bitmap.get()), FPDFBitmap_BGRA);
|
||||
SkImageInfo info = SkImageInfo::Make(
|
||||
FPDFBitmap_GetWidth(bitmap.get()), FPDFBitmap_GetHeight(bitmap.get()),
|
||||
kBGRA_8888_SkColorType, kOpaque_SkAlphaType);
|
||||
const size_t row_bytes = FPDFBitmap_GetStride(bitmap.get());
|
||||
SkPixmap pixels(info, FPDFBitmap_GetBuffer(bitmap.get()), row_bytes);
|
||||
if (!image.image_data.tryAllocPixels(info, row_bytes)) {
|
||||
continue;
|
||||
}
|
||||
image.image_data.writePixels(pixels);
|
||||
|
||||
SkBitmapOperations::RotationAmount rotation;
|
||||
switch (FPDFPage_GetRotation(page)) {
|
||||
case 0:
|
||||
continue;
|
||||
case 1:
|
||||
rotation = SkBitmapOperations::RotationAmount::ROTATION_90_CW;
|
||||
break;
|
||||
case 2:
|
||||
rotation = SkBitmapOperations::RotationAmount::ROTATION_180_CW;
|
||||
break;
|
||||
case 3:
|
||||
rotation = SkBitmapOperations::RotationAmount::ROTATION_270_CW;
|
||||
break;
|
||||
}
|
||||
|
||||
image.image_data = SkBitmapOperations::Rotate(image.image_data, rotation);
|
||||
}
|
||||
}
|
||||
|
||||
void PDFiumPage::PopulateImageAltText(
|
||||
|
@ -89,9 +89,6 @@ class PDFiumPage {
|
||||
// `image_data` field.
|
||||
std::vector<AccessibilityImageInfo> GetImageInfo(uint32_t text_run_count);
|
||||
|
||||
// Returns the image as a 32-bit bitmap format for OCR.
|
||||
SkBitmap GetImageForOcr(int page_object_index);
|
||||
|
||||
// For all the highlights on the page, get their underlying text ranges and
|
||||
// bounding boxes.
|
||||
std::vector<AccessibilityHighlightInfo> GetHighlightInfo(
|
||||
@ -234,7 +231,6 @@ class PDFiumPage {
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, CalculateImages);
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, ImageAltText);
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, ImageData);
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, ImageDataForNonImage);
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, RotatedPageImageData);
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, AnnotLinkGeneration);
|
||||
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, GetLinkTarget);
|
||||
@ -273,12 +269,15 @@ class PDFiumPage {
|
||||
Image(const Image& other);
|
||||
~Image();
|
||||
|
||||
// Index of the object in its page.
|
||||
int page_object_index;
|
||||
|
||||
// Alt text is available only for PDFs that are tagged for accessibility.
|
||||
std::string alt_text;
|
||||
gfx::Rect bounding_rect;
|
||||
// Image data is only stored if the user has requested that the OCR service
|
||||
// try to retrieve textual and layout information from this image. The
|
||||
// bitmap will have the same size as the image in the PDF file, and will
|
||||
// not be scaled.
|
||||
SkBitmap image_data;
|
||||
};
|
||||
|
||||
// Represents a highlight within the page.
|
||||
|
@ -541,20 +541,16 @@ TEST_P(PDFiumPageImageDataTest, ImageData) {
|
||||
ASSERT_EQ(3u, page.images_.size());
|
||||
|
||||
ASSERT_FALSE(page.images_[0].alt_text.empty());
|
||||
SkBitmap image_bitmap = engine->GetImageForOcr(
|
||||
/*page_index=*/0, page.images_[0].page_object_index);
|
||||
EXPECT_FALSE(image_bitmap.drawsNothing());
|
||||
EXPECT_EQ(image_bitmap.width(), 50);
|
||||
EXPECT_EQ(image_bitmap.height(), 50);
|
||||
EXPECT_TRUE(page.images_[0].image_data.drawsNothing());
|
||||
EXPECT_EQ(page.images_[0].image_data.width(), 0);
|
||||
EXPECT_EQ(page.images_[0].image_data.height(), 0);
|
||||
|
||||
ASSERT_TRUE(page.images_[2].alt_text.empty());
|
||||
|
||||
ASSERT_TRUE(page.images_[1].alt_text.empty());
|
||||
image_bitmap = engine->GetImageForOcr(/*page_index=*/0,
|
||||
page.images_[1].page_object_index);
|
||||
EXPECT_FALSE(image_bitmap.drawsNothing());
|
||||
// While the scaled image size is 20x20, `image_data` has the same size as
|
||||
// the image in the PDF file, which is 50x50, and is not scaled.
|
||||
EXPECT_EQ(image_bitmap.width(), 50);
|
||||
EXPECT_EQ(image_bitmap.height(), 50);
|
||||
EXPECT_EQ(page.images_[1].image_data.width(), 50);
|
||||
EXPECT_EQ(page.images_[1].image_data.height(), 50);
|
||||
}
|
||||
|
||||
TEST_P(PDFiumPageImageDataTest, RotatedPageImageData) {
|
||||
@ -570,37 +566,8 @@ TEST_P(PDFiumPageImageDataTest, RotatedPageImageData) {
|
||||
|
||||
// This page is rotated, therefore the extracted image size is 25x100 while
|
||||
// the stored image is 100x25.
|
||||
SkBitmap image_bitmap = engine->GetImageForOcr(
|
||||
/*page_index=*/0, page.images_[0].page_object_index);
|
||||
EXPECT_EQ(image_bitmap.width(), 25);
|
||||
EXPECT_EQ(image_bitmap.height(), 100);
|
||||
}
|
||||
|
||||
TEST_P(PDFiumPageImageDataTest, ImageDataForNonImage) {
|
||||
TestClient client;
|
||||
std::unique_ptr<PDFiumEngine> engine =
|
||||
InitializeEngine(&client, FILE_PATH_LITERAL("text_with_image.pdf"));
|
||||
ASSERT_TRUE(engine);
|
||||
ASSERT_EQ(1, engine->GetNumberOfPages());
|
||||
|
||||
PDFiumPage& page = GetPDFiumPageForTest(*engine, 0);
|
||||
page.CalculateImages();
|
||||
ASSERT_EQ(3u, page.images_.size());
|
||||
ASSERT_EQ(1, page.images_[0].page_object_index);
|
||||
|
||||
// Existing non-image object.
|
||||
SkBitmap image_bitmap = engine->GetImageForOcr(
|
||||
/*page_index=*/0, /*image_index=*/0);
|
||||
EXPECT_TRUE(image_bitmap.drawsNothing());
|
||||
EXPECT_EQ(image_bitmap.width(), 0);
|
||||
EXPECT_EQ(image_bitmap.height(), 0);
|
||||
|
||||
// Out of range.
|
||||
image_bitmap = engine->GetImageForOcr(
|
||||
/*page_index=*/0, /*image_index=*/1000);
|
||||
EXPECT_TRUE(image_bitmap.drawsNothing());
|
||||
EXPECT_EQ(image_bitmap.width(), 0);
|
||||
EXPECT_EQ(image_bitmap.height(), 0);
|
||||
EXPECT_EQ(page.images_[0].image_data.width(), 25);
|
||||
EXPECT_EQ(page.images_[0].image_data.height(), 100);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(All, PDFiumPageImageDataTest, testing::Bool());
|
||||
|
Reference in New Issue
Block a user