Revert "Extract high quality images from PDF only when they are needed for OCR."

This reverts commit 5208eafb23.

Reason for revert:
LUCI Bisection identified this CL as the culprit of a build failure. See the analysis: https://luci-bisection.appspot.com/analysis/b/8773075837739418257

Sample failed build: https://ci.chromium.org/b/8773075837739418257

If this is a false positive, please report it at https://bugs.chromium.org/p/chromium/issues/entry?comment=Analysis%3A+https%3A%2F%2Fluci-bisection.appspot.com%2Fanalysis%2Fb%2F8773075837739418257&components=Tools%3ETest%3EFindit&labels=LUCI-Bisection-Wrong%2CPri-3%2CType-Bug&status=Available&summary=Wrongly+blamed+https%3A%2F%2Fchromium-review.googlesource.com%2Fc%2Fchromium%2Fsrc%2F%2B%2F4754282

Original change's description:
> Extract high quality images from PDF only when they are needed for OCR.
>
> 32-bit bitmaps with highest available quality are extracted from PDF
> files when they are loaded, so that they would be sent later to OCR
> service. To avoid the memory overhead of this process, this CL
> postpones image extraction to the time they are sent to OCR service,
> and destroys the extracted images immediately after that.
>
> AX-Relnotes: n/a
> Bug: 1471392
> Change-Id: Id337edf693d8d4a4ddd1a56d814a0d1f0e1ac5e4
> Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4754282
> Auto-Submit: Ramin Halavati <rhalavati@chromium.org>
> Reviewed-by: Kyungjun Lee <kyungjunlee@google.com>
> Commit-Queue: Ramin Halavati <rhalavati@chromium.org>
> Commit-Queue: Lei Zhang <thestig@chromium.org>
> Reviewed-by: Lei Zhang <thestig@chromium.org>
> Cr-Commit-Position: refs/heads/main@{#1182463}
>

Bug: 1471392
Change-Id: If5d194c454e0cced5a2b7f4041973870b427e5cf
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4772383
Owners-Override: luci-bisection@appspot.gserviceaccount.com <luci-bisection@appspot.gserviceaccount.com>
Commit-Queue: luci-bisection@appspot.gserviceaccount.com <luci-bisection@appspot.gserviceaccount.com>
Bot-Commit: luci-bisection@appspot.gserviceaccount.com <luci-bisection@appspot.gserviceaccount.com>
Cr-Commit-Position: refs/heads/main@{#1182468}

This commit is contained in:

luci-bisection@appspot.gserviceaccount.com

2023-08-11 08:10:23 +00:00

committed by

Chromium LUCI CQ

parent f95b9ba3f8

commit edf0d3d1ed

19 changed files with 479 additions and 527 deletions

components/pdf/renderer

DEPS pdf_accessibility_tree.cc pdf_accessibility_tree.h pdf_accessibility_tree_browsertest.cc pdf_view_web_plugin_client.cc pdf_view_web_plugin_client.h

pdf

BUILD.gn accessibility_structs.cc accessibility_structs.h pdf_accessibility_image_fetcher.h pdf_engine.h pdf_view_web_plugin.cc pdf_view_web_plugin.h pdf_view_web_plugin_unittest.cc

pdfium

pdfium_engine.cc pdfium_engine.h pdfium_page.cc pdfium_page.h pdfium_page_unittest.cc

1

components/pdf/renderer/DEPS

 @ -12,7 +12,6 @@ include_rules = [
   "+pdf/mojom/pdf.mojom.h",
   "+pdf/pdf_accessibility_action_handler.h",
   "+pdf/pdf_accessibility_data_handler.h",
   "+pdf/pdf_accessibility_image_fetcher.h",
   "+pdf/pdf_features.h",
   "+pdf/pdf_view_web_plugin.h",
   "+printing/buildflags/buildflags.h",

									
										65

components/pdf/renderer/pdf_accessibility_tree.cc
									
				@ -25,7 +25,6 @@

				#include "content/public/renderer/render_frame.h"

				#include "content/public/renderer/render_thread.h"

				#include "pdf/pdf_accessibility_action_handler.h"

				#include "pdf/pdf_accessibility_image_fetcher.h"

				#include "pdf/pdf_features.h"

				#include "third_party/blink/public/strings/grit/blink_accessibility_strings.h"

				#include "ui/accessibility/ax_enums.mojom.h"

				@ -58,12 +57,10 @@ using PdfOcrRequest = PdfAccessibilityTree::PdfOcrRequest;

				PdfOcrRequest::PdfOcrRequest(const ui::AXNodeID& image_node_id,

				                             const chrome_pdf::AccessibilityImageInfo& image,

				                             const ui::AXNodeID& parent_node_id,

				                             uint32_t page_index)

				                             const ui::AXNodeID& parent_node_id)

				    : image_node_id(image_node_id),

				      image(image),

				      parent_node_id(parent_node_id),

				      page_index(page_index) {}

				      parent_node_id(parent_node_id) {}

				//

				// PdfOcrService

				@ -71,13 +68,10 @@ PdfOcrRequest::PdfOcrRequest(const ui::AXNodeID& image_node_id,

				using PdfOcrService = PdfAccessibilityTree::PdfOcrService;

				PdfOcrService::PdfOcrService(

				    chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher,

				    content::RenderFrame& render_frame,

				    uint32_t page_count,

				    OnOcrDataReceivedCallback callback)

				    : image_fetcher_(image_fetcher),

				      remaining_page_count_(page_count),

				PdfOcrService::PdfOcrService(content::RenderFrame& render_frame,

				                             uint32_t page_count,

				                             OnOcrDataReceivedCallback callback)

				    : remaining_page_count_(page_count),

				      on_ocr_data_received_callback_(std::move(callback)) {

				  CHECK(features::IsPdfOcrEnabled());

				  render_frame.GetBrowserInterfaceBroker()->GetInterface(

				@ -130,21 +124,12 @@ void PdfOcrService::OcrNextImage() {

				  if (all_requests_.empty()) {

				    return;

				  }

				  PdfOcrRequest request = all_requests_.front();

				  const PdfOcrRequest request = all_requests_.front();

				  all_requests_.pop();

				  SkBitmap bitmap = image_fetcher_->GetImageForOcr(

				      request.page_index, request.image.page_object_index);

				  request.image_pixel_size = gfx::SizeF(bitmap.width(), bitmap.height());

				  if (bitmap.drawsNothing()) {

				    ReceiveOcrResultsForImage(std::move(request), ui::AXTreeUpdate());

				    return;

				  }

				  screen_ai_annotator_->PerformOcrAndReturnAXTreeUpdate(

				      std::move(bitmap),

				      request.image.image_data,

				      base::BindOnce(&PdfOcrService::ReceiveOcrResultsForImage,

				                     weak_ptr_factory_.GetWeakPtr(), std::move(request)));

				                     weak_ptr_factory_.GetWeakPtr(), request));

				  base::UmaHistogramEnumeration("Accessibility.PdfOcr.PDFImages",

				                                PdfOcrRequestStatus::kRequested);

				@ -521,11 +506,14 @@ std::unique_ptr<ui::AXNodeData> CreateStatusNodeWrapper(

				  return node_wrapper;

				}

				gfx::Transform MakeTransformForImage(const gfx::RectF image_screen_size,

				                                     const gfx::SizeF image_pixel_size) {

				gfx::Transform MakeTransformForImage(

				    const chrome_pdf::AccessibilityImageInfo& image) {

				  // Nodes created with OCR results from the image will be misaligned on screen

				  // if `image_screen_size` is different from `image_pixel_size`. To address

				  // this misalignment issue, an additional transform needs to be created.

				  const gfx::RectF& image_screen_size = image.bounds;

				  const gfx::RectF image_pixel_size =

				      gfx::RectF(image.image_data.width(), image.image_data.height());

				  CHECK(!image_pixel_size.IsEmpty());

				  gfx::Transform transform;

				@ -1343,9 +1331,9 @@ class PdfAccessibilityTreeBuilder {

				      ui::AXNodeData* image_node = CreateImageNode(images_[i]);

				      para_node->child_ids.push_back(image_node->id);

				#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)

				      if (!has_accessible_text_ && ocr_available) {

				        ocr_requests.emplace(image_node->id, images_[i], para_node->id,

				                             page_index_);

				      if (!has_accessible_text_ && ocr_available &&

				          !images_[i].image_data.drawsNothing()) {

				        ocr_requests.emplace(image_node->id, images_[i], para_node->id);

				      }

				#endif

				    }

				@ -1421,15 +1409,12 @@ class PdfAccessibilityTreeBuilder {

				PdfAccessibilityTree::PdfAccessibilityTree(

				    content::RenderFrame* render_frame,

				    chrome_pdf::PdfAccessibilityActionHandler* action_handler,

				    chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher)

				    chrome_pdf::PdfAccessibilityActionHandler* action_handler)

				    : content::RenderFrameObserver(render_frame),

				      render_frame_(render_frame),

				      action_handler_(action_handler),

				      image_fetcher_(image_fetcher) {

				      action_handler_(action_handler) {

				  DCHECK(render_frame);

				  DCHECK(action_handler_);

				  DCHECK(image_fetcher_);

				  MaybeHandleAccessibilityChange(/*always_load_or_reload_accessibility=*/false);

				}

				@ -2122,17 +2107,14 @@ void PdfAccessibilityTree::OnOcrDataReceived(

				    // would be more convenient and less complex if an `ui::AXTree` was never

				    // constructed and if the `ui::AXTreeSource` was able to use the collection

				    // of `nodes_` directly.

				    base::UmaHistogramEnumeration("Accessibility.PdfOcr.PDFImages",

				                                  PdfOcrRequestStatus::kPerformed);

				    if (tree_update.nodes.empty()) {

				      VLOG(1) << "Empty OCR data received.";

				      // TODO(crbug.com/1471392): Create an empty update and continue. This can

				      // happen if OCR returns an empty result, or the image draws nothing.

				      return;

				    }

				    base::UmaHistogramEnumeration("Accessibility.PdfOcr.PDFImages",

				                                  PdfOcrRequestStatus::kPerformed);

				    // Update the flag if OCR extracted text from any images. This flag will be

				    // used to update the status node to notify users of it.

				    was_text_converted_from_image_ = true;

				@ -2163,8 +2145,7 @@ void PdfAccessibilityTree::OnOcrDataReceived(

				    // transform, nodes created from OCR results will have misaligned bounding

				    // boxes. This transform will be applied to all nodes from OCR results

				    // below.

				    gfx::Transform transform = MakeTransformForImage(

				        ocr_request.image.bounds, ocr_request.image_pixel_size);

				    gfx::Transform transform = MakeTransformForImage(ocr_request.image);

				    // Count each detected language and find out the most detected language in

				    // OCR result. Then record the most detected language in UMA.

				@ -2250,7 +2231,7 @@ void PdfAccessibilityTree::OnOcrDataReceived(

				void PdfAccessibilityTree::CreateOcrService() {

				  VLOG(2) << "Creating OCR service.";

				  ocr_service_ = std::make_unique<PdfOcrService>(

				      image_fetcher_, *render_frame_, page_count_,

				      *render_frame_, page_count_,

				      base::BindRepeating(&PdfAccessibilityTree::OnOcrDataReceived,

				                          weak_ptr_factory_.GetWeakPtr()));

				}

									
										18

components/pdf/renderer/pdf_accessibility_tree.h
									
				@ -35,7 +35,6 @@

				namespace chrome_pdf {

				class PdfAccessibilityActionHandler;

				class PdfAccessibilityImageFetcher;

				}  // namespace chrome_pdf

				@ -73,19 +72,14 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,

				  struct PdfOcrRequest {

				    PdfOcrRequest(const ui::AXNodeID& image_node_id,

				                  const chrome_pdf::AccessibilityImageInfo& image,

				                  const ui::AXNodeID& parent_node_id,

				                  uint32_t page_index);

				                  const ui::AXNodeID& parent_node_id);

				    const ui::AXNodeID image_node_id;

				    const chrome_pdf::AccessibilityImageInfo image;

				    const ui::AXNodeID parent_node_id;

				    const uint32_t page_index;

				    // This boolean indicates which request corresponds to the last image on

				    // each page.

				    bool is_last_on_page = false;

				    // This field is set after the image is extracted from PDF.

				    gfx::SizeF image_pixel_size;

				  };

				  // Manages the connection to the OCR Service via Mojo, and ensures that

				@ -96,8 +90,7 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,

				        std::vector<PdfOcrRequest> ocr_requests,

				        std::vector<ui::AXTreeUpdate> tree_updates)>;

				    PdfOcrService(chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher,

				                  content::RenderFrame& render_frame,

				    PdfOcrService(content::RenderFrame& render_frame,

				                  uint32_t page_count,

				                  OnOcrDataReceivedCallback callback);

				@ -131,9 +124,6 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,

				    void ReceiveOcrResultsForImage(PdfOcrRequest request,

				                                   const ui::AXTreeUpdate& tree_update);

				    // `image_fetcher_` owns `this`.

				    chrome_pdf::PdfAccessibilityImageFetcher* const image_fetcher_;

				    uint32_t remaining_page_count_;

				    // True if there are pending OCR requests. Used to determine if `OcrPage`

				    // should call `OcrNextImage` or if the next call to

				@ -159,8 +149,7 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,

				  PdfAccessibilityTree(

				      content::RenderFrame* render_frame,

				      chrome_pdf::PdfAccessibilityActionHandler* action_handler,

				      chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher);

				      chrome_pdf::PdfAccessibilityActionHandler* action_handler);

				  ~PdfAccessibilityTree() override;

				  static bool IsDataFromPluginValid(

				@ -316,7 +305,6 @@ class PdfAccessibilityTree : public content::PluginAXTreeSource,

				  // Unowned. Must outlive `this`.

				  chrome_pdf::PdfAccessibilityActionHandler* const action_handler_;

				  chrome_pdf::PdfAccessibilityImageFetcher* const image_fetcher_;

				  // `zoom_` signifies the zoom level set in for the browser content.

				  // `scale_` signifies the scale level set by user. Scale is applied

639

components/pdf/renderer/pdf_accessibility_tree_browsertest.cc

File diff suppressed because it is too large Load Diff

									
										6

components/pdf/renderer/pdf_view_web_plugin_client.cc
									
				@ -255,10 +255,8 @@ void PdfViewWebPluginClient::RecordComputedAction(const std::string& action) {

				std::unique_ptr<chrome_pdf::PdfAccessibilityDataHandler>

				PdfViewWebPluginClient::CreateAccessibilityDataHandler(

				    chrome_pdf::PdfAccessibilityActionHandler* action_handler,

				    chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher) {

				  return std::make_unique<PdfAccessibilityTree>(render_frame_, action_handler,

				                                                image_fetcher);

				    chrome_pdf::PdfAccessibilityActionHandler* action_handler) {

				  return std::make_unique<PdfAccessibilityTree>(render_frame_, action_handler);

				}

				}  // namespace pdf

									
										3

components/pdf/renderer/pdf_view_web_plugin_client.h
									
				@ -78,8 +78,7 @@ class PdfViewWebPluginClient : public chrome_pdf::PdfViewWebPlugin::Client {

				  void RecordComputedAction(const std::string& action) override;

				  std::unique_ptr<chrome_pdf::PdfAccessibilityDataHandler>

				  CreateAccessibilityDataHandler(

				      chrome_pdf::PdfAccessibilityActionHandler* action_handler,

				      chrome_pdf::PdfAccessibilityImageFetcher* image_fetcher) override;

				      chrome_pdf::PdfAccessibilityActionHandler* action_handler) override;

				 private:

				  blink::WebLocalFrame* GetFrame() const;

1

pdf/BUILD.gn

 @ -217,7 +217,6 @@ if (enable_pdf) {
       "accessibility_structs.h",
       "pdf_accessibility_action_handler.h",
       "pdf_accessibility_data_handler.h",
       "pdf_accessibility_image_fetcher.h",
     ]
     configs += [ ":strict" ]

									
										4

pdf/accessibility_structs.cc
									
				@ -77,11 +77,11 @@ AccessibilityImageInfo::AccessibilityImageInfo() = default;

				AccessibilityImageInfo::AccessibilityImageInfo(const std::string& alt_text,

				                                               uint32_t text_run_index,

				                                               const gfx::RectF& bounds,

				                                               int32_t page_object_index)

				                                               const SkBitmap& image_data)

				    : alt_text(alt_text),

				      text_run_index(text_run_index),

				      bounds(bounds),

				      page_object_index(page_object_index) {}

				      image_data(image_data) {}

				AccessibilityImageInfo::AccessibilityImageInfo(

				    const AccessibilityImageInfo& other) = default;

									
										7

pdf/accessibility_structs.h
									
				@ -132,7 +132,7 @@ struct AccessibilityImageInfo {

				  AccessibilityImageInfo(const std::string& alt_text,

				                         uint32_t text_run_index,

				                         const gfx::RectF& bounds,

				                         int32_t page_object_index);

				                         const SkBitmap& image_data);

				  AccessibilityImageInfo(const AccessibilityImageInfo& other);

				  ~AccessibilityImageInfo();

				@ -147,8 +147,9 @@ struct AccessibilityImageInfo {

				  // Bounding box of the image.

				  gfx::RectF bounds;

				  // Index of the image object in its page.

				  int32_t page_object_index;

				  // Only populated if `alt_text` is empty or unavailable, and if the user has

				  // requested that the OCR service tag the PDF so that it is made accessible.

				  SkBitmap image_data;

				};

				struct AccessibilityHighlightInfo {

									
										22

pdf/pdf_accessibility_image_fetcher.h
									
				@ -1,22 +0,0 @@

				// Copyright 2023 The Chromium Authors

				// Use of this source code is governed by a BSD-style license that can be

				// found in the LICENSE file.

				#ifndef PDF_PDF_ACCESSIBILITY_IMAGE_FETCHER_H_

				#define PDF_PDF_ACCESSIBILITY_IMAGE_FETCHER_H_

				class SkBitmap;

				namespace chrome_pdf {

				class PdfAccessibilityImageFetcher {

				 public:

				  virtual ~PdfAccessibilityImageFetcher() = default;

				  // Fetches the image as a 32-bit bitmap for OCR.

				  virtual SkBitmap GetImageForOcr(int32_t page_index,

				                                  int32_t page_object_index) = 0;

				};

				}  // namespace chrome_pdf

				#endif  // PDF_PDF_ACCESSIBILITY_IMAGE_FETCHER_H_

									
										2

pdf/pdf_engine.h
									
				@ -408,8 +408,6 @@ class PDFEngine {

				  virtual std::vector<AccessibilityImageInfo> GetImageInfo(

				      int page_index,

				      uint32_t text_run_count) = 0;

				  // Returns the image as a 32-bit bitmap format for OCR.

				  virtual SkBitmap GetImageForOcr(int page_index, int image_index) = 0;

				  // For all the highlights in page `page_index`, get their underlying text

				  // ranges and bounding boxes.

				  virtual std::vector<AccessibilityHighlightInfo> GetHighlightInfo(

									
										10

pdf/pdf_view_web_plugin.cc
									
				@ -274,8 +274,7 @@ std::unique_ptr<PDFiumEngine> PdfViewWebPlugin::Client::CreateEngine(

				std::unique_ptr<PdfAccessibilityDataHandler>

				PdfViewWebPlugin::Client::CreateAccessibilityDataHandler(

				    PdfAccessibilityActionHandler* action_handler,

				    PdfAccessibilityImageFetcher* image_fetcher) {

				    PdfAccessibilityActionHandler* action_handler) {

				  return nullptr;

				}

				@ -287,7 +286,7 @@ PdfViewWebPlugin::PdfViewWebPlugin(

				      pdf_service_(std::move(pdf_service)),

				      initial_params_(params),

				      pdf_accessibility_data_handler_(

				          client_->CreateAccessibilityDataHandler(this, this)) {

				          client_->CreateAccessibilityDataHandler(this)) {

				  DCHECK(pdf_service_);

				  pdf_service_->SetListener(listener_receiver_.BindNewPipeAndPassRemote());

				}

				@ -1930,11 +1929,6 @@ void PdfViewWebPlugin::EnableAccessibility() {

				  LoadOrReloadAccessibility();

				}

				SkBitmap PdfViewWebPlugin::GetImageForOcr(int32_t page_index,

				                                          int32_t page_object_index) {

				  return engine_->GetImageForOcr(page_index, page_object_index);

				}

				void PdfViewWebPlugin::HandleAccessibilityAction(

				    const AccessibilityActionData& action_data) {

				  engine_->HandleAccessibilityAction(action_data);

									
										9

pdf/pdf_view_web_plugin.h
									
				@ -27,7 +27,6 @@

				#include "pdf/mojom/pdf.mojom.h"

				#include "pdf/paint_manager.h"

				#include "pdf/pdf_accessibility_action_handler.h"

				#include "pdf/pdf_accessibility_image_fetcher.h"

				#include "pdf/pdf_engine.h"

				#include "pdf/pdfium/pdfium_form_filler.h"

				#include "pdf/post_message_receiver.h"

				@ -83,7 +82,6 @@ class PdfViewWebPlugin final : public PDFEngine::Client,

				                               public PostMessageReceiver::Client,

				                               public PaintManager::Client,

				                               public PdfAccessibilityActionHandler,

				                               public PdfAccessibilityImageFetcher,

				                               public PreviewModeClient::Client {

				 public:

				  // Do not save files larger than 100 MB. This cap should be kept in sync with

				@ -221,8 +219,7 @@ class PdfViewWebPlugin final : public PDFEngine::Client,

				    // client.

				    virtual std::unique_ptr<PdfAccessibilityDataHandler>

				    CreateAccessibilityDataHandler(

				        PdfAccessibilityActionHandler* action_handler,

				        PdfAccessibilityImageFetcher* image_fetcher);

				        PdfAccessibilityActionHandler* action_handler);

				  };

				  PdfViewWebPlugin(std::unique_ptr<Client> client,

				@ -384,10 +381,6 @@ class PdfViewWebPlugin final : public PDFEngine::Client,

				      const AccessibilityActionData& action_data) override;

				  void LoadOrReloadAccessibility() override;

				  // PdfAccessibilityImageFetcher:

				  SkBitmap GetImageForOcr(int32_t page_index,

				                          int32_t page_object_index) override;

				  // PreviewModeClient::Client:

				  void PreviewDocumentLoadComplete() override;

				  void PreviewDocumentLoadFailed() override;

									
										3

pdf/pdf_view_web_plugin_unittest.cc
									
				@ -37,7 +37,6 @@

				#include "pdf/mojom/pdf.mojom.h"

				#include "pdf/paint_ready_rect.h"

				#include "pdf/pdf_accessibility_data_handler.h"

				#include "pdf/pdf_accessibility_image_fetcher.h"

				#include "pdf/pdf_features.h"

				#include "pdf/test/mock_web_associated_url_loader.h"

				#include "pdf/test/test_helpers.h"

				@ -309,7 +308,7 @@ class FakePdfViewWebPluginClient : public PdfViewWebPlugin::Client {

				  MOCK_METHOD(std::unique_ptr<PdfAccessibilityDataHandler>,

				              CreateAccessibilityDataHandler,

				              (PdfAccessibilityActionHandler*, PdfAccessibilityImageFetcher*),

				              (PdfAccessibilityActionHandler*),

				              (override));

				};

									
										5

pdf/pdfium/pdfium_engine.cc
									
				@ -2618,11 +2618,6 @@ std::vector<AccessibilityImageInfo> PDFiumEngine::GetImageInfo(

				  return pages_[page_index]->GetImageInfo(text_run_count);

				}

				SkBitmap PDFiumEngine::GetImageForOcr(int page_index, int image_index) {

				  DCHECK(PageIndexInBounds(page_index));

				  return pages_[page_index]->GetImageForOcr(image_index);

				}

				std::vector<AccessibilityHighlightInfo> PDFiumEngine::GetHighlightInfo(

				    int page_index,

				    const std::vector<AccessibilityTextRunInfo>& text_runs) {

									
										1

pdf/pdfium/pdfium_engine.h
									
				@ -158,7 +158,6 @@ class PDFiumEngine : public PDFEngine,

				  std::vector<AccessibilityImageInfo> GetImageInfo(

				      int page_index,

				      uint32_t text_run_count) override;

				  SkBitmap GetImageForOcr(int page_index, int image_index) override;

				  std::vector<AccessibilityHighlightInfo> GetHighlightInfo(

				      int page_index,

				      const std::vector<AccessibilityTextRunInfo>& text_runs) override;

									
										148

pdf/pdfium/pdfium_page.cc
									
				@ -770,84 +770,12 @@ std::vector<AccessibilityImageInfo> PDFiumPage::GetImageInfo(

				    cur_info.bounds =

				        gfx::RectF(image.bounding_rect.x(), image.bounding_rect.y(),

				                   image.bounding_rect.width(), image.bounding_rect.height());

				    cur_info.page_object_index = image.page_object_index;

				    cur_info.image_data = image.image_data;

				    image_info.push_back(std::move(cur_info));

				  }

				  return image_info;

				}

				SkBitmap PDFiumPage::GetImageForOcr(int page_object_index) {

				  SkBitmap bitmap;

				  FPDF_PAGE page = GetPage();

				  FPDF_PAGEOBJECT page_object = FPDFPage_GetObject(page, page_object_index);

				  if (FPDFPageObj_GetType(page_object) != FPDF_PAGEOBJ_IMAGE) {

				    return bitmap;

				  }

				  // OCR needs the image with the highest available quality. To get it, the

				  // image transform matrix is reset to no-scale, the bitmap is extracted,

				  // and then the original matrix is restored.

				  FS_MATRIX original_matrix;

				  if (!FPDFPageObj_GetMatrix(page_object, &original_matrix)) {

				    return bitmap;

				  }

				  // Get the actual image size.

				  unsigned int width;

				  unsigned int height;

				  if (!FPDFImageObj_GetImagePixelSize(page_object, &width, &height)) {

				    return bitmap;

				  }

				  // Resize the matrix to actual size.

				  FS_MATRIX new_matrix = {static_cast<float>(width),  0, 0,

				                          static_cast<float>(height), 0, 0};

				  if (!FPDFPageObj_SetMatrix(page_object, &new_matrix)) {

				    return bitmap;

				  }

				  ScopedFPDFBitmap raw_bitmap(

				      FPDFImageObj_GetRenderedBitmap(engine_->doc(), page, page_object));

				  // Restore the original matrix.

				  CHECK(FPDFPageObj_SetMatrix(page_object, &original_matrix));

				  if (!raw_bitmap) {

				    return SkBitmap();

				  }

				  CHECK_EQ(FPDFBitmap_GetFormat(raw_bitmap.get()), FPDFBitmap_BGRA);

				  SkImageInfo info =

				      SkImageInfo::Make(FPDFBitmap_GetWidth(raw_bitmap.get()),

				                        FPDFBitmap_GetHeight(raw_bitmap.get()),

				                        kBGRA_8888_SkColorType, kOpaque_SkAlphaType);

				  const size_t row_bytes = FPDFBitmap_GetStride(raw_bitmap.get());

				  SkPixmap pixels(info, FPDFBitmap_GetBuffer(raw_bitmap.get()), row_bytes);

				  if (!bitmap.tryAllocPixels(info, row_bytes)) {

				    return bitmap;

				  }

				  bitmap.writePixels(pixels);

				  SkBitmapOperations::RotationAmount rotation;

				  switch (FPDFPage_GetRotation(page)) {

				    case 0:

				      return bitmap;

				    case 1:

				      rotation = SkBitmapOperations::RotationAmount::ROTATION_90_CW;

				      break;

				    case 2:

				      rotation = SkBitmapOperations::RotationAmount::ROTATION_180_CW;

				      break;

				    case 3:

				      rotation = SkBitmapOperations::RotationAmount::ROTATION_270_CW;

				      break;

				  }

				  return SkBitmapOperations::Rotate(bitmap, rotation);

				}

				std::vector<AccessibilityHighlightInfo> PDFiumPage::GetHighlightInfo(

				    const std::vector<AccessibilityTextRunInfo>& text_runs) {

				  std::vector<AccessibilityHighlightInfo> highlight_info;

				@ -1343,6 +1271,80 @@ void PDFiumPage::CalculateImages() {

				  if (!marked_content_id_image_map.empty())

				    PopulateImageAltText(marked_content_id_image_map);

				  if (!features::IsPdfOcrEnabled())

				    return;

				  // If requested by the user, we store the raw image data so that the OCR

				  // service can try and retrieve textual and layout information from the image.

				  // This is because alt text might be empty, or the PDF might simply be

				  // untagged for accessibility.

				  for (Image& image : images_) {

				    if (!image.alt_text.empty())

				      continue;

				    FPDF_PAGEOBJECT page_object =

				        FPDFPage_GetObject(page, image.page_object_index);

				    // OCR needs the image with the highest available quality. To get it, the

				    // image transform matrix is reset to no-scale, the bitmap is extracted,

				    // and then the original matrix is restored.

				    FS_MATRIX original_matrix;

				    if (!FPDFPageObj_GetMatrix(page_object, &original_matrix)) {

				      continue;

				    }

				    // Get the actual image size.

				    unsigned int width;

				    unsigned int height;

				    if (!FPDFImageObj_GetImagePixelSize(page_object, &width, &height)) {

				      continue;

				    }

				    // Resize the matrix to actual size.

				    FS_MATRIX new_matrix = {static_cast<float>(width),  0, 0,

				                            static_cast<float>(height), 0, 0};

				    if (!FPDFPageObj_SetMatrix(page_object, &new_matrix)) {

				      continue;

				    }

				    ScopedFPDFBitmap bitmap(

				        FPDFImageObj_GetRenderedBitmap(engine_->doc(), page, page_object));

				    // Restore the original matrix.

				    CHECK(FPDFPageObj_SetMatrix(page_object, &original_matrix));

				    if (!bitmap)

				      continue;

				    CHECK_EQ(FPDFBitmap_GetFormat(bitmap.get()), FPDFBitmap_BGRA);

				    SkImageInfo info = SkImageInfo::Make(

				        FPDFBitmap_GetWidth(bitmap.get()), FPDFBitmap_GetHeight(bitmap.get()),

				        kBGRA_8888_SkColorType, kOpaque_SkAlphaType);

				    const size_t row_bytes = FPDFBitmap_GetStride(bitmap.get());

				    SkPixmap pixels(info, FPDFBitmap_GetBuffer(bitmap.get()), row_bytes);

				    if (!image.image_data.tryAllocPixels(info, row_bytes)) {

				      continue;

				    }

				    image.image_data.writePixels(pixels);

				    SkBitmapOperations::RotationAmount rotation;

				    switch (FPDFPage_GetRotation(page)) {

				      case 0:

				        continue;

				      case 1:

				        rotation = SkBitmapOperations::RotationAmount::ROTATION_90_CW;

				        break;

				      case 2:

				        rotation = SkBitmapOperations::RotationAmount::ROTATION_180_CW;

				        break;

				      case 3:

				        rotation = SkBitmapOperations::RotationAmount::ROTATION_270_CW;

				        break;

				    }

				    image.image_data = SkBitmapOperations::Rotate(image.image_data, rotation);

				  }

				}

				void PDFiumPage::PopulateImageAltText(

									
										11

pdf/pdfium/pdfium_page.h
									
				@ -89,9 +89,6 @@ class PDFiumPage {

				  // `image_data` field.

				  std::vector<AccessibilityImageInfo> GetImageInfo(uint32_t text_run_count);

				  // Returns the image as a 32-bit bitmap format for OCR.

				  SkBitmap GetImageForOcr(int page_object_index);

				  // For all the highlights on the page, get their underlying text ranges and

				  // bounding boxes.

				  std::vector<AccessibilityHighlightInfo> GetHighlightInfo(

				@ -234,7 +231,6 @@ class PDFiumPage {

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, CalculateImages);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, ImageAltText);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, ImageData);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, ImageDataForNonImage);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, RotatedPageImageData);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, AnnotLinkGeneration);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, GetLinkTarget);

				@ -273,12 +269,15 @@ class PDFiumPage {

				    Image(const Image& other);

				    ~Image();

				    // Index of the object in its page.

				    int page_object_index;

				    // Alt text is available only for PDFs that are tagged for accessibility.

				    std::string alt_text;

				    gfx::Rect bounding_rect;

				    // Image data is only stored if the user has requested that the OCR service

				    // try to retrieve textual and layout information from this image. The

				    // bitmap will have the same size as the image in the PDF file, and will

				    // not be scaled.

				    SkBitmap image_data;

				  };

				  // Represents a highlight within the page.

									
										51

pdf/pdfium/pdfium_page_unittest.cc
									
				@ -541,20 +541,16 @@ TEST_P(PDFiumPageImageDataTest, ImageData) {

				  ASSERT_EQ(3u, page.images_.size());

				  ASSERT_FALSE(page.images_[0].alt_text.empty());

				  SkBitmap image_bitmap = engine->GetImageForOcr(

				      /*page_index=*/0, page.images_[0].page_object_index);

				  EXPECT_FALSE(image_bitmap.drawsNothing());

				  EXPECT_EQ(image_bitmap.width(), 50);

				  EXPECT_EQ(image_bitmap.height(), 50);

				  EXPECT_TRUE(page.images_[0].image_data.drawsNothing());

				  EXPECT_EQ(page.images_[0].image_data.width(), 0);

				  EXPECT_EQ(page.images_[0].image_data.height(), 0);

				  ASSERT_TRUE(page.images_[2].alt_text.empty());

				  ASSERT_TRUE(page.images_[1].alt_text.empty());

				  image_bitmap = engine->GetImageForOcr(/*page_index=*/0,

				                                        page.images_[1].page_object_index);

				  EXPECT_FALSE(image_bitmap.drawsNothing());

				  // While the scaled image size is 20x20, `image_data` has the same size as

				  // the image in the PDF file, which is 50x50, and is not scaled.

				  EXPECT_EQ(image_bitmap.width(), 50);

				  EXPECT_EQ(image_bitmap.height(), 50);

				  EXPECT_EQ(page.images_[1].image_data.width(), 50);

				  EXPECT_EQ(page.images_[1].image_data.height(), 50);

				}

				TEST_P(PDFiumPageImageDataTest, RotatedPageImageData) {

				@ -570,37 +566,8 @@ TEST_P(PDFiumPageImageDataTest, RotatedPageImageData) {

				  // This page is rotated, therefore the extracted image size is 25x100 while

				  // the stored image is 100x25.

				  SkBitmap image_bitmap = engine->GetImageForOcr(

				      /*page_index=*/0, page.images_[0].page_object_index);

				  EXPECT_EQ(image_bitmap.width(), 25);

				  EXPECT_EQ(image_bitmap.height(), 100);

				}

				TEST_P(PDFiumPageImageDataTest, ImageDataForNonImage) {

				  TestClient client;

				  std::unique_ptr<PDFiumEngine> engine =

				      InitializeEngine(&client, FILE_PATH_LITERAL("text_with_image.pdf"));

				  ASSERT_TRUE(engine);

				  ASSERT_EQ(1, engine->GetNumberOfPages());

				  PDFiumPage& page = GetPDFiumPageForTest(*engine, 0);

				  page.CalculateImages();

				  ASSERT_EQ(3u, page.images_.size());

				  ASSERT_EQ(1, page.images_[0].page_object_index);

				  // Existing non-image object.

				  SkBitmap image_bitmap = engine->GetImageForOcr(

				      /*page_index=*/0, /*image_index=*/0);

				  EXPECT_TRUE(image_bitmap.drawsNothing());

				  EXPECT_EQ(image_bitmap.width(), 0);

				  EXPECT_EQ(image_bitmap.height(), 0);

				  // Out of range.

				  image_bitmap = engine->GetImageForOcr(

				      /*page_index=*/0, /*image_index=*/1000);

				  EXPECT_TRUE(image_bitmap.drawsNothing());

				  EXPECT_EQ(image_bitmap.width(), 0);

				  EXPECT_EQ(image_bitmap.height(), 0);

				  EXPECT_EQ(page.images_[0].image_data.width(), 25);

				  EXPECT_EQ(page.images_[0].image_data.height(), 100);

				}

				INSTANTIATE_TEST_SUITE_P(All, PDFiumPageImageDataTest, testing::Bool());

Revert "Extract high quality images from PDF only when they are needed for OCR."

1 components/pdf/renderer/DEPS

65 components/pdf/renderer/pdf_accessibility_tree.cc

18 components/pdf/renderer/pdf_accessibility_tree.h

639 components/pdf/renderer/pdf_accessibility_tree_browsertest.cc

6 components/pdf/renderer/pdf_view_web_plugin_client.cc

3 components/pdf/renderer/pdf_view_web_plugin_client.h

1 pdf/BUILD.gn

4 pdf/accessibility_structs.cc

7 pdf/accessibility_structs.h

22 pdf/pdf_accessibility_image_fetcher.h

2 pdf/pdf_engine.h

10 pdf/pdf_view_web_plugin.cc

9 pdf/pdf_view_web_plugin.h

3 pdf/pdf_view_web_plugin_unittest.cc

5 pdf/pdfium/pdfium_engine.cc

1 pdf/pdfium/pdfium_engine.h

148 pdf/pdfium/pdfium_page.cc

11 pdf/pdfium/pdfium_page.h

51 pdf/pdfium/pdfium_page_unittest.cc

1

components/pdf/renderer/DEPS

65

components/pdf/renderer/pdf_accessibility_tree.cc

18

components/pdf/renderer/pdf_accessibility_tree.h

639

components/pdf/renderer/pdf_accessibility_tree_browsertest.cc

6

components/pdf/renderer/pdf_view_web_plugin_client.cc

3

components/pdf/renderer/pdf_view_web_plugin_client.h

1

pdf/BUILD.gn

4

pdf/accessibility_structs.cc

7

pdf/accessibility_structs.h

22

pdf/pdf_accessibility_image_fetcher.h

2

pdf/pdf_engine.h

10

pdf/pdf_view_web_plugin.cc

9

pdf/pdf_view_web_plugin.h

3

pdf/pdf_view_web_plugin_unittest.cc

5

pdf/pdfium/pdfium_engine.cc

1

pdf/pdfium/pdfium_engine.h

148

pdf/pdfium/pdfium_page.cc

11

pdf/pdfium/pdfium_page.h

51

pdf/pdfium/pdfium_page_unittest.cc