Transfers image pixels from PDFs to the renderer if OCR feature is enabled

Inaccessible PDFs will be made accessible via optical character recognition.
This patch simply collects the raw pixels from every image in a PDF
that is untagged, so that the OCR service can extract textual and layout information out of it.
More info in:
https://docs.google.com/document/d/1Xe1BBncEddGz8DYYnGdTHI0cwhJ_3BvfELCQY6T1rBw/edit?usp=sharing

R=dtseng@chromium.org

AX-Relnotes: n/a.
Bug: 1248380
Change-Id: I938acf62339fb823b462640da8db3fc13e015905
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3516332
Reviewed-by: Ramin Halavati <rhalavati@chromium.org>
Auto-Submit: Nektarios Paisios <nektar@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: John Abd-El-Malek <jam@chromium.org>
Commit-Queue: John Abd-El-Malek <jam@chromium.org>
Cr-Commit-Position: refs/heads/main@{#980985}

This commit is contained in:

Nektarios Paisios

2022-03-15 06:10:53 +00:00

committed by

Chromium LUCI CQ

parent 5fbd681585

commit 874297171d

11 changed files with 106 additions and 11 deletions

components/pdf/renderer

BUILD.gn DEPS

pdf

BUILD.gn accessibility_structs.cc accessibility_structs.h pdf_engine.h

pdfium

DEPS accessibility_unittest.cc pdfium_page.cc pdfium_page.h pdfium_page_unittest.cc

1

components/pdf/renderer/BUILD.gn

 @ -39,6 +39,7 @@ static_library("renderer") {
     "//pdf:features",
     "//pdf:pdf_view_web_plugin",
     "//printing/buildflags",
     "//skia",
     "//third_party/blink/public:blink",
     "//third_party/blink/public/strings:accessibility_strings",
     "//third_party/icu",

1

components/pdf/renderer/DEPS

 @ -13,6 +13,7 @@ include_rules = [
   "+pdf/pdf_view_web_plugin.h",
   "+printing/buildflags/buildflags.h",
   "+third_party/blink/public",
   "+third_party/skia/include/core",
   "+ui/accessibility",
   "+ui/base",
 ]

1

pdf/BUILD.gn

 @ -228,6 +228,7 @@ if (enable_pdf) {
     deps = [
       "//base",
       "//skia",
       "//ui/gfx/geometry",
     ]
   }

									
										8

pdf/accessibility_structs.cc
									
				@ -76,8 +76,12 @@ AccessibilityImageInfo::AccessibilityImageInfo() = default;

				AccessibilityImageInfo::AccessibilityImageInfo(const std::string& alt_text,

				                                               uint32_t text_run_index,

				                                               const gfx::RectF& bounds)

				    : alt_text(alt_text), text_run_index(text_run_index), bounds(bounds) {}

				                                               const gfx::RectF& bounds,

				                                               const SkBitmap& image_data)

				    : alt_text(alt_text),

				      text_run_index(text_run_index),

				      bounds(bounds),

				      image_data(image_data) {}

				AccessibilityImageInfo::AccessibilityImageInfo(

				    const AccessibilityImageInfo& other) = default;

									
										10

pdf/accessibility_structs.h
									
				@ -10,6 +10,7 @@

				#include <string>

				#include <vector>

				#include "third_party/skia/include/core/SkBitmap.h"

				#include "ui/gfx/geometry/point.h"

				#include "ui/gfx/geometry/rect.h"

				#include "ui/gfx/geometry/rect_f.h"

				@ -130,18 +131,25 @@ struct AccessibilityImageInfo {

				  AccessibilityImageInfo();

				  AccessibilityImageInfo(const std::string& alt_text,

				                         uint32_t text_run_index,

				                         const gfx::RectF& bounds);

				                         const gfx::RectF& bounds,

				                         const SkBitmap& image_data);

				  AccessibilityImageInfo(const AccessibilityImageInfo& other);

				  ~AccessibilityImageInfo();

				  // Alternate text for the image provided by PDF.

				  std::string alt_text;

				  // We anchor the image to a char index, this denotes the text run before

				  // which the image should be inserted in the accessibility tree. The text run

				  // at this index should contain the anchor char index.

				  uint32_t text_run_index = 0;

				  // Bounding box of the image.

				  gfx::RectF bounds;

				  // Only populated if `alt_text` is empty or unavailable, and if the user has

				  // requested that the OCR service tag the PDF so that it is made accessible.

				  SkBitmap image_data;

				};

				struct AccessibilityHighlightInfo {

									
										5

pdf/pdf_engine.h
									
				@ -391,7 +391,10 @@ class PDFEngine {

				      int page_index,

				      const std::vector<AccessibilityTextRunInfo>& text_runs) = 0;

				  // For all the images in page `page_index`, get their alt texts and bounding

				  // boxes.

				  // boxes. If the alt text is empty or unavailable, and if the user has

				  // requested that the OCR service tag the PDF so that it is made accessible,

				  // transfer the raw image pixels in the `image_data` field. Otherwise do not

				  // populate the `image_data` field.

				  virtual std::vector<AccessibilityImageInfo> GetImageInfo(

				      int page_index,

				      uint32_t text_run_count) = 0;

1

pdf/pdfium/DEPS

 @ -1,5 +1,6 @@
 include_rules = [
   "+components/services/font/public/cpp",
   "+third_party/pdfium/public",
   "+ui/accessibility",
   "+ui/gfx/codec",
 ]

									
										6

pdf/pdfium/accessibility_unittest.cc
									
				@ -114,9 +114,9 @@ TEST_F(AccessibilityTest, GetAccessibilityPage) {

				TEST_F(AccessibilityTest, GetAccessibilityImageInfo) {

				  static const AccessibilityImageInfo kExpectedImageInfo[] = {

				      {"Image 1", 0, {380, 78, 67, 68}},

				      {"Image 2", 0, {380, 385, 27, 28}},

				      {"Image 3", 0, {380, 678, 1, 1}}};

				      {"Image 1", 0, {380, 78, 67, 68}, {}},

				      {"Image 2", 0, {380, 385, 27, 28}, {}},

				      {"Image 3", 0, {380, 678, 1, 1}, {}}};

				  TestClient client;

				  std::unique_ptr<PDFiumEngine> engine =

									
										32

pdf/pdfium/pdfium_page.cc
									
				@ -31,6 +31,9 @@

				#include "third_party/pdfium/public/cpp/fpdf_scopers.h"

				#include "third_party/pdfium/public/fpdf_annot.h"

				#include "third_party/pdfium/public/fpdf_catalog.h"

				#include "third_party/skia/include/core/SkImageInfo.h"

				#include "third_party/skia/include/core/SkPixmap.h"

				#include "ui/accessibility/accessibility_features.h"

				#include "ui/gfx/geometry/point.h"

				#include "ui/gfx/geometry/point_f.h"

				#include "ui/gfx/geometry/rect.h"

				@ -683,6 +686,7 @@ std::vector<AccessibilityImageInfo> PDFiumPage::GetImageInfo(

				    cur_info.bounds =

				        gfx::RectF(image.bounding_rect.x(), image.bounding_rect.y(),

				                   image.bounding_rect.width(), image.bounding_rect.height());

				    cur_info.image_data = image.image_data;

				    image_info.push_back(std::move(cur_info));

				  }

				  return image_info;

				@ -1160,6 +1164,7 @@ void PDFiumPage::CalculateImages() {

				      continue;

				    Image image;

				    image.page_object_index = i;

				    image.bounding_rect = PageToScreen(gfx::Point(), 1.0, left, top, right,

				                                       bottom, PageOrientation::kOriginal);

				@ -1182,6 +1187,33 @@ void PDFiumPage::CalculateImages() {

				  if (!marked_content_id_image_map.empty())

				    PopulateImageAltText(marked_content_id_image_map);

				  if (!features::IsPdfOcrEnabled())

				    return;

				  // If requested by the user, we store the raw image data so that the OCR

				  // service can try and retrieve textual and layout information from the image.

				  // This is because alt text might be empty, or the PDF might simply be

				  // untagged for accessibility.

				  for (Image& image : images_) {

				    if (!image.alt_text.empty())

				      continue;

				    FPDF_PAGEOBJECT page_object =

				        FPDFPage_GetObject(page, image.page_object_index);

				    ScopedFPDFBitmap bitmap(

				        FPDFImageObj_GetRenderedBitmap(engine_->doc(), page, page_object));

				    if (!bitmap)

				      continue;

				    SkImageInfo info = SkImageInfo::Make(

				        FPDFBitmap_GetWidth(bitmap.get()), FPDFBitmap_GetHeight(bitmap.get()),

				        kBGRA_8888_SkColorType, kOpaque_SkAlphaType);

				    const size_t row_bytes = FPDFBitmap_GetStride(bitmap.get());

				    SkPixmap pixels(info, FPDFBitmap_GetBuffer(bitmap.get()), row_bytes);

				    if (image.image_data.tryAllocPixels(info, row_bytes))

				      image.image_data.writePixels(pixels);

				  }

				}

				void PDFiumPage::PopulateImageAltText(

									
										17

pdf/pdfium/pdfium_page.h
									
				@ -21,6 +21,7 @@

				#include "third_party/pdfium/public/fpdf_doc.h"

				#include "third_party/pdfium/public/fpdf_formfill.h"

				#include "third_party/pdfium/public/fpdf_text.h"

				#include "third_party/skia/include/core/SkBitmap.h"

				#include "ui/gfx/geometry/point_f.h"

				#include "ui/gfx/geometry/rect.h"

				@ -75,8 +76,11 @@ class PDFiumPage {

				  // bounding boxes.

				  std::vector<AccessibilityLinkInfo> GetLinkInfo(

				      const std::vector<AccessibilityTextRunInfo>& text_runs);

				  // For all the images on the page, get their alt texts and bounding boxes.

				  // For all the images on the page, get their alt texts and bounding boxes. If

				  // the alt text is empty or unavailable, and if the user has requested that

				  // the OCR service tag the PDF so that it is made accessible, transfer the raw

				  // image pixels in the `image_data` field. Otherwise do not populate the

				  // `image_data` field.

				  std::vector<AccessibilityImageInfo> GetImageInfo(uint32_t text_run_count);

				  // For all the highlights on the page, get their underlying text ranges and

				@ -220,6 +224,7 @@ class PDFiumPage {

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageHighlightTest, PopulateHighlights);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, CalculateImages);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, ImageAltText);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageDataTest, ImageData);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, AnnotLinkGeneration);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, GetLinkTarget);

				  FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, LinkGeneration);

				@ -256,9 +261,13 @@ class PDFiumPage {

				    Image(const Image& other);

				    ~Image();

				    gfx::Rect bounding_rect;

				    // Alt text is available only for tagged PDFs.

				    int page_object_index;

				    // Alt text is available only for PDFs that are tagged for accessibility.

				    std::string alt_text;

				    gfx::Rect bounding_rect;

				    // Image data is only stored if the user has requested that the OCR service

				    // try to retrieve textual and layout information from this image.

				    SkBitmap image_data;

				  };

				  // Represents a highlight within the page.

									
										35

pdf/pdfium/pdfium_page_unittest.cc
									
				@ -11,6 +11,7 @@

				#include "base/files/file_path.h"

				#include "base/strings/string_util.h"

				#include "base/strings/stringprintf.h"

				#include "base/test/scoped_feature_list.h"

				#include "cc/test/pixel_comparator.h"

				#include "cc/test/pixel_test_utils.h"

				#include "pdf/accessibility_structs.h"

				@ -23,6 +24,7 @@

				#include "third_party/abseil-cpp/absl/types/optional.h"

				#include "third_party/pdfium/public/fpdf_formfill.h"

				#include "third_party/skia/include/core/SkBitmap.h"

				#include "ui/accessibility/accessibility_features.h"

				#include "ui/gfx/geometry/rect.h"

				#include "ui/gfx/geometry/rect_f.h"

				#include "ui/gfx/geometry/size_f.h"

				@ -283,6 +285,39 @@ TEST_F(PDFiumPageImageTest, ImageAltText) {

				  EXPECT_EQ("", page.images_[2].alt_text);

				}

				class PDFiumPageImageDataTest : public PDFiumPageImageTest {

				 public:

				  PDFiumPageImageDataTest() : enable_pdf_ocr_({features::kPdfOcr}) {}

				  PDFiumPageImageDataTest(const PDFiumPageImageDataTest&) = delete;

				  PDFiumPageImageDataTest& operator=(const PDFiumPageImageDataTest&) = delete;

				  ~PDFiumPageImageDataTest() override = default;

				 private:

				  base::test::ScopedFeatureList enable_pdf_ocr_;

				};

				TEST_F(PDFiumPageImageDataTest, ImageData) {

				  TestClient client;

				  std::unique_ptr<PDFiumEngine> engine =

				      InitializeEngine(&client, FILE_PATH_LITERAL("text_with_image.pdf"));

				  ASSERT_TRUE(engine);

				  ASSERT_EQ(1, engine->GetNumberOfPages());

				  PDFiumPage& page = GetPDFiumPageForTest(*engine, 0);

				  page.CalculateImages();

				  ASSERT_EQ(3u, page.images_.size());

				  ASSERT_FALSE(page.images_[0].alt_text.empty());

				  EXPECT_TRUE(page.images_[0].image_data.drawsNothing());

				  EXPECT_EQ(page.images_[0].image_data.width(), 0);

				  EXPECT_EQ(page.images_[0].image_data.height(), 0);

				  ASSERT_TRUE(page.images_[2].alt_text.empty());

				  EXPECT_EQ(page.images_[1].image_data.width(), 20);

				  EXPECT_EQ(page.images_[1].image_data.height(), 20);

				}

				using PDFiumPageTextTest = PDFiumTestBase;

				TEST_F(PDFiumPageTextTest, TextRunBounds) {

Transfers image pixels from PDFs to the renderer if OCR feature is enabled

1 components/pdf/renderer/BUILD.gn

1 components/pdf/renderer/DEPS

1 pdf/BUILD.gn

8 pdf/accessibility_structs.cc

10 pdf/accessibility_structs.h

5 pdf/pdf_engine.h

1 pdf/pdfium/DEPS

6 pdf/pdfium/accessibility_unittest.cc

32 pdf/pdfium/pdfium_page.cc

17 pdf/pdfium/pdfium_page.h

35 pdf/pdfium/pdfium_page_unittest.cc

1

components/pdf/renderer/BUILD.gn

1

components/pdf/renderer/DEPS

1

pdf/BUILD.gn

8

pdf/accessibility_structs.cc

10

pdf/accessibility_structs.h

5

pdf/pdf_engine.h

1

pdf/pdfium/DEPS

6

pdf/pdfium/accessibility_unittest.cc

32

pdf/pdfium/pdfium_page.cc

17

pdf/pdfium/pdfium_page.h

35

pdf/pdfium/pdfium_page_unittest.cc