Tagged PDFs: Consolidate code for retrieving text & image info into PDFiumPage class

Follow up to https://crrev.com/c/5980169

AX-Relnotes: n/a.
Bug: 40707542
Change-Id: Ia0dc2d3223a8d8b6fff3a3c36d472d8554868caa
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6053970
Reviewed-by: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nektarios Paisios <nektar@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1425200}

This commit is contained in:

Nektarios Paisios

2025-02-26 09:17:51 -08:00

committed by

Chromium LUCI CQ

parent a60402427a

commit 98cf8b5d75

3 changed files with 130 additions and 113 deletions

pdf

accessibility.cc

pdfium

pdfium_page.cc pdfium_page.h

									
										64

pdf/accessibility.cc
									
				@ -39,7 +39,6 @@ void GetAccessibilityInfo(PDFiumEngine* engine,

				  CHECK(page);

				  const int raw_char_count = page->GetCharCount();

				  // Treat a char count of -1 (error) as 0 (an empty page), since

				  // other pages might have valid content.

				  const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);

				@ -48,70 +47,9 @@ void GetAccessibilityInfo(PDFiumEngine* engine,

				  page_info.bounds = page->rect();

				  page_info.char_count = char_count;

				  page_info.is_searchified = page->IsPageSearchified();

				  chars.resize(page_info.char_count);

				  for (uint32_t i = 0; i < char_count; ++i) {

				    chars[i].unicode_character = page->GetCharUnicode(i);

				  }

				  // TODO(crbug.com/40707542): Move the entire logic present in the following

				  // while loop to `PDFiumPage` class.

				  uint32_t char_index = 0;

				  while (char_index < char_count) {

				    std::optional<AccessibilityTextRunInfo> text_run_info_result =

				        page->GetTextRunInfo(char_index);

				    CHECK(text_run_info_result.has_value());

				    const auto& text_run_info = text_run_info_result.value();

				    uint32_t text_run_end = char_index + text_run_info.len;

				    CHECK_LE(text_run_end, char_count);

				    text_runs.push_back(text_run_info);

				    // We need to provide enough information to draw a bounding box

				    // around any arbitrary text range, but the bounding boxes of characters

				    // we get from PDFium don't necessarily "line up".

				    // Example for LTR text direction: walk through the

				    // characters in each text run and let the width of each character be

				    // the difference between the x coordinate of one character and the

				    // x coordinate of the next. The rest of the bounds of each character

				    // can be computed from the bounds of the text run.

				    // The same idea is used for RTL, TTB and BTT text direction.

				    gfx::RectF char_bounds = page->GetCharBounds(char_index);

				    for (uint32_t i = char_index; i < text_run_end - 1; i++) {

				      CHECK_LT(i + 1, char_count);

				      gfx::RectF next_char_bounds = page->GetCharBounds(i + 1);

				      double& char_width = chars[i].char_width;

				      switch (text_run_info.direction) {

				        case AccessibilityTextDirection::kNone:

				        case AccessibilityTextDirection::kLeftToRight:

				          char_width = next_char_bounds.x() - char_bounds.x();

				          break;

				        case AccessibilityTextDirection::kTopToBottom:

				          char_width = next_char_bounds.y() - char_bounds.y();

				          break;

				        case AccessibilityTextDirection::kRightToLeft:

				          char_width = char_bounds.right() - next_char_bounds.right();

				          break;

				        case AccessibilityTextDirection::kBottomToTop:

				          char_width = char_bounds.bottom() - next_char_bounds.bottom();

				          break;

				      }

				      char_bounds = next_char_bounds;

				    }

				    double& char_width = chars[text_run_end - 1].char_width;

				    if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||

				        text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {

				      char_width = char_bounds.height();

				    } else {

				      char_width = char_bounds.width();

				    }

				    char_index += text_run_info.len;

				  }

				  page->PopulateTextRunTypeAndImageAltText(text_runs);

				  page->GetTextAndImageInfo(text_runs, chars, page_objects.images);

				  page_info.text_run_count = text_runs.size();

				  page_objects.links = page->GetLinkInfo(text_runs);

				  page_objects.images = page->GetImageInfo(page_info.text_run_count);

				  page_objects.highlights = page->GetHighlightInfo(text_runs);

				  page_objects.form_fields =

				      GetAccessibilityFormFieldInfo(page, page_info.text_run_count);

									
										157

pdf/pdfium/pdfium_page.cc
									
				@ -541,6 +541,76 @@ int PDFiumPage::GetCharCount() {

				  return FPDFText_CountChars(GetTextPage());

				}

				void PDFiumPage::GetTextAndImageInfo(

				    std::vector<AccessibilityTextRunInfo>& text_runs,

				    std::vector<AccessibilityCharInfo>& chars,

				    std::vector<AccessibilityImageInfo>& images) {

				  const int raw_char_count = GetCharCount();

				  // Treat a char count of -1 (error) as 0 (an empty page), since

				  // other pages might have valid content.

				  const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);

				  chars.resize(char_count);

				  for (uint32_t i = 0; i < char_count; ++i) {

				    chars[i].unicode_character = GetCharUnicode(i);

				  }

				  uint32_t char_index = 0;

				  while (char_index < char_count) {

				    std::optional<AccessibilityTextRunInfo> text_run_info_result =

				        GetTextRunInfo(char_index);

				    CHECK(text_run_info_result.has_value());

				    AccessibilityTextRunInfo& text_run_info = *text_run_info_result;

				    uint32_t text_run_end = char_index + text_run_info.len;

				    CHECK_LE(text_run_end, char_count);

				    text_runs.push_back(text_run_info);

				    // We need to provide enough information to draw a bounding box

				    // around any arbitrary text range, but the bounding boxes of characters

				    // we get from PDFium don't necessarily "line up".

				    // Example for LTR text direction: walk through the

				    // characters in each text run and let the width of each character be

				    // the difference between the x coordinate of one character and the

				    // x coordinate of the next. The rest of the bounds of each character

				    // can be computed from the bounds of the text run.

				    // The same idea is used for RTL, TTB and BTT text direction.

				    gfx::RectF char_bounds = GetCharBounds(char_index);

				    for (uint32_t i = char_index; i < text_run_end - 1; i++) {

				      CHECK_LT(i + 1, char_count);

				      gfx::RectF next_char_bounds = GetCharBounds(i + 1);

				      double& char_width = chars[i].char_width;

				      switch (text_run_info.direction) {

				        case AccessibilityTextDirection::kNone:

				        case AccessibilityTextDirection::kLeftToRight:

				          char_width = next_char_bounds.x() - char_bounds.x();

				          break;

				        case AccessibilityTextDirection::kTopToBottom:

				          char_width = next_char_bounds.y() - char_bounds.y();

				          break;

				        case AccessibilityTextDirection::kRightToLeft:

				          char_width = char_bounds.right() - next_char_bounds.right();

				          break;

				        case AccessibilityTextDirection::kBottomToTop:

				          char_width = char_bounds.bottom() - next_char_bounds.bottom();

				          break;

				      }

				      char_bounds = next_char_bounds;

				    }

				    double& char_width = chars[text_run_end - 1].char_width;

				    if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||

				        text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {

				      char_width = char_bounds.height();

				    } else {

				      char_width = char_bounds.width();

				    }

				    char_index += text_run_info.len;

				  }

				  PopulateTextRunTypeAndImageAltText(text_runs);

				  images = GetImageInfo(text_runs.size());

				}

				std::optional<AccessibilityTextRunInfo> PDFiumPage::GetTextRunInfo(

				    int start_char_index) {

				  FPDF_PAGE page = GetPage();

				@ -999,49 +1069,6 @@ std::vector<AccessibilityTextFieldInfo> PDFiumPage::GetTextFieldInfo(

				  return text_field_info;

				}

				void PDFiumPage::PopulateTextRunTypeAndImageAltText(

				    std::vector<AccessibilityTextRunInfo>& text_runs) {

				  CalculateImages();

				  ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));

				  if (!struct_tree) {

				    return;

				  }

				  // TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building

				  // logic into this class and remove the following block.

				  MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;

				  if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {

				    FPDF_TEXTPAGE text_page = GetTextPage();

				    uint32_t char_index = 0;

				    for (auto& text_run : text_runs) {

				      FPDF_PAGEOBJECT text_object =

				          FPDFText_GetTextObject(text_page, char_index);

				      int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);

				      if (marked_content_id == -1) {

				        continue;

				      }

				      auto [iter, _] = marked_content_id_text_run_info_map.emplace(

				          marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());

				      iter->second.push_back(&text_run);

				      char_index += text_run.len;

				    }

				  }

				  if (marked_content_id_text_run_info_map.empty() &&

				      marked_content_id_image_map_.empty()) {

				    return;

				  }

				  std::set<FPDF_STRUCTELEMENT> visited_elements;

				  int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());

				  for (int i = 0; i < tree_children_count; ++i) {

				    FPDF_STRUCTELEMENT current_element =

				        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);

				    PopulateTextRunTypeAndImageAltTextForStructElement(

				        current_element, visited_elements, marked_content_id_text_run_info_map);

				  }

				}

				PDFiumPage::Area PDFiumPage::GetLinkTargetAtIndex(int link_index,

				                                                  LinkTarget* target) {

				  if (!available_ || link_index < 0)

				@ -1465,6 +1492,50 @@ void PDFiumPage::CalculateImages() {

				  }

				}

				void PDFiumPage::PopulateTextRunTypeAndImageAltText(

				    std::vector<AccessibilityTextRunInfo>& text_runs) {

				  CalculateImages();

				  ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));

				  if (!struct_tree) {

				    return;

				  }

				  // TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building

				  // logic into this class and remove the following block.

				  MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;

				  if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {

				    FPDF_TEXTPAGE text_page = GetTextPage();

				    uint32_t char_index = 0;

				    for (auto& text_run : text_runs) {

				      FPDF_PAGEOBJECT text_object =

				          FPDFText_GetTextObject(text_page, char_index);

				      int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);

				      if (marked_content_id == -1) {

				        continue;

				      }

				      auto [iter, _] = marked_content_id_text_run_info_map.emplace(

				          marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());

				      iter->second.push_back(&text_run);

				      char_index += text_run.len;

				    }

				  }

				  if (marked_content_id_text_run_info_map.empty() &&

				      marked_content_id_image_map_.empty()) {

				    return;

				  }

				  std::set<FPDF_STRUCTELEMENT> visited_elements;

				  int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());

				  for (int i = 0; i < tree_children_count; ++i) {

				    FPDF_STRUCTELEMENT current_element =

				        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);

				    PopulateTextRunTypeAndImageAltTextForStructElement(

				        current_element, visited_elements, marked_content_id_text_run_info_map);

				  }

				}

				void PDFiumPage::PopulateTextRunTypeAndImageAltTextForStructElement(

				    FPDF_STRUCTELEMENT current_element,

				    std::set<FPDF_STRUCTELEMENT>& visited_elements,

									
										22

pdf/pdfium/pdfium_page.h
									
				@ -41,6 +41,7 @@ namespace chrome_pdf {

				class PDFiumEngine;

				class Thumbnail;

				struct AccessibilityCharInfo;

				struct AccessibilityHighlightInfo;

				struct AccessibilityImageInfo;

				struct AccessibilityLinkInfo;

				@ -69,6 +70,7 @@ class PDFiumPage {

				  // Unloads the PDFium data for this page from memory.

				  void Unload();

				  // Gets the FPDF_PAGE for this page, loading and parsing it if necessary.

				  FPDF_PAGE GetPage();

				@ -81,6 +83,11 @@ class PDFiumPage {

				  // Resets loaded text and loads it again.

				  void ReloadTextPage();

				  // Get all the chars, text runs and images from the page.

				  void GetTextAndImageInfo(std::vector<AccessibilityTextRunInfo>& text_runs,

				                           std::vector<AccessibilityCharInfo>& chars,

				                           std::vector<AccessibilityImageInfo>& images);

				  // Given a start char index, find the longest continuous run of text that's

				  // in a single direction and with the same text style. Return a filled out

				  // AccessibilityTextRunInfo on success or std::nullopt on failure. e.g. When

				@ -109,6 +116,7 @@ class PDFiumPage {

				  // bounding boxes.

				  std::vector<AccessibilityLinkInfo> GetLinkInfo(

				      const std::vector<AccessibilityTextRunInfo>& text_runs);

				  // For all the images on the page, get their alt texts and bounding boxes. If

				  // the alt text is empty or unavailable, and if the user has requested that

				  // the OCR service tag the PDF so that it is made accessible, transfer the raw

				@ -143,13 +151,6 @@ class PDFiumPage {

				  std::vector<AccessibilityTextFieldInfo> GetTextFieldInfo(

				      uint32_t text_run_count);

				  // Traverses the entire struct tree of the page recursively and extracts the

				  // text run type or the alt text from struct tree elements corresponding to

				  // the marked content IDs associated with `text_runs` or present in

				  // `marked_content_id_image_map_` respectively.

				  void PopulateTextRunTypeAndImageAltText(

				      std::vector<AccessibilityTextRunInfo>& text_runs);

				  enum Area {

				    NONSELECTABLE_AREA,

				    TEXT_AREA,       // Area contains regular, selectable text not

				@ -447,6 +448,13 @@ class PDFiumPage {

				  // Value  :  Index of the image in the `images_` vector.

				  using MarkedContentIdToImageMap = std::map<int, size_t>;

				  // Traverses the entire struct tree of the page recursively and extracts the

				  // text run type or the alt text from struct tree elements corresponding to

				  // the marked content IDs associated with `text_runs` or present in

				  // `marked_content_id_image_map_` respectively.

				  void PopulateTextRunTypeAndImageAltText(

				      std::vector<AccessibilityTextRunInfo>& text_runs);

				  // Traverses a struct element and its sub-tree recursively and extracts the

				  // text run type or the alt text from struct elements corresponding to the

				  // marked content IDs present in `marked_content_id_text_run_info_map` or

Tagged PDFs: Consolidate code for retrieving text & image info into PDFiumPage class

64 pdf/accessibility.cc

157 pdf/pdfium/pdfium_page.cc

22 pdf/pdfium/pdfium_page.h

64

pdf/accessibility.cc

157

pdf/pdfium/pdfium_page.cc

22

pdf/pdfium/pdfium_page.h