0

Tagged PDFs: Consolidate code for retrieving text & image info into PDFiumPage class

Follow up to https://crrev.com/c/5980169

AX-Relnotes: n/a.
Bug: 40707542
Change-Id: Ia0dc2d3223a8d8b6fff3a3c36d472d8554868caa
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6053970
Reviewed-by: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nektarios Paisios <nektar@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1425200}
This commit is contained in:
Nektarios Paisios
2025-02-26 09:17:51 -08:00
committed by Chromium LUCI CQ
parent a60402427a
commit 98cf8b5d75
3 changed files with 130 additions and 113 deletions

@ -39,7 +39,6 @@ void GetAccessibilityInfo(PDFiumEngine* engine,
CHECK(page);
const int raw_char_count = page->GetCharCount();
// Treat a char count of -1 (error) as 0 (an empty page), since
// other pages might have valid content.
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
@ -48,70 +47,9 @@ void GetAccessibilityInfo(PDFiumEngine* engine,
page_info.bounds = page->rect();
page_info.char_count = char_count;
page_info.is_searchified = page->IsPageSearchified();
chars.resize(page_info.char_count);
for (uint32_t i = 0; i < char_count; ++i) {
chars[i].unicode_character = page->GetCharUnicode(i);
}
// TODO(crbug.com/40707542): Move the entire logic present in the following
// while loop to `PDFiumPage` class.
uint32_t char_index = 0;
while (char_index < char_count) {
std::optional<AccessibilityTextRunInfo> text_run_info_result =
page->GetTextRunInfo(char_index);
CHECK(text_run_info_result.has_value());
const auto& text_run_info = text_run_info_result.value();
uint32_t text_run_end = char_index + text_run_info.len;
CHECK_LE(text_run_end, char_count);
text_runs.push_back(text_run_info);
// We need to provide enough information to draw a bounding box
// around any arbitrary text range, but the bounding boxes of characters
// we get from PDFium don't necessarily "line up".
// Example for LTR text direction: walk through the
// characters in each text run and let the width of each character be
// the difference between the x coordinate of one character and the
// x coordinate of the next. The rest of the bounds of each character
// can be computed from the bounds of the text run.
// The same idea is used for RTL, TTB and BTT text direction.
gfx::RectF char_bounds = page->GetCharBounds(char_index);
for (uint32_t i = char_index; i < text_run_end - 1; i++) {
CHECK_LT(i + 1, char_count);
gfx::RectF next_char_bounds = page->GetCharBounds(i + 1);
double& char_width = chars[i].char_width;
switch (text_run_info.direction) {
case AccessibilityTextDirection::kNone:
case AccessibilityTextDirection::kLeftToRight:
char_width = next_char_bounds.x() - char_bounds.x();
break;
case AccessibilityTextDirection::kTopToBottom:
char_width = next_char_bounds.y() - char_bounds.y();
break;
case AccessibilityTextDirection::kRightToLeft:
char_width = char_bounds.right() - next_char_bounds.right();
break;
case AccessibilityTextDirection::kBottomToTop:
char_width = char_bounds.bottom() - next_char_bounds.bottom();
break;
}
char_bounds = next_char_bounds;
}
double& char_width = chars[text_run_end - 1].char_width;
if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||
text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {
char_width = char_bounds.height();
} else {
char_width = char_bounds.width();
}
char_index += text_run_info.len;
}
page->PopulateTextRunTypeAndImageAltText(text_runs);
page->GetTextAndImageInfo(text_runs, chars, page_objects.images);
page_info.text_run_count = text_runs.size();
page_objects.links = page->GetLinkInfo(text_runs);
page_objects.images = page->GetImageInfo(page_info.text_run_count);
page_objects.highlights = page->GetHighlightInfo(text_runs);
page_objects.form_fields =
GetAccessibilityFormFieldInfo(page, page_info.text_run_count);

@ -541,6 +541,76 @@ int PDFiumPage::GetCharCount() {
return FPDFText_CountChars(GetTextPage());
}
void PDFiumPage::GetTextAndImageInfo(
std::vector<AccessibilityTextRunInfo>& text_runs,
std::vector<AccessibilityCharInfo>& chars,
std::vector<AccessibilityImageInfo>& images) {
const int raw_char_count = GetCharCount();
// Treat a char count of -1 (error) as 0 (an empty page), since
// other pages might have valid content.
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
chars.resize(char_count);
for (uint32_t i = 0; i < char_count; ++i) {
chars[i].unicode_character = GetCharUnicode(i);
}
uint32_t char_index = 0;
while (char_index < char_count) {
std::optional<AccessibilityTextRunInfo> text_run_info_result =
GetTextRunInfo(char_index);
CHECK(text_run_info_result.has_value());
AccessibilityTextRunInfo& text_run_info = *text_run_info_result;
uint32_t text_run_end = char_index + text_run_info.len;
CHECK_LE(text_run_end, char_count);
text_runs.push_back(text_run_info);
// We need to provide enough information to draw a bounding box
// around any arbitrary text range, but the bounding boxes of characters
// we get from PDFium don't necessarily "line up".
// Example for LTR text direction: walk through the
// characters in each text run and let the width of each character be
// the difference between the x coordinate of one character and the
// x coordinate of the next. The rest of the bounds of each character
// can be computed from the bounds of the text run.
// The same idea is used for RTL, TTB and BTT text direction.
gfx::RectF char_bounds = GetCharBounds(char_index);
for (uint32_t i = char_index; i < text_run_end - 1; i++) {
CHECK_LT(i + 1, char_count);
gfx::RectF next_char_bounds = GetCharBounds(i + 1);
double& char_width = chars[i].char_width;
switch (text_run_info.direction) {
case AccessibilityTextDirection::kNone:
case AccessibilityTextDirection::kLeftToRight:
char_width = next_char_bounds.x() - char_bounds.x();
break;
case AccessibilityTextDirection::kTopToBottom:
char_width = next_char_bounds.y() - char_bounds.y();
break;
case AccessibilityTextDirection::kRightToLeft:
char_width = char_bounds.right() - next_char_bounds.right();
break;
case AccessibilityTextDirection::kBottomToTop:
char_width = char_bounds.bottom() - next_char_bounds.bottom();
break;
}
char_bounds = next_char_bounds;
}
double& char_width = chars[text_run_end - 1].char_width;
if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||
text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {
char_width = char_bounds.height();
} else {
char_width = char_bounds.width();
}
char_index += text_run_info.len;
}
PopulateTextRunTypeAndImageAltText(text_runs);
images = GetImageInfo(text_runs.size());
}
std::optional<AccessibilityTextRunInfo> PDFiumPage::GetTextRunInfo(
int start_char_index) {
FPDF_PAGE page = GetPage();
@ -999,49 +1069,6 @@ std::vector<AccessibilityTextFieldInfo> PDFiumPage::GetTextFieldInfo(
return text_field_info;
}
void PDFiumPage::PopulateTextRunTypeAndImageAltText(
std::vector<AccessibilityTextRunInfo>& text_runs) {
CalculateImages();
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));
if (!struct_tree) {
return;
}
// TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building
// logic into this class and remove the following block.
MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
FPDF_TEXTPAGE text_page = GetTextPage();
uint32_t char_index = 0;
for (auto& text_run : text_runs) {
FPDF_PAGEOBJECT text_object =
FPDFText_GetTextObject(text_page, char_index);
int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);
if (marked_content_id == -1) {
continue;
}
auto [iter, _] = marked_content_id_text_run_info_map.emplace(
marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());
iter->second.push_back(&text_run);
char_index += text_run.len;
}
}
if (marked_content_id_text_run_info_map.empty() &&
marked_content_id_image_map_.empty()) {
return;
}
std::set<FPDF_STRUCTELEMENT> visited_elements;
int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());
for (int i = 0; i < tree_children_count; ++i) {
FPDF_STRUCTELEMENT current_element =
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);
PopulateTextRunTypeAndImageAltTextForStructElement(
current_element, visited_elements, marked_content_id_text_run_info_map);
}
}
PDFiumPage::Area PDFiumPage::GetLinkTargetAtIndex(int link_index,
LinkTarget* target) {
if (!available_ || link_index < 0)
@ -1465,6 +1492,50 @@ void PDFiumPage::CalculateImages() {
}
}
void PDFiumPage::PopulateTextRunTypeAndImageAltText(
std::vector<AccessibilityTextRunInfo>& text_runs) {
CalculateImages();
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));
if (!struct_tree) {
return;
}
// TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building
// logic into this class and remove the following block.
MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
FPDF_TEXTPAGE text_page = GetTextPage();
uint32_t char_index = 0;
for (auto& text_run : text_runs) {
FPDF_PAGEOBJECT text_object =
FPDFText_GetTextObject(text_page, char_index);
int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);
if (marked_content_id == -1) {
continue;
}
auto [iter, _] = marked_content_id_text_run_info_map.emplace(
marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());
iter->second.push_back(&text_run);
char_index += text_run.len;
}
}
if (marked_content_id_text_run_info_map.empty() &&
marked_content_id_image_map_.empty()) {
return;
}
std::set<FPDF_STRUCTELEMENT> visited_elements;
int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());
for (int i = 0; i < tree_children_count; ++i) {
FPDF_STRUCTELEMENT current_element =
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);
PopulateTextRunTypeAndImageAltTextForStructElement(
current_element, visited_elements, marked_content_id_text_run_info_map);
}
}
void PDFiumPage::PopulateTextRunTypeAndImageAltTextForStructElement(
FPDF_STRUCTELEMENT current_element,
std::set<FPDF_STRUCTELEMENT>& visited_elements,

@ -41,6 +41,7 @@ namespace chrome_pdf {
class PDFiumEngine;
class Thumbnail;
struct AccessibilityCharInfo;
struct AccessibilityHighlightInfo;
struct AccessibilityImageInfo;
struct AccessibilityLinkInfo;
@ -69,6 +70,7 @@ class PDFiumPage {
// Unloads the PDFium data for this page from memory.
void Unload();
// Gets the FPDF_PAGE for this page, loading and parsing it if necessary.
FPDF_PAGE GetPage();
@ -81,6 +83,11 @@ class PDFiumPage {
// Resets loaded text and loads it again.
void ReloadTextPage();
// Get all the chars, text runs and images from the page.
void GetTextAndImageInfo(std::vector<AccessibilityTextRunInfo>& text_runs,
std::vector<AccessibilityCharInfo>& chars,
std::vector<AccessibilityImageInfo>& images);
// Given a start char index, find the longest continuous run of text that's
// in a single direction and with the same text style. Return a filled out
// AccessibilityTextRunInfo on success or std::nullopt on failure. e.g. When
@ -109,6 +116,7 @@ class PDFiumPage {
// bounding boxes.
std::vector<AccessibilityLinkInfo> GetLinkInfo(
const std::vector<AccessibilityTextRunInfo>& text_runs);
// For all the images on the page, get their alt texts and bounding boxes. If
// the alt text is empty or unavailable, and if the user has requested that
// the OCR service tag the PDF so that it is made accessible, transfer the raw
@ -143,13 +151,6 @@ class PDFiumPage {
std::vector<AccessibilityTextFieldInfo> GetTextFieldInfo(
uint32_t text_run_count);
// Traverses the entire struct tree of the page recursively and extracts the
// text run type or the alt text from struct tree elements corresponding to
// the marked content IDs associated with `text_runs` or present in
// `marked_content_id_image_map_` respectively.
void PopulateTextRunTypeAndImageAltText(
std::vector<AccessibilityTextRunInfo>& text_runs);
enum Area {
NONSELECTABLE_AREA,
TEXT_AREA, // Area contains regular, selectable text not
@ -447,6 +448,13 @@ class PDFiumPage {
// Value : Index of the image in the `images_` vector.
using MarkedContentIdToImageMap = std::map<int, size_t>;
// Traverses the entire struct tree of the page recursively and extracts the
// text run type or the alt text from struct tree elements corresponding to
// the marked content IDs associated with `text_runs` or present in
// `marked_content_id_image_map_` respectively.
void PopulateTextRunTypeAndImageAltText(
std::vector<AccessibilityTextRunInfo>& text_runs);
// Traverses a struct element and its sub-tree recursively and extracts the
// text run type or the alt text from struct elements corresponding to the
// marked content IDs present in `marked_content_id_text_run_info_map` or