Tagged PDFs: Consolidate code for retrieving text & image info into PDFiumPage class
Follow up to https://crrev.com/c/5980169 AX-Relnotes: n/a. Bug: 40707542 Change-Id: Ia0dc2d3223a8d8b6fff3a3c36d472d8554868caa Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6053970 Reviewed-by: Lei Zhang <thestig@chromium.org> Auto-Submit: Nektarios Paisios <nektar@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org> Cr-Commit-Position: refs/heads/main@{#1425200}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
a60402427a
commit
98cf8b5d75
@ -39,7 +39,6 @@ void GetAccessibilityInfo(PDFiumEngine* engine,
|
||||
CHECK(page);
|
||||
|
||||
const int raw_char_count = page->GetCharCount();
|
||||
|
||||
// Treat a char count of -1 (error) as 0 (an empty page), since
|
||||
// other pages might have valid content.
|
||||
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
|
||||
@ -48,70 +47,9 @@ void GetAccessibilityInfo(PDFiumEngine* engine,
|
||||
page_info.bounds = page->rect();
|
||||
page_info.char_count = char_count;
|
||||
page_info.is_searchified = page->IsPageSearchified();
|
||||
|
||||
chars.resize(page_info.char_count);
|
||||
for (uint32_t i = 0; i < char_count; ++i) {
|
||||
chars[i].unicode_character = page->GetCharUnicode(i);
|
||||
}
|
||||
|
||||
// TODO(crbug.com/40707542): Move the entire logic present in the following
|
||||
// while loop to `PDFiumPage` class.
|
||||
uint32_t char_index = 0;
|
||||
while (char_index < char_count) {
|
||||
std::optional<AccessibilityTextRunInfo> text_run_info_result =
|
||||
page->GetTextRunInfo(char_index);
|
||||
CHECK(text_run_info_result.has_value());
|
||||
const auto& text_run_info = text_run_info_result.value();
|
||||
uint32_t text_run_end = char_index + text_run_info.len;
|
||||
CHECK_LE(text_run_end, char_count);
|
||||
text_runs.push_back(text_run_info);
|
||||
|
||||
// We need to provide enough information to draw a bounding box
|
||||
// around any arbitrary text range, but the bounding boxes of characters
|
||||
// we get from PDFium don't necessarily "line up".
|
||||
// Example for LTR text direction: walk through the
|
||||
// characters in each text run and let the width of each character be
|
||||
// the difference between the x coordinate of one character and the
|
||||
// x coordinate of the next. The rest of the bounds of each character
|
||||
// can be computed from the bounds of the text run.
|
||||
// The same idea is used for RTL, TTB and BTT text direction.
|
||||
gfx::RectF char_bounds = page->GetCharBounds(char_index);
|
||||
for (uint32_t i = char_index; i < text_run_end - 1; i++) {
|
||||
CHECK_LT(i + 1, char_count);
|
||||
gfx::RectF next_char_bounds = page->GetCharBounds(i + 1);
|
||||
double& char_width = chars[i].char_width;
|
||||
switch (text_run_info.direction) {
|
||||
case AccessibilityTextDirection::kNone:
|
||||
case AccessibilityTextDirection::kLeftToRight:
|
||||
char_width = next_char_bounds.x() - char_bounds.x();
|
||||
break;
|
||||
case AccessibilityTextDirection::kTopToBottom:
|
||||
char_width = next_char_bounds.y() - char_bounds.y();
|
||||
break;
|
||||
case AccessibilityTextDirection::kRightToLeft:
|
||||
char_width = char_bounds.right() - next_char_bounds.right();
|
||||
break;
|
||||
case AccessibilityTextDirection::kBottomToTop:
|
||||
char_width = char_bounds.bottom() - next_char_bounds.bottom();
|
||||
break;
|
||||
}
|
||||
char_bounds = next_char_bounds;
|
||||
}
|
||||
double& char_width = chars[text_run_end - 1].char_width;
|
||||
if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||
|
||||
text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {
|
||||
char_width = char_bounds.height();
|
||||
} else {
|
||||
char_width = char_bounds.width();
|
||||
}
|
||||
|
||||
char_index += text_run_info.len;
|
||||
}
|
||||
|
||||
page->PopulateTextRunTypeAndImageAltText(text_runs);
|
||||
page->GetTextAndImageInfo(text_runs, chars, page_objects.images);
|
||||
page_info.text_run_count = text_runs.size();
|
||||
page_objects.links = page->GetLinkInfo(text_runs);
|
||||
page_objects.images = page->GetImageInfo(page_info.text_run_count);
|
||||
page_objects.highlights = page->GetHighlightInfo(text_runs);
|
||||
page_objects.form_fields =
|
||||
GetAccessibilityFormFieldInfo(page, page_info.text_run_count);
|
||||
|
@ -541,6 +541,76 @@ int PDFiumPage::GetCharCount() {
|
||||
return FPDFText_CountChars(GetTextPage());
|
||||
}
|
||||
|
||||
void PDFiumPage::GetTextAndImageInfo(
|
||||
std::vector<AccessibilityTextRunInfo>& text_runs,
|
||||
std::vector<AccessibilityCharInfo>& chars,
|
||||
std::vector<AccessibilityImageInfo>& images) {
|
||||
const int raw_char_count = GetCharCount();
|
||||
// Treat a char count of -1 (error) as 0 (an empty page), since
|
||||
// other pages might have valid content.
|
||||
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
|
||||
|
||||
chars.resize(char_count);
|
||||
for (uint32_t i = 0; i < char_count; ++i) {
|
||||
chars[i].unicode_character = GetCharUnicode(i);
|
||||
}
|
||||
|
||||
uint32_t char_index = 0;
|
||||
while (char_index < char_count) {
|
||||
std::optional<AccessibilityTextRunInfo> text_run_info_result =
|
||||
GetTextRunInfo(char_index);
|
||||
CHECK(text_run_info_result.has_value());
|
||||
AccessibilityTextRunInfo& text_run_info = *text_run_info_result;
|
||||
uint32_t text_run_end = char_index + text_run_info.len;
|
||||
CHECK_LE(text_run_end, char_count);
|
||||
text_runs.push_back(text_run_info);
|
||||
|
||||
// We need to provide enough information to draw a bounding box
|
||||
// around any arbitrary text range, but the bounding boxes of characters
|
||||
// we get from PDFium don't necessarily "line up".
|
||||
// Example for LTR text direction: walk through the
|
||||
// characters in each text run and let the width of each character be
|
||||
// the difference between the x coordinate of one character and the
|
||||
// x coordinate of the next. The rest of the bounds of each character
|
||||
// can be computed from the bounds of the text run.
|
||||
// The same idea is used for RTL, TTB and BTT text direction.
|
||||
gfx::RectF char_bounds = GetCharBounds(char_index);
|
||||
for (uint32_t i = char_index; i < text_run_end - 1; i++) {
|
||||
CHECK_LT(i + 1, char_count);
|
||||
gfx::RectF next_char_bounds = GetCharBounds(i + 1);
|
||||
double& char_width = chars[i].char_width;
|
||||
switch (text_run_info.direction) {
|
||||
case AccessibilityTextDirection::kNone:
|
||||
case AccessibilityTextDirection::kLeftToRight:
|
||||
char_width = next_char_bounds.x() - char_bounds.x();
|
||||
break;
|
||||
case AccessibilityTextDirection::kTopToBottom:
|
||||
char_width = next_char_bounds.y() - char_bounds.y();
|
||||
break;
|
||||
case AccessibilityTextDirection::kRightToLeft:
|
||||
char_width = char_bounds.right() - next_char_bounds.right();
|
||||
break;
|
||||
case AccessibilityTextDirection::kBottomToTop:
|
||||
char_width = char_bounds.bottom() - next_char_bounds.bottom();
|
||||
break;
|
||||
}
|
||||
char_bounds = next_char_bounds;
|
||||
}
|
||||
double& char_width = chars[text_run_end - 1].char_width;
|
||||
if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||
|
||||
text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {
|
||||
char_width = char_bounds.height();
|
||||
} else {
|
||||
char_width = char_bounds.width();
|
||||
}
|
||||
|
||||
char_index += text_run_info.len;
|
||||
}
|
||||
|
||||
PopulateTextRunTypeAndImageAltText(text_runs);
|
||||
images = GetImageInfo(text_runs.size());
|
||||
}
|
||||
|
||||
std::optional<AccessibilityTextRunInfo> PDFiumPage::GetTextRunInfo(
|
||||
int start_char_index) {
|
||||
FPDF_PAGE page = GetPage();
|
||||
@ -999,49 +1069,6 @@ std::vector<AccessibilityTextFieldInfo> PDFiumPage::GetTextFieldInfo(
|
||||
return text_field_info;
|
||||
}
|
||||
|
||||
void PDFiumPage::PopulateTextRunTypeAndImageAltText(
|
||||
std::vector<AccessibilityTextRunInfo>& text_runs) {
|
||||
CalculateImages();
|
||||
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));
|
||||
if (!struct_tree) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building
|
||||
// logic into this class and remove the following block.
|
||||
MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;
|
||||
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
|
||||
FPDF_TEXTPAGE text_page = GetTextPage();
|
||||
uint32_t char_index = 0;
|
||||
for (auto& text_run : text_runs) {
|
||||
FPDF_PAGEOBJECT text_object =
|
||||
FPDFText_GetTextObject(text_page, char_index);
|
||||
int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);
|
||||
if (marked_content_id == -1) {
|
||||
continue;
|
||||
}
|
||||
auto [iter, _] = marked_content_id_text_run_info_map.emplace(
|
||||
marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());
|
||||
iter->second.push_back(&text_run);
|
||||
char_index += text_run.len;
|
||||
}
|
||||
}
|
||||
|
||||
if (marked_content_id_text_run_info_map.empty() &&
|
||||
marked_content_id_image_map_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::set<FPDF_STRUCTELEMENT> visited_elements;
|
||||
int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());
|
||||
for (int i = 0; i < tree_children_count; ++i) {
|
||||
FPDF_STRUCTELEMENT current_element =
|
||||
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);
|
||||
PopulateTextRunTypeAndImageAltTextForStructElement(
|
||||
current_element, visited_elements, marked_content_id_text_run_info_map);
|
||||
}
|
||||
}
|
||||
|
||||
PDFiumPage::Area PDFiumPage::GetLinkTargetAtIndex(int link_index,
|
||||
LinkTarget* target) {
|
||||
if (!available_ || link_index < 0)
|
||||
@ -1465,6 +1492,50 @@ void PDFiumPage::CalculateImages() {
|
||||
}
|
||||
}
|
||||
|
||||
void PDFiumPage::PopulateTextRunTypeAndImageAltText(
|
||||
std::vector<AccessibilityTextRunInfo>& text_runs) {
|
||||
CalculateImages();
|
||||
|
||||
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));
|
||||
if (!struct_tree) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building
|
||||
// logic into this class and remove the following block.
|
||||
MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;
|
||||
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
|
||||
FPDF_TEXTPAGE text_page = GetTextPage();
|
||||
uint32_t char_index = 0;
|
||||
for (auto& text_run : text_runs) {
|
||||
FPDF_PAGEOBJECT text_object =
|
||||
FPDFText_GetTextObject(text_page, char_index);
|
||||
int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);
|
||||
if (marked_content_id == -1) {
|
||||
continue;
|
||||
}
|
||||
auto [iter, _] = marked_content_id_text_run_info_map.emplace(
|
||||
marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());
|
||||
iter->second.push_back(&text_run);
|
||||
char_index += text_run.len;
|
||||
}
|
||||
}
|
||||
|
||||
if (marked_content_id_text_run_info_map.empty() &&
|
||||
marked_content_id_image_map_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::set<FPDF_STRUCTELEMENT> visited_elements;
|
||||
int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());
|
||||
for (int i = 0; i < tree_children_count; ++i) {
|
||||
FPDF_STRUCTELEMENT current_element =
|
||||
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);
|
||||
PopulateTextRunTypeAndImageAltTextForStructElement(
|
||||
current_element, visited_elements, marked_content_id_text_run_info_map);
|
||||
}
|
||||
}
|
||||
|
||||
void PDFiumPage::PopulateTextRunTypeAndImageAltTextForStructElement(
|
||||
FPDF_STRUCTELEMENT current_element,
|
||||
std::set<FPDF_STRUCTELEMENT>& visited_elements,
|
||||
|
@ -41,6 +41,7 @@ namespace chrome_pdf {
|
||||
|
||||
class PDFiumEngine;
|
||||
class Thumbnail;
|
||||
struct AccessibilityCharInfo;
|
||||
struct AccessibilityHighlightInfo;
|
||||
struct AccessibilityImageInfo;
|
||||
struct AccessibilityLinkInfo;
|
||||
@ -69,6 +70,7 @@ class PDFiumPage {
|
||||
|
||||
// Unloads the PDFium data for this page from memory.
|
||||
void Unload();
|
||||
|
||||
// Gets the FPDF_PAGE for this page, loading and parsing it if necessary.
|
||||
FPDF_PAGE GetPage();
|
||||
|
||||
@ -81,6 +83,11 @@ class PDFiumPage {
|
||||
// Resets loaded text and loads it again.
|
||||
void ReloadTextPage();
|
||||
|
||||
// Get all the chars, text runs and images from the page.
|
||||
void GetTextAndImageInfo(std::vector<AccessibilityTextRunInfo>& text_runs,
|
||||
std::vector<AccessibilityCharInfo>& chars,
|
||||
std::vector<AccessibilityImageInfo>& images);
|
||||
|
||||
// Given a start char index, find the longest continuous run of text that's
|
||||
// in a single direction and with the same text style. Return a filled out
|
||||
// AccessibilityTextRunInfo on success or std::nullopt on failure. e.g. When
|
||||
@ -109,6 +116,7 @@ class PDFiumPage {
|
||||
// bounding boxes.
|
||||
std::vector<AccessibilityLinkInfo> GetLinkInfo(
|
||||
const std::vector<AccessibilityTextRunInfo>& text_runs);
|
||||
|
||||
// For all the images on the page, get their alt texts and bounding boxes. If
|
||||
// the alt text is empty or unavailable, and if the user has requested that
|
||||
// the OCR service tag the PDF so that it is made accessible, transfer the raw
|
||||
@ -143,13 +151,6 @@ class PDFiumPage {
|
||||
std::vector<AccessibilityTextFieldInfo> GetTextFieldInfo(
|
||||
uint32_t text_run_count);
|
||||
|
||||
// Traverses the entire struct tree of the page recursively and extracts the
|
||||
// text run type or the alt text from struct tree elements corresponding to
|
||||
// the marked content IDs associated with `text_runs` or present in
|
||||
// `marked_content_id_image_map_` respectively.
|
||||
void PopulateTextRunTypeAndImageAltText(
|
||||
std::vector<AccessibilityTextRunInfo>& text_runs);
|
||||
|
||||
enum Area {
|
||||
NONSELECTABLE_AREA,
|
||||
TEXT_AREA, // Area contains regular, selectable text not
|
||||
@ -447,6 +448,13 @@ class PDFiumPage {
|
||||
// Value : Index of the image in the `images_` vector.
|
||||
using MarkedContentIdToImageMap = std::map<int, size_t>;
|
||||
|
||||
// Traverses the entire struct tree of the page recursively and extracts the
|
||||
// text run type or the alt text from struct tree elements corresponding to
|
||||
// the marked content IDs associated with `text_runs` or present in
|
||||
// `marked_content_id_image_map_` respectively.
|
||||
void PopulateTextRunTypeAndImageAltText(
|
||||
std::vector<AccessibilityTextRunInfo>& text_runs);
|
||||
|
||||
// Traverses a struct element and its sub-tree recursively and extracts the
|
||||
// text run type or the alt text from struct elements corresponding to the
|
||||
// marked content IDs present in `marked_content_id_text_run_info_map` or
|
||||
|
Reference in New Issue
Block a user