Tagged PDFs: Consolidate code for retrieving text & image info into PDFiumPage class
Follow up to https://crrev.com/c/5980169 AX-Relnotes: n/a. Bug: 40707542 Change-Id: Ia0dc2d3223a8d8b6fff3a3c36d472d8554868caa Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6053970 Reviewed-by: Lei Zhang <thestig@chromium.org> Auto-Submit: Nektarios Paisios <nektar@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org> Cr-Commit-Position: refs/heads/main@{#1425200}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
a60402427a
commit
98cf8b5d75
@@ -39,7 +39,6 @@ void GetAccessibilityInfo(PDFiumEngine* engine,
|
|||||||
CHECK(page);
|
CHECK(page);
|
||||||
|
|
||||||
const int raw_char_count = page->GetCharCount();
|
const int raw_char_count = page->GetCharCount();
|
||||||
|
|
||||||
// Treat a char count of -1 (error) as 0 (an empty page), since
|
// Treat a char count of -1 (error) as 0 (an empty page), since
|
||||||
// other pages might have valid content.
|
// other pages might have valid content.
|
||||||
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
|
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
|
||||||
@@ -48,70 +47,9 @@ void GetAccessibilityInfo(PDFiumEngine* engine,
|
|||||||
page_info.bounds = page->rect();
|
page_info.bounds = page->rect();
|
||||||
page_info.char_count = char_count;
|
page_info.char_count = char_count;
|
||||||
page_info.is_searchified = page->IsPageSearchified();
|
page_info.is_searchified = page->IsPageSearchified();
|
||||||
|
page->GetTextAndImageInfo(text_runs, chars, page_objects.images);
|
||||||
chars.resize(page_info.char_count);
|
|
||||||
for (uint32_t i = 0; i < char_count; ++i) {
|
|
||||||
chars[i].unicode_character = page->GetCharUnicode(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(crbug.com/40707542): Move the entire logic present in the following
|
|
||||||
// while loop to `PDFiumPage` class.
|
|
||||||
uint32_t char_index = 0;
|
|
||||||
while (char_index < char_count) {
|
|
||||||
std::optional<AccessibilityTextRunInfo> text_run_info_result =
|
|
||||||
page->GetTextRunInfo(char_index);
|
|
||||||
CHECK(text_run_info_result.has_value());
|
|
||||||
const auto& text_run_info = text_run_info_result.value();
|
|
||||||
uint32_t text_run_end = char_index + text_run_info.len;
|
|
||||||
CHECK_LE(text_run_end, char_count);
|
|
||||||
text_runs.push_back(text_run_info);
|
|
||||||
|
|
||||||
// We need to provide enough information to draw a bounding box
|
|
||||||
// around any arbitrary text range, but the bounding boxes of characters
|
|
||||||
// we get from PDFium don't necessarily "line up".
|
|
||||||
// Example for LTR text direction: walk through the
|
|
||||||
// characters in each text run and let the width of each character be
|
|
||||||
// the difference between the x coordinate of one character and the
|
|
||||||
// x coordinate of the next. The rest of the bounds of each character
|
|
||||||
// can be computed from the bounds of the text run.
|
|
||||||
// The same idea is used for RTL, TTB and BTT text direction.
|
|
||||||
gfx::RectF char_bounds = page->GetCharBounds(char_index);
|
|
||||||
for (uint32_t i = char_index; i < text_run_end - 1; i++) {
|
|
||||||
CHECK_LT(i + 1, char_count);
|
|
||||||
gfx::RectF next_char_bounds = page->GetCharBounds(i + 1);
|
|
||||||
double& char_width = chars[i].char_width;
|
|
||||||
switch (text_run_info.direction) {
|
|
||||||
case AccessibilityTextDirection::kNone:
|
|
||||||
case AccessibilityTextDirection::kLeftToRight:
|
|
||||||
char_width = next_char_bounds.x() - char_bounds.x();
|
|
||||||
break;
|
|
||||||
case AccessibilityTextDirection::kTopToBottom:
|
|
||||||
char_width = next_char_bounds.y() - char_bounds.y();
|
|
||||||
break;
|
|
||||||
case AccessibilityTextDirection::kRightToLeft:
|
|
||||||
char_width = char_bounds.right() - next_char_bounds.right();
|
|
||||||
break;
|
|
||||||
case AccessibilityTextDirection::kBottomToTop:
|
|
||||||
char_width = char_bounds.bottom() - next_char_bounds.bottom();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
char_bounds = next_char_bounds;
|
|
||||||
}
|
|
||||||
double& char_width = chars[text_run_end - 1].char_width;
|
|
||||||
if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||
|
|
||||||
text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {
|
|
||||||
char_width = char_bounds.height();
|
|
||||||
} else {
|
|
||||||
char_width = char_bounds.width();
|
|
||||||
}
|
|
||||||
|
|
||||||
char_index += text_run_info.len;
|
|
||||||
}
|
|
||||||
|
|
||||||
page->PopulateTextRunTypeAndImageAltText(text_runs);
|
|
||||||
page_info.text_run_count = text_runs.size();
|
page_info.text_run_count = text_runs.size();
|
||||||
page_objects.links = page->GetLinkInfo(text_runs);
|
page_objects.links = page->GetLinkInfo(text_runs);
|
||||||
page_objects.images = page->GetImageInfo(page_info.text_run_count);
|
|
||||||
page_objects.highlights = page->GetHighlightInfo(text_runs);
|
page_objects.highlights = page->GetHighlightInfo(text_runs);
|
||||||
page_objects.form_fields =
|
page_objects.form_fields =
|
||||||
GetAccessibilityFormFieldInfo(page, page_info.text_run_count);
|
GetAccessibilityFormFieldInfo(page, page_info.text_run_count);
|
||||||
|
@@ -541,6 +541,76 @@ int PDFiumPage::GetCharCount() {
|
|||||||
return FPDFText_CountChars(GetTextPage());
|
return FPDFText_CountChars(GetTextPage());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFiumPage::GetTextAndImageInfo(
|
||||||
|
std::vector<AccessibilityTextRunInfo>& text_runs,
|
||||||
|
std::vector<AccessibilityCharInfo>& chars,
|
||||||
|
std::vector<AccessibilityImageInfo>& images) {
|
||||||
|
const int raw_char_count = GetCharCount();
|
||||||
|
// Treat a char count of -1 (error) as 0 (an empty page), since
|
||||||
|
// other pages might have valid content.
|
||||||
|
const uint32_t char_count = std::max<uint32_t>(raw_char_count, 0);
|
||||||
|
|
||||||
|
chars.resize(char_count);
|
||||||
|
for (uint32_t i = 0; i < char_count; ++i) {
|
||||||
|
chars[i].unicode_character = GetCharUnicode(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t char_index = 0;
|
||||||
|
while (char_index < char_count) {
|
||||||
|
std::optional<AccessibilityTextRunInfo> text_run_info_result =
|
||||||
|
GetTextRunInfo(char_index);
|
||||||
|
CHECK(text_run_info_result.has_value());
|
||||||
|
AccessibilityTextRunInfo& text_run_info = *text_run_info_result;
|
||||||
|
uint32_t text_run_end = char_index + text_run_info.len;
|
||||||
|
CHECK_LE(text_run_end, char_count);
|
||||||
|
text_runs.push_back(text_run_info);
|
||||||
|
|
||||||
|
// We need to provide enough information to draw a bounding box
|
||||||
|
// around any arbitrary text range, but the bounding boxes of characters
|
||||||
|
// we get from PDFium don't necessarily "line up".
|
||||||
|
// Example for LTR text direction: walk through the
|
||||||
|
// characters in each text run and let the width of each character be
|
||||||
|
// the difference between the x coordinate of one character and the
|
||||||
|
// x coordinate of the next. The rest of the bounds of each character
|
||||||
|
// can be computed from the bounds of the text run.
|
||||||
|
// The same idea is used for RTL, TTB and BTT text direction.
|
||||||
|
gfx::RectF char_bounds = GetCharBounds(char_index);
|
||||||
|
for (uint32_t i = char_index; i < text_run_end - 1; i++) {
|
||||||
|
CHECK_LT(i + 1, char_count);
|
||||||
|
gfx::RectF next_char_bounds = GetCharBounds(i + 1);
|
||||||
|
double& char_width = chars[i].char_width;
|
||||||
|
switch (text_run_info.direction) {
|
||||||
|
case AccessibilityTextDirection::kNone:
|
||||||
|
case AccessibilityTextDirection::kLeftToRight:
|
||||||
|
char_width = next_char_bounds.x() - char_bounds.x();
|
||||||
|
break;
|
||||||
|
case AccessibilityTextDirection::kTopToBottom:
|
||||||
|
char_width = next_char_bounds.y() - char_bounds.y();
|
||||||
|
break;
|
||||||
|
case AccessibilityTextDirection::kRightToLeft:
|
||||||
|
char_width = char_bounds.right() - next_char_bounds.right();
|
||||||
|
break;
|
||||||
|
case AccessibilityTextDirection::kBottomToTop:
|
||||||
|
char_width = char_bounds.bottom() - next_char_bounds.bottom();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
char_bounds = next_char_bounds;
|
||||||
|
}
|
||||||
|
double& char_width = chars[text_run_end - 1].char_width;
|
||||||
|
if (text_run_info.direction == AccessibilityTextDirection::kBottomToTop ||
|
||||||
|
text_run_info.direction == AccessibilityTextDirection::kTopToBottom) {
|
||||||
|
char_width = char_bounds.height();
|
||||||
|
} else {
|
||||||
|
char_width = char_bounds.width();
|
||||||
|
}
|
||||||
|
|
||||||
|
char_index += text_run_info.len;
|
||||||
|
}
|
||||||
|
|
||||||
|
PopulateTextRunTypeAndImageAltText(text_runs);
|
||||||
|
images = GetImageInfo(text_runs.size());
|
||||||
|
}
|
||||||
|
|
||||||
std::optional<AccessibilityTextRunInfo> PDFiumPage::GetTextRunInfo(
|
std::optional<AccessibilityTextRunInfo> PDFiumPage::GetTextRunInfo(
|
||||||
int start_char_index) {
|
int start_char_index) {
|
||||||
FPDF_PAGE page = GetPage();
|
FPDF_PAGE page = GetPage();
|
||||||
@@ -999,49 +1069,6 @@ std::vector<AccessibilityTextFieldInfo> PDFiumPage::GetTextFieldInfo(
|
|||||||
return text_field_info;
|
return text_field_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
void PDFiumPage::PopulateTextRunTypeAndImageAltText(
|
|
||||||
std::vector<AccessibilityTextRunInfo>& text_runs) {
|
|
||||||
CalculateImages();
|
|
||||||
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));
|
|
||||||
if (!struct_tree) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building
|
|
||||||
// logic into this class and remove the following block.
|
|
||||||
MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;
|
|
||||||
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
|
|
||||||
FPDF_TEXTPAGE text_page = GetTextPage();
|
|
||||||
uint32_t char_index = 0;
|
|
||||||
for (auto& text_run : text_runs) {
|
|
||||||
FPDF_PAGEOBJECT text_object =
|
|
||||||
FPDFText_GetTextObject(text_page, char_index);
|
|
||||||
int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);
|
|
||||||
if (marked_content_id == -1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
auto [iter, _] = marked_content_id_text_run_info_map.emplace(
|
|
||||||
marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());
|
|
||||||
iter->second.push_back(&text_run);
|
|
||||||
char_index += text_run.len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (marked_content_id_text_run_info_map.empty() &&
|
|
||||||
marked_content_id_image_map_.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::set<FPDF_STRUCTELEMENT> visited_elements;
|
|
||||||
int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());
|
|
||||||
for (int i = 0; i < tree_children_count; ++i) {
|
|
||||||
FPDF_STRUCTELEMENT current_element =
|
|
||||||
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);
|
|
||||||
PopulateTextRunTypeAndImageAltTextForStructElement(
|
|
||||||
current_element, visited_elements, marked_content_id_text_run_info_map);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PDFiumPage::Area PDFiumPage::GetLinkTargetAtIndex(int link_index,
|
PDFiumPage::Area PDFiumPage::GetLinkTargetAtIndex(int link_index,
|
||||||
LinkTarget* target) {
|
LinkTarget* target) {
|
||||||
if (!available_ || link_index < 0)
|
if (!available_ || link_index < 0)
|
||||||
@@ -1465,6 +1492,50 @@ void PDFiumPage::CalculateImages() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFiumPage::PopulateTextRunTypeAndImageAltText(
|
||||||
|
std::vector<AccessibilityTextRunInfo>& text_runs) {
|
||||||
|
CalculateImages();
|
||||||
|
|
||||||
|
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(GetPage()));
|
||||||
|
if (!struct_tree) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(crbug.com/40707542): Consolidate `Accessibility"TextRunInfo` building
|
||||||
|
// logic into this class and remove the following block.
|
||||||
|
MarkedContentIdToTextRunInfoMap marked_content_id_text_run_info_map;
|
||||||
|
if (base::FeatureList::IsEnabled(chrome_pdf::features::kPdfTags)) {
|
||||||
|
FPDF_TEXTPAGE text_page = GetTextPage();
|
||||||
|
uint32_t char_index = 0;
|
||||||
|
for (auto& text_run : text_runs) {
|
||||||
|
FPDF_PAGEOBJECT text_object =
|
||||||
|
FPDFText_GetTextObject(text_page, char_index);
|
||||||
|
int marked_content_id = FPDFPageObj_GetMarkedContentID(text_object);
|
||||||
|
if (marked_content_id == -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto [iter, _] = marked_content_id_text_run_info_map.emplace(
|
||||||
|
marked_content_id, std::vector<raw_ptr<AccessibilityTextRunInfo>>());
|
||||||
|
iter->second.push_back(&text_run);
|
||||||
|
char_index += text_run.len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (marked_content_id_text_run_info_map.empty() &&
|
||||||
|
marked_content_id_image_map_.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::set<FPDF_STRUCTELEMENT> visited_elements;
|
||||||
|
int tree_children_count = FPDF_StructTree_CountChildren(struct_tree.get());
|
||||||
|
for (int i = 0; i < tree_children_count; ++i) {
|
||||||
|
FPDF_STRUCTELEMENT current_element =
|
||||||
|
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), i);
|
||||||
|
PopulateTextRunTypeAndImageAltTextForStructElement(
|
||||||
|
current_element, visited_elements, marked_content_id_text_run_info_map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void PDFiumPage::PopulateTextRunTypeAndImageAltTextForStructElement(
|
void PDFiumPage::PopulateTextRunTypeAndImageAltTextForStructElement(
|
||||||
FPDF_STRUCTELEMENT current_element,
|
FPDF_STRUCTELEMENT current_element,
|
||||||
std::set<FPDF_STRUCTELEMENT>& visited_elements,
|
std::set<FPDF_STRUCTELEMENT>& visited_elements,
|
||||||
|
@@ -41,6 +41,7 @@ namespace chrome_pdf {
|
|||||||
|
|
||||||
class PDFiumEngine;
|
class PDFiumEngine;
|
||||||
class Thumbnail;
|
class Thumbnail;
|
||||||
|
struct AccessibilityCharInfo;
|
||||||
struct AccessibilityHighlightInfo;
|
struct AccessibilityHighlightInfo;
|
||||||
struct AccessibilityImageInfo;
|
struct AccessibilityImageInfo;
|
||||||
struct AccessibilityLinkInfo;
|
struct AccessibilityLinkInfo;
|
||||||
@@ -69,6 +70,7 @@ class PDFiumPage {
|
|||||||
|
|
||||||
// Unloads the PDFium data for this page from memory.
|
// Unloads the PDFium data for this page from memory.
|
||||||
void Unload();
|
void Unload();
|
||||||
|
|
||||||
// Gets the FPDF_PAGE for this page, loading and parsing it if necessary.
|
// Gets the FPDF_PAGE for this page, loading and parsing it if necessary.
|
||||||
FPDF_PAGE GetPage();
|
FPDF_PAGE GetPage();
|
||||||
|
|
||||||
@@ -81,6 +83,11 @@ class PDFiumPage {
|
|||||||
// Resets loaded text and loads it again.
|
// Resets loaded text and loads it again.
|
||||||
void ReloadTextPage();
|
void ReloadTextPage();
|
||||||
|
|
||||||
|
// Get all the chars, text runs and images from the page.
|
||||||
|
void GetTextAndImageInfo(std::vector<AccessibilityTextRunInfo>& text_runs,
|
||||||
|
std::vector<AccessibilityCharInfo>& chars,
|
||||||
|
std::vector<AccessibilityImageInfo>& images);
|
||||||
|
|
||||||
// Given a start char index, find the longest continuous run of text that's
|
// Given a start char index, find the longest continuous run of text that's
|
||||||
// in a single direction and with the same text style. Return a filled out
|
// in a single direction and with the same text style. Return a filled out
|
||||||
// AccessibilityTextRunInfo on success or std::nullopt on failure. e.g. When
|
// AccessibilityTextRunInfo on success or std::nullopt on failure. e.g. When
|
||||||
@@ -109,6 +116,7 @@ class PDFiumPage {
|
|||||||
// bounding boxes.
|
// bounding boxes.
|
||||||
std::vector<AccessibilityLinkInfo> GetLinkInfo(
|
std::vector<AccessibilityLinkInfo> GetLinkInfo(
|
||||||
const std::vector<AccessibilityTextRunInfo>& text_runs);
|
const std::vector<AccessibilityTextRunInfo>& text_runs);
|
||||||
|
|
||||||
// For all the images on the page, get their alt texts and bounding boxes. If
|
// For all the images on the page, get their alt texts and bounding boxes. If
|
||||||
// the alt text is empty or unavailable, and if the user has requested that
|
// the alt text is empty or unavailable, and if the user has requested that
|
||||||
// the OCR service tag the PDF so that it is made accessible, transfer the raw
|
// the OCR service tag the PDF so that it is made accessible, transfer the raw
|
||||||
@@ -143,13 +151,6 @@ class PDFiumPage {
|
|||||||
std::vector<AccessibilityTextFieldInfo> GetTextFieldInfo(
|
std::vector<AccessibilityTextFieldInfo> GetTextFieldInfo(
|
||||||
uint32_t text_run_count);
|
uint32_t text_run_count);
|
||||||
|
|
||||||
// Traverses the entire struct tree of the page recursively and extracts the
|
|
||||||
// text run type or the alt text from struct tree elements corresponding to
|
|
||||||
// the marked content IDs associated with `text_runs` or present in
|
|
||||||
// `marked_content_id_image_map_` respectively.
|
|
||||||
void PopulateTextRunTypeAndImageAltText(
|
|
||||||
std::vector<AccessibilityTextRunInfo>& text_runs);
|
|
||||||
|
|
||||||
enum Area {
|
enum Area {
|
||||||
NONSELECTABLE_AREA,
|
NONSELECTABLE_AREA,
|
||||||
TEXT_AREA, // Area contains regular, selectable text not
|
TEXT_AREA, // Area contains regular, selectable text not
|
||||||
@@ -447,6 +448,13 @@ class PDFiumPage {
|
|||||||
// Value : Index of the image in the `images_` vector.
|
// Value : Index of the image in the `images_` vector.
|
||||||
using MarkedContentIdToImageMap = std::map<int, size_t>;
|
using MarkedContentIdToImageMap = std::map<int, size_t>;
|
||||||
|
|
||||||
|
// Traverses the entire struct tree of the page recursively and extracts the
|
||||||
|
// text run type or the alt text from struct tree elements corresponding to
|
||||||
|
// the marked content IDs associated with `text_runs` or present in
|
||||||
|
// `marked_content_id_image_map_` respectively.
|
||||||
|
void PopulateTextRunTypeAndImageAltText(
|
||||||
|
std::vector<AccessibilityTextRunInfo>& text_runs);
|
||||||
|
|
||||||
// Traverses a struct element and its sub-tree recursively and extracts the
|
// Traverses a struct element and its sub-tree recursively and extracts the
|
||||||
// text run type or the alt text from struct elements corresponding to the
|
// text run type or the alt text from struct elements corresponding to the
|
||||||
// marked content IDs present in `marked_content_id_text_run_info_map` or
|
// marked content IDs present in `marked_content_id_text_run_info_map` or
|
||||||
|
Reference in New Issue
Block a user