0

Populate Highlights in PDFiumPage

This CL introduces a method PDFiumPage::PopulateHighlights which reads
highlight annotations from the PDF document and stores relevant
accessibility info in a vector within PDFiumPage. Also included is a
test pdf file containing simple highlights and a unit test which
validates the new method.

Bug: 1008775
Change-Id: I1c04f2cc7dc2885aa3a5f49ec66fcccc5a2c1311
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1862872
Commit-Queue: Kalpak Tapas <katapas@microsoft.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Kevin Babbitt <kbabbitt@microsoft.com>
Cr-Commit-Position: refs/heads/master@{#718088}
This commit is contained in:
Kalpak Tapas
2019-11-22 10:49:59 +00:00
committed by Commit Bot
parent ed920a42dc
commit 6eb2e8d72c
5 changed files with 257 additions and 0 deletions

@ -1009,6 +1009,43 @@ void PDFiumPage::PopulateImageAltTextForStructElement(
}
}
void PDFiumPage::PopulateHighlights() {
if (calculated_highlights_)
return;
FPDF_PAGE page = GetPage();
if (!page)
return;
calculated_highlights_ = true;
// Populate highlights from within the pdf page into data structures ready
// to be passed to mimehandler. Currently scoped to highlights only.
int annotation_count = FPDFPage_GetAnnotCount(page);
for (int i = 0; i < annotation_count; ++i) {
ScopedFPDFAnnotation annot(FPDFPage_GetAnnot(page, i));
DCHECK(annot);
FPDF_ANNOTATION_SUBTYPE subtype = FPDFAnnot_GetSubtype(annot.get());
if (subtype != FPDF_ANNOT_HIGHLIGHT)
continue;
FS_RECTF rect;
if (!FPDFAnnot_GetRect(annot.get(), &rect))
continue;
Highlight highlight;
// We use the bounding box of the highlight as the bounding rect.
highlight.bounding_rect =
PageToScreen(pp::Point(), 1.0, rect.left, rect.top, rect.right,
rect.bottom, PageOrientation::kOriginal);
GetUnderlyingTextRangeForRect(
pp::FloatRect(rect.left, rect.bottom, std::abs(rect.right - rect.left),
std::abs(rect.bottom - rect.top)),
&highlight.start_char_index, &highlight.char_count);
highlights_.push_back(std::move(highlight));
}
}
bool PDFiumPage::GetUnderlyingTextRangeForRect(const pp::FloatRect& rect,
int* start_index,
int* char_len) {
@ -1153,6 +1190,12 @@ PDFiumPage::Image::Image(const Image& that) = default;
PDFiumPage::Image::~Image() = default;
PDFiumPage::Highlight::Highlight() = default;
PDFiumPage::Highlight::Highlight(const Highlight& that) = default;
PDFiumPage::Highlight::~Highlight() = default;
int ToPDFiumRotation(PageOrientation orientation) {
// Could static_cast<int>(orientation), but using an exhaustive switch will
// trigger an error if we ever change the definition of PageOrientation.

@ -162,6 +162,7 @@ class PDFiumPage {
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, TestAnnotLinkGeneration);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, TestImageAltText);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, TestLinkGeneration);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageHighlightTest, TestPopulateHighlights);
// Returns a link index if the given character index is over a link, or -1
// otherwise.
@ -174,6 +175,8 @@ class PDFiumPage {
void PopulateAnnotationLinks();
// Calculate the locations of images on the page.
void CalculateImages();
// Populate highlights on the page.
void PopulateHighlights();
// Returns link type and fills target associated with a link. Returns
// NONSELECTABLE_AREA if link detection failed.
Area GetLinkTarget(FPDF_LINK link, LinkTarget* target);
@ -248,6 +251,19 @@ class PDFiumPage {
std::string alt_text;
};
// Represents a highlight within the page.
struct Highlight {
Highlight();
Highlight(const Highlight& other);
~Highlight();
// Start index of underlying text range. -1 indicates invalid value.
int32_t start_char_index = -1;
// Number of characters encompassed by this highlight.
int32_t char_count = 0;
pp::Rect bounding_rect;
};
PDFiumEngine* engine_;
ScopedFPDFPage page_;
ScopedFPDFTextPage text_page_;
@ -258,6 +274,8 @@ class PDFiumPage {
std::vector<Link> links_;
bool calculated_images_ = false;
std::vector<Image> images_;
bool calculated_highlights_ = false;
std::vector<Highlight> highlights_;
bool calculated_page_object_text_run_breaks_ = false;
// The set of character indices on which text runs need to be broken for page
// objects.

@ -291,4 +291,39 @@ TEST_F(PDFiumPageTextTest, GetTextRunInfo) {
ASSERT_FALSE(text_run_info_result.has_value());
}
using PDFiumPageHighlightTest = PDFiumTestBase;
TEST_F(PDFiumPageHighlightTest, TestPopulateHighlights) {
struct ExpectedHighlight {
int32_t start_char_index;
int32_t char_count;
pp::Rect bounding_rect;
};
static const ExpectedHighlight kExpectedHighlights[] = {
{0, 5, {5, 196, 49, 26}},
{12, 7, {110, 196, 77, 26}},
{20, 1, {192, 196, 13, 26}}};
TestClient client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("highlights.pdf"));
ASSERT_TRUE(engine);
ASSERT_EQ(1, engine->GetNumberOfPages());
PDFiumPage* page = GetPDFiumPageForTest(engine.get(), 0);
ASSERT_TRUE(page);
page->PopulateHighlights();
ASSERT_EQ(base::size(kExpectedHighlights), page->highlights_.size());
for (size_t i = 0; i < page->highlights_.size(); ++i) {
ASSERT_EQ(kExpectedHighlights[i].start_char_index,
page->highlights_[i].start_char_index);
ASSERT_EQ(kExpectedHighlights[i].char_count,
page->highlights_[i].char_count);
CompareRect(kExpectedHighlights[i].bounding_rect,
page->highlights_[i].bounding_rect);
}
}
} // namespace chrome_pdf

@ -0,0 +1,73 @@
{{header}}
{{object 1 0}} <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
{{object 2 0}} <<
/Type /Pages
/MediaBox [0 0 400 200]
/Count 1
/Kids [3 0 R]
>>
endobj
{{object 3 0}} <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
/Annots [6 0 R 7 0 R 8 0 R]
>>
endobj
{{object 4 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
{{object 5 0}} <<
{{streamlen}}
>>
stream
BT
0 40 Td
/F1 16 Tf
(Hello, nice meeting you) Tj
ET
endstream
endobj
{{object 6 0}} <<
/Type /Annot
/Subtype /Highlight
/QuadPoints [0 55 36 59 0 36 36 36]
/Rect [0 36 36 55]
/C [0.15 0 0.9 0]
/P 3 0 R
>>
endobj
{{object 7 0}} <<
/Type /Annot
/Subtype /Highlight
/QuadPoints [79 55 136 55 79 36 136 36]
/Rect [79 36 136 55]
/C [0.15 0 0.9 0]
/P 3 0 R
>>
endobj
{{object 8 0}} <<
/Type /Annot
/Subtype /Highlight
/QuadPoints [140 55 149 55 140 36 149 36]
/Rect [140 36 149 55]
/C [0.15 0 0.9 0]
/P 3 0 R
>>
endobj
{{xref}}
{{trailer}}
{{startxref}}
%%EOF

@ -0,0 +1,88 @@
%PDF-1.7
%<25><><EFBFBD><EFBFBD>
1 0 obj <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj <<
/Type /Pages
/MediaBox [0 0 400 200]
/Count 1
/Kids [3 0 R]
>>
endobj
3 0 obj <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
/Annots [6 0 R 7 0 R 8 0 R]
>>
endobj
4 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
5 0 obj <<
/Length 53
>>
stream
BT
0 40 Td
/F1 16 Tf
(Hello, nice meeting you) Tj
ET
endstream
endobj
6 0 obj <<
/Type /Annot
/Subtype /Highlight
/QuadPoints [0 55 36 59 0 36 36 36]
/Rect [0 36 36 55]
/C [0.15 0 0.9 0]
/P 3 0 R
>>
endobj
7 0 obj <<
/Type /Annot
/Subtype /Highlight
/QuadPoints [79 55 136 55 79 36 136 36]
/Rect [79 36 136 55]
/C [0.15 0 0.9 0]
/P 3 0 R
>>
endobj
8 0 obj <<
/Type /Annot
/Subtype /Highlight
/QuadPoints [140 55 149 55 140 36 149 36]
/Rect [140 36 149 55]
/C [0.15 0 0.9 0]
/P 3 0 R
>>
endobj
xref
0 9
0000000000 65535 f
0000000015 00000 n
0000000068 00000 n
0000000157 00000 n
0000000313 00000 n
0000000389 00000 n
0000000493 00000 n
0000000641 00000 n
0000000795 00000 n
trailer <<
/Root 1 0 R
/Size 9
>>
startxref
952
%%EOF