0

Populate Text Fields in PDFiumPage

This CL introduces a method PDFiumPage::PopulateTextFields() which
reads text form fields from the PDF document and stores relevant
information in a vector within PDFiumPage.

The CL also includes a new test file with sample text fields and a unit
test to validate the new method.

Bug: 1030242
Change-Id: I98a13e237e443f1703ac7b699cc4952cf21c5e10
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2024637
Commit-Queue: Mansi Awasthi <maawas@microsoft.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Kevin Babbitt <kbabbitt@microsoft.com>
Cr-Commit-Position: refs/heads/master@{#745022}
This commit is contained in:
Mansi Awasthi
2020-02-27 13:02:39 +00:00
committed by Commit Bot
parent 7d09cf61d7
commit d6afb93c52
5 changed files with 392 additions and 43 deletions

@ -183,6 +183,24 @@ bool FloatEquals(float f1, float f2) {
kEpsilonScale * fmaxf(fmaxf(fabsf(f1), fabsf(f2)), kEpsilonScale);
}
using GetFormFieldPropertyFunction =
base::RepeatingCallback<unsigned long(unsigned short* buffer,
unsigned long buflen)>;
// Helper method to fetch string properties of form fields.
std::string GetFormFieldProperty(GetFormFieldPropertyFunction function) {
base::string16 data;
size_t buffer_size = function.Run(nullptr, 0);
if (buffer_size > 0) {
PDFiumAPIStringBufferSizeInBytesAdapter<base::string16> api_string_adapter(
&data, buffer_size, true);
api_string_adapter.Close(function.Run(
reinterpret_cast<unsigned short*>(api_string_adapter.GetData()),
buffer_size));
}
return base::UTF16ToUTF8(data);
}
} // namespace
PDFiumPage::LinkTarget::LinkTarget() : page(-1) {}
@ -266,7 +284,7 @@ void PDFiumPage::CalculatePageObjectTextRunBreaks() {
}
}
PopulateHighlights();
PopulateAnnotations();
for (const auto& highlight : highlights_) {
if (highlight.start_char_index >= 0 &&
highlight.start_char_index < chars_count) {
@ -568,7 +586,7 @@ PDFiumPage::GetHighlightInfo() {
if (!available_)
return highlight_info;
PopulateHighlights();
PopulateAnnotations();
highlight_info.reserve(highlights_.size());
for (const Highlight& highlight : highlights_) {
@ -1041,57 +1059,98 @@ void PDFiumPage::PopulateImageAltTextForStructElement(
}
}
void PDFiumPage::PopulateHighlights() {
if (calculated_highlights_)
void PDFiumPage::PopulateAnnotations() {
if (calculated_annotations_)
return;
FPDF_PAGE page = GetPage();
if (!page)
return;
calculated_highlights_ = true;
// Populate highlights from within the pdf page into data structures ready
// to be passed to mimehandler. Currently scoped to highlights only.
int annotation_count = FPDFPage_GetAnnotCount(page);
for (int i = 0; i < annotation_count; ++i) {
ScopedFPDFAnnotation annot(FPDFPage_GetAnnot(page, i));
DCHECK(annot);
FPDF_ANNOTATION_SUBTYPE subtype = FPDFAnnot_GetSubtype(annot.get());
if (subtype != FPDF_ANNOT_HIGHLIGHT)
continue;
FS_RECTF rect;
if (!FPDFAnnot_GetRect(annot.get(), &rect))
continue;
Highlight highlight;
// We use the bounding box of the highlight as the bounding rect.
highlight.bounding_rect =
PageToScreen(pp::Point(), 1.0, rect.left, rect.top, rect.right,
rect.bottom, PageOrientation::kOriginal);
GetUnderlyingTextRangeForRect(
pp::FloatRect(rect.left, rect.bottom, std::abs(rect.right - rect.left),
std::abs(rect.bottom - rect.top)),
&highlight.start_char_index, &highlight.char_count);
// Retrieve the color of the highlight.
unsigned int color_r;
unsigned int color_g;
unsigned int color_b;
unsigned int color_a;
FPDF_PAGEOBJECT page_object = FPDFAnnot_GetObject(annot.get(), 0);
if (FPDFPageObj_GetFillColor(page_object, &color_r, &color_g, &color_b,
&color_a)) {
highlight.color = MakeARGB(color_a, color_r, color_g, color_b);
} else {
// Set the same default color as in pdfium. See calls to
// GetColorStringWithDefault() in CPVT_GenerateAP::Generate*AP() in
// pdfium.
highlight.color = MakeARGB(255, 255, 255, 0);
switch (subtype) {
case FPDF_ANNOT_HIGHLIGHT: {
PopulateHighlight(annot.get());
break;
}
case FPDF_ANNOT_WIDGET: {
// TODO(crbug.com/1030242): Populate other types of form fields too.
if (FPDFAnnot_GetFormFieldType(engine_->form(), annot.get()) ==
FPDF_FORMFIELD_TEXTFIELD) {
PopulateTextField(annot.get());
}
break;
}
default:
break;
}
highlights_.push_back(std::move(highlight));
}
calculated_annotations_ = true;
}
void PDFiumPage::PopulateHighlight(FPDF_ANNOTATION annot) {
DCHECK(annot);
DCHECK_EQ(FPDFAnnot_GetSubtype(annot), FPDF_ANNOT_HIGHLIGHT);
FS_RECTF rect;
if (!FPDFAnnot_GetRect(annot, &rect))
return;
Highlight highlight;
// We use the bounding box of the highlight as the bounding rect.
highlight.bounding_rect =
PageToScreen(pp::Point(), 1.0, rect.left, rect.top, rect.right,
rect.bottom, PageOrientation::kOriginal);
GetUnderlyingTextRangeForRect(
pp::FloatRect(rect.left, rect.bottom, std::abs(rect.right - rect.left),
std::abs(rect.bottom - rect.top)),
&highlight.start_char_index, &highlight.char_count);
// Retrieve the color of the highlight.
unsigned int color_r;
unsigned int color_g;
unsigned int color_b;
unsigned int color_a;
FPDF_PAGEOBJECT page_object = FPDFAnnot_GetObject(annot, 0);
if (FPDFPageObj_GetFillColor(page_object, &color_r, &color_g, &color_b,
&color_a)) {
highlight.color = MakeARGB(color_a, color_r, color_g, color_b);
} else {
// Set the same default color as in pdfium. See calls to
// GetColorStringWithDefault() in CPVT_GenerateAP::Generate*AP() in
// pdfium.
highlight.color = MakeARGB(255, 255, 255, 0);
}
highlights_.push_back(std::move(highlight));
}
void PDFiumPage::PopulateTextField(FPDF_ANNOTATION annot) {
DCHECK(annot);
FPDF_FORMHANDLE form_handle = engine_->form();
DCHECK_EQ(FPDFAnnot_GetFormFieldType(form_handle, annot),
FPDF_FORMFIELD_TEXTFIELD);
FS_RECTF rect;
if (!FPDFAnnot_GetRect(annot, &rect))
return;
TextField text_field;
// We use the bounding box of the text field as the bounding rect.
text_field.bounding_rect =
PageToScreen(pp::Point(), 1.0, rect.left, rect.top, rect.right,
rect.bottom, PageOrientation::kOriginal);
text_field.value = GetFormFieldProperty(
base::BindRepeating(FPDFAnnot_GetFormFieldValue, form_handle, annot));
text_field.name = GetFormFieldProperty(
base::BindRepeating(FPDFAnnot_GetFormFieldName, form_handle, annot));
text_field.flags = FPDFAnnot_GetFormFieldFlags(form_handle, annot);
text_fields_.push_back(std::move(text_field));
}
bool PDFiumPage::GetUnderlyingTextRangeForRect(const pp::FloatRect& rect,
@ -1244,6 +1303,12 @@ PDFiumPage::Highlight::Highlight(const Highlight& that) = default;
PDFiumPage::Highlight::~Highlight() = default;
PDFiumPage::TextField::TextField() = default;
PDFiumPage::TextField::TextField(const TextField& that) = default;
PDFiumPage::TextField::~TextField() = default;
int ToPDFiumRotation(PageOrientation orientation) {
// Could static_cast<int>(orientation), but using an exhaustive switch will
// trigger an error if we ever change the definition of PageOrientation.

@ -170,6 +170,7 @@ class PDFiumPage {
FRIEND_TEST_ALL_PREFIXES(PDFiumPageImageTest, TestImageAltText);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageLinkTest, TestLinkGeneration);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageHighlightTest, TestPopulateHighlights);
FRIEND_TEST_ALL_PREFIXES(PDFiumPageTextFieldTest, TestPopulateTextFields);
// Returns a link index if the given character index is over a link, or -1
// otherwise.
@ -182,8 +183,12 @@ class PDFiumPage {
void PopulateAnnotationLinks();
// Calculate the locations of images on the page.
void CalculateImages();
// Populate highlights on the page.
void PopulateHighlights();
// Populate annotations like highlight and text field on the page.
void PopulateAnnotations();
// Populate |highlights_| with |annot|.
void PopulateHighlight(FPDF_ANNOTATION annot);
// Populate |text_fields_| with |annot|.
void PopulateTextField(FPDF_ANNOTATION annot);
// Returns link type and fills target associated with a link. Returns
// NONSELECTABLE_AREA if link detection failed.
Area GetLinkTarget(FPDF_LINK link, LinkTarget* target);
@ -275,6 +280,20 @@ class PDFiumPage {
uint32_t color;
};
// Represents a text field within the page.
struct TextField {
TextField();
TextField(const TextField& other);
~TextField();
// Represents the name of form field as defined in the field dictionary.
std::string name;
std::string value;
pp::Rect bounding_rect;
// Represents the flags of form field as defined in the field dictionary.
int flags;
};
PDFiumEngine* engine_;
ScopedFPDFPage page_;
ScopedFPDFTextPage text_page_;
@ -285,8 +304,9 @@ class PDFiumPage {
std::vector<Link> links_;
bool calculated_images_ = false;
std::vector<Image> images_;
bool calculated_highlights_ = false;
bool calculated_annotations_ = false;
std::vector<Highlight> highlights_;
std::vector<TextField> text_fields_;
bool calculated_page_object_text_run_breaks_ = false;
// The set of character indices on which text runs need to be broken for page
// objects.

@ -372,7 +372,7 @@ TEST_F(PDFiumPageHighlightTest, TestPopulateHighlights) {
PDFiumPage* page = GetPDFiumPageForTest(engine.get(), 0);
ASSERT_TRUE(page);
page->PopulateHighlights();
page->PopulateAnnotations();
ASSERT_EQ(base::size(kExpectedHighlights), page->highlights_.size());
for (size_t i = 0; i < page->highlights_.size(); ++i) {
@ -386,4 +386,41 @@ TEST_F(PDFiumPageHighlightTest, TestPopulateHighlights) {
}
}
using PDFiumPageTextFieldTest = PDFiumTestBase;
TEST_F(PDFiumPageTextFieldTest, TestPopulateTextFields) {
struct ExpectedTextField {
const char* name;
const char* value;
pp::Rect bounding_rect;
int flags;
};
static const ExpectedTextField kExpectedTextFields[] = {
{"Text Box", "Text", {138, 230, 135, 41}, 0},
{"ReadOnly", "Elephant", {138, 163, 135, 41}, 1},
{"Required", "Required Field", {138, 303, 135, 34}, 2},
{"Password", "", {138, 356, 135, 35}, 8192}};
TestClient client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("form_text_fields.pdf"));
ASSERT_TRUE(engine);
ASSERT_EQ(1, engine->GetNumberOfPages());
PDFiumPage* page = GetPDFiumPageForTest(engine.get(), 0);
ASSERT_TRUE(page);
page->PopulateAnnotations();
size_t text_fields_count = page->text_fields_.size();
ASSERT_EQ(base::size(kExpectedTextFields), text_fields_count);
for (size_t i = 0; i < text_fields_count; ++i) {
EXPECT_EQ(kExpectedTextFields[i].name, page->text_fields_[i].name);
EXPECT_EQ(kExpectedTextFields[i].value, page->text_fields_[i].value);
CompareRect(kExpectedTextFields[i].bounding_rect,
page->text_fields_[i].bounding_rect);
EXPECT_EQ(kExpectedTextFields[i].flags, page->text_fields_[i].flags);
}
}
} // namespace chrome_pdf

@ -0,0 +1,105 @@
{{header}}
{{object 1 0}} <<
/Type /Catalog
/Pages 2 0 R
/AcroForm <<
/Fields [ 7 0 R 8 0 R 9 0 R 10 0 R ]
/DR 4 0 R
>>
>>
endobj
{{object 2 0}} <<
/Count 1
/Kids [ 3 0 R ]
/Type /Pages
>>
endobj
{{object 3 0}} <<
/Type /Page
/Parent 2 0 R
/Resources 4 0 R
/MediaBox [ 0 0 300 300 ]
/Contents 6 0 R
/Annots [ 7 0 R 8 0 R 9 0 R 10 0 R ]
>>
endobj
{{object 4 0}} <<
/Font <<
/F1 5 0 R
>>
>>
endobj
{{object 5 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
{{object 6 0}} <<
{{streamlen}}
>>
stream
BT
/F1 12 Tf
100 200 Td
(Test Form) Tj
/F1 12 Tf
-80 -40 Td
(Read Only:) Tj
/F1 12 Tf
0 -50 Td
(Sample Text) Tj
/F1 12 Tf
200 -55 Td
(*required field) Tj
/F1 12 Tf
-200 -35 Td
(Password:) Tj
ET
endstream
endobj
{{object 7 0}} <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/T (Text Box)
/V (Text)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 100 200 130 ]
>>
endobj
{{object 8 0}} <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/Ff 1
/T (ReadOnly)
/V (Elephant)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 150 200 180 ]
>>
endobj
{{object 9 0}} <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/Ff 2
/T (Required)
/V (Required Field)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 50 200 75 ]
>>
{{object 10 0}} <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/Ff 8192
/T (Password)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 10 200 35 ]
>>
endobj
{{xref}}
{{trailer}}
{{startxref}}
%%EOF

@ -0,0 +1,122 @@
%PDF-1.7
%<25><><EFBFBD><EFBFBD>
1 0 obj <<
/Type /Catalog
/Pages 2 0 R
/AcroForm <<
/Fields [ 7 0 R 8 0 R 9 0 R 10 0 R ]
/DR 4 0 R
>>
>>
endobj
2 0 obj <<
/Count 1
/Kids [ 3 0 R ]
/Type /Pages
>>
endobj
3 0 obj <<
/Type /Page
/Parent 2 0 R
/Resources 4 0 R
/MediaBox [ 0 0 300 300 ]
/Contents 6 0 R
/Annots [ 7 0 R 8 0 R 9 0 R 10 0 R ]
>>
endobj
4 0 obj <<
/Font <<
/F1 5 0 R
>>
>>
endobj
5 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
6 0 obj <<
/Length 194
>>
stream
BT
/F1 12 Tf
100 200 Td
(Test Form) Tj
/F1 12 Tf
-80 -40 Td
(Read Only:) Tj
/F1 12 Tf
0 -50 Td
(Sample Text) Tj
/F1 12 Tf
200 -55 Td
(*required field) Tj
/F1 12 Tf
-200 -35 Td
(Password:) Tj
ET
endstream
endobj
7 0 obj <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/T (Text Box)
/V (Text)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 100 200 130 ]
>>
endobj
8 0 obj <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/Ff 1
/T (ReadOnly)
/V (Elephant)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 150 200 180 ]
>>
endobj
9 0 obj <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/Ff 2
/T (Required)
/V (Required Field)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 50 200 75 ]
>>
10 0 obj <<
/Type /Annot
/Subtype /Widget
/FT /Tx
/Ff 8192
/T (Password)
/DA (0 0 0 rg /F1 12 Tf)
/Rect [ 100 10 200 35 ]
>>
endobj
xref
0 11
0000000000 65535 f
0000000015 00000 n
0000000143 00000 n
0000000208 00000 n
0000000363 00000 n
0000000414 00000 n
0000000490 00000 n
0000000736 00000 n
0000000884 00000 n
0000001044 00000 n
0000001201 00000 n
trailer <<
/Root 1 0 R
/Size 11
>>
startxref
1347
%%EOF