0

Demonstrate PDF Searchify() can put text in the wrong place

Check in a cleaned up copy of PDFium's bug_718762.in as big_image.in
and big_image.pdf. This PDF has a single 5000x5000 JPEG image that is
displayed as 64x64. The drastic difference between the actual image size
and rendered image size makes it easier to see when Searchify() is using
the wrong size value.

Then add a PDFiumEngineExportsTest to:

1) Load big_image.pdf
2) Confirm it has no text positions.
3) Searchify it.
4) Check the searchified PDF's text positions.

Note that the OCR results are on the page, yet the text positions have
negative origins.

Bug: 352482331
Change-Id: I2061e604f15c3c31d1ce333fff8789c7820ab95e
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5697870
Reviewed-by: Chu-Hsuan Yang <chuhsuan@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1329337}
This commit is contained in:
Lei Zhang
2024-07-18 06:19:11 +00:00
committed by Chromium LUCI CQ
parent f16995881b
commit db5c3d1f36
3 changed files with 230 additions and 0 deletions

@ -20,6 +20,7 @@
#include "ui/gfx/geometry/rect.h"
#include "ui/gfx/geometry/size.h"
#include "ui/gfx/geometry/size_f.h"
#include "ui/gfx/geometry/test/geometry_util.h"
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
#include <memory>
@ -87,6 +88,31 @@ std::string GetText(base::span<const uint8_t> pdf, int page_index) {
return base::UTF16ToUTF8(text);
}
std::vector<gfx::RectF> GetTextPositions(base::span<const uint8_t> pdf,
int page_index) {
ScopedFPDFDocument document(
FPDF_LoadMemDocument64(pdf.data(), pdf.size(), nullptr));
CHECK(document);
ScopedFPDFPage page(FPDF_LoadPage(document.get(), page_index));
CHECK(page);
ScopedFPDFTextPage text_page(FPDFText_LoadPage(page.get()));
CHECK(text_page);
int char_count = FPDFText_CountChars(text_page.get());
CHECK_GE(char_count, 0);
std::vector<gfx::RectF> positions;
positions.reserve(char_count);
for (int i = 0; i < char_count; ++i) {
double left;
double right;
double bottom;
double top;
CHECK(
FPDFText_GetCharBox(text_page.get(), i, &left, &right, &bottom, &top));
positions.push_back(gfx::RectF(left, bottom, right - left, top - bottom));
}
return positions;
}
} // namespace
TEST_F(PDFiumEngineExportsTest, GetPDFDocInfo) {
@ -262,6 +288,72 @@ TEST_F(PDFiumEngineExportsTest, SearchifyBroken) {
EXPECT_TRUE(output_pdf_buffer.empty());
}
TEST_F(PDFiumEngineExportsTest, SearchifyBigImage) {
base::FilePath pdf_path =
pdf_data_dir().Append(FILE_PATH_LITERAL("big_image.pdf"));
std::optional<std::vector<uint8_t>> pdf_buffer =
base::ReadFileToBytes(pdf_path);
ASSERT_TRUE(pdf_buffer.has_value());
base::MockCallback<base::RepeatingCallback<
screen_ai::mojom::VisualAnnotationPtr(const SkBitmap&)>>
perform_ocr_callback;
EXPECT_CALL(perform_ocr_callback, Run).WillOnce([](const SkBitmap& bitmap) {
EXPECT_EQ(5000, bitmap.width());
EXPECT_EQ(5000, bitmap.height());
auto annotation = screen_ai::mojom::VisualAnnotation::New();
constexpr gfx::Rect kRect1(0, 500, 50, 100);
auto line_box1 = screen_ai::mojom::LineBox::New();
line_box1->baseline_box = kRect1;
line_box1->baseline_box_angle = 0;
line_box1->bounding_box = kRect1;
line_box1->bounding_box_angle = 0;
auto word_box1 = screen_ai::mojom::WordBox::New();
word_box1->word = "a";
word_box1->bounding_box = kRect1;
word_box1->bounding_box_angle = 0;
line_box1->words.push_back(std::move(word_box1));
annotation->lines.push_back(std::move(line_box1));
constexpr gfx::Rect kRect2(4800, 4900, 200, 100);
auto line_box2 = screen_ai::mojom::LineBox::New();
line_box2->baseline_box = kRect2;
line_box2->baseline_box_angle = 0;
line_box2->bounding_box = kRect2;
line_box2->bounding_box_angle = 0;
auto word_box2 = screen_ai::mojom::WordBox::New();
word_box2->word = "b";
word_box2->bounding_box = kRect2;
word_box2->bounding_box_angle = 0;
line_box2->words.push_back(std::move(word_box2));
annotation->lines.push_back(std::move(line_box2));
return annotation;
});
{
ScopedLibraryInitializer initializer;
EXPECT_TRUE(GetTextPositions(*pdf_buffer, 0).empty());
}
std::vector<uint8_t> output_pdf_buffer =
Searchify(*pdf_buffer, perform_ocr_callback.Get());
ASSERT_FALSE(output_pdf_buffer.empty());
{
constexpr float kFloatTolerance = 0.00001f;
ScopedLibraryInitializer initializer;
// The middle 2 positions are for auto-generated "\r\n".
// TODO(crbug.com/352482331): The positions here are wrong.
const std::vector<gfx::RectF> positions =
GetTextPositions(output_pdf_buffer, 0);
ASSERT_EQ(4u, positions.size());
EXPECT_RECTF_NEAR(gfx::RectF(0, -6.8608f, 0.64f, 1.28f), positions[0],
kFloatTolerance);
EXPECT_TRUE(positions[1].IsEmpty());
EXPECT_TRUE(positions[2].IsEmpty());
EXPECT_RECTF_NEAR(gfx::RectF(61.44f, -63.1808f, 2.56f, 1.28f), positions[3],
kFloatTolerance);
}
}
TEST_F(PDFiumEngineExportsTest, PdfProgressiveSearchifier) {
std::unique_ptr<PdfProgressiveSearchifier> progressive_searchifier =
CreateProgressiveSearchifier();

@ -0,0 +1,63 @@
{{header}}
{{object 1 0}} <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
{{object 2 0}} <<
/Type /Pages
/Count 1
/Kids [3 0 R]
>>
endobj
{{object 3 0}} <<
/Type /Page
/Parent 2 0 R
/Resources <<
/XObject <<
/I1 4 0 R
>>
>>
/MediaBox [0 0 64 64]
/Contents [5 0 R]
>>
endobj
{{object 4 0}} <<
/Type /XObject
/Subtype /Image
/Width 5000
/Height 5000
/BitsPerComponent 8
/ColorSpace /DeviceCMYK
/Decode [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0]
/Filter [/ASCIIHexDecode /FlateDecode /DCTDecode]
/Name /I1
{{streamlen}}
>>
stream
789cedcd3d4ec2001806e00f680529d4165a643671f112c6901095b09838b01983899bd7e02c9ec
2c143f8b37802af50cba81770799eef9ddebcc9d7bc355f515c2faf96d1e974e2b2bd68be23bfd8
3edd3fc436f69acf5844afdbdda795b44907699a24e9b0df3f188c86a35136ccb2713e29c679996
759312bca6955d7f5e8e8783eabe693aaae9a97a806d3dd74972c4e625dc6a68c5519cd6bd4ede3
f8edb0adabe26f1bcd47e4c922d6b18955c4793cdf9ddedc9e3dfe9d01000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000fc835ef
3fe03ce2e1eba
endstream
endobj
{{object 5 0}} <<
{{streamlen}}
>>
stream
q
64 0 0 64 0 0 cm
/I1 Do
Q
endstream
endobj
{{xref}}
{{trailer}}
{{startxref}}
%%EOF

@ -0,0 +1,75 @@
%PDF-1.7
%<25><><EFBFBD><EFBFBD>
1 0 obj <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj <<
/Type /Pages
/Count 1
/Kids [3 0 R]
>>
endobj
3 0 obj <<
/Type /Page
/Parent 2 0 R
/Resources <<
/XObject <<
/I1 4 0 R
>>
>>
/MediaBox [0 0 64 64]
/Contents [5 0 R]
>>
endobj
4 0 obj <<
/Type /XObject
/Subtype /Image
/Width 5000
/Height 5000
/BitsPerComponent 8
/ColorSpace /DeviceCMYK
/Decode [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0]
/Filter [/ASCIIHexDecode /FlateDecode /DCTDecode]
/Name /I1
/Length 734
>>
stream
789cedcd3d4ec2001806e00f680529d4165a643671f112c6901095b09838b01983899bd7e02c9ec
2c143f8b37802af50cba81770799eef9ddebcc9d7bc355f515c2faf96d1e974e2b2bd68be23bfd8
3edd3fc436f69acf5844afdbdda795b44907699a24e9b0df3f188c86a35136ccb2713e29c679996
759312bca6955d7f5e8e8783eabe693aaae9a97a806d3dd74972c4e625dc6a68c5519cd6bd4ede3
f8edb0adabe26f1bcd47e4c922d6b18955c4793cdf9ddedc9e3dfe9d01000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000fc835ef
3fe03ce2e1eba
endstream
endobj
5 0 obj <<
/Length 28
>>
stream
q
64 0 0 64 0 0 cm
/I1 Do
Q
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000068 00000 n
0000000131 00000 n
0000000286 00000 n
0000001292 00000 n
trailer <<
/Root 1 0 R
/Size 6
>>
startxref
1371
%%EOF