Demonstrate PDF Searchify() can put text in the wrong place
Check in a cleaned up copy of PDFium's bug_718762.in as big_image.in and big_image.pdf. This PDF has a single 5000x5000 JPEG image that is displayed as 64x64. The drastic difference between the actual image size and rendered image size makes it easier to see when Searchify() is using the wrong size value. Then add a PDFiumEngineExportsTest to: 1) Load big_image.pdf 2) Confirm it has no text positions. 3) Searchify it. 4) Check the searchified PDF's text positions. Note that the OCR results are on the page, yet the text positions have negative origins. Bug: 352482331 Change-Id: I2061e604f15c3c31d1ce333fff8789c7820ab95e Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5697870 Reviewed-by: Chu-Hsuan Yang <chuhsuan@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org> Cr-Commit-Position: refs/heads/main@{#1329337}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
f16995881b
commit
db5c3d1f36
pdf
@ -20,6 +20,7 @@
|
||||
#include "ui/gfx/geometry/rect.h"
|
||||
#include "ui/gfx/geometry/size.h"
|
||||
#include "ui/gfx/geometry/size_f.h"
|
||||
#include "ui/gfx/geometry/test/geometry_util.h"
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
#include <memory>
|
||||
@ -87,6 +88,31 @@ std::string GetText(base::span<const uint8_t> pdf, int page_index) {
|
||||
return base::UTF16ToUTF8(text);
|
||||
}
|
||||
|
||||
std::vector<gfx::RectF> GetTextPositions(base::span<const uint8_t> pdf,
|
||||
int page_index) {
|
||||
ScopedFPDFDocument document(
|
||||
FPDF_LoadMemDocument64(pdf.data(), pdf.size(), nullptr));
|
||||
CHECK(document);
|
||||
ScopedFPDFPage page(FPDF_LoadPage(document.get(), page_index));
|
||||
CHECK(page);
|
||||
ScopedFPDFTextPage text_page(FPDFText_LoadPage(page.get()));
|
||||
CHECK(text_page);
|
||||
int char_count = FPDFText_CountChars(text_page.get());
|
||||
CHECK_GE(char_count, 0);
|
||||
std::vector<gfx::RectF> positions;
|
||||
positions.reserve(char_count);
|
||||
for (int i = 0; i < char_count; ++i) {
|
||||
double left;
|
||||
double right;
|
||||
double bottom;
|
||||
double top;
|
||||
CHECK(
|
||||
FPDFText_GetCharBox(text_page.get(), i, &left, &right, &bottom, &top));
|
||||
positions.push_back(gfx::RectF(left, bottom, right - left, top - bottom));
|
||||
}
|
||||
return positions;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_F(PDFiumEngineExportsTest, GetPDFDocInfo) {
|
||||
@ -262,6 +288,72 @@ TEST_F(PDFiumEngineExportsTest, SearchifyBroken) {
|
||||
EXPECT_TRUE(output_pdf_buffer.empty());
|
||||
}
|
||||
|
||||
TEST_F(PDFiumEngineExportsTest, SearchifyBigImage) {
|
||||
base::FilePath pdf_path =
|
||||
pdf_data_dir().Append(FILE_PATH_LITERAL("big_image.pdf"));
|
||||
std::optional<std::vector<uint8_t>> pdf_buffer =
|
||||
base::ReadFileToBytes(pdf_path);
|
||||
ASSERT_TRUE(pdf_buffer.has_value());
|
||||
|
||||
base::MockCallback<base::RepeatingCallback<
|
||||
screen_ai::mojom::VisualAnnotationPtr(const SkBitmap&)>>
|
||||
perform_ocr_callback;
|
||||
EXPECT_CALL(perform_ocr_callback, Run).WillOnce([](const SkBitmap& bitmap) {
|
||||
EXPECT_EQ(5000, bitmap.width());
|
||||
EXPECT_EQ(5000, bitmap.height());
|
||||
auto annotation = screen_ai::mojom::VisualAnnotation::New();
|
||||
|
||||
constexpr gfx::Rect kRect1(0, 500, 50, 100);
|
||||
auto line_box1 = screen_ai::mojom::LineBox::New();
|
||||
line_box1->baseline_box = kRect1;
|
||||
line_box1->baseline_box_angle = 0;
|
||||
line_box1->bounding_box = kRect1;
|
||||
line_box1->bounding_box_angle = 0;
|
||||
auto word_box1 = screen_ai::mojom::WordBox::New();
|
||||
word_box1->word = "a";
|
||||
word_box1->bounding_box = kRect1;
|
||||
word_box1->bounding_box_angle = 0;
|
||||
line_box1->words.push_back(std::move(word_box1));
|
||||
annotation->lines.push_back(std::move(line_box1));
|
||||
|
||||
constexpr gfx::Rect kRect2(4800, 4900, 200, 100);
|
||||
auto line_box2 = screen_ai::mojom::LineBox::New();
|
||||
line_box2->baseline_box = kRect2;
|
||||
line_box2->baseline_box_angle = 0;
|
||||
line_box2->bounding_box = kRect2;
|
||||
line_box2->bounding_box_angle = 0;
|
||||
auto word_box2 = screen_ai::mojom::WordBox::New();
|
||||
word_box2->word = "b";
|
||||
word_box2->bounding_box = kRect2;
|
||||
word_box2->bounding_box_angle = 0;
|
||||
line_box2->words.push_back(std::move(word_box2));
|
||||
annotation->lines.push_back(std::move(line_box2));
|
||||
return annotation;
|
||||
});
|
||||
{
|
||||
ScopedLibraryInitializer initializer;
|
||||
EXPECT_TRUE(GetTextPositions(*pdf_buffer, 0).empty());
|
||||
}
|
||||
std::vector<uint8_t> output_pdf_buffer =
|
||||
Searchify(*pdf_buffer, perform_ocr_callback.Get());
|
||||
ASSERT_FALSE(output_pdf_buffer.empty());
|
||||
{
|
||||
constexpr float kFloatTolerance = 0.00001f;
|
||||
ScopedLibraryInitializer initializer;
|
||||
// The middle 2 positions are for auto-generated "\r\n".
|
||||
// TODO(crbug.com/352482331): The positions here are wrong.
|
||||
const std::vector<gfx::RectF> positions =
|
||||
GetTextPositions(output_pdf_buffer, 0);
|
||||
ASSERT_EQ(4u, positions.size());
|
||||
EXPECT_RECTF_NEAR(gfx::RectF(0, -6.8608f, 0.64f, 1.28f), positions[0],
|
||||
kFloatTolerance);
|
||||
EXPECT_TRUE(positions[1].IsEmpty());
|
||||
EXPECT_TRUE(positions[2].IsEmpty());
|
||||
EXPECT_RECTF_NEAR(gfx::RectF(61.44f, -63.1808f, 2.56f, 1.28f), positions[3],
|
||||
kFloatTolerance);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(PDFiumEngineExportsTest, PdfProgressiveSearchifier) {
|
||||
std::unique_ptr<PdfProgressiveSearchifier> progressive_searchifier =
|
||||
CreateProgressiveSearchifier();
|
||||
|
63
pdf/test/data/big_image.in
Normal file
63
pdf/test/data/big_image.in
Normal file
@ -0,0 +1,63 @@
|
||||
{{header}}
|
||||
{{object 1 0}} <<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
{{object 2 0}} <<
|
||||
/Type /Pages
|
||||
/Count 1
|
||||
/Kids [3 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 3 0}} <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Resources <<
|
||||
/XObject <<
|
||||
/I1 4 0 R
|
||||
>>
|
||||
>>
|
||||
/MediaBox [0 0 64 64]
|
||||
/Contents [5 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 4 0}} <<
|
||||
/Type /XObject
|
||||
/Subtype /Image
|
||||
/Width 5000
|
||||
/Height 5000
|
||||
/BitsPerComponent 8
|
||||
/ColorSpace /DeviceCMYK
|
||||
/Decode [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0]
|
||||
/Filter [/ASCIIHexDecode /FlateDecode /DCTDecode]
|
||||
/Name /I1
|
||||
{{streamlen}}
|
||||
>>
|
||||
stream
|
||||
789cedcd3d4ec2001806e00f680529d4165a643671f112c6901095b09838b01983899bd7e02c9ec
|
||||
2c143f8b37802af50cba81770799eef9ddebcc9d7bc355f515c2faf96d1e974e2b2bd68be23bfd8
|
||||
3edd3fc436f69acf5844afdbdda795b44907699a24e9b0df3f188c86a35136ccb2713e29c679996
|
||||
759312bca6955d7f5e8e8783eabe693aaae9a97a806d3dd74972c4e625dc6a68c5519cd6bd4ede3
|
||||
f8edb0adabe26f1bcd47e4c922d6b18955c4793cdf9ddedc9e3dfe9d01000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
000000000000000000000000000000000000000000000000000000000000000000000000fc835ef
|
||||
3fe03ce2e1eba
|
||||
endstream
|
||||
endobj
|
||||
{{object 5 0}} <<
|
||||
{{streamlen}}
|
||||
>>
|
||||
stream
|
||||
q
|
||||
64 0 0 64 0 0 cm
|
||||
/I1 Do
|
||||
Q
|
||||
endstream
|
||||
endobj
|
||||
{{xref}}
|
||||
{{trailer}}
|
||||
{{startxref}}
|
||||
%%EOF
|
75
pdf/test/data/big_image.pdf
Normal file
75
pdf/test/data/big_image.pdf
Normal file
@ -0,0 +1,75 @@
|
||||
%PDF-1.7
|
||||
%<25><><EFBFBD><EFBFBD>
|
||||
1 0 obj <<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj <<
|
||||
/Type /Pages
|
||||
/Count 1
|
||||
/Kids [3 0 R]
|
||||
>>
|
||||
endobj
|
||||
3 0 obj <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Resources <<
|
||||
/XObject <<
|
||||
/I1 4 0 R
|
||||
>>
|
||||
>>
|
||||
/MediaBox [0 0 64 64]
|
||||
/Contents [5 0 R]
|
||||
>>
|
||||
endobj
|
||||
4 0 obj <<
|
||||
/Type /XObject
|
||||
/Subtype /Image
|
||||
/Width 5000
|
||||
/Height 5000
|
||||
/BitsPerComponent 8
|
||||
/ColorSpace /DeviceCMYK
|
||||
/Decode [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0]
|
||||
/Filter [/ASCIIHexDecode /FlateDecode /DCTDecode]
|
||||
/Name /I1
|
||||
/Length 734
|
||||
>>
|
||||
stream
|
||||
789cedcd3d4ec2001806e00f680529d4165a643671f112c6901095b09838b01983899bd7e02c9ec
|
||||
2c143f8b37802af50cba81770799eef9ddebcc9d7bc355f515c2faf96d1e974e2b2bd68be23bfd8
|
||||
3edd3fc436f69acf5844afdbdda795b44907699a24e9b0df3f188c86a35136ccb2713e29c679996
|
||||
759312bca6955d7f5e8e8783eabe693aaae9a97a806d3dd74972c4e625dc6a68c5519cd6bd4ede3
|
||||
f8edb0adabe26f1bcd47e4c922d6b18955c4793cdf9ddedc9e3dfe9d01000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
000000000000000000000000000000000000000000000000000000000000000000000000fc835ef
|
||||
3fe03ce2e1eba
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj <<
|
||||
/Length 28
|
||||
>>
|
||||
stream
|
||||
q
|
||||
64 0 0 64 0 0 cm
|
||||
/I1 Do
|
||||
Q
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000015 00000 n
|
||||
0000000068 00000 n
|
||||
0000000131 00000 n
|
||||
0000000286 00000 n
|
||||
0000001292 00000 n
|
||||
trailer <<
|
||||
/Root 1 0 R
|
||||
/Size 6
|
||||
>>
|
||||
startxref
|
||||
1371
|
||||
%%EOF
|
Reference in New Issue
Block a user