0

Simplify ConvertToPdfOrigin() inside pdfium_searchify.cc and add tests

Expose the function as ConvertToPdfOriginForTesting() to add some unit
tests. Also remove the unused width parameter.

Change-Id: Iaa7242f138fbe1a132c58643652daac80cc03aac
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5691718
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Chu-Hsuan Yang <chuhsuan@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1325951}
This commit is contained in:
Lei Zhang
2024-07-11 07:10:57 +00:00
committed by Chromium LUCI CQ
parent c3e6dac201
commit ece448f559
4 changed files with 114 additions and 20 deletions

@ -477,6 +477,10 @@ if (enable_pdf) {
]
}
if (enable_screen_ai_service) {
sources += [ "pdfium/pdfium_searchify_unittest.cc" ]
}
if (v8_use_external_startup_data) {
deps += [
"//tools/v8_context_snapshot",

@ -47,20 +47,13 @@ std::vector<uint32_t> Utf8ToCharcodes(const std::string& string) {
return charcodes;
}
struct BoundingBoxOrigin {
double x;
double y;
double theta;
};
// The coordinate systems between OCR and PDF are different. OCR's origin is at
// top-left, so we need to convert them to PDF's bottom-left.
BoundingBoxOrigin ConvertToPdfOrigin(int x,
int y,
int width,
int height,
double angle,
double coordinate_system_height) {
SearchifyBoundingBoxOrigin ConvertToPdfOrigin(int x,
int y,
int height,
double angle,
double coordinate_system_height) {
const double theta = base::DegToRad(angle);
return {.x = x - (sin(theta) * height),
.y = coordinate_system_height - (y + cos(theta) * height),
@ -68,8 +61,9 @@ BoundingBoxOrigin ConvertToPdfOrigin(int x,
}
// Project the text object's origin to the baseline's origin.
BoundingBoxOrigin ProjectToBaseline(const BoundingBoxOrigin& origin,
const BoundingBoxOrigin& baseline_origin) {
SearchifyBoundingBoxOrigin ProjectToBaseline(
const SearchifyBoundingBoxOrigin& origin,
const SearchifyBoundingBoxOrigin& baseline_origin) {
// The length between `origin` and `baseline_origin`.
double length = (origin.x - baseline_origin.x) * cos(baseline_origin.theta) +
(origin.y - baseline_origin.y) * sin(baseline_origin.theta);
@ -107,10 +101,10 @@ void AddTextOnImage(FPDF_DOCUMENT document,
}
for (const auto& line : annotation->lines) {
BoundingBoxOrigin baseline_origin = ConvertToPdfOrigin(
line->baseline_box.x(), line->baseline_box.y(),
line->baseline_box.width(), line->baseline_box.height(),
line->baseline_box_angle, image_rendered_height);
SearchifyBoundingBoxOrigin baseline_origin =
ConvertToPdfOrigin(line->baseline_box.x(), line->baseline_box.y(),
line->baseline_box.height(),
line->baseline_box_angle, image_rendered_height);
for (const auto& word : line->words) {
double width = word->bounding_box.width();
@ -167,8 +161,8 @@ void AddTextOnImage(FPDF_DOCUMENT document,
FPDFPageObj_Transform(text.get(), width_scale, 0, 0, height_scale, 0, 0);
// Move text object to the corresponding text position on the full image.
BoundingBoxOrigin origin = ConvertToPdfOrigin(
word->bounding_box.x(), word->bounding_box.y(), width, height,
SearchifyBoundingBoxOrigin origin = ConvertToPdfOrigin(
word->bounding_box.x(), word->bounding_box.y(), height,
word->bounding_box_angle, image_rendered_height);
origin = ProjectToBaseline(origin, baseline_origin);
double a = cos(origin.theta);
@ -280,6 +274,15 @@ std::vector<uint8_t> PDFiumSearchify(
return output_file_write.TakeBuffer();
}
SearchifyBoundingBoxOrigin ConvertToPdfOriginForTesting(
int x,
int y,
int height,
double angle,
double coordinate_system_height) {
return ConvertToPdfOrigin(x, y, height, angle, coordinate_system_height);
}
PdfiumProgressiveSearchifier::ScopedSdkInitializer::ScopedSdkInitializer() {
// TODO(thestig): Check the default value of `use_skia`.
InitializeSDK(false, false, FontMappingMode::kNoMapping);

@ -18,11 +18,25 @@
namespace chrome_pdf {
struct SearchifyBoundingBoxOrigin {
double x;
double y;
double theta;
};
std::vector<uint8_t> PDFiumSearchify(
base::span<const uint8_t> pdf_buffer,
base::RepeatingCallback<screen_ai::mojom::VisualAnnotationPtr(
const SkBitmap& bitmap)> perform_ocr_callback);
// Internal function exposed for testing.
SearchifyBoundingBoxOrigin ConvertToPdfOriginForTesting(
int x,
int y,
int height,
double angle,
double coordinate_system_height);
class PdfiumProgressiveSearchifier : public PdfProgressiveSearchifier {
public:
PdfiumProgressiveSearchifier();

@ -0,0 +1,73 @@
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdfium/pdfium_searchify.h"
#include <numbers>
#include "testing/gtest/include/gtest/gtest.h"
namespace chrome_pdf {
TEST(PdfiumSearchifyTest, ConvertToPdfOrigin) {
{
SearchifyBoundingBoxOrigin result = ConvertToPdfOriginForTesting(
/*x=*/100,
/*y=*/50,
/*height=*/30,
/*angle=*/0,
/*coordinate_system_height=*/792);
EXPECT_DOUBLE_EQ(100, result.x);
EXPECT_DOUBLE_EQ(712, result.y);
EXPECT_DOUBLE_EQ(0, result.theta);
}
{
SearchifyBoundingBoxOrigin result = ConvertToPdfOriginForTesting(
/*x=*/100,
/*y=*/50,
/*height=*/30,
/*angle=*/45,
/*coordinate_system_height=*/792);
EXPECT_DOUBLE_EQ(78.786796564403573, result.x);
EXPECT_DOUBLE_EQ(720.78679656440363, result.y);
EXPECT_DOUBLE_EQ(-std::numbers::pi / 4, result.theta);
}
{
SearchifyBoundingBoxOrigin result = ConvertToPdfOriginForTesting(
/*x=*/100,
/*y=*/50,
/*height=*/30,
/*angle=*/90,
/*coordinate_system_height=*/792);
EXPECT_DOUBLE_EQ(70, result.x);
EXPECT_DOUBLE_EQ(742, result.y);
EXPECT_DOUBLE_EQ(-std::numbers::pi / 2, result.theta);
}
{
SearchifyBoundingBoxOrigin result = ConvertToPdfOriginForTesting(
/*x=*/100,
/*y=*/50,
/*height=*/30,
/*angle=*/180,
/*coordinate_system_height=*/792);
EXPECT_DOUBLE_EQ(100, result.x);
EXPECT_DOUBLE_EQ(772, result.y);
EXPECT_DOUBLE_EQ(-std::numbers::pi, result.theta);
}
{
SearchifyBoundingBoxOrigin result = ConvertToPdfOriginForTesting(
/*x=*/100,
/*y=*/50,
/*height=*/30,
/*angle=*/-90,
/*coordinate_system_height=*/792);
EXPECT_DOUBLE_EQ(130, result.x);
EXPECT_DOUBLE_EQ(742, result.y);
EXPECT_DOUBLE_EQ(std::numbers::pi / 2, result.theta);
}
}
} // namespace chrome_pdf