0

Add a word object for each space between words in PDF Searchify.

To better show word positions for searchify outputs on PDFs, a new word
object is added for spaces between consecutive words. The length of the
space is calculated based on the relative position of the two words.

AX-Relnotes: n/a
Bug: 372139880,360803943
Change-Id: I2ae17d58adde3c1ca319a8960bc16d843ff81eca
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5951924
Auto-Submit: Ramin Halavati <rhalavati@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Chu-Hsuan Yang <chuhsuan@chromium.org>
Commit-Queue: Ramin Halavati <rhalavati@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1373736}
This commit is contained in:
Ramin Halavati
2024-10-25 05:44:02 +00:00
committed by Chromium LUCI CQ
parent 4bce22f7ed
commit cfd27804d5
3 changed files with 151 additions and 18 deletions

@ -124,20 +124,13 @@ FS_MATRIX CalculateWordMoveMatrix(const SearchifyBoundingBoxOrigin& word_origin,
FPDF_PAGEOBJECT AddWordOnImage(FPDF_DOCUMENT document,
FPDF_PAGE page,
FPDF_FONT font,
const screen_ai::mojom::WordBoxPtr& word,
const screen_ai::mojom::WordBox& word,
base::span<const FS_MATRIX> transform_matrices) {
ScopedFPDFPageObject text(
FPDFPageObj_CreateTextObj(document, font, word->bounding_box.height()));
FPDFPageObj_CreateTextObj(document, font, word.bounding_box.height()));
CHECK(text);
std::string word_string = word->word;
// TODO(crbug.com/41487613): A more accurate width would be the distance
// from current word's origin to next word's origin.
if (word->has_space_after) {
word_string.push_back(' ');
}
std::vector<uint32_t> charcodes = Utf8ToCharcodes(word_string);
std::vector<uint32_t> charcodes = Utf8ToCharcodes(word.word);
if (charcodes.empty()) {
DLOG(ERROR) << "Got empty word";
return nullptr;
@ -155,8 +148,8 @@ FPDF_PAGEOBJECT AddWordOnImage(FPDF_DOCUMENT document,
CHECK_GT(text_object_size.width(), 0);
CHECK_GT(text_object_size.height(), 0);
const FS_MATRIX text_scale_matrix(
word->bounding_box.width() / text_object_size.width(), 0, 0,
word->bounding_box.height() / text_object_size.height(), 0, 0);
word.bounding_box.width() / text_object_size.width(), 0, 0,
word.bounding_box.height() / text_object_size.height(), 0, 0);
CHECK(FPDFPageObj_TransformF(text.get(), &text_scale_matrix));
for (const auto& matrix : transform_matrices) {
@ -168,6 +161,66 @@ FPDF_PAGEOBJECT AddWordOnImage(FPDF_DOCUMENT document,
return text_ptr;
}
double IsInRange(double v, double min_value, double max_value) {
CHECK_LT(min_value, max_value);
return v >= min_value && v <= max_value;
}
// Returns the rectangle covering the space between the bounding rectangles of
// two consecutive words.
gfx::Rect GetSpaceRect(const gfx::Rect& rect1, const gfx::Rect& rect2) {
if (rect1.IsEmpty() || rect2.IsEmpty()) {
return gfx::Rect();
}
// Compute the angle of text flow from `rect1` to `rect2`, to decide where the
// space rectangle should be.
gfx::Point c1 = rect1.CenterPoint();
gfx::Point c2 = rect2.CenterPoint();
double text_flow_angle = base::RadToDeg(
gfx::Vector2dF(c2.x() - c1.x(), c2.y() - c1.y()).SlopeAngleRadians());
int x;
int y;
int width;
int height;
if (IsInRange(text_flow_angle, -45, 45)) {
// Left to Right
x = rect1.right();
width = rect2.x() - rect1.right();
y = std::min(rect1.y(), rect2.y());
height = std::max(rect1.bottom(), rect2.bottom()) - y;
} else if (IsInRange(text_flow_angle, 45, 135)) {
// Top to Bottom.
y = rect1.bottom();
height = rect2.y() - rect1.bottom();
x = std::min(rect1.x(), rect2.x());
width = std::max(rect1.right(), rect2.right()) - x;
} else if (IsInRange(text_flow_angle, 135, 180) ||
IsInRange(text_flow_angle, -180, -135)) {
// Right to Left.
x = rect2.right();
width = rect1.x() - rect2.right();
y = std::min(rect1.y(), rect2.y());
height = std::max(rect1.bottom(), rect2.bottom()) - y;
} else {
CHECK(IsInRange(text_flow_angle, -135, -45));
// Bottom to Top.
y = rect2.bottom();
height = rect1.y() - rect2.bottom();
x = std::min(rect1.x(), rect2.x());
width = std::max(rect1.right(), rect2.right()) - x;
}
// To avoid returning an empty rectangle, width and height are set to at least
// one.
width = std::max(1, width);
height = std::max(1, height);
return gfx::Rect(x, y, width, height);
}
} // namespace
std::vector<uint8_t> PDFiumSearchify(
@ -258,27 +311,53 @@ std::vector<FPDF_PAGEOBJECT> AddTextOnImage(
size_t estimated_word_count = 0;
for (const auto& line : annotation->lines) {
estimated_word_count += line->words.size();
// Assume there are spaces between each two words.
if (line->words.size()) {
estimated_word_count += line->words.size() * 2 - 1;
}
}
std::vector<FPDF_PAGEOBJECT> added_text_objects;
added_text_objects.reserve(estimated_word_count);
for (const auto& line : annotation->lines) {
SearchifyBoundingBoxOrigin baseline_origin =
ConvertToPdfOrigin(line->baseline_box, line->baseline_box_angle,
image_pixel_size.height());
for (const auto& word : line->words) {
if (word->bounding_box.IsEmpty()) {
// If OCR has recognized a space character between two consecutive words,
// insert a new word between them to represent it.
size_t original_word_count = line->words.size();
std::vector<screen_ai::mojom::WordBox> words_and_spaces;
words_and_spaces.reserve(original_word_count * 2 + 1);
for (size_t i = 0; i < original_word_count; i++) {
auto& current_word = line->words[i];
words_and_spaces.push_back(*current_word);
if (current_word->has_space_after && i + 1 < original_word_count) {
gfx::Rect space_rect = GetSpaceRect(current_word->bounding_box,
line->words[i + 1]->bounding_box);
if (!space_rect.IsEmpty()) {
words_and_spaces.push_back(screen_ai::mojom::WordBox(
/*word=*/" ", /*dictionary_word=*/false, current_word->language,
/*has_space_after=*/false, space_rect,
current_word->bounding_box_angle, current_word->direction,
/*confidence=*/1));
}
}
}
for (const auto& word : words_and_spaces) {
if (word.bounding_box.IsEmpty()) {
continue;
}
SearchifyBoundingBoxOrigin origin =
ConvertToPdfOrigin(word->bounding_box, word->bounding_box_angle,
ConvertToPdfOrigin(word.bounding_box, word.bounding_box_angle,
image_pixel_size.height());
move_matrix = CalculateWordMoveMatrix(
ProjectToBaseline(origin.point, baseline_origin),
word->bounding_box.width(),
word->direction ==
word.bounding_box.width(),
word.direction ==
screen_ai::mojom::Direction::DIRECTION_RIGHT_TO_LEFT);
added_text_objects.push_back(
AddWordOnImage(document, page, font, word, transform_matrices));
@ -301,6 +380,11 @@ FS_MATRIX CalculateWordMoveMatrixForTesting(
return CalculateWordMoveMatrix(origin, word_bounding_box_width, word_is_rtl);
}
gfx::Rect GetSpaceRectForTesting(const gfx::Rect& rect1, // IN-TEST
const gfx::Rect& rect2) {
return GetSpaceRect(rect1, rect2);
}
ScopedFPDFFont CreateFont(FPDF_DOCUMENT document) {
std::vector<uint8_t> cid_to_gid_map(CreateCidToGidMap());
return ScopedFPDFFont(

@ -62,6 +62,8 @@ FS_MATRIX CalculateWordMoveMatrixForTesting(
const SearchifyBoundingBoxOrigin& origin,
int word_bounding_box_width,
bool word_is_rtl);
gfx::Rect GetSpaceRectForTesting(const gfx::Rect& rect1,
const gfx::Rect& rect2);
} // namespace chrome_pdf

@ -143,4 +143,51 @@ TEST(PdfiumSearchifyTest, CalculateWordMoveMatrix) {
}
}
TEST(PdfiumSearchifyTest, GetSpaceRect) {
// Empty rects.
{
gfx::Rect before;
gfx::Rect after;
gfx::Rect expected;
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
// Horizontal, Left to Right.
{
gfx::Rect before(10, 10, 100, 10);
gfx::Rect after(120, 10, 50, 10);
gfx::Rect expected(110, 10, 10, 10);
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
// Horizontal, Right To Left.
{
gfx::Rect before(120, 10, 50, 10);
gfx::Rect after(10, 10, 100, 10);
gfx::Rect expected(110, 10, 10, 10);
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
// Vertical, Top to Bottom.
{
gfx::Rect before(10, 10, 10, 100);
gfx::Rect after(10, 120, 10, 50);
gfx::Rect expected(10, 110, 10, 10);
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
// Vertical, Bottom to Top.
{
gfx::Rect before(10, 120, 10, 50);
gfx::Rect after(10, 10, 10, 100);
gfx::Rect expected(10, 110, 10, 10);
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
}
} // namespace chrome_pdf