0

Add test for PDF Searchify spaces between words.

Additional changes:
1. If the bounding rectangles of two consecutive words collide, a
   space rect is not added between them.
2. Typo is fixed in reserved vector size for `words_and_spaces` to
   reserve "2 * word_count - 1" instead of "2 * word_count + 1" as the
   number of spaces is at most one less than words.

Bug: 360803943
Change-Id: I3e459f4320bc439d0d4532847ec24a44af9c5600
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5964956
Commit-Queue: Ramin Halavati <rhalavati@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Chu-Hsuan Yang <chuhsuan@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1375056}
This commit is contained in:
Ramin Halavati
2024-10-29 05:29:55 +00:00
committed by Chromium LUCI CQ
parent 2674df58cd
commit a9d95bfc55
3 changed files with 218 additions and 30 deletions

@ -173,6 +173,12 @@ gfx::Rect GetSpaceRect(const gfx::Rect& rect1, const gfx::Rect& rect2) {
return gfx::Rect();
}
// Return empty if the two rects intersect.
gfx::Rect r1 = rect1;
if (r1.InclusiveIntersect(rect2)) {
return gfx::Rect();
}
// Compute the angle of text flow from `rect1` to `rect2`, to decide where the
// space rectangle should be.
gfx::Vector2dF vec(rect2.CenterPoint() - rect1.CenterPoint());
@ -219,6 +225,37 @@ gfx::Rect GetSpaceRect(const gfx::Rect& rect1, const gfx::Rect& rect2) {
return gfx::Rect(x, y, width, height);
}
// If OCR has recognized a space character between two consecutive words,
// inserts a new word between them to represent it, and returns the vector of
// words and spaces.
std::vector<screen_ai::mojom::WordBox> GetWordsAndSpaces(
const std::vector<screen_ai::mojom::WordBoxPtr>& words) {
std::vector<screen_ai::mojom::WordBox> words_and_spaces;
size_t original_word_count = words.size();
if (original_word_count) {
words_and_spaces.reserve(original_word_count * 2 - 1);
}
for (size_t i = 0; i < original_word_count; i++) {
auto& current_word = words[i];
words_and_spaces.push_back(*current_word);
if (current_word->has_space_after && i + 1 < original_word_count) {
gfx::Rect space_rect =
GetSpaceRect(current_word->bounding_box, words[i + 1]->bounding_box);
if (!space_rect.IsEmpty()) {
words_and_spaces.push_back(screen_ai::mojom::WordBox(
/*word=*/" ", /*dictionary_word=*/false, current_word->language,
/*has_space_after=*/false, space_rect,
current_word->bounding_box_angle, current_word->direction,
/*confidence=*/1));
}
}
}
return words_and_spaces;
}
} // namespace
std::vector<uint8_t> PDFiumSearchify(
@ -322,27 +359,8 @@ std::vector<FPDF_PAGEOBJECT> AddTextOnImage(
ConvertToPdfOrigin(line->baseline_box, line->baseline_box_angle,
image_pixel_size.height());
// If OCR has recognized a space character between two consecutive words,
// insert a new word between them to represent it.
size_t original_word_count = line->words.size();
std::vector<screen_ai::mojom::WordBox> words_and_spaces;
words_and_spaces.reserve(original_word_count * 2 + 1);
for (size_t i = 0; i < original_word_count; i++) {
auto& current_word = line->words[i];
words_and_spaces.push_back(*current_word);
if (current_word->has_space_after && i + 1 < original_word_count) {
gfx::Rect space_rect = GetSpaceRect(current_word->bounding_box,
line->words[i + 1]->bounding_box);
if (!space_rect.IsEmpty()) {
words_and_spaces.push_back(screen_ai::mojom::WordBox(
/*word=*/" ", /*dictionary_word=*/false, current_word->language,
/*has_space_after=*/false, space_rect,
current_word->bounding_box_angle, current_word->direction,
/*confidence=*/1));
}
}
}
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpaces(line->words);
for (const auto& word : words_and_spaces) {
if (word.bounding_box.IsEmpty()) {
@ -383,6 +401,11 @@ gfx::Rect GetSpaceRectForTesting(const gfx::Rect& rect1, // IN-TEST
return GetSpaceRect(rect1, rect2);
}
std::vector<screen_ai::mojom::WordBox> GetWordsAndSpacesForTesting( // IN-TEST
const std::vector<screen_ai::mojom::WordBoxPtr>& words) {
return GetWordsAndSpaces(words);
}
ScopedFPDFFont CreateFont(FPDF_DOCUMENT document) {
std::vector<uint8_t> cid_to_gid_map(CreateCidToGidMap());
return ScopedFPDFFont(

@ -64,6 +64,8 @@ FS_MATRIX CalculateWordMoveMatrixForTesting(
bool word_is_rtl);
gfx::Rect GetSpaceRectForTesting(const gfx::Rect& rect1,
const gfx::Rect& rect2);
std::vector<screen_ai::mojom::WordBox> GetWordsAndSpacesForTesting(
const std::vector<screen_ai::mojom::WordBoxPtr>& words);
} // namespace chrome_pdf

@ -9,6 +9,7 @@
#include "base/numerics/angle_conversions.h"
#include "base/strings/stringprintf.h"
#include "services/screen_ai/public/mojom/screen_ai_service.mojom.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "ui/gfx/geometry/point_f.h"
#include "ui/gfx/geometry/rect.h"
@ -144,15 +145,6 @@ TEST(PdfiumSearchifyTest, CalculateWordMoveMatrix) {
}
TEST(PdfiumSearchifyTest, GetSpaceRect) {
// Empty rects.
{
gfx::Rect before;
gfx::Rect after;
gfx::Rect expected;
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
// Horizontal, Left to Right.
{
gfx::Rect before(10, 10, 100, 10);
@ -188,6 +180,177 @@ TEST(PdfiumSearchifyTest, GetSpaceRect) {
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
}
// Empty rect.
{
gfx::Rect before(10, 10, 10, 10);
gfx::Rect after;
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
}
// Touching rects.
{
gfx::Rect before(10, 10, 10, 10);
gfx::Rect after(20, 10, 10, 10);
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
}
// Colliding rects.
{
gfx::Rect before(10, 10, 10, 10);
gfx::Rect after(15, 10, 10, 10);
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
}
// Covering rects, first in second.
{
gfx::Rect before(10, 10, 10, 10);
gfx::Rect after(0, 0, 20, 20);
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
}
// Covering rects, second in first.
{
gfx::Rect before(0, 0, 20, 20);
gfx::Rect after(10, 10, 10, 10);
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
}
}
TEST(PdfiumSearchifyTest, AddingSpaceRects) {
// No words.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
EXPECT_TRUE(words_and_spaces.empty());
}
// One word with space after it.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word1";
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
words.back()->has_space_after = true;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
ASSERT_EQ(words_and_spaces.size(), 1u);
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
}
// Two words with space between them.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word1";
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
words.back()->has_space_after = true;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word2";
words.back()->bounding_box = gfx::Rect(60, 0, 50, 10);
words.back()->has_space_after = false;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
ASSERT_EQ(words_and_spaces.size(), 3u);
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
EXPECT_EQ(words_and_spaces[1].word, " ");
EXPECT_EQ(words_and_spaces[2].word, words[1]->word);
}
// Two words with no space between them.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word1";
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
words.back()->has_space_after = false;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word2";
words.back()->bounding_box = gfx::Rect(60, 0, 50, 10);
words.back()->has_space_after = true;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
ASSERT_EQ(words_and_spaces.size(), 2u);
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
}
// Two words with space between them and touching boundaries.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word1";
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
words.back()->has_space_after = true;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word2";
words.back()->bounding_box = gfx::Rect(50, 0, 50, 10);
words.back()->has_space_after = true;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
ASSERT_EQ(words_and_spaces.size(), 2u);
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
}
// Two words with space between them, first one with empty bounding box.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word1";
words.back()->bounding_box = gfx::Rect(0, 0, 0, 0);
words.back()->has_space_after = true;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word2";
words.back()->bounding_box = gfx::Rect(60, 0, 50, 10);
words.back()->has_space_after = false;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
ASSERT_EQ(words_and_spaces.size(), 2u);
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
}
// Two words with space between them, second one with empty bounding box.
{
std::vector<screen_ai::mojom::WordBoxPtr> words;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word1";
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
words.back()->has_space_after = true;
words.emplace_back(screen_ai::mojom::WordBox::New());
words.back()->word = "word2";
words.back()->bounding_box = gfx::Rect(0, 0, 0, 0);
words.back()->has_space_after = false;
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
GetWordsAndSpacesForTesting(words);
ASSERT_EQ(words_and_spaces.size(), 2u);
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
}
}
} // namespace chrome_pdf