Add test for PDF Searchify spaces between words.
Additional changes: 1. If the bounding rectangles of two consecutive words collide, a space rect is not added between them. 2. Typo is fixed in reserved vector size for `words_and_spaces` to reserve "2 * word_count - 1" instead of "2 * word_count + 1" as the number of spaces is at most one less than words. Bug: 360803943 Change-Id: I3e459f4320bc439d0d4532847ec24a44af9c5600 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5964956 Commit-Queue: Ramin Halavati <rhalavati@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org> Reviewed-by: Chu-Hsuan Yang <chuhsuan@chromium.org> Cr-Commit-Position: refs/heads/main@{#1375056}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
2674df58cd
commit
a9d95bfc55
@ -173,6 +173,12 @@ gfx::Rect GetSpaceRect(const gfx::Rect& rect1, const gfx::Rect& rect2) {
|
||||
return gfx::Rect();
|
||||
}
|
||||
|
||||
// Return empty if the two rects intersect.
|
||||
gfx::Rect r1 = rect1;
|
||||
if (r1.InclusiveIntersect(rect2)) {
|
||||
return gfx::Rect();
|
||||
}
|
||||
|
||||
// Compute the angle of text flow from `rect1` to `rect2`, to decide where the
|
||||
// space rectangle should be.
|
||||
gfx::Vector2dF vec(rect2.CenterPoint() - rect1.CenterPoint());
|
||||
@ -219,6 +225,37 @@ gfx::Rect GetSpaceRect(const gfx::Rect& rect1, const gfx::Rect& rect2) {
|
||||
return gfx::Rect(x, y, width, height);
|
||||
}
|
||||
|
||||
// If OCR has recognized a space character between two consecutive words,
|
||||
// inserts a new word between them to represent it, and returns the vector of
|
||||
// words and spaces.
|
||||
std::vector<screen_ai::mojom::WordBox> GetWordsAndSpaces(
|
||||
const std::vector<screen_ai::mojom::WordBoxPtr>& words) {
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces;
|
||||
|
||||
size_t original_word_count = words.size();
|
||||
if (original_word_count) {
|
||||
words_and_spaces.reserve(original_word_count * 2 - 1);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < original_word_count; i++) {
|
||||
auto& current_word = words[i];
|
||||
words_and_spaces.push_back(*current_word);
|
||||
if (current_word->has_space_after && i + 1 < original_word_count) {
|
||||
gfx::Rect space_rect =
|
||||
GetSpaceRect(current_word->bounding_box, words[i + 1]->bounding_box);
|
||||
if (!space_rect.IsEmpty()) {
|
||||
words_and_spaces.push_back(screen_ai::mojom::WordBox(
|
||||
/*word=*/" ", /*dictionary_word=*/false, current_word->language,
|
||||
/*has_space_after=*/false, space_rect,
|
||||
current_word->bounding_box_angle, current_word->direction,
|
||||
/*confidence=*/1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return words_and_spaces;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<uint8_t> PDFiumSearchify(
|
||||
@ -322,27 +359,8 @@ std::vector<FPDF_PAGEOBJECT> AddTextOnImage(
|
||||
ConvertToPdfOrigin(line->baseline_box, line->baseline_box_angle,
|
||||
image_pixel_size.height());
|
||||
|
||||
// If OCR has recognized a space character between two consecutive words,
|
||||
// insert a new word between them to represent it.
|
||||
size_t original_word_count = line->words.size();
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces;
|
||||
words_and_spaces.reserve(original_word_count * 2 + 1);
|
||||
|
||||
for (size_t i = 0; i < original_word_count; i++) {
|
||||
auto& current_word = line->words[i];
|
||||
words_and_spaces.push_back(*current_word);
|
||||
if (current_word->has_space_after && i + 1 < original_word_count) {
|
||||
gfx::Rect space_rect = GetSpaceRect(current_word->bounding_box,
|
||||
line->words[i + 1]->bounding_box);
|
||||
if (!space_rect.IsEmpty()) {
|
||||
words_and_spaces.push_back(screen_ai::mojom::WordBox(
|
||||
/*word=*/" ", /*dictionary_word=*/false, current_word->language,
|
||||
/*has_space_after=*/false, space_rect,
|
||||
current_word->bounding_box_angle, current_word->direction,
|
||||
/*confidence=*/1));
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpaces(line->words);
|
||||
|
||||
for (const auto& word : words_and_spaces) {
|
||||
if (word.bounding_box.IsEmpty()) {
|
||||
@ -383,6 +401,11 @@ gfx::Rect GetSpaceRectForTesting(const gfx::Rect& rect1, // IN-TEST
|
||||
return GetSpaceRect(rect1, rect2);
|
||||
}
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> GetWordsAndSpacesForTesting( // IN-TEST
|
||||
const std::vector<screen_ai::mojom::WordBoxPtr>& words) {
|
||||
return GetWordsAndSpaces(words);
|
||||
}
|
||||
|
||||
ScopedFPDFFont CreateFont(FPDF_DOCUMENT document) {
|
||||
std::vector<uint8_t> cid_to_gid_map(CreateCidToGidMap());
|
||||
return ScopedFPDFFont(
|
||||
|
@ -64,6 +64,8 @@ FS_MATRIX CalculateWordMoveMatrixForTesting(
|
||||
bool word_is_rtl);
|
||||
gfx::Rect GetSpaceRectForTesting(const gfx::Rect& rect1,
|
||||
const gfx::Rect& rect2);
|
||||
std::vector<screen_ai::mojom::WordBox> GetWordsAndSpacesForTesting(
|
||||
const std::vector<screen_ai::mojom::WordBoxPtr>& words);
|
||||
|
||||
} // namespace chrome_pdf
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
#include "base/numerics/angle_conversions.h"
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "services/screen_ai/public/mojom/screen_ai_service.mojom.h"
|
||||
#include "testing/gtest/include/gtest/gtest.h"
|
||||
#include "ui/gfx/geometry/point_f.h"
|
||||
#include "ui/gfx/geometry/rect.h"
|
||||
@ -144,15 +145,6 @@ TEST(PdfiumSearchifyTest, CalculateWordMoveMatrix) {
|
||||
}
|
||||
|
||||
TEST(PdfiumSearchifyTest, GetSpaceRect) {
|
||||
// Empty rects.
|
||||
{
|
||||
gfx::Rect before;
|
||||
gfx::Rect after;
|
||||
gfx::Rect expected;
|
||||
|
||||
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
|
||||
}
|
||||
|
||||
// Horizontal, Left to Right.
|
||||
{
|
||||
gfx::Rect before(10, 10, 100, 10);
|
||||
@ -188,6 +180,177 @@ TEST(PdfiumSearchifyTest, GetSpaceRect) {
|
||||
|
||||
EXPECT_EQ(GetSpaceRectForTesting(before, after), expected);
|
||||
}
|
||||
|
||||
// Empty rect.
|
||||
{
|
||||
gfx::Rect before(10, 10, 10, 10);
|
||||
gfx::Rect after;
|
||||
|
||||
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
|
||||
}
|
||||
|
||||
// Touching rects.
|
||||
{
|
||||
gfx::Rect before(10, 10, 10, 10);
|
||||
gfx::Rect after(20, 10, 10, 10);
|
||||
|
||||
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
|
||||
}
|
||||
|
||||
// Colliding rects.
|
||||
{
|
||||
gfx::Rect before(10, 10, 10, 10);
|
||||
gfx::Rect after(15, 10, 10, 10);
|
||||
|
||||
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
|
||||
}
|
||||
|
||||
// Covering rects, first in second.
|
||||
{
|
||||
gfx::Rect before(10, 10, 10, 10);
|
||||
gfx::Rect after(0, 0, 20, 20);
|
||||
|
||||
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
|
||||
}
|
||||
|
||||
// Covering rects, second in first.
|
||||
{
|
||||
gfx::Rect before(0, 0, 20, 20);
|
||||
gfx::Rect after(10, 10, 10, 10);
|
||||
|
||||
EXPECT_TRUE(GetSpaceRectForTesting(before, after).IsEmpty());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PdfiumSearchifyTest, AddingSpaceRects) {
|
||||
// No words.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
EXPECT_TRUE(words_and_spaces.empty());
|
||||
}
|
||||
|
||||
// One word with space after it.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word1";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
|
||||
ASSERT_EQ(words_and_spaces.size(), 1u);
|
||||
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
|
||||
}
|
||||
|
||||
// Two words with space between them.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word1";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word2";
|
||||
words.back()->bounding_box = gfx::Rect(60, 0, 50, 10);
|
||||
words.back()->has_space_after = false;
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
|
||||
ASSERT_EQ(words_and_spaces.size(), 3u);
|
||||
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
|
||||
EXPECT_EQ(words_and_spaces[1].word, " ");
|
||||
EXPECT_EQ(words_and_spaces[2].word, words[1]->word);
|
||||
}
|
||||
|
||||
// Two words with no space between them.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word1";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
|
||||
words.back()->has_space_after = false;
|
||||
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word2";
|
||||
words.back()->bounding_box = gfx::Rect(60, 0, 50, 10);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
|
||||
ASSERT_EQ(words_and_spaces.size(), 2u);
|
||||
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
|
||||
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
|
||||
}
|
||||
|
||||
// Two words with space between them and touching boundaries.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word1";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word2";
|
||||
words.back()->bounding_box = gfx::Rect(50, 0, 50, 10);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
|
||||
ASSERT_EQ(words_and_spaces.size(), 2u);
|
||||
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
|
||||
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
|
||||
}
|
||||
|
||||
// Two words with space between them, first one with empty bounding box.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word1";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 0, 0);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word2";
|
||||
words.back()->bounding_box = gfx::Rect(60, 0, 50, 10);
|
||||
words.back()->has_space_after = false;
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
|
||||
ASSERT_EQ(words_and_spaces.size(), 2u);
|
||||
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
|
||||
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
|
||||
}
|
||||
|
||||
// Two words with space between them, second one with empty bounding box.
|
||||
{
|
||||
std::vector<screen_ai::mojom::WordBoxPtr> words;
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word1";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 50, 10);
|
||||
words.back()->has_space_after = true;
|
||||
|
||||
words.emplace_back(screen_ai::mojom::WordBox::New());
|
||||
words.back()->word = "word2";
|
||||
words.back()->bounding_box = gfx::Rect(0, 0, 0, 0);
|
||||
words.back()->has_space_after = false;
|
||||
|
||||
std::vector<screen_ai::mojom::WordBox> words_and_spaces =
|
||||
GetWordsAndSpacesForTesting(words);
|
||||
|
||||
ASSERT_EQ(words_and_spaces.size(), 2u);
|
||||
EXPECT_EQ(words_and_spaces[0].word, words[0]->word);
|
||||
EXPECT_EQ(words_and_spaces[1].word, words[1]->word);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace chrome_pdf
|
||||
|
Reference in New Issue
Block a user