0

[lensoverlay] Support parsing and searching for text fragments in PDF.

Rendering the highlights will come in a follow-up CL.

Change-Id: I76ca388bd11cce8a3ddd6cd04474d0157fecd909
Bug: 383575917
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6173296
Commit-Queue: Juan Mojica <juanmojica@google.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1413612}
This commit is contained in:
Juan Mojica
2025-01-30 10:55:44 -08:00
committed by Chromium LUCI CQ
parent cc36da063d
commit 6c765ca6f1
9 changed files with 826 additions and 18 deletions

@ -139,6 +139,8 @@ if (enable_pdf) {
"pdfium/pdfium_print.h",
"pdfium/pdfium_range.cc",
"pdfium/pdfium_range.h",
"pdfium/pdfium_text_fragment_finder.cc",
"pdfium/pdfium_text_fragment_finder.h",
"pdfium/pdfium_unsupported_features.cc",
"pdfium/pdfium_unsupported_features.h",
"preview_mode_client.cc",
@ -444,6 +446,7 @@ if (enable_pdf) {
"pdfium/pdfium_test_base.h",
"pdfium/pdfium_test_helpers.cc",
"pdfium/pdfium_test_helpers.h",
"pdfium/pdfium_text_fragment_finder_unittest.cc",
"test/run_all_unittests.cc",
"ui/document_properties_unittest.cc",
"ui/file_name_unittest.cc",

@ -1,5 +1,6 @@
include_rules = [
"+components/services/font/public/cpp",
"+components/shared_highlighting/core",
"+skia/ext/font_utils.h",
"+third_party/pdfium/public",
"+third_party/re2/src/re2/re2.h",

@ -58,6 +58,7 @@
#include "pdf/pdfium/pdfium_document_metadata.h"
#include "pdf/pdfium/pdfium_mem_buffer_file_write.h"
#include "pdf/pdfium/pdfium_permissions.h"
#include "pdf/pdfium/pdfium_text_fragment_finder.h"
#include "pdf/pdfium/pdfium_unsupported_features.h"
#include "printing/mojom/print.mojom-shared.h"
#include "printing/units.h"
@ -999,8 +1000,22 @@ void PDFiumEngine::SetFormHighlight(bool enable_form) {
void PDFiumEngine::HighlightTextFragments(
base::span<const std::string> text_fragments) {
// TODO(crbug.com/383575917): Implement parsing, searching, and rendering of
// text fragments.
// TODO(crbug.com/383575917): Add support for rendering text fragment
// highlights.
PDFiumTextFragmentFinder text_fragment_finder(this);
text_fragment_finder.FindTextFragments(text_fragments);
}
void PDFiumEngine::SearchForFragment(
const std::u16string& term,
int character_to_start_searching_from,
int last_character_index_to_search,
int page_to_search,
AddSearchResultCallback add_result_callback) {
SearchUsingICU(term, /*case_sensitive=*/false, /*first_search=*/false,
character_to_start_searching_from,
last_character_index_to_search, page_to_search, page_to_search,
std::move(add_result_callback));
}
void PDFiumEngine::ClearTextSelection() {
@ -1888,8 +1903,11 @@ void PDFiumEngine::StartFind(const std::u16string& text, bool case_sensitive) {
SearchUsingPDFium(text, case_sensitive, first_search,
character_to_start_searching_from, current_page);
} else {
SearchUsingICU(text, case_sensitive, first_search,
character_to_start_searching_from, current_page);
SearchUsingICU(
text, case_sensitive, first_search, character_to_start_searching_from,
last_character_index_to_search_, current_page, last_page_to_search_,
base::BindRepeating(&PDFiumEngine::AddFindResult,
weak_factory_.GetWeakPtr()));
}
if (!IsPageVisible(current_page))
@ -1970,7 +1988,10 @@ void PDFiumEngine::SearchUsingICU(const std::u16string& term,
bool case_sensitive,
bool first_search,
int character_to_start_searching_from,
int current_page) {
int last_character_index_to_search,
int current_page,
int last_page_to_search,
AddSearchResultCallback add_result_callback) {
DCHECK(!term.empty());
// Various types of quotions marks need to be converted to the simple ASCII
@ -1983,9 +2004,9 @@ void PDFiumEngine::SearchUsingICU(const std::u16string& term,
int text_length = original_text_length;
if (character_to_start_searching_from) {
text_length -= character_to_start_searching_from;
} else if (!first_search && last_character_index_to_search_ != -1 &&
current_page == last_page_to_search_) {
text_length = last_character_index_to_search_;
} else if (!first_search && last_character_index_to_search != -1 &&
current_page == last_page_to_search) {
text_length = last_character_index_to_search;
}
if (text_length <= 0)
return;
@ -2093,7 +2114,8 @@ void PDFiumEngine::SearchUsingICU(const std::u16string& term,
DCHECK_LT(start, end);
DCHECK_EQ(term.size() + term_removed_count,
static_cast<size_t>(end - start));
AddFindResult(PDFiumRange(pages_[current_page].get(), start, end - start));
add_result_callback.Run(
PDFiumRange(pages_[current_page].get(), start, end - start));
}
}

@ -130,6 +130,7 @@ void InitializeSDK(bool enable_v8,
void ShutdownSDK();
using SendThumbnailCallback = base::OnceCallback<void(Thumbnail)>;
using AddSearchResultCallback = base::RepeatingCallback<void(PDFiumRange)>;
// This class implements a PDF rendering engine using the PDFium library.
//
@ -502,6 +503,18 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
virtual void HighlightTextFragments(
base::span<const std::string> text_fragments);
// Searches for a text fragment within the text of the PDF.
void SearchForFragment(const std::u16string& term,
int character_to_start_searching_from,
int last_character_index_to_search,
int page_to_search,
AddSearchResultCallback add_result_callback);
// Returns all the pages in the PDF.
const std::vector<std::unique_ptr<PDFiumPage>>& pages() const {
return pages_;
}
private:
// This helper class is used to detect the difference in selection between
// construction and destruction. At destruction, it invalidates all the
@ -687,7 +700,10 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
bool case_sensitive,
bool first_search,
int character_to_start_searching_from,
int current_page);
int last_character_index_to_search,
int current_page,
int last_page_to_search,
AddSearchResultCallback add_result_callback);
// Input event handlers.
bool OnMouseDown(const blink::WebMouseEvent& event);

@ -0,0 +1,336 @@
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdfium/pdfium_text_fragment_finder.h"
#include <optional>
#include <string>
#include <vector>
#include "base/containers/span.h"
#include "base/functional/bind.h"
#include "base/memory/raw_ptr.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/shared_highlighting/core/common/text_fragment.h"
#include "pdf/pdfium/pdfium_engine.h"
#include "pdf/pdfium/pdfium_range.h"
namespace chrome_pdf {
namespace {
// Adds the `prefix_result` to the list of prefixes found in the PDF.
void AddTextFragmentPrefixResult(
std::vector<PDFiumRange>& text_fragment_prefixes,
PDFiumRange prefix_result) {
text_fragment_prefixes.emplace_back(std::move(prefix_result));
}
// Sets the `suffix_result` to be the suffix of the fragment if it comes after
// the `before_suffix_range`.
void AddTextFragmentSuffixResult(
PDFiumEngine* engine,
std::optional<PDFiumRange>& text_fragment_suffix,
const PDFiumRange& before_suffix_range,
PDFiumRange suffix_result) {
// TODO(crbug.com/393166468): Modify TextSearch() to return one result rather
// than all of them.
// If an appropriate suffix was already found, then do nothing.
if (text_fragment_suffix) {
return;
}
const int suffix_boundary_start =
before_suffix_range.char_index() + before_suffix_range.char_count();
const int suffix_boundary_count =
suffix_result.char_index() - suffix_boundary_start;
const auto suffix_boundary =
PDFiumRange(engine->pages()[before_suffix_range.page_index()].get(),
suffix_boundary_start, suffix_boundary_count);
for (const auto& c : suffix_boundary.GetText()) {
if (!base::IsUnicodeWhitespace(c)) {
return;
}
}
text_fragment_suffix = std::move(suffix_result);
}
// Executes the search of the fragment suffix value. Takes into consideration
// the range that should come before it if it exists. Returns a null optional or
// the range representing the suffix if it exists.
std::optional<PDFiumRange> FindTextFragmentSuffix(
PDFiumEngine* engine,
const shared_highlighting::TextFragment& fragment,
const PDFiumRange& end_range) {
std::optional<PDFiumRange> text_fragment_suffix = std::nullopt;
engine->SearchForFragment(
base::UTF8ToUTF16(fragment.suffix()),
/*character_to_start_searching_from=*/end_range.char_index() +
end_range.char_count(),
/*last_character_index_to_search=*/-1,
/*page_to_search=*/end_range.page_index(),
base::BindRepeating(&AddTextFragmentSuffixResult, engine,
std::ref(text_fragment_suffix), std::ref(end_range)));
return text_fragment_suffix;
}
// Adds the `start_result` to the list of text starts found in the PDF. This
// search utilizes the provided prefix and suffix, if present, to locate the
// fragment within the text. If only one is provided, the search proceeds
// accordingly, ignoring the missing component. The suffix is only checked
// when provided if there does not exist a `text_end` value in the fragment.
void AddTextFragmentStartResult(
PDFiumEngine* engine,
std::vector<PDFiumRange>& text_fragment_starts,
std::optional<PDFiumRange>& text_fragment_suffix,
const shared_highlighting::TextFragment& fragment,
std::optional<const PDFiumRange> prefix_range,
PDFiumRange start_result) {
// If there is a prefix range, only add the result to `text_fragment_starts_`
// if the result comes immediately after a word boundary.
if (prefix_range) {
const int prefix_end =
prefix_range->char_index() + prefix_range->char_count();
const int boundary_start = prefix_end;
const int boundary_count = start_result.char_index() - prefix_end;
const auto boundary =
PDFiumRange(engine->pages()[start_result.page_index()].get(),
boundary_start, boundary_count);
for (const auto& c : boundary.GetText()) {
if (!base::IsUnicodeWhitespace(c)) {
return;
}
}
}
if (fragment.text_end().empty() && !fragment.suffix().empty()) {
text_fragment_suffix =
FindTextFragmentSuffix(engine, fragment, start_result);
if (!text_fragment_suffix) {
return;
}
}
text_fragment_starts.emplace_back(std::move(start_result));
}
// Sets the `end_result` to be the text end of the fragment. If a suffix is
// provided, it is also checked to come after the text end.
void AddTextFragmentEndResult(PDFiumEngine* engine,
std::optional<PDFiumRange>& text_fragment_end,
std::optional<PDFiumRange>& text_fragment_suffix,
const shared_highlighting::TextFragment& fragment,
PDFiumRange end_result) {
// If an appropriate text fragment end was already found, do nothing.
if (text_fragment_end) {
return;
}
if (!fragment.suffix().empty()) {
text_fragment_suffix = FindTextFragmentSuffix(engine, fragment, end_result);
if (!text_fragment_suffix) {
return;
}
}
text_fragment_end = std::move(end_result);
}
} // namespace
PDFiumTextFragmentFinder::PDFiumTextFragmentFinder(PDFiumEngine* engine)
: engine_(engine) {}
PDFiumTextFragmentFinder::~PDFiumTextFragmentFinder() = default;
std::vector<PDFiumRange> PDFiumTextFragmentFinder::FindTextFragments(
base::span<const std::string> text_fragments) {
text_fragment_highlights_.clear();
for (const std::string& fragment : text_fragments) {
const auto text_fragment =
shared_highlighting::TextFragment::FromEscapedString(fragment);
CHECK(text_fragment.has_value());
StartTextFragmentSearch(text_fragment.value());
}
return std::move(text_fragment_highlights_);
}
void PDFiumTextFragmentFinder::StartTextFragmentSearch(
const shared_highlighting::TextFragment& fragment) {
// Clear any state from previous searches.
last_unsearched_page_ = 0;
text_fragment_prefixes_.clear();
text_fragment_starts_.clear();
text_fragment_end_ = std::nullopt;
text_fragment_suffix_ = std::nullopt;
// If StartTextFragmentSearch() gets called before `engine_` has any page
// information (i.e. before the first call to LoadDocument has happened).
// Handle this case.
if (engine_->pages().empty()) {
return;
}
// If the fragment contains a prefix, start the search there.
if (!fragment.prefix().empty()) {
FindTextFragmentPrefix(fragment, /*page_to_start_search_from=*/0);
return;
}
// Otherwise, start the search from the the text fragment start value as it is
// a required value of the fragment.
FindTextFragmentStart(fragment);
}
void PDFiumTextFragmentFinder::FindTextFragmentPrefix(
const shared_highlighting::TextFragment& fragment,
int page_to_start_search_from) {
text_fragment_prefixes_.clear();
const auto prefix_unicode = base::UTF8ToUTF16(fragment.prefix());
for (int current_page = page_to_start_search_from;
current_page < static_cast<int>(engine_->pages().size());
current_page++) {
last_unsearched_page_ = current_page + 1;
engine_->SearchForFragment(
prefix_unicode,
/*character_to_start_searching_from=*/0,
/*last_character_index_to_search=*/-1, current_page,
base::BindRepeating(&AddTextFragmentPrefixResult,
std::ref(text_fragment_prefixes_)));
if (!text_fragment_prefixes_.empty()) {
FindTextFragmentStart(fragment);
return;
}
}
}
void PDFiumTextFragmentFinder::FindTextFragmentStart(
const shared_highlighting::TextFragment& fragment) {
text_fragment_starts_.clear();
const auto start_unicode = base::UTF8ToUTF16(fragment.text_start());
// If there are no text fragment prefixes then none were expected as part of
// the text fragment. In this case, searching for the start term itself should
// be adequate.
if (text_fragment_prefixes_.empty()) {
for (int current_page = 0;
current_page < static_cast<int>(engine_->pages().size());
current_page++) {
engine_->SearchForFragment(
start_unicode,
/*character_to_start_searching_from=*/0,
/*last_character_index_to_search=*/-1, current_page,
base::BindRepeating(&AddTextFragmentStartResult, engine_,
std::ref(text_fragment_starts_),
std::ref(text_fragment_suffix_), fragment,
std::nullopt));
}
// If no text fragments were found, then return early as the text fragment
// does exist in the text of this PDF.
if (text_fragment_starts_.empty()) {
return;
}
// If text fragment starts were found, continue the search. If there is no
// text end value in the fragment, then the FindTextFragmentEnd() function
// will conclude the search.
FindTextFragmentEnd(fragment);
return;
}
// If there are text fragment prefixes, then search through them to determine
// if the `text_start` value comes after as expected.
for (const auto& prefix_range : text_fragment_prefixes_) {
engine_->SearchForFragment(
start_unicode,
/*character_to_start_searching_from=*/prefix_range.char_index(),
/*last_character_index_to_search=*/-1, prefix_range.page_index(),
base::BindRepeating(&AddTextFragmentStartResult, engine_,
std::ref(text_fragment_starts_),
std::ref(text_fragment_suffix_), fragment,
prefix_range));
// If no text fragments were found, then continue on to the next prefix
// found.
if (text_fragment_starts_.empty()) {
continue;
}
// If text fragment starts were found, continue the search. If there is no
// text end value in the fragment, then the FindTextFragmentEnd() function
// will conclude the search.
FindTextFragmentEnd(fragment);
return;
}
// If the `text_start` value could not be found and the fragment contains a
// prefix, search again for the text fragment prefix in case the fragment is
// actually on an unsearched page.
if (text_fragment_starts_.empty() && !fragment.prefix().empty() &&
last_unsearched_page_ < static_cast<int>(engine_->pages().size())) {
FindTextFragmentPrefix(fragment, last_unsearched_page_);
}
}
void PDFiumTextFragmentFinder::FindTextFragmentEnd(
const shared_highlighting::TextFragment& fragment) {
if (fragment.text_end().empty()) {
FinishTextFragmentSearch();
return;
}
text_fragment_end_ = std::nullopt;
const auto end_unicode = base::UTF8ToUTF16(fragment.text_end());
for (const auto& start_range : text_fragment_starts_) {
engine_->SearchForFragment(
end_unicode,
/*character_to_start_searching_from=*/start_range.char_index() +
start_range.char_count(),
/*last_character_index_to_search=*/-1,
/*page_to_search=*/start_range.page_index(),
base::BindRepeating(&AddTextFragmentEndResult, engine_,
std::ref(text_fragment_end_),
std::ref(text_fragment_suffix_), fragment));
if (text_fragment_end_) {
FinishTextFragmentSearch();
return;
}
}
// If no text end was found and the fragment contains a prefix, search again
// for the text fragment prefix in case the fragment is on an unsearched page.
if (!text_fragment_end_ && !fragment.prefix().empty() &&
last_unsearched_page_ < static_cast<int>(engine_->pages().size())) {
FindTextFragmentPrefix(fragment, last_unsearched_page_);
}
}
void PDFiumTextFragmentFinder::FinishTextFragmentSearch() {
if (text_fragment_starts_.empty()) {
return;
}
PDFiumRange highlight = text_fragment_starts_[0];
if (text_fragment_end_) {
// The search for `text_fragment_end_` always starts at the character
// index that would be represented by `highlight.char_index() +
// highlight.char_count()`. Because of this
// `text_fragment_end_->char_index()` should always be greater than
// `highlight.char_index()`.
CHECK_GT(text_fragment_end_->char_index(), highlight.char_index());
base::CheckedNumeric<int> new_char_count = text_fragment_end_->char_index();
new_char_count -= highlight.char_index();
new_char_count += text_fragment_end_->char_count();
highlight.SetCharCount(new_char_count.ValueOrDie());
}
text_fragment_highlights_.emplace_back(std::move(highlight));
}
} // namespace chrome_pdf

@ -0,0 +1,84 @@
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef PDF_PDFIUM_PDFIUM_TEXT_FRAGMENT_FINDER_H_
#define PDF_PDFIUM_PDFIUM_TEXT_FRAGMENT_FINDER_H_
#include <optional>
#include <string>
#include <vector>
#include "base/containers/span.h"
#include "base/memory/raw_ptr.h"
#include "pdf/pdfium/pdfium_range.h"
namespace shared_highlighting {
class TextFragment;
} // namespace shared_highlighting
namespace chrome_pdf {
class PDFiumEngine;
class PDFiumTextFragmentFinder {
public:
explicit PDFiumTextFragmentFinder(PDFiumEngine* engine);
PDFiumTextFragmentFinder(const PDFiumTextFragmentFinder&) = delete;
PDFiumTextFragmentFinder& operator=(const PDFiumTextFragmentFinder&) = delete;
~PDFiumTextFragmentFinder();
// Finds the text fragments ranges to highlight. The strings passed to this
// function must be unescaped, otherwise some specific text fragments will
// fail to parse correctly. For example, `foo%2C-,bar` which will parse vs.
// `foo,-,bar` which will fail to parse. When finished searching, the
// highlights will be returned as a list of ranges.
std::vector<PDFiumRange> FindTextFragments(
base::span<const std::string> text_fragments);
private:
// Starts a text fragment search with the values provided in `fragment`.
void StartTextFragmentSearch(
const shared_highlighting::TextFragment& fragment);
// Executes the search of the fragment prefix starting from
// `page_to_start_search_from`. Should not be called if the prefix does not
// exist in `fragment`.
void FindTextFragmentPrefix(const shared_highlighting::TextFragment& fragment,
int page_to_start_search_from);
// Executes the search of the fragment text start value. Takes into
// consideration any prefixes or suffixes that should come before or after.
void FindTextFragmentStart(const shared_highlighting::TextFragment& fragment);
// Executes the search of the fragment text end value. Takes into
// consideration the suffix if it exists.
void FindTextFragmentEnd(const shared_highlighting::TextFragment& fragment);
// Finishes a text fragment search by adding the range to highlight to
// `text_fragment_highlights_`.
void FinishTextFragmentSearch();
// The last unsearched page during a text fragment search.
int last_unsearched_page_ = 0;
// The list of text fragment prefixes found within the PDF if the prefix
// exists.
std::vector<PDFiumRange> text_fragment_prefixes_;
// The text fragment start values found within the PDF.
std::vector<PDFiumRange> text_fragment_starts_;
// The text fragment end value if it exists and is found within the
// PDF.
std::optional<PDFiumRange> text_fragment_end_;
// The suffix of the text fragment if it exists and is found within the PDF.
std::optional<PDFiumRange> text_fragment_suffix_;
// Used to highlight text fragments, but does not include text within form
// text areas.
std::vector<PDFiumRange> text_fragment_highlights_;
// Owns this class.
const raw_ptr<PDFiumEngine> engine_;
};
} // namespace chrome_pdf
#endif // PDF_PDFIUM_PDFIUM_TEXT_FRAGMENT_FINDER_H_

@ -0,0 +1,346 @@
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/pdfium/pdfium_text_fragment_finder.h"
#include <string>
#include "base/containers/span.h"
#include "pdf/pdfium/pdfium_engine.h"
#include "pdf/pdfium/pdfium_test_base.h"
#include "pdf/test/test_client.h"
#include "pdf/text_search.h"
#include "testing/gmock/include/gmock/gmock.h"
namespace chrome_pdf {
namespace {
using ::testing::NiceMock;
class SearchStringTestClient : public TestClient {
public:
std::vector<SearchStringResult> SearchString(const std::u16string& needle,
const std::u16string& haystack,
bool case_sensitive) override {
EXPECT_FALSE(needle.empty());
EXPECT_FALSE(haystack.empty());
return TextSearch(/*needle=*/needle, /*haystack=*/haystack, case_sensitive);
}
};
} // namespace
using PDFiumTextFragmentFinderTest = PDFiumTestBase;
TEST_P(PDFiumTextFragmentFinderTest, OnlyTextStart) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({"Google"});
ASSERT_EQ(highlights.size(), 1u);
static constexpr char16_t kExpectedString[] = u"Google";
const auto& range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 9);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextStartAndEnd) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({"spanner,database"});
ASSERT_EQ(highlights.size(), 1u);
static constexpr char16_t kExpectedString[] =
u"Spanner: Google\x2019s Globally-Distributed Database\r";
const auto& range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 0);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextStartAndTextSuffix) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({"how,-many"});
ASSERT_EQ(highlights.size(), 1u);
static constexpr char16_t kExpectedString[] = u"how";
const auto& range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 4141);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextStartEndAndSuffix) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({"this,api,-and"});
ASSERT_EQ(highlights.size(), 1u);
const auto& range = highlights[0];
static constexpr char16_t kExpectedString[] =
u"This\r\npaper describes how Spanner is structured, its feature "
u"set,\r\nthe rationale underlying various design decisions, and "
u"a\r\nnovel time API that exposes clock uncertainty. This API\r";
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 704);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextPrefixAndTextStart) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({"is-,Google"});
ASSERT_EQ(highlights.size(), 1u);
static constexpr char16_t kExpectedString[] = u"Google";
const auto& range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 489);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextPrefixStartAndSuffix) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({"of-,Google,-'s"});
ASSERT_EQ(highlights.size(), 1u);
static constexpr char16_t kExpectedString[] = u"Google";
const auto& range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 2072);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextPrefixStartAndEnd) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights =
finder.FindTextFragments({"time-,api,implementation"});
ASSERT_EQ(highlights.size(), 1u);
const auto& range = highlights[0];
static constexpr char16_t kExpectedString[] =
u"API that exposes clock uncertainty. This API\r\nand its "
u"implementation";
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 840);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, TextPrefixStartEndAndSuffix) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights =
finder.FindTextFragments({"and-,applications,old,-timestamps"});
ASSERT_EQ(highlights.size(), 1u);
static constexpr char16_t kExpectedString[] =
u"applications can read data at old";
const auto& range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString).size()));
EXPECT_EQ(range.char_index(), 3591);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, MultipleTextFragments) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights =
finder.FindTextFragments({"Google", "is-,Google", "of-,Google,-'s",
"and-,applications,old,-timestamps"});
ASSERT_EQ(highlights.size(), 4u);
static constexpr char16_t kExpectedString1[] = u"Google";
static constexpr int kExpectedString1Length =
static_cast<int>(base::span_from_cstring(kExpectedString1).size());
static constexpr char16_t kExpectedString2[] =
u"applications can read data at old";
auto range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString1);
EXPECT_EQ(range.char_count(), kExpectedString1Length);
EXPECT_EQ(range.char_index(), 9);
EXPECT_EQ(range.page_index(), 0);
range = highlights[1];
EXPECT_EQ(range.GetText(), kExpectedString1);
EXPECT_EQ(range.char_count(), kExpectedString1Length);
EXPECT_EQ(range.char_index(), 489);
EXPECT_EQ(range.page_index(), 0);
range = highlights[2];
EXPECT_EQ(range.GetText(), kExpectedString1);
EXPECT_EQ(range.char_count(), kExpectedString1Length);
EXPECT_EQ(range.char_index(), 2072);
EXPECT_EQ(range.page_index(), 0);
range = highlights[3];
EXPECT_EQ(range.GetText(), kExpectedString2);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString2).size()));
EXPECT_EQ(range.char_index(), 3591);
EXPECT_EQ(range.page_index(), 0);
}
TEST_P(PDFiumTextFragmentFinderTest, MultiPage) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("link_annots.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights =
finder.FindTextFragments({"link", "1-,link,-with", "page,-in",
"second-,page", "second-,page,in,-document"});
ASSERT_EQ(highlights.size(), 5u);
static constexpr char16_t kExpectedString1[] = u"Link";
static constexpr int kExpectedString1Length =
static_cast<int>(base::span_from_cstring(kExpectedString1).size());
static constexpr char16_t kExpectedString2[] = u"Page";
static constexpr char16_t kExpectedString3[] = u"page\r";
static constexpr char16_t kExpectedString4[] = u"Page in";
auto range = highlights[0];
EXPECT_EQ(range.GetText(), kExpectedString1);
EXPECT_EQ(range.char_count(), kExpectedString1Length);
EXPECT_EQ(range.char_index(), 0);
EXPECT_EQ(range.page_index(), 0);
range = highlights[1];
EXPECT_EQ(range.GetText(), kExpectedString1);
EXPECT_EQ(range.char_count(), kExpectedString1Length);
EXPECT_EQ(range.char_index(), 27);
EXPECT_EQ(range.page_index(), 0);
range = highlights[2];
EXPECT_EQ(range.GetText(), kExpectedString2);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString2).size()));
EXPECT_EQ(range.char_index(), 7);
EXPECT_EQ(range.page_index(), 1);
range = highlights[3];
EXPECT_EQ(range.GetText(), kExpectedString3);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString3).size()));
EXPECT_EQ(range.char_index(), 59);
EXPECT_EQ(range.page_index(), 0);
range = highlights[4];
EXPECT_EQ(range.GetText(), kExpectedString4);
EXPECT_EQ(range.char_count(),
static_cast<int>(base::span_from_cstring(kExpectedString4).size()));
EXPECT_EQ(range.char_index(), 7);
EXPECT_EQ(range.page_index(), 1);
}
TEST_P(PDFiumTextFragmentFinderTest, FragmentNotInPDF) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
// Start is not present in PDF.
auto highlights = finder.FindTextFragments({"apples"});
EXPECT_TRUE(highlights.empty());
// Prefix + start present, but suffix is not.
highlights = finder.FindTextFragments({"of-,Google,-random"});
EXPECT_TRUE(highlights.empty());
// Prefix + start present, but text end is not.
highlights = finder.FindTextFragments({"of-,Google,random"});
EXPECT_TRUE(highlights.empty());
// Prefix + start + end present, but suffix is not.
highlights = finder.FindTextFragments({"and-,applications,old,-random"});
EXPECT_TRUE(highlights.empty());
// Start is present, but prefix is not.
highlights = finder.FindTextFragments({"apples-,Google"});
EXPECT_TRUE(highlights.empty());
// Start is present, but suffix is not.
highlights = finder.FindTextFragments({"Google,-random"});
EXPECT_TRUE(highlights.empty());
// Start is present but end is not.
highlights = finder.FindTextFragments({"applications,random"});
EXPECT_TRUE(highlights.empty());
// Start and end are present, but suffix is not.
highlights = finder.FindTextFragments({"applications,old,-random"});
EXPECT_TRUE(highlights.empty());
}
TEST_P(PDFiumTextFragmentFinderTest, EmptyList) {
NiceMock<SearchStringTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("spanner.pdf"));
ASSERT_TRUE(engine);
PDFiumTextFragmentFinder finder(engine.get());
const auto highlights = finder.FindTextFragments({});
EXPECT_TRUE(highlights.empty());
}
INSTANTIATE_TEST_SUITE_P(All, PDFiumTextFragmentFinderTest, testing::Bool());
} // namespace chrome_pdf

@ -73,7 +73,7 @@ stream
BT
70 700 Td
/F1 18 Tf
(Second Page) Tj
(Second Page in Document) Tj
ET
endstream
endobj

@ -68,13 +68,13 @@ ET
endstream
endobj
8 0 obj <<
/Length 43
/Length 55
>>
stream
BT
70 700 Td
/F1 18 Tf
(Second Page) Tj
(Second Page in Document) Tj
ET
endstream
endobj
@ -146,14 +146,14 @@ xref
0000000516 00000 n
0000000592 00000 n
0000001035 00000 n
0000001129 00000 n
0000001271 00000 n
0000001456 00000 n
0000001719 00000 n
0000001141 00000 n
0000001283 00000 n
0000001468 00000 n
0000001731 00000 n
trailer <<
/Root 1 0 R
/Size 13
>>
startxref
1903
1915
%%EOF