0

Find strings in PDFs that have been broken by a soft hyphen

Currently if a search term in the PDF text has been broken over two
lines by a soft hyphen, find will not correctly identify it as a
match. This is rooted in the fact that the result of FPDF_GetText
includes a marker for soft-hyphens, 0xFFFE, which causes the match to
fail.

This CL adds in filtering this character from the text being
searched over, so that these matches can pass. This requires changes
in the SearchUsingICU method to strip ignorable characters from the
string before searching, and correctly converting the results back
into the non-stripped index space. Ranges also have had filtering for
0xFFFE added in, so that the highlights created by searching are
properly placed.

BUG=chromium:788799

Change-Id: I06c8181358cdebe6454c36437065592820637808
Reviewed-on: https://chromium-review.googlesource.com/1234998
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Cr-Commit-Position: refs/heads/master@{#593993}
This commit is contained in:
Ryan Harrison
2018-09-25 17:47:57 +00:00
committed by Commit Bot
parent b312df8d9b
commit 189eca26a7
6 changed files with 91 additions and 16 deletions

@ -53,10 +53,6 @@ class VarDictionary;
namespace chrome_pdf {
// Zero Width Whitespace character that requires special handling in multiple
// locations.
constexpr base::char16 kZeroWidthSpace = 0x200B;
// Do one time initialization of the SDK.
bool InitializeSDK();
// Tells the SDK that we're shutting down.

@ -20,13 +20,14 @@ namespace {
class TestDocumentLoader : public DocumentLoader {
public:
explicit TestDocumentLoader(Client* client) : client_(client) {
TestDocumentLoader(Client* client, const base::FilePath::StringType& pdf_name)
: client_(client) {
base::FilePath pdf_path;
CHECK(base::PathService::Get(base::DIR_SOURCE_ROOT, &pdf_path));
pdf_path = pdf_path.Append(FILE_PATH_LITERAL("pdf"))
.Append(FILE_PATH_LITERAL("test"))
.Append(FILE_PATH_LITERAL("data"))
.Append(FILE_PATH_LITERAL("hello_world2.pdf"));
.Append(pdf_name);
CHECK(base::ReadFileToString(pdf_path, &pdf_data_));
}
~TestDocumentLoader() override = default;
@ -66,9 +67,11 @@ class TestDocumentLoader : public DocumentLoader {
std::string pdf_data_;
};
const base::FilePath::CharType* g_test_pdf_name;
std::unique_ptr<DocumentLoader> CreateTestDocumentLoader(
DocumentLoader::Client* client) {
return std::make_unique<TestDocumentLoader>(client);
return std::make_unique<TestDocumentLoader>(client, g_test_pdf_name);
}
class TestClient : public PDFEngine::Client {
@ -137,11 +140,11 @@ class FindTextTest : public testing::Test {
protected:
void SetUp() override {
InitializePDFium();
PDFiumEngine::SetCreateDocumentLoaderFunctionForTesting(
&CreateTestDocumentLoader);
}
void TearDown() override {
PDFiumEngine::SetCreateDocumentLoaderFunctionForTesting(nullptr);
g_test_pdf_name = nullptr;
FPDF_DestroyLibrary();
}
@ -154,10 +157,18 @@ class FindTextTest : public testing::Test {
FPDF_InitLibraryWithConfig(&config);
}
void SetDocumentForTest(const base::FilePath::CharType* test_pdf_name) {
g_test_pdf_name = test_pdf_name;
PDFiumEngine::SetCreateDocumentLoaderFunctionForTesting(
&CreateTestDocumentLoader);
}
private:
DISALLOW_COPY_AND_ASSIGN(FindTextTest);
};
TEST_F(FindTextTest, FindText) {
SetDocumentForTest(FILE_PATH_LITERAL("hello_world2.pdf"));
pp::URLLoader dummy_loader;
TestClient client;
PDFiumEngine engine(&client, true);
@ -177,4 +188,24 @@ TEST_F(FindTextTest, FindText) {
engine.StartFind("o", /*case_sensitive=*/true);
}
TEST_F(FindTextTest, FindHyphenatedText) {
SetDocumentForTest(FILE_PATH_LITERAL("spanner.pdf"));
pp::URLLoader dummy_loader;
TestClient client;
PDFiumEngine engine(&client, true);
ASSERT_TRUE(engine.New("https://chromium.org/dummy.pdf", ""));
ASSERT_TRUE(engine.HandleDocumentLoad(dummy_loader));
{
InSequence sequence;
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false));
EXPECT_CALL(client, NotifySelectedFindResultChanged(0));
for (int i = 1; i < 6; ++i)
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(i + 1, false));
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(6, true));
}
engine.StartFind("application", /*case_sensitive=*/true);
}
} // namespace chrome_pdf

@ -1918,28 +1918,65 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term,
character_to_start_searching_from, text_length, data);
api_string_adapter.Close(written);
base::string16 stripped_page_text;
stripped_page_text.reserve(page_text.size());
// Values in |stripped_indices| are in the stripped text index space and
// indicate a character was removed from the page text before the given
// index. If multiple characters are removed in a row then there will be
// multiple entries with the same value.
std::vector<size_t> stripped_indices;
for (size_t i = 0; i < page_text.size(); i++) {
base::char16 c = page_text[i];
if (IsIgnorableCharacter(c))
stripped_indices.push_back(stripped_page_text.size());
else
stripped_page_text.push_back(c);
}
std::vector<PDFEngine::Client::SearchStringResult> results =
client_->SearchString(page_text.c_str(), term.c_str(), case_sensitive);
client_->SearchString(stripped_page_text.c_str(), term.c_str(),
case_sensitive);
for (const auto& result : results) {
// Need to convert from stripped page text start to page text start, by
// incrementing for all the characters stripped before it in the string.
auto stripped_indices_begin = std::upper_bound(
stripped_indices.begin(), stripped_indices.end(), result.start_index);
size_t page_text_result_start_index =
result.start_index +
std::distance(stripped_indices.begin(), stripped_indices_begin);
// Need to convert the stripped page length into a page text length, since
// the matching range may have stripped characters within it. This
// conversion only cares about skipped characters in the result interval.
auto stripped_indices_end =
std::upper_bound(stripped_indices_begin, stripped_indices.end(),
result.start_index + result.length);
int term_stripped_count =
std::distance(stripped_indices_begin, stripped_indices_end);
int page_text_result_length = result.length + term_stripped_count;
// Need to map the indexes from the page text, which may have generated
// characters like space etc, to character indices from the page.
int text_to_start_searching_from = FPDFText_GetTextIndexFromCharIndex(
pages_[current_page]->GetTextPage(), character_to_start_searching_from);
int temp_start = result.start_index + text_to_start_searching_from;
int temp_start =
page_text_result_start_index + text_to_start_searching_from;
int start = FPDFText_GetCharIndexFromTextIndex(
pages_[current_page]->GetTextPage(), temp_start);
int end = FPDFText_GetCharIndexFromTextIndex(
pages_[current_page]->GetTextPage(), temp_start + result.length);
pages_[current_page]->GetTextPage(),
temp_start + page_text_result_length);
// If |term| occurs at the end of a page, then |end| will be -1 due to the
// index being out of bounds. Compensate for this case so the range
// character count calculation below works out.
if (temp_start + result.length == original_text_length) {
if (temp_start + page_text_result_length == original_text_length) {
DCHECK_EQ(-1, end);
end = original_text_length;
}
DCHECK_LT(start, end);
DCHECK_EQ(term.size(), static_cast<size_t>(end - start));
DCHECK_EQ(term.size() + term_stripped_count,
static_cast<size_t>(end - start));
AddFindResult(PDFiumRange(pages_[current_page].get(), start, end - start));
}
}

@ -23,6 +23,10 @@ void AdjustForBackwardsRange(int* index, int* count) {
} // namespace
bool IsIgnorableCharacter(base::char16 c) {
return (c == kZeroWidthSpace) || (c == kPDFSoftHyphenMarker);
}
PDFiumRange::PDFiumRange(PDFiumPage* page, int char_index, int char_count)
: page_(page), char_index_(char_index), char_count_(char_count) {
#if DCHECK_IS_ON()
@ -105,8 +109,7 @@ base::string16 PDFiumRange::GetText() const {
api_string_adapter.Close(written);
}
// Strip ignorable non-displaying whitespace
rv.erase(std::remove(rv.begin(), rv.end(), kZeroWidthSpace), rv.end());
base::EraseIf(rv, [](base::char16 c) { return IsIgnorableCharacter(c); });
return rv;
}

@ -14,6 +14,14 @@
namespace chrome_pdf {
constexpr base::char16 kZeroWidthSpace = 0x200B;
constexpr base::char16 kPDFSoftHyphenMarker = 0xFFFE;
// Helper for identifying characters that PDFium outputs, via FPDFText_GetText,
// that have special meaning, but should not be included in things like copied
// text or when running find.
bool IsIgnorableCharacter(base::char16 c);
// Describes location of a string of characters.
class PDFiumRange {
public:

BIN
pdf/test/data/spanner.pdf Normal file

Binary file not shown.