0

Improve matching of similar characters in PDF search

This CL changes the ICU collator being used for text searching in the
PDF viewer from the default to the 'search' one. This is the same
collator used in Blink, and it folds together similar looking
characters in a locale specific way, so that things like accented and
unaccented vowles will match or not match dependent on the
expectations of the user locale.

Additionally, when searching single and double quotation characters
are being simplified to the ASCII forms of them to make matching
between defferent types of quotes work better. Quotation marks are a
known limitation of the search collator and this workaround is based
off a similar solution that exists in Blink to improve matching.

BUG=chromium:142627

Change-Id: I399ea1f0bacb58434b468866127a91ccb6213e99
Reviewed-on: https://chromium-review.googlesource.com/c/1281178
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Jochen Eisinger <jochen@chromium.org>
Cr-Commit-Position: refs/heads/master@{#601157}
This commit is contained in:
Ryan Harrison
2018-10-19 15:18:55 +00:00
committed by Commit Bot
parent 17df4207bb
commit c6082cfb51
5 changed files with 77 additions and 6 deletions

@ -3,6 +3,7 @@
// found in the LICENSE file.
#include "base/optional.h"
#include "base/strings/utf_string_conversions.h"
#include "pdf/pdfium/pdfium_engine.h"
#include "pdf/pdfium/pdfium_test_base.h"
#include "pdf/test/test_client.h"
@ -111,4 +112,42 @@ TEST_F(FindTextTest, FindLineBreakText) {
engine->StartFind("is the first system", /*case_sensitive=*/true);
}
TEST_F(FindTextTest, FindSimpleQuotationMarkText) {
FindTextTestClient client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("bug_142627.pdf"));
ASSERT_TRUE(engine);
{
InSequence sequence;
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false));
EXPECT_CALL(client, NotifySelectedFindResultChanged(0));
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, false));
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, true));
}
engine->StartFind("don't", /*case_sensitive=*/true);
}
TEST_F(FindTextTest, FindFancyQuotationMarkText) {
FindTextTestClient client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("bug_142627.pdf"));
ASSERT_TRUE(engine);
{
InSequence sequence;
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false));
EXPECT_CALL(client, NotifySelectedFindResultChanged(0));
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, false));
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, true));
}
// don't, using right apostrophe instead of a single quotation mark
base::string16 term = {'d', 'o', 'n', 0x2019, 't'};
engine->StartFind(base::UTF16ToUTF8(term), /*case_sensitive=*/true);
}
} // namespace chrome_pdf

@ -625,6 +625,31 @@ std::string ConvertViewIntToViewString(unsigned long view_int) {
}
}
// Simplify to \" for searching
constexpr wchar_t kHebrewPunctuationGershayimCharacter = 0x05F4;
constexpr wchar_t kLeftDoubleQuotationMarkCharacter = 0x201C;
constexpr wchar_t kRightDoubleQuotationMarkCharacter = 0x201D;
// Simplify \' for searching
constexpr wchar_t kHebrewPunctuationGereshCharacter = 0x05F3;
constexpr wchar_t kLeftSingleQuotationMarkCharacter = 0x2018;
constexpr wchar_t kRightSingleQuotationMarkCharacter = 0x2019;
wchar_t SimplifyForSearch(wchar_t c) {
switch (c) {
case kHebrewPunctuationGershayimCharacter:
case kLeftDoubleQuotationMarkCharacter:
case kRightDoubleQuotationMarkCharacter:
return L'\"';
case kHebrewPunctuationGereshCharacter:
case kLeftSingleQuotationMarkCharacter:
case kRightSingleQuotationMarkCharacter:;
return L'\'';
default:
return c;
}
}
} // namespace
bool InitializeSDK() {
@ -1918,6 +1943,12 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term,
int current_page) {
DCHECK(!term.empty());
// Various types of quotions marks need to be converted to the simple ASCII
// version for searching to get better matching.
base::string16 adjusted_term = term;
for (base::char16& c : adjusted_term)
c = SimplifyForSearch(c);
const int original_text_length = pages_[current_page]->GetCharCount();
int text_length = original_text_length;
if (character_to_start_searching_from) {
@ -1974,11 +2005,11 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term,
if (IsIgnorableCharacter(c))
removed_indices.push_back(adjusted_page_text.size());
else
adjusted_page_text.push_back(c);
adjusted_page_text.push_back(SimplifyForSearch(c));
}
std::vector<PDFEngine::Client::SearchStringResult> results =
client_->SearchString(adjusted_page_text.c_str(), term.c_str(),
client_->SearchString(adjusted_page_text.c_str(), adjusted_term.c_str(),
case_sensitive);
for (const auto& result : results) {
// Need to convert from adjusted page text start to page text start, by

Binary file not shown.

@ -58,15 +58,16 @@ void PDFResource::SearchString(const unsigned short* input_string,
PP_PrivateFindResult** results,
uint32_t* count) {
if (locale_.empty())
locale_ = GetLocale();
locale_ = GetLocale() + "@collation=search";
const base::char16* string =
reinterpret_cast<const base::char16*>(input_string);
const base::char16* term =
reinterpret_cast<const base::char16*>(input_term);
UErrorCode status = U_ZERO_ERROR;
UStringSearch* searcher = usearch_open(term, -1, string, -1, locale_.c_str(),
0, &status);
UStringSearch* searcher =
usearch_open(term, -1, string, -1, locale_.c_str(), nullptr, &status);
DCHECK(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING ||
status == U_USING_DEFAULT_WARNING)
<< status;

@ -32,7 +32,7 @@ TEST_F(PDFResourceTest, SearchString) {
// Instantiate a resource explicitly so we can specify the locale.
auto pdf_resource = base::MakeRefCounted<PDFResource>(
Connection(&sink(), &sink(), 0), pp_instance());
pdf_resource->SetLocaleForTest("en-US");
pdf_resource->SetLocaleForTest("en-US@collation=search");
base::string16 input;
base::string16 term;