Improve matching of similar characters in PDF search
This CL changes the ICU collator being used for text searching in the PDF viewer from the default to the 'search' one. This is the same collator used in Blink, and it folds together similar looking characters in a locale specific way, so that things like accented and unaccented vowles will match or not match dependent on the expectations of the user locale. Additionally, when searching single and double quotation characters are being simplified to the ASCII forms of them to make matching between defferent types of quotes work better. Quotation marks are a known limitation of the search collator and this workaround is based off a similar solution that exists in Blink to improve matching. BUG=chromium:142627 Change-Id: I399ea1f0bacb58434b468866127a91ccb6213e99 Reviewed-on: https://chromium-review.googlesource.com/c/1281178 Commit-Queue: Ryan Harrison <rharrison@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org> Reviewed-by: Jochen Eisinger <jochen@chromium.org> Cr-Commit-Position: refs/heads/master@{#601157}
This commit is contained in:

committed by
Commit Bot

parent
17df4207bb
commit
c6082cfb51
pdf
ppapi/proxy
@ -3,6 +3,7 @@
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "base/optional.h"
|
||||
#include "base/strings/utf_string_conversions.h"
|
||||
#include "pdf/pdfium/pdfium_engine.h"
|
||||
#include "pdf/pdfium/pdfium_test_base.h"
|
||||
#include "pdf/test/test_client.h"
|
||||
@ -111,4 +112,42 @@ TEST_F(FindTextTest, FindLineBreakText) {
|
||||
engine->StartFind("is the first system", /*case_sensitive=*/true);
|
||||
}
|
||||
|
||||
TEST_F(FindTextTest, FindSimpleQuotationMarkText) {
|
||||
FindTextTestClient client;
|
||||
std::unique_ptr<PDFiumEngine> engine =
|
||||
InitializeEngine(&client, FILE_PATH_LITERAL("bug_142627.pdf"));
|
||||
ASSERT_TRUE(engine);
|
||||
|
||||
{
|
||||
InSequence sequence;
|
||||
|
||||
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false));
|
||||
EXPECT_CALL(client, NotifySelectedFindResultChanged(0));
|
||||
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, false));
|
||||
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, true));
|
||||
}
|
||||
|
||||
engine->StartFind("don't", /*case_sensitive=*/true);
|
||||
}
|
||||
|
||||
TEST_F(FindTextTest, FindFancyQuotationMarkText) {
|
||||
FindTextTestClient client;
|
||||
std::unique_ptr<PDFiumEngine> engine =
|
||||
InitializeEngine(&client, FILE_PATH_LITERAL("bug_142627.pdf"));
|
||||
ASSERT_TRUE(engine);
|
||||
|
||||
{
|
||||
InSequence sequence;
|
||||
|
||||
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false));
|
||||
EXPECT_CALL(client, NotifySelectedFindResultChanged(0));
|
||||
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, false));
|
||||
EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, true));
|
||||
}
|
||||
|
||||
// don't, using right apostrophe instead of a single quotation mark
|
||||
base::string16 term = {'d', 'o', 'n', 0x2019, 't'};
|
||||
engine->StartFind(base::UTF16ToUTF8(term), /*case_sensitive=*/true);
|
||||
}
|
||||
|
||||
} // namespace chrome_pdf
|
||||
|
@ -625,6 +625,31 @@ std::string ConvertViewIntToViewString(unsigned long view_int) {
|
||||
}
|
||||
}
|
||||
|
||||
// Simplify to \" for searching
|
||||
constexpr wchar_t kHebrewPunctuationGershayimCharacter = 0x05F4;
|
||||
constexpr wchar_t kLeftDoubleQuotationMarkCharacter = 0x201C;
|
||||
constexpr wchar_t kRightDoubleQuotationMarkCharacter = 0x201D;
|
||||
|
||||
// Simplify \' for searching
|
||||
constexpr wchar_t kHebrewPunctuationGereshCharacter = 0x05F3;
|
||||
constexpr wchar_t kLeftSingleQuotationMarkCharacter = 0x2018;
|
||||
constexpr wchar_t kRightSingleQuotationMarkCharacter = 0x2019;
|
||||
|
||||
wchar_t SimplifyForSearch(wchar_t c) {
|
||||
switch (c) {
|
||||
case kHebrewPunctuationGershayimCharacter:
|
||||
case kLeftDoubleQuotationMarkCharacter:
|
||||
case kRightDoubleQuotationMarkCharacter:
|
||||
return L'\"';
|
||||
case kHebrewPunctuationGereshCharacter:
|
||||
case kLeftSingleQuotationMarkCharacter:
|
||||
case kRightSingleQuotationMarkCharacter:;
|
||||
return L'\'';
|
||||
default:
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool InitializeSDK() {
|
||||
@ -1918,6 +1943,12 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term,
|
||||
int current_page) {
|
||||
DCHECK(!term.empty());
|
||||
|
||||
// Various types of quotions marks need to be converted to the simple ASCII
|
||||
// version for searching to get better matching.
|
||||
base::string16 adjusted_term = term;
|
||||
for (base::char16& c : adjusted_term)
|
||||
c = SimplifyForSearch(c);
|
||||
|
||||
const int original_text_length = pages_[current_page]->GetCharCount();
|
||||
int text_length = original_text_length;
|
||||
if (character_to_start_searching_from) {
|
||||
@ -1974,11 +2005,11 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term,
|
||||
if (IsIgnorableCharacter(c))
|
||||
removed_indices.push_back(adjusted_page_text.size());
|
||||
else
|
||||
adjusted_page_text.push_back(c);
|
||||
adjusted_page_text.push_back(SimplifyForSearch(c));
|
||||
}
|
||||
|
||||
std::vector<PDFEngine::Client::SearchStringResult> results =
|
||||
client_->SearchString(adjusted_page_text.c_str(), term.c_str(),
|
||||
client_->SearchString(adjusted_page_text.c_str(), adjusted_term.c_str(),
|
||||
case_sensitive);
|
||||
for (const auto& result : results) {
|
||||
// Need to convert from adjusted page text start to page text start, by
|
||||
|
BIN
pdf/test/data/bug_142627.pdf
Normal file
BIN
pdf/test/data/bug_142627.pdf
Normal file
Binary file not shown.
@ -58,15 +58,16 @@ void PDFResource::SearchString(const unsigned short* input_string,
|
||||
PP_PrivateFindResult** results,
|
||||
uint32_t* count) {
|
||||
if (locale_.empty())
|
||||
locale_ = GetLocale();
|
||||
locale_ = GetLocale() + "@collation=search";
|
||||
|
||||
const base::char16* string =
|
||||
reinterpret_cast<const base::char16*>(input_string);
|
||||
const base::char16* term =
|
||||
reinterpret_cast<const base::char16*>(input_term);
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UStringSearch* searcher = usearch_open(term, -1, string, -1, locale_.c_str(),
|
||||
0, &status);
|
||||
UStringSearch* searcher =
|
||||
usearch_open(term, -1, string, -1, locale_.c_str(), nullptr, &status);
|
||||
DCHECK(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING ||
|
||||
status == U_USING_DEFAULT_WARNING)
|
||||
<< status;
|
||||
|
@ -32,7 +32,7 @@ TEST_F(PDFResourceTest, SearchString) {
|
||||
// Instantiate a resource explicitly so we can specify the locale.
|
||||
auto pdf_resource = base::MakeRefCounted<PDFResource>(
|
||||
Connection(&sink(), &sink(), 0), pp_instance());
|
||||
pdf_resource->SetLocaleForTest("en-US");
|
||||
pdf_resource->SetLocaleForTest("en-US@collation=search");
|
||||
|
||||
base::string16 input;
|
||||
base::string16 term;
|
||||
|
Reference in New Issue
Block a user