diff --git a/pdf/pdfium/findtext_unittest.cc b/pdf/pdfium/findtext_unittest.cc index 895fc39d66a6c..c77e7f2b7ae11 100644 --- a/pdf/pdfium/findtext_unittest.cc +++ b/pdf/pdfium/findtext_unittest.cc @@ -3,6 +3,7 @@ // found in the LICENSE file. #include "base/optional.h" +#include "base/strings/utf_string_conversions.h" #include "pdf/pdfium/pdfium_engine.h" #include "pdf/pdfium/pdfium_test_base.h" #include "pdf/test/test_client.h" @@ -111,4 +112,42 @@ TEST_F(FindTextTest, FindLineBreakText) { engine->StartFind("is the first system", /*case_sensitive=*/true); } +TEST_F(FindTextTest, FindSimpleQuotationMarkText) { + FindTextTestClient client; + std::unique_ptr<PDFiumEngine> engine = + InitializeEngine(&client, FILE_PATH_LITERAL("bug_142627.pdf")); + ASSERT_TRUE(engine); + + { + InSequence sequence; + + EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false)); + EXPECT_CALL(client, NotifySelectedFindResultChanged(0)); + EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, false)); + EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, true)); + } + + engine->StartFind("don't", /*case_sensitive=*/true); +} + +TEST_F(FindTextTest, FindFancyQuotationMarkText) { + FindTextTestClient client; + std::unique_ptr<PDFiumEngine> engine = + InitializeEngine(&client, FILE_PATH_LITERAL("bug_142627.pdf")); + ASSERT_TRUE(engine); + + { + InSequence sequence; + + EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(1, false)); + EXPECT_CALL(client, NotifySelectedFindResultChanged(0)); + EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, false)); + EXPECT_CALL(client, NotifyNumberOfFindResultsChanged(2, true)); + } + + // don't, using right apostrophe instead of a single quotation mark + base::string16 term = {'d', 'o', 'n', 0x2019, 't'}; + engine->StartFind(base::UTF16ToUTF8(term), /*case_sensitive=*/true); +} + } // namespace chrome_pdf diff --git a/pdf/pdfium/pdfium_engine.cc b/pdf/pdfium/pdfium_engine.cc index 22ee59fee90c5..fa13753e50943 100644 --- a/pdf/pdfium/pdfium_engine.cc +++ b/pdf/pdfium/pdfium_engine.cc @@ -625,6 +625,31 @@ std::string ConvertViewIntToViewString(unsigned long view_int) { } } +// Simplify to \" for searching +constexpr wchar_t kHebrewPunctuationGershayimCharacter = 0x05F4; +constexpr wchar_t kLeftDoubleQuotationMarkCharacter = 0x201C; +constexpr wchar_t kRightDoubleQuotationMarkCharacter = 0x201D; + +// Simplify \' for searching +constexpr wchar_t kHebrewPunctuationGereshCharacter = 0x05F3; +constexpr wchar_t kLeftSingleQuotationMarkCharacter = 0x2018; +constexpr wchar_t kRightSingleQuotationMarkCharacter = 0x2019; + +wchar_t SimplifyForSearch(wchar_t c) { + switch (c) { + case kHebrewPunctuationGershayimCharacter: + case kLeftDoubleQuotationMarkCharacter: + case kRightDoubleQuotationMarkCharacter: + return L'\"'; + case kHebrewPunctuationGereshCharacter: + case kLeftSingleQuotationMarkCharacter: + case kRightSingleQuotationMarkCharacter:; + return L'\''; + default: + return c; + } +} + } // namespace bool InitializeSDK() { @@ -1918,6 +1943,12 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term, int current_page) { DCHECK(!term.empty()); + // Various types of quotions marks need to be converted to the simple ASCII + // version for searching to get better matching. + base::string16 adjusted_term = term; + for (base::char16& c : adjusted_term) + c = SimplifyForSearch(c); + const int original_text_length = pages_[current_page]->GetCharCount(); int text_length = original_text_length; if (character_to_start_searching_from) { @@ -1974,11 +2005,11 @@ void PDFiumEngine::SearchUsingICU(const base::string16& term, if (IsIgnorableCharacter(c)) removed_indices.push_back(adjusted_page_text.size()); else - adjusted_page_text.push_back(c); + adjusted_page_text.push_back(SimplifyForSearch(c)); } std::vector<PDFEngine::Client::SearchStringResult> results = - client_->SearchString(adjusted_page_text.c_str(), term.c_str(), + client_->SearchString(adjusted_page_text.c_str(), adjusted_term.c_str(), case_sensitive); for (const auto& result : results) { // Need to convert from adjusted page text start to page text start, by diff --git a/pdf/test/data/bug_142627.pdf b/pdf/test/data/bug_142627.pdf new file mode 100644 index 0000000000000..f44e363f9a5c6 Binary files /dev/null and b/pdf/test/data/bug_142627.pdf differ diff --git a/ppapi/proxy/pdf_resource.cc b/ppapi/proxy/pdf_resource.cc index 75e0f3fe75e9a..90aaee28d1cb1 100644 --- a/ppapi/proxy/pdf_resource.cc +++ b/ppapi/proxy/pdf_resource.cc @@ -58,15 +58,16 @@ void PDFResource::SearchString(const unsigned short* input_string, PP_PrivateFindResult** results, uint32_t* count) { if (locale_.empty()) - locale_ = GetLocale(); + locale_ = GetLocale() + "@collation=search"; + const base::char16* string = reinterpret_cast<const base::char16*>(input_string); const base::char16* term = reinterpret_cast<const base::char16*>(input_term); UErrorCode status = U_ZERO_ERROR; - UStringSearch* searcher = usearch_open(term, -1, string, -1, locale_.c_str(), - 0, &status); + UStringSearch* searcher = + usearch_open(term, -1, string, -1, locale_.c_str(), nullptr, &status); DCHECK(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING) << status; diff --git a/ppapi/proxy/pdf_resource_unittest.cc b/ppapi/proxy/pdf_resource_unittest.cc index 2787d68f6c2af..507a2ca27972e 100644 --- a/ppapi/proxy/pdf_resource_unittest.cc +++ b/ppapi/proxy/pdf_resource_unittest.cc @@ -32,7 +32,7 @@ TEST_F(PDFResourceTest, SearchString) { // Instantiate a resource explicitly so we can specify the locale. auto pdf_resource = base::MakeRefCounted<PDFResource>( Connection(&sink(), &sink(), 0), pp_instance()); - pdf_resource->SetLocaleForTest("en-US"); + pdf_resource->SetLocaleForTest("en-US@collation=search"); base::string16 input; base::string16 term;