0

[PDF Searchify] Avoid Searchifying the same page twice

Extend PDFiumOnDemandSearchifierTest.PageWithImagesNoRecognizableText to
perform a page unload followed by a reload. This triggers a CHECK()
failure that can be seen in crash reports in the wild. Fix this crash by
modifying the PDFiumEngine::ScheduleSearchifyIfNeeded() logic to not
schedule Searchify for pages that have been Searchified, regardless of
whether Searchify added text to the page or not.

Bug: 395485272
Change-Id: I58772f8f2c71d271c61898b6b3364c4df089027d
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6250975
Reviewed-by: Ramin Halavati <rhalavati@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1419360}
This commit is contained in:
Lei Zhang
2025-02-12 09:16:19 -08:00
committed by Chromium LUCI CQ
parent 0ad42055ae
commit cf482574c9
3 changed files with 34 additions and 8 deletions

@ -4389,14 +4389,20 @@ bool PDFiumEngine::PageNeedsSearchify(int page_index) const {
}
void PDFiumEngine::ScheduleSearchifyIfNeeded(PDFiumPage* page) {
if (!page->available()) {
CHECK(page);
CHECK(page->available());
if (page->IsPageSearchified()) {
return;
}
// TODO(crbug.com/40066441): Explore heuristics to run OCR on pages with large
// images and a little text.
bool page_has_text = page->GetCharCount() != 0;
// Report metric only once for each page.
// Report metric only once for each page. Note that it is possible to reach
// here multiple times for a given page, if the page was scheduled to be
// Searchified, but the Searchify operation did not complete for some reason.
bool not_reported =
page_has_text_metric_reported_.insert(page->index()).second;
if (not_reported) {

@ -466,7 +466,8 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
// Tells if the page is waiting to be searchified.
bool PageNeedsSearchify(int page_index) const;
// Schedules searchify for the page if it has no text.
// Schedules searchify for the page if it has no text. `page` must be non-null
// and in an available state.
void ScheduleSearchifyIfNeeded(PDFiumPage* page);
// Cancels a pending searchify if it has not started yet. Ignores the request

@ -228,14 +228,33 @@ TEST_P(PDFiumOnDemandSearchifierTest, PageWithImagesNoRecognizableText) {
StartSearchify(/*empty_results=*/true);
base::test::TestFuture<void> future;
WaitUntilIdle(searchifier, future.GetCallback());
ASSERT_TRUE(future.Wait());
ASSERT_EQ(performed_ocrs(), 2);
EXPECT_TRUE(page.IsPageSearchified());
{
base::test::TestFuture<void> future;
WaitUntilIdle(searchifier, future.GetCallback());
ASSERT_TRUE(future.Wait());
ASSERT_EQ(performed_ocrs(), 2);
EXPECT_TRUE(page.IsPageSearchified());
}
// The page has two images, but no recognizable text.
EXPECT_TRUE(GetPageText(page).empty());
// Unload the page where Searchify did not add any text.
page.Unload();
// Get the text from the page, which reloads the page.
EXPECT_EQ(GetPageText(page), "");
{
// Wait for idle. This should not crash.
base::test::TestFuture<void> future;
WaitUntilIdle(searchifier, future.GetCallback());
ASSERT_TRUE(future.Wait());
// The number of performed OCRs has not changed.
ASSERT_EQ(performed_ocrs(), 2);
EXPECT_TRUE(page.IsPageSearchified());
}
}
TEST_P(PDFiumOnDemandSearchifierTest, MultiplePagesWithImages) {