[PDF Searchify] Avoid Searchifying the same page twice
Extend PDFiumOnDemandSearchifierTest.PageWithImagesNoRecognizableText to perform a page unload followed by a reload. This triggers a CHECK() failure that can be seen in crash reports in the wild. Fix this crash by modifying the PDFiumEngine::ScheduleSearchifyIfNeeded() logic to not schedule Searchify for pages that have been Searchified, regardless of whether Searchify added text to the page or not. Bug: 395485272 Change-Id: I58772f8f2c71d271c61898b6b3364c4df089027d Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6250975 Reviewed-by: Ramin Halavati <rhalavati@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org> Cr-Commit-Position: refs/heads/main@{#1419360}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
0ad42055ae
commit
cf482574c9
@ -4389,14 +4389,20 @@ bool PDFiumEngine::PageNeedsSearchify(int page_index) const {
|
||||
}
|
||||
|
||||
void PDFiumEngine::ScheduleSearchifyIfNeeded(PDFiumPage* page) {
|
||||
if (!page->available()) {
|
||||
CHECK(page);
|
||||
CHECK(page->available());
|
||||
|
||||
if (page->IsPageSearchified()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(crbug.com/40066441): Explore heuristics to run OCR on pages with large
|
||||
// images and a little text.
|
||||
bool page_has_text = page->GetCharCount() != 0;
|
||||
|
||||
// Report metric only once for each page.
|
||||
// Report metric only once for each page. Note that it is possible to reach
|
||||
// here multiple times for a given page, if the page was scheduled to be
|
||||
// Searchified, but the Searchify operation did not complete for some reason.
|
||||
bool not_reported =
|
||||
page_has_text_metric_reported_.insert(page->index()).second;
|
||||
if (not_reported) {
|
||||
|
@ -466,7 +466,8 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
|
||||
// Tells if the page is waiting to be searchified.
|
||||
bool PageNeedsSearchify(int page_index) const;
|
||||
|
||||
// Schedules searchify for the page if it has no text.
|
||||
// Schedules searchify for the page if it has no text. `page` must be non-null
|
||||
// and in an available state.
|
||||
void ScheduleSearchifyIfNeeded(PDFiumPage* page);
|
||||
|
||||
// Cancels a pending searchify if it has not started yet. Ignores the request
|
||||
|
@ -228,14 +228,33 @@ TEST_P(PDFiumOnDemandSearchifierTest, PageWithImagesNoRecognizableText) {
|
||||
|
||||
StartSearchify(/*empty_results=*/true);
|
||||
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilIdle(searchifier, future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
ASSERT_EQ(performed_ocrs(), 2);
|
||||
EXPECT_TRUE(page.IsPageSearchified());
|
||||
{
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilIdle(searchifier, future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
ASSERT_EQ(performed_ocrs(), 2);
|
||||
EXPECT_TRUE(page.IsPageSearchified());
|
||||
}
|
||||
|
||||
// The page has two images, but no recognizable text.
|
||||
EXPECT_TRUE(GetPageText(page).empty());
|
||||
|
||||
// Unload the page where Searchify did not add any text.
|
||||
page.Unload();
|
||||
|
||||
// Get the text from the page, which reloads the page.
|
||||
EXPECT_EQ(GetPageText(page), "");
|
||||
|
||||
{
|
||||
// Wait for idle. This should not crash.
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilIdle(searchifier, future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
|
||||
// The number of performed OCRs has not changed.
|
||||
ASSERT_EQ(performed_ocrs(), 2);
|
||||
EXPECT_TRUE(page.IsPageSearchified());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(PDFiumOnDemandSearchifierTest, MultiplePagesWithImages) {
|
||||
|
Reference in New Issue
Block a user