Add a PDF searchifier that runs on pages on demand.
A new PDF searchifer class is added that runs on pages based on demand. All images in the pages without text are sent to the OCR module and the recognized text is added to the PDF page. This CL is partly based on a draft by thestig@chromium.org. Bug: 360803943 Change-Id: Idda631e35ca320a8f3c4c60fb2b72cb68e5f59a3 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5864550 Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Ramin Halavati <rhalavati@chromium.org> Cr-Commit-Position: refs/heads/main@{#1360964}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
a960e66e20
commit
4ef56a216e
@ -236,6 +236,8 @@ if (enable_pdf) {
|
||||
|
||||
if (enable_screen_ai_service) {
|
||||
sources += [
|
||||
"pdfium/pdfium_on_demand_searchifier.cc",
|
||||
"pdfium/pdfium_on_demand_searchifier.h",
|
||||
"pdfium/pdfium_progressive_searchifier.cc",
|
||||
"pdfium/pdfium_progressive_searchifier.h",
|
||||
"pdfium/pdfium_searchify.cc",
|
||||
@ -493,7 +495,10 @@ if (enable_pdf) {
|
||||
}
|
||||
|
||||
if (enable_screen_ai_service) {
|
||||
sources += [ "pdfium/pdfium_searchify_unittest.cc" ]
|
||||
sources += [
|
||||
"pdfium/pdfium_on_demand_searchifier_unittest.cc",
|
||||
"pdfium/pdfium_searchify_unittest.cc",
|
||||
]
|
||||
}
|
||||
|
||||
if (v8_use_external_startup_data) {
|
||||
|
@ -94,6 +94,11 @@
|
||||
#include "gin/public/cppgc.h"
|
||||
#endif
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
#include "pdf/pdfium/pdfium_on_demand_searchifier.h"
|
||||
#include "ui/accessibility/ax_features.mojom-features.h"
|
||||
#endif
|
||||
|
||||
#if BUILDFLAG(IS_LINUX) || BUILDFLAG(IS_CHROMEOS)
|
||||
#include "pdf/pdfium/pdfium_font_linux.h"
|
||||
#endif
|
||||
@ -567,6 +572,10 @@ PDFiumEngine::~PDFiumEngine() {
|
||||
// Clear all the containers that can prevent unloading.
|
||||
find_results_.clear();
|
||||
selection_.clear();
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
// Should be reset before document is unloaded.
|
||||
searchifier_.reset();
|
||||
#endif
|
||||
|
||||
for (auto& page : pages_)
|
||||
page->Unload();
|
||||
@ -4246,6 +4255,66 @@ void PDFiumEngine::UpdatePageCount() {
|
||||
}
|
||||
#endif // defined(PDF_ENABLE_XFA)
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
void PDFiumEngine::StartSearchify(
|
||||
PerformOcrCallbackAsync perform_ocr_callback) {
|
||||
// Searchify requests may be sent to the engine when PDF pages are loaded and
|
||||
// before this function is called. In that case, `searchifier_` is already
|
||||
// created and is waiting for the `Start` command to start processing the
|
||||
// requests.
|
||||
if (!searchifier_) {
|
||||
searchifier_ = std::make_unique<PDFiumOnDemandSearchifier>(this);
|
||||
}
|
||||
searchifier_->Start(std::move(perform_ocr_callback));
|
||||
}
|
||||
|
||||
base::RepeatingClosure PDFiumEngine::GetOcrDisconnectHandler() {
|
||||
return base::BindRepeating(&PDFiumEngine::OnOcrDisconnected,
|
||||
weak_factory_.GetWeakPtr());
|
||||
}
|
||||
|
||||
void PDFiumEngine::OnOcrDisconnected() {
|
||||
if (searchifier_) {
|
||||
searchifier_->OnOcrDisconnected();
|
||||
}
|
||||
}
|
||||
|
||||
bool PDFiumEngine::PageNeedsSearchify(int page_index) const {
|
||||
CHECK(PageIndexInBounds(page_index));
|
||||
return searchifier_ && searchifier_->IsPageScheduled(page_index);
|
||||
}
|
||||
|
||||
void PDFiumEngine::ScheduleSearchifyIfNeeded(PDFiumPage* page) {
|
||||
if (!base::FeatureList::IsEnabled(chrome_pdf::features::kPdfSearchify) ||
|
||||
!base::FeatureList::IsEnabled(ax::mojom::features::kScreenAIOCREnabled)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(crbug.com/40066441): Explore heuristics to run OCR on pages with large
|
||||
// images and a little text.
|
||||
if (!page->available() || page->GetCharCount()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// This function is called during page load, which can be before when the
|
||||
// client calls `StartSearchify`, or after searchifier has failed to call OCR
|
||||
// and is considered as not available.
|
||||
if (!searchifier_) {
|
||||
searchifier_ = std::make_unique<PDFiumOnDemandSearchifier>(this);
|
||||
} else if (searchifier_->HasFailed()) {
|
||||
return;
|
||||
}
|
||||
|
||||
searchifier_->SchedulePage(page->index());
|
||||
}
|
||||
|
||||
void PDFiumEngine::CancelPendingSearchify(int page_index) {
|
||||
if (searchifier_) {
|
||||
searchifier_->RemovePageFromQueue(page_index);
|
||||
}
|
||||
}
|
||||
#endif // BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
|
||||
void PDFiumEngine::UpdateLinkUnderCursor(const std::string& target_url) {
|
||||
client_->SetLinkUnderCursor(target_url);
|
||||
}
|
||||
|
@ -59,6 +59,7 @@
|
||||
#endif
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
#include "pdf/pdfium/pdfium_searchify.h"
|
||||
#include "services/screen_ai/public/mojom/screen_ai_service.mojom-forward.h"
|
||||
#endif
|
||||
|
||||
@ -87,6 +88,10 @@ struct DocumentAttachmentInfo;
|
||||
struct DocumentMetadata;
|
||||
struct PageCharacterIndex;
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
class PDFiumOnDemandSearchifier;
|
||||
#endif
|
||||
|
||||
namespace draw_utils {
|
||||
class ShadowMatrix;
|
||||
} // namespace draw_utils
|
||||
@ -418,6 +423,29 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
|
||||
void UpdatePageCount();
|
||||
#endif // defined(PDF_ENABLE_XFA)
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
// Starts the searchify process and passes a callback to a function that
|
||||
// performs OCR. This function is expected to be called only once.
|
||||
void StartSearchify(PerformOcrCallbackAsync perform_ocr_callback);
|
||||
|
||||
// Returns a function to pass OCR disconnection events to the searchifier.
|
||||
base::RepeatingClosure GetOcrDisconnectHandler();
|
||||
|
||||
// Tells if the page is waiting to be searchified.
|
||||
bool PageNeedsSearchify(int page_index) const;
|
||||
|
||||
// Schedules searchify for the page if it has no text.
|
||||
void ScheduleSearchifyIfNeeded(PDFiumPage* page);
|
||||
|
||||
// Cancels a pending searchify if it has not started yet. Ignores the request
|
||||
// if the page is not scheduled for searchify.
|
||||
void CancelPendingSearchify(int page_index);
|
||||
|
||||
PDFiumOnDemandSearchifier* GetSearchifierForTesting() {
|
||||
return searchifier_.get();
|
||||
}
|
||||
#endif
|
||||
|
||||
void UnsupportedFeature(const std::string& feature);
|
||||
|
||||
FPDF_AVAIL fpdf_availability() const;
|
||||
@ -851,6 +879,11 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
|
||||
// requests the thumbnail for that page.
|
||||
void MaybeRequestPendingThumbnail(int page_index);
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
// Called if OCR service gets disconnected.
|
||||
void OnOcrDisconnected();
|
||||
#endif
|
||||
|
||||
const raw_ptr<PDFiumEngineClient> client_;
|
||||
|
||||
// The current document layout.
|
||||
@ -881,6 +914,10 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
|
||||
// form filler.
|
||||
PDFiumFormFiller form_filler_;
|
||||
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
std::unique_ptr<PDFiumOnDemandSearchifier> searchifier_;
|
||||
#endif
|
||||
|
||||
std::unique_ptr<PDFiumDocument> document_;
|
||||
bool document_pending_ = false;
|
||||
bool document_loaded_ = false;
|
||||
|
173
pdf/pdfium/pdfium_on_demand_searchifier.cc
Normal file
173
pdf/pdfium/pdfium_on_demand_searchifier.cc
Normal file
@ -0,0 +1,173 @@
|
||||
// Copyright 2024 The Chromium Authors
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "pdf/pdfium/pdfium_on_demand_searchifier.h"
|
||||
|
||||
#include "base/check.h"
|
||||
#include "base/containers/contains.h"
|
||||
#include "base/task/single_thread_task_runner.h"
|
||||
#include "pdf/pdfium/pdfium_searchify.h"
|
||||
#include "services/screen_ai/public/mojom/screen_ai_service.mojom.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// A delay to wait between page searchify tasks to give more priority to other
|
||||
// PDF tasks.
|
||||
constexpr base::TimeDelta kSearchifyPageDelay = base::Milliseconds(100);
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace chrome_pdf {
|
||||
|
||||
PDFiumOnDemandSearchifier::PDFiumOnDemandSearchifier(PDFiumEngine* engine)
|
||||
: engine_(raw_ref<PDFiumEngine>::from_ptr(engine)) {}
|
||||
|
||||
PDFiumOnDemandSearchifier::~PDFiumOnDemandSearchifier() = default;
|
||||
|
||||
void PDFiumOnDemandSearchifier::Start(PerformOcrCallbackAsync callback) {
|
||||
CHECK(!callback.is_null());
|
||||
CHECK_EQ(state_, State::kIdle);
|
||||
|
||||
// Expected to be called only once.
|
||||
CHECK(perform_ocr_callback_.is_null());
|
||||
|
||||
font_ = CreateFont(engine_->doc());
|
||||
perform_ocr_callback_ = std::move(callback);
|
||||
|
||||
SearchifyNextPage();
|
||||
}
|
||||
|
||||
void PDFiumOnDemandSearchifier::OnOcrDisconnected() {
|
||||
switch (state_) {
|
||||
case State::kIdle:
|
||||
// No need to change state, if another request comes up, the OCR provider
|
||||
// will try to connect to the service again.
|
||||
return;
|
||||
|
||||
case State::kWaitingForResults:
|
||||
// Assume OCR cannot be used anymore if it gets disconnected while
|
||||
// waiting for results. Therefore cancel all pending requests and move
|
||||
// to failed state.
|
||||
current_page_ = nullptr;
|
||||
pages_queue_.clear();
|
||||
state_ = State::kFailed;
|
||||
return;
|
||||
|
||||
case State::kFailed:
|
||||
// `kFailed` is the end state and searchifier does not accept any requests
|
||||
// after it. So no need to react to OCR disconnection.
|
||||
return;
|
||||
}
|
||||
NOTREACHED();
|
||||
}
|
||||
|
||||
bool PDFiumOnDemandSearchifier::IsPageScheduled(int page_index) const {
|
||||
if (current_page_ && current_page_->index() == page_index) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return base::Contains(pages_queue_, page_index);
|
||||
}
|
||||
|
||||
void PDFiumOnDemandSearchifier::SchedulePage(int page_index) {
|
||||
CHECK_GE(page_index, 0);
|
||||
CHECK_NE(state_, State::kFailed);
|
||||
if (IsPageScheduled(page_index)) {
|
||||
return;
|
||||
}
|
||||
pages_queue_.push_back(page_index);
|
||||
if (state_ == State::kWaitingForResults || !perform_ocr_callback_) {
|
||||
return;
|
||||
}
|
||||
|
||||
CHECK_EQ(state_, State::kIdle);
|
||||
|
||||
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
|
||||
FROM_HERE,
|
||||
base::BindOnce(&PDFiumOnDemandSearchifier::SearchifyNextPage,
|
||||
weak_factory_.GetWeakPtr()),
|
||||
kSearchifyPageDelay);
|
||||
|
||||
// Avoid posting `SearchifyNextPage` more than once.
|
||||
state_ = State::kWaitingForResults;
|
||||
}
|
||||
|
||||
void PDFiumOnDemandSearchifier::RemovePageFromQueue(int page_index) {
|
||||
base::Erase(pages_queue_, page_index);
|
||||
}
|
||||
|
||||
void PDFiumOnDemandSearchifier::SearchifyNextPage() {
|
||||
// Do not proceed if OCR got disconnected.
|
||||
if (state_ == State::kFailed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (pages_queue_.empty()) {
|
||||
state_ = State::kIdle;
|
||||
return;
|
||||
}
|
||||
|
||||
state_ = State::kWaitingForResults;
|
||||
current_page_ = engine_->GetPage(pages_queue_.front());
|
||||
CHECK(current_page_);
|
||||
pages_queue_.pop_front();
|
||||
|
||||
current_page_image_object_indices_ = current_page_->GetImageObjectIndices();
|
||||
SearchifyNextImage();
|
||||
}
|
||||
|
||||
void PDFiumOnDemandSearchifier::SearchifyNextImage() {
|
||||
std::optional<BitmapResult> result = GetNextBitmap();
|
||||
if (!result.has_value()) {
|
||||
current_page_->ReloadTextPage();
|
||||
if (!FPDFPage_GenerateContent(current_page_->GetPage())) {
|
||||
LOG(ERROR) << "Failed to generate content";
|
||||
}
|
||||
current_page_ = nullptr;
|
||||
|
||||
// Searchify next page.
|
||||
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
|
||||
FROM_HERE,
|
||||
base::BindOnce(&PDFiumOnDemandSearchifier::SearchifyNextPage,
|
||||
weak_factory_.GetWeakPtr()),
|
||||
kSearchifyPageDelay);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& bitmap = result.value().bitmap;
|
||||
perform_ocr_callback_.Run(
|
||||
bitmap,
|
||||
base::BindOnce(&PDFiumOnDemandSearchifier::OnGotOcrResult,
|
||||
weak_factory_.GetWeakPtr(), result.value().image_index,
|
||||
gfx::Size(bitmap.width(), bitmap.height())));
|
||||
}
|
||||
|
||||
std::optional<PDFiumOnDemandSearchifier::BitmapResult>
|
||||
PDFiumOnDemandSearchifier::GetNextBitmap() {
|
||||
while (!current_page_image_object_indices_.empty()) {
|
||||
int image_index = current_page_image_object_indices_.back();
|
||||
current_page_image_object_indices_.pop_back();
|
||||
SkBitmap bitmap = current_page_->GetImageForOcr(image_index);
|
||||
if (!bitmap.drawsNothing()) {
|
||||
return BitmapResult{bitmap, image_index};
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void PDFiumOnDemandSearchifier::OnGotOcrResult(
|
||||
int image_index,
|
||||
const gfx::Size& image_size,
|
||||
screen_ai::mojom::VisualAnnotationPtr annotation) {
|
||||
CHECK_EQ(state_, State::kWaitingForResults);
|
||||
if (annotation) {
|
||||
FPDF_PAGEOBJECT image =
|
||||
FPDFPage_GetObject(current_page_->GetPage(), image_index);
|
||||
AddTextOnImage(engine_->doc(), current_page_->GetPage(), font_.get(), image,
|
||||
std::move(annotation), image_size);
|
||||
}
|
||||
SearchifyNextImage();
|
||||
}
|
||||
|
||||
} // namespace chrome_pdf
|
87
pdf/pdfium/pdfium_on_demand_searchifier.h
Normal file
87
pdf/pdfium/pdfium_on_demand_searchifier.h
Normal file
@ -0,0 +1,87 @@
|
||||
// Copyright 2024 The Chromium Authors
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef PDF_PDFIUM_PDFIUM_ON_DEMAND_SEARCHIFIER_H_
|
||||
#define PDF_PDFIUM_PDFIUM_ON_DEMAND_SEARCHIFIER_H_
|
||||
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "base/containers/circular_deque.h"
|
||||
#include "base/functional/callback_forward.h"
|
||||
#include "base/memory/raw_ref.h"
|
||||
#include "base/memory/weak_ptr.h"
|
||||
#include "pdf/pdfium/pdfium_engine.h"
|
||||
#include "services/screen_ai/public/mojom/screen_ai_service.mojom-forward.h"
|
||||
#include "third_party/pdfium/public/cpp/fpdf_scopers.h"
|
||||
#include "third_party/skia/include/core/SkBitmap.h"
|
||||
namespace chrome_pdf {
|
||||
|
||||
class PDFiumOnDemandSearchifier {
|
||||
public:
|
||||
explicit PDFiumOnDemandSearchifier(PDFiumEngine* engine);
|
||||
~PDFiumOnDemandSearchifier();
|
||||
|
||||
// Starts performing searchify on the scheduled pages. The function should be
|
||||
// called only once. If pages are added for searchifying later, they are
|
||||
// automatically picked up from the queue.
|
||||
void Start(PerformOcrCallbackAsync callback);
|
||||
|
||||
// Called when OCR service is disconnected and is not available anymore.
|
||||
void OnOcrDisconnected();
|
||||
|
||||
// Checks if the page is queued to be searchified or the searchifying process
|
||||
// has started for it but not finished yet.
|
||||
bool IsPageScheduled(int page_index) const;
|
||||
|
||||
// Puts a page in the queue to be searchified. This function can be called
|
||||
// before `Start` and if so, the page stays in the queue until searchifier
|
||||
// starts.
|
||||
void SchedulePage(int page_index);
|
||||
|
||||
// Removes the page form the searchifying queue if it's there.
|
||||
void RemovePageFromQueue(int page_index);
|
||||
|
||||
bool HasFailed() const { return state_ == State::kFailed; }
|
||||
bool IsIdleForTesting() const { return state_ == State::kIdle; }
|
||||
|
||||
private:
|
||||
enum class State { kIdle, kWaitingForResults, kFailed };
|
||||
|
||||
void SearchifyNextPage();
|
||||
void SearchifyNextImage();
|
||||
|
||||
struct BitmapResult {
|
||||
SkBitmap bitmap;
|
||||
int image_index;
|
||||
};
|
||||
|
||||
std::optional<BitmapResult> GetNextBitmap();
|
||||
void OnGotOcrResult(int image_index,
|
||||
const gfx::Size& image_size,
|
||||
screen_ai::mojom::VisualAnnotationPtr annotation);
|
||||
|
||||
// Owns this class.
|
||||
const raw_ref<PDFiumEngine> engine_;
|
||||
|
||||
ScopedFPDFFont font_;
|
||||
|
||||
// Callback function to perform OCR.
|
||||
PerformOcrCallbackAsync perform_ocr_callback_;
|
||||
|
||||
// The page that is currently OCRed.
|
||||
raw_ptr<PDFiumPage> current_page_ = nullptr;
|
||||
std::vector<int> current_page_image_object_indices_;
|
||||
|
||||
// Scheduled pages to be searchified.
|
||||
base::circular_deque<int> pages_queue_;
|
||||
|
||||
State state_ = State::kIdle;
|
||||
|
||||
base::WeakPtrFactory<PDFiumOnDemandSearchifier> weak_factory_{this};
|
||||
};
|
||||
|
||||
} // namespace chrome_pdf
|
||||
|
||||
#endif // PDF_PDFIUM_PDFIUM_ON_DEMAND_SEARCHIFIER_H_
|
252
pdf/pdfium/pdfium_on_demand_searchifier_unittest.cc
Normal file
252
pdf/pdfium/pdfium_on_demand_searchifier_unittest.cc
Normal file
@ -0,0 +1,252 @@
|
||||
// Copyright 2024 The Chromium Authors
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "pdf/pdfium/pdfium_on_demand_searchifier.h"
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "base/functional/callback.h"
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "base/strings/utf_string_conversions.h"
|
||||
#include "base/task/single_thread_task_runner.h"
|
||||
#include "base/test/scoped_feature_list.h"
|
||||
#include "base/test/test_future.h"
|
||||
#include "base/time/time.h"
|
||||
#include "pdf/pdf_features.h"
|
||||
#include "pdf/pdfium/pdfium_range.h"
|
||||
#include "pdf/pdfium/pdfium_test_base.h"
|
||||
#include "pdf/test/test_client.h"
|
||||
#include "services/screen_ai/public/mojom/screen_ai_service.mojom.h"
|
||||
|
||||
namespace {
|
||||
|
||||
void WaitUntilIdle(chrome_pdf::PDFiumOnDemandSearchifier* searchifier,
|
||||
base::OnceClosure callback) {
|
||||
if (searchifier->IsIdleForTesting()) {
|
||||
std::move(callback).Run();
|
||||
return;
|
||||
}
|
||||
|
||||
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
|
||||
FROM_HERE,
|
||||
base::BindOnce(&WaitUntilIdle, searchifier, std::move(callback)),
|
||||
base::Milliseconds(100));
|
||||
}
|
||||
|
||||
void WaitUntilFailure(chrome_pdf::PDFiumOnDemandSearchifier* searchifier,
|
||||
base::OnceClosure callback) {
|
||||
if (searchifier->HasFailed()) {
|
||||
std::move(callback).Run();
|
||||
return;
|
||||
}
|
||||
|
||||
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
|
||||
FROM_HERE,
|
||||
base::BindOnce(&WaitUntilFailure, searchifier, std::move(callback)),
|
||||
base::Milliseconds(100));
|
||||
}
|
||||
|
||||
screen_ai::mojom::VisualAnnotationPtr CreateDummyAnnotation(int call_number) {
|
||||
auto annotation = screen_ai::mojom::VisualAnnotation::New();
|
||||
auto line_box = screen_ai::mojom::LineBox::New();
|
||||
line_box->baseline_box = gfx::Rect(0, 0, 100, 100);
|
||||
line_box->baseline_box_angle = 0;
|
||||
line_box->bounding_box = gfx::Rect(0, 0, 100, 100);
|
||||
line_box->bounding_box_angle = 0;
|
||||
auto word_box = screen_ai::mojom::WordBox::New();
|
||||
word_box->word = base::StringPrintf("OCR Text %i", call_number);
|
||||
word_box->bounding_box = gfx::Rect(0, 0, 100, 100);
|
||||
word_box->bounding_box_angle = 0;
|
||||
line_box->words.push_back(std::move(word_box));
|
||||
annotation->lines.push_back(std::move(line_box));
|
||||
return annotation;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace chrome_pdf {
|
||||
|
||||
class PDFiumOnDemandSearchifierTest : public PDFiumTestBase {
|
||||
public:
|
||||
PDFiumOnDemandSearchifierTest() {
|
||||
scoped_feature_list_.InitAndEnableFeature(
|
||||
chrome_pdf::features::kPdfSearchify);
|
||||
}
|
||||
|
||||
void CreateEngine(const base::FilePath::CharType* test_filename) {
|
||||
engine_ = InitializeEngine(&client_, test_filename);
|
||||
ASSERT_TRUE(engine_) << test_filename;
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// PDFium gets uninitialized via `FPDF_DestroyLibrary`. If `engine_` is not
|
||||
// destroyed here, its destruction results in a crash later.
|
||||
engine_.reset();
|
||||
PDFiumTestBase::TearDown();
|
||||
}
|
||||
|
||||
void StartSearchify() {
|
||||
// `engine_` is owned by this class, safe to use as unretained.
|
||||
engine_->StartSearchify(
|
||||
base::BindRepeating(&PDFiumOnDemandSearchifierTest::MockPerformOcr,
|
||||
base::Unretained(this)));
|
||||
}
|
||||
|
||||
void MockPerformOcr(
|
||||
const SkBitmap& image,
|
||||
base::OnceCallback<void(screen_ai::mojom::VisualAnnotationPtr)>
|
||||
callback) {
|
||||
// Reply with delay, as done through mojo connection to the OCR service.
|
||||
base::SingleThreadTaskRunner::GetCurrentDefault()->PostDelayedTask(
|
||||
FROM_HERE,
|
||||
base::BindOnce(
|
||||
[](base::OnceCallback<void(screen_ai::mojom::VisualAnnotationPtr)>
|
||||
callback,
|
||||
int call_number) {
|
||||
std::move(callback).Run(CreateDummyAnnotation(call_number));
|
||||
},
|
||||
std::move(callback), performed_ocrs_),
|
||||
base::Milliseconds(100));
|
||||
|
||||
performed_ocrs_++;
|
||||
}
|
||||
|
||||
// Returns all characters in the page.
|
||||
std::string GetPageText(chrome_pdf::PDFiumPage* page) {
|
||||
return base::UTF16ToUTF8(
|
||||
chrome_pdf::PDFiumRange::AllTextOnPage(page).GetText());
|
||||
}
|
||||
|
||||
int performed_ocrs() const { return performed_ocrs_; }
|
||||
PDFiumEngine* engine() { return engine_.get(); }
|
||||
|
||||
private:
|
||||
base::test::ScopedFeatureList scoped_feature_list_;
|
||||
std::unique_ptr<PDFiumEngine> engine_;
|
||||
TestClient client_;
|
||||
int performed_ocrs_ = 0;
|
||||
};
|
||||
|
||||
TEST_P(PDFiumOnDemandSearchifierTest, NoImage) {
|
||||
CreateEngine(FILE_PATH_LITERAL("hello_world2.pdf"));
|
||||
|
||||
// Load the page to trigger searchify checking.
|
||||
engine()->GetPage(0)->GetPage();
|
||||
ASSERT_FALSE(engine()->PageNeedsSearchify(0));
|
||||
|
||||
// Searchifier should not be created as it's not needed yet.
|
||||
ASSERT_FALSE(engine()->GetSearchifierForTesting());
|
||||
}
|
||||
|
||||
TEST_P(PDFiumOnDemandSearchifierTest, OnePageWithImages) {
|
||||
CreateEngine(FILE_PATH_LITERAL("image_alt_text.pdf"));
|
||||
|
||||
// Load the page to trigger searchify checking.
|
||||
engine()->GetPage(0)->GetPage();
|
||||
ASSERT_TRUE(engine()->PageNeedsSearchify(0));
|
||||
|
||||
PDFiumOnDemandSearchifier* searchifier = engine()->GetSearchifierForTesting();
|
||||
ASSERT_TRUE(searchifier);
|
||||
|
||||
ASSERT_TRUE(searchifier->IsPageScheduled(0));
|
||||
|
||||
StartSearchify();
|
||||
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilIdle(searchifier, future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
ASSERT_EQ(performed_ocrs(), 2);
|
||||
|
||||
// The page has two images.
|
||||
std::string page_text = GetPageText(engine()->GetPage(0));
|
||||
ASSERT_EQ(page_text, "OCR Text 0\r\nOCR Text 1");
|
||||
}
|
||||
|
||||
TEST_P(PDFiumOnDemandSearchifierTest, MultiplePagesWithImages) {
|
||||
constexpr int kPageCount = 4;
|
||||
CreateEngine(FILE_PATH_LITERAL("multi_page_no_text.pdf"));
|
||||
|
||||
// Trigger page load and verify needing searchify.
|
||||
for (int page = 0; page < kPageCount; page++) {
|
||||
engine()->GetPage(page)->GetPage();
|
||||
ASSERT_TRUE(engine()->PageNeedsSearchify(page));
|
||||
}
|
||||
|
||||
PDFiumOnDemandSearchifier* searchifier = engine()->GetSearchifierForTesting();
|
||||
ASSERT_TRUE(searchifier);
|
||||
|
||||
// Ensure they are scheduled.
|
||||
for (int page = 0; page < kPageCount; page++) {
|
||||
ASSERT_TRUE(searchifier->IsPageScheduled(page)) << page;
|
||||
}
|
||||
|
||||
StartSearchify();
|
||||
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilIdle(searchifier, future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
ASSERT_EQ(performed_ocrs(), 4);
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(0)), "OCR Text 0");
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(1)), "OCR Text 1");
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(2)), "OCR Text 2");
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(3)), "OCR Text 3");
|
||||
}
|
||||
|
||||
TEST_P(PDFiumOnDemandSearchifierTest, MultiplePagesWithUnload) {
|
||||
constexpr int kPageCount = 4;
|
||||
CreateEngine(FILE_PATH_LITERAL("multi_page_no_text.pdf"));
|
||||
|
||||
// Trigger page load for all.
|
||||
for (int page = 0; page < kPageCount; page++) {
|
||||
ASSERT_TRUE(engine()->GetPage(page)->GetPage());
|
||||
}
|
||||
|
||||
engine()->GetPage(0)->Unload();
|
||||
|
||||
PDFiumOnDemandSearchifier* searchifier = engine()->GetSearchifierForTesting();
|
||||
ASSERT_TRUE(searchifier);
|
||||
ASSERT_FALSE(searchifier->IsPageScheduled(0));
|
||||
|
||||
StartSearchify();
|
||||
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilIdle(searchifier, future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
ASSERT_EQ(performed_ocrs(), kPageCount - 1);
|
||||
|
||||
// First page is not searchified.
|
||||
std::string page_text = GetPageText(engine()->GetPage(0));
|
||||
EXPECT_TRUE(page_text.empty());
|
||||
|
||||
// Other pages are searchified.
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(1)), "OCR Text 0");
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(2)), "OCR Text 1");
|
||||
EXPECT_EQ(GetPageText(engine()->GetPage(3)), "OCR Text 2");
|
||||
}
|
||||
|
||||
TEST_P(PDFiumOnDemandSearchifierTest, OcrCancellation) {
|
||||
constexpr int kPageCount = 4;
|
||||
CreateEngine(FILE_PATH_LITERAL("multi_page_no_text.pdf"));
|
||||
|
||||
// Trigger page load for all.
|
||||
for (int page = 0; page < kPageCount; page++) {
|
||||
ASSERT_TRUE(engine()->GetPage(page)->GetPage());
|
||||
}
|
||||
|
||||
StartSearchify();
|
||||
engine()->GetOcrDisconnectHandler().Run();
|
||||
|
||||
base::test::TestFuture<void> future;
|
||||
WaitUntilFailure(engine()->GetSearchifierForTesting(), future.GetCallback());
|
||||
ASSERT_TRUE(future.Wait());
|
||||
|
||||
// Performing OCR is async and has some delay. It is expected that
|
||||
// cancellation takes effect before all pages are OCRed.
|
||||
ASSERT_LT(performed_ocrs(), kPageCount);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(All, PDFiumOnDemandSearchifierTest, testing::Bool());
|
||||
|
||||
} // namespace chrome_pdf
|
@ -12,6 +12,7 @@
|
||||
#include <utility>
|
||||
|
||||
#include "base/check_op.h"
|
||||
#include "base/containers/to_vector.h"
|
||||
#include "base/functional/bind.h"
|
||||
#include "base/functional/callback.h"
|
||||
#include "base/metrics/histogram_functions.h"
|
||||
@ -403,6 +404,10 @@ void PDFiumPage::Unload() {
|
||||
text_page_.reset();
|
||||
|
||||
if (page_) {
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
// TODO(crbug.com/360803943): Keep previously generated OCR results.
|
||||
engine_->CancelPendingSearchify(index_);
|
||||
#endif
|
||||
if (engine_->form()) {
|
||||
FORM_OnBeforeClosePage(page(), engine_->form());
|
||||
}
|
||||
@ -417,8 +422,13 @@ FPDF_PAGE PDFiumPage::GetPage() {
|
||||
if (!page_) {
|
||||
ScopedUnloadPreventer scoped_unload_preventer(this);
|
||||
page_.reset(FPDF_LoadPage(engine_->doc(), index_));
|
||||
if (page_ && engine_->form()) {
|
||||
FORM_OnAfterLoadPage(page(), engine_->form());
|
||||
if (page_) {
|
||||
#if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE)
|
||||
engine_->ScheduleSearchifyIfNeeded(this);
|
||||
#endif
|
||||
if (engine_->form()) {
|
||||
FORM_OnAfterLoadPage(page(), engine_->form());
|
||||
}
|
||||
}
|
||||
}
|
||||
return page();
|
||||
@ -434,6 +444,12 @@ FPDF_TEXTPAGE PDFiumPage::GetTextPage() {
|
||||
return text_page();
|
||||
}
|
||||
|
||||
void PDFiumPage::ReloadTextPage() {
|
||||
CHECK_EQ(preventing_unload_count_, 0);
|
||||
text_page_.reset();
|
||||
GetTextPage();
|
||||
}
|
||||
|
||||
void PDFiumPage::CalculatePageObjectTextRunBreaks() {
|
||||
if (calculated_page_object_text_run_breaks_)
|
||||
return;
|
||||
@ -809,6 +825,16 @@ std::vector<AccessibilityImageInfo> PDFiumPage::GetImageInfo(
|
||||
return image_info;
|
||||
}
|
||||
|
||||
std::vector<int> PDFiumPage::GetImageObjectIndices() {
|
||||
if (!available_) {
|
||||
return {};
|
||||
}
|
||||
|
||||
CalculateImages();
|
||||
return base::ToVector(
|
||||
images_, [](const Image& image) { return image.page_object_index; });
|
||||
}
|
||||
|
||||
SkBitmap PDFiumPage::GetImageForOcr(int page_object_index) {
|
||||
FPDF_PAGE page = GetPage();
|
||||
FPDF_PAGEOBJECT page_object = FPDFPage_GetObject(page, page_object_index);
|
||||
|
@ -77,6 +77,9 @@ class PDFiumPage {
|
||||
// Gets the number of characters in the page.
|
||||
int GetCharCount();
|
||||
|
||||
// Resets loaded text and loads it again.
|
||||
void ReloadTextPage();
|
||||
|
||||
// See definition of PDFiumEngine::GetTextRunInfo().
|
||||
std::optional<AccessibilityTextRunInfo> GetTextRunInfo(int start_char_index);
|
||||
|
||||
@ -109,6 +112,9 @@ class PDFiumPage {
|
||||
// `image_data` field.
|
||||
std::vector<AccessibilityImageInfo> GetImageInfo(uint32_t text_run_count);
|
||||
|
||||
// Returns the indices of image objects.
|
||||
std::vector<int> GetImageObjectIndices();
|
||||
|
||||
// Returns the image as a 32-bit bitmap format for OCR.
|
||||
SkBitmap GetImageForOcr(int page_object_index);
|
||||
|
||||
|
@ -23,6 +23,11 @@ class Size;
|
||||
|
||||
namespace chrome_pdf {
|
||||
|
||||
using PerformOcrCallbackAsync = base::RepeatingCallback<void(
|
||||
const SkBitmap& bitmap,
|
||||
base::OnceCallback<void(
|
||||
screen_ai::mojom::VisualAnnotationPtr annotation)>)>;
|
||||
|
||||
struct SearchifyBoundingBoxOrigin {
|
||||
gfx::PointF point;
|
||||
float theta;
|
||||
|
81
pdf/test/data/multi_page_no_text.in
Normal file
81
pdf/test/data/multi_page_no_text.in
Normal file
@ -0,0 +1,81 @@
|
||||
{{header}}
|
||||
{{object 1 0}} <<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
{{object 2 0}} <<
|
||||
/Type /Pages
|
||||
/Count 4
|
||||
/Kids [3 0 R 4 0 R 5 0 R 6 0 R]
|
||||
/Resources <<
|
||||
/XObject <<
|
||||
/I1 7 0 R
|
||||
>>
|
||||
>>
|
||||
/MediaBox [0 0 600 800]
|
||||
>>
|
||||
endobj
|
||||
{{object 3 0}} <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 4 0}} <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 5 0}} <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 6 0}} <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 7 0}} <<
|
||||
/Type /XObject
|
||||
/Subtype /Image
|
||||
/Width 600
|
||||
/Height 800
|
||||
/BitsPerComponent 8
|
||||
/ColorSpace /DeviceCMYK
|
||||
/Decode [1 0 1 0 1 0 1 0]
|
||||
/Filter [/ASCIIHexDecode /FlateDecode /DCTDecode]
|
||||
/Name /I1
|
||||
{{streamlen}}
|
||||
>>
|
||||
stream
|
||||
789cedcd3d4ec2001806e00f680529d4165a643671f112c6901095b09838b01983899bd7e02c9ec
|
||||
2c143f8b37802af50cba81770799eef9ddebcc9d7bc355f515c2faf96d1e974e2b2bd68be23bfd8
|
||||
3edd3fc436f69acf5844afdbdda795b44907699a24e9b0df3f188c86a35136ccb2713e29c679996
|
||||
759312bca6955d7f5e8e8783eabe693aaae9a97a806d3dd74972c4e625dc6a68c5519cd6bd4ede3
|
||||
f8edb0adabe26f1bcd47e4c922d6b18955c4793cdf9ddedc9e3dfe9d01000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
000000000000000000000000000000000000000000000000000000000000000000000000fc835ef
|
||||
3fe03ce2e1eba
|
||||
endstream
|
||||
endobj
|
||||
{{object 8 0}} <<
|
||||
{{streamlen}}
|
||||
>>
|
||||
stream
|
||||
q
|
||||
600 0 0 800 0 0 cm
|
||||
/I1 Do
|
||||
Q
|
||||
endstream
|
||||
endobj
|
||||
{{xref}}
|
||||
{{trailer}}
|
||||
{{startxref}}
|
||||
%%EOF
|
96
pdf/test/data/multi_page_no_text.pdf
Normal file
96
pdf/test/data/multi_page_no_text.pdf
Normal file
@ -0,0 +1,96 @@
|
||||
%PDF-1.7
|
||||
%<25><><EFBFBD><EFBFBD>
|
||||
1 0 obj <<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj <<
|
||||
/Type /Pages
|
||||
/Count 4
|
||||
/Kids [3 0 R 4 0 R 5 0 R 6 0 R]
|
||||
/Resources <<
|
||||
/XObject <<
|
||||
/I1 7 0 R
|
||||
>>
|
||||
>>
|
||||
/MediaBox [0 0 600 800]
|
||||
>>
|
||||
endobj
|
||||
3 0 obj <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
4 0 obj <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
5 0 obj <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
6 0 obj <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Contents [8 0 R]
|
||||
>>
|
||||
endobj
|
||||
7 0 obj <<
|
||||
/Type /XObject
|
||||
/Subtype /Image
|
||||
/Width 600
|
||||
/Height 800
|
||||
/BitsPerComponent 8
|
||||
/ColorSpace /DeviceCMYK
|
||||
/Decode [1 0 1 0 1 0 1 0]
|
||||
/Filter [/ASCIIHexDecode /FlateDecode /DCTDecode]
|
||||
/Name /I1
|
||||
/Length 734
|
||||
>>
|
||||
stream
|
||||
789cedcd3d4ec2001806e00f680529d4165a643671f112c6901095b09838b01983899bd7e02c9ec
|
||||
2c143f8b37802af50cba81770799eef9ddebcc9d7bc355f515c2faf96d1e974e2b2bd68be23bfd8
|
||||
3edd3fc436f69acf5844afdbdda795b44907699a24e9b0df3f188c86a35136ccb2713e29c679996
|
||||
759312bca6955d7f5e8e8783eabe693aaae9a97a806d3dd74972c4e625dc6a68c5519cd6bd4ede3
|
||||
f8edb0adabe26f1bcd47e4c922d6b18955c4793cdf9ddedc9e3dfe9d01000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
000000000000000000000000000000000000000000000000000000000000000000000000fc835ef
|
||||
3fe03ce2e1eba
|
||||
endstream
|
||||
endobj
|
||||
8 0 obj <<
|
||||
/Length 30
|
||||
>>
|
||||
stream
|
||||
q
|
||||
600 0 0 800 0 0 cm
|
||||
/I1 Do
|
||||
Q
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000015 00000 n
|
||||
0000000068 00000 n
|
||||
0000000235 00000 n
|
||||
0000000306 00000 n
|
||||
0000000377 00000 n
|
||||
0000000448 00000 n
|
||||
0000000519 00000 n
|
||||
0000001506 00000 n
|
||||
trailer <<
|
||||
/Root 1 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
1587
|
||||
%%EOF
|
Reference in New Issue
Block a user