0

Reland "Add PDF engine method to get all text in the document"

This is a reland of commit 8d44e4899d

The original CL added a method to grab all the text because that was
the formatted needed by the LensOverlayController that was going to use
this method. The format has thus changed, so the CL modified the
original method from getting all text at once, to getting the text on a
certain page instead.

Original change's description:
> Add PDF engine method to get all text in the document
>
> This will be used by the Lens Overlay to pass a fraction of the document
> to be used for suggest signals while the full PDF is being processed.
> Retrieving the text in the Lens overlay controller will come in a
> followup CL.
>
> Bug: 379344946
> Change-Id: Ie52df82022916a3bb367150207d1b70e03fbce8a
> Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6032353
> Reviewed-by: Lei Zhang <thestig@chromium.org>
> Commit-Queue: Duncan Mercer <mercerd@google.com>
> Cr-Commit-Position: refs/heads/main@{#1385831}

Bug: 379344946
Change-Id: I91d09c610fb64b050ae12e29ed519804dd1dfe38
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6042085
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Duncan Mercer <mercerd@google.com>
Cr-Commit-Position: refs/heads/main@{#1386411}
This commit is contained in:
Duncan Mercer
2024-11-21 20:17:37 +00:00
committed by Chromium LUCI CQ
parent 20b8655eba
commit fba49a40ba
3 changed files with 22 additions and 0 deletions

@ -2248,6 +2248,12 @@ void PDFiumEngine::DisplayAnnotations(bool display) {
InvalidateAllPages();
}
std::u16string PDFiumEngine::GetPageText(int page_index) {
CHECK(PageIndexInBounds(page_index));
auto range = PDFiumRange::AllTextOnPage(pages_[page_index].get());
return range.GetText();
}
void PDFiumEngine::InvalidateAllPages() {
CancelPaints();
StopFind();

@ -222,6 +222,10 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
void SetDocumentLayout(DocumentLayout::PageSpread page_spread);
void DisplayAnnotations(bool display);
// Returns the text contained on the given page. The caller is responsible for
// passing a valid `page_index`.
std::u16string GetPageText(int page_index);
// Applies the document layout options proposed by a call to
// PDFiumEngineClient::ProposeDocumentLayout(), returning the overall size of
// the new effective layout.

@ -1097,6 +1097,18 @@ TEST_P(PDFiumEngineTest, DrawTextSelectionsBigtableMicro) {
*engine, /*page_index=*/0, "bigtable_micro_selection.png");
}
TEST_P(PDFiumEngineTest, GetPageText) {
NiceMock<MockTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("hello_world2.pdf"));
ASSERT_TRUE(engine);
static constexpr char16_t kExpectedPageText[] = u"Hello, world!\r\nGoodbye, world!";
EXPECT_EQ(kExpectedPageText, engine->GetPageText(/*page_index=*/0));
EXPECT_EQ(kExpectedPageText, engine->GetPageText(/*page_index=*/1));
}
TEST_P(PDFiumEngineTest, LinkNavigates) {
NiceMock<MockTestClient> client;
std::unique_ptr<PDFiumEngine> engine =