0

Add PDF engine method to get all text in the document

This will be used by the Lens Overlay to pass a fraction of the document
to be used for suggest signals while the full PDF is being processed.
Retrieving the text in the Lens overlay controller will come in a
followup CL.

Bug: 379344946
Change-Id: Ie52df82022916a3bb367150207d1b70e03fbce8a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6032353
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Duncan Mercer <mercerd@google.com>
Cr-Commit-Position: refs/heads/main@{#1385831}
This commit is contained in:
Duncan Mercer
2024-11-20 19:50:44 +00:00
committed by Chromium LUCI CQ
parent 738dff33b7
commit 8d44e4899d
3 changed files with 49 additions and 0 deletions

@ -2248,6 +2248,24 @@ void PDFiumEngine::DisplayAnnotations(bool display) {
InvalidateAllPages();
}
std::u16string PDFiumEngine::GetAllText(uint32_t size_limit) {
std::u16string all_pages_text;
for (auto& page : pages_) {
// Add the current page's text to the output.
auto range = PDFiumRange::AllTextOnPage(page.get());
all_pages_text.append(range.GetText());
// Truncate and exit early if over the size limit.
if (all_pages_text.size() > size_limit) {
all_pages_text.resize(size_limit);
break;
}
}
return all_pages_text;
}
void PDFiumEngine::InvalidateAllPages() {
CancelPaints();
StopFind();

@ -222,6 +222,11 @@ class PDFiumEngine : public DocumentLoader::Client, public IFSDK_PAUSE {
void SetDocumentLayout(DocumentLayout::PageSpread page_spread);
void DisplayAnnotations(bool display);
// Returns the text contained in the PDF. If the size of the text is larger
// than `size_limit`, the remaining text will be truncated, and a string with
// the first `size_limit` characters will be returned.
std::u16string GetAllText(uint32_t size_limit);
// Applies the document layout options proposed by a call to
// PDFiumEngineClient::ProposeDocumentLayout(), returning the overall size of
// the new effective layout.

@ -1097,6 +1097,32 @@ TEST_P(PDFiumEngineTest, DrawTextSelectionsBigtableMicro) {
*engine, /*page_index=*/0, "bigtable_micro_selection.png");
}
TEST_P(PDFiumEngineTest, GetAllText) {
NiceMock<MockTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("hello_world2.pdf"));
ASSERT_TRUE(engine);
auto text = engine->GetAllText(/*size_limit=*/100);
constexpr char16_t kExpectedText[] =
u"Hello, world!\r\nGoodbye, world!Hello, world!\r\nGoodbye, world!";
EXPECT_EQ(kExpectedText, text);
}
TEST_P(PDFiumEngineTest, GetAllTextSizeLimit) {
NiceMock<MockTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("hello_world2.pdf"));
ASSERT_TRUE(engine);
auto text = engine->GetAllText(/*size_limit=*/30);
ASSERT_EQ(30u, text.size());
constexpr char16_t kExpectedText[] = u"Hello, world!\r\nGoodbye, world!";
EXPECT_EQ(kExpectedText, text);
}
TEST_P(PDFiumEngineTest, LinkNavigates) {
NiceMock<MockTestClient> client;
std::unique_ptr<PDFiumEngine> engine =