0

Fix PDFiumRange's handling of FPDFText_GetText() results.

Use FPDFText_GetText()'s returned count as the source of truth for how
many characters are in the output string, instead of trusting this count
is consistent with the FPDFText_CountChars() return value. In some
cases, they are not the same.

Bug: 1357385
Change-Id: I355a573c036e15f96972901cd562e3bcee1be74a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3908491
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1050686}
This commit is contained in:
Lei Zhang
2022-09-23 16:37:37 +00:00
committed by Chromium LUCI CQ
parent f949c8de94
commit 161d189731
4 changed files with 146 additions and 3 deletions

@ -864,6 +864,18 @@ TEST_F(PDFiumEngineTest, SelectLinkAreaWithNoText) {
EXPECT_EQ(kExpectedText, engine->GetSelectedText());
}
TEST_F(PDFiumEngineTest, SelectTextWithNonPrintableCharacter) {
NiceMock<MockTestClient> client;
std::unique_ptr<PDFiumEngine> engine =
InitializeEngine(&client, FILE_PATH_LITERAL("bug_1357385.pdf"));
ASSERT_TRUE(engine);
EXPECT_THAT(engine->GetSelectedText(), IsEmpty());
engine->SelectAll();
EXPECT_EQ("Hello, world!", engine->GetSelectedText());
}
using PDFiumEngineDeathTest = PDFiumEngineTest;
TEST_F(PDFiumEngineDeathTest, RequestThumbnailRedundant) {

@ -8,6 +8,7 @@
#include "base/containers/cxx20_erase.h"
#include "base/strings/string_util.h"
#include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"
#include "third_party/pdfium/public/fpdf_searchex.h"
#include "ui/gfx/geometry/point.h"
#include "ui/gfx/geometry/rect.h"
#include "ui/gfx/geometry/rect_f.h"
@ -105,20 +106,46 @@ std::u16string PDFiumRange::GetText() const {
AdjustForBackwardsRange(index, count);
if (count > 0) {
// Note that the `expected_size` value includes the NUL terminator.
//
// Cannot set `check_expected_size` to true here because the fix to
// https://crbug.com/pdfium/1139 made it such that FPDFText_GetText() is
// not always consistent with FPDFText_CountChars() and may trim characters.
//
// Instead, treat `count` as the requested count, but use the size of
// `result` as the source of truth for how many characters
// FPDFText_GetText() actually wrote out.
PDFiumAPIStringBufferAdapter<std::u16string> api_string_adapter(
&result, /*expected_size=*/count + 1, /*check_expected_size=*/true);
&result, /*expected_size=*/count + 1, /*check_expected_size=*/false);
unsigned short* data =
reinterpret_cast<unsigned short*>(api_string_adapter.GetData());
int written = FPDFText_GetText(page_->GetTextPage(), index, count, data);
// FPDFText_GetText() returns 0 on failure. Never negative value.
DCHECK_GE(written, 0);
api_string_adapter.Close(written);
const gfx::RectF page_bounds = page_->GetCroppedRect();
std::u16string in_bound_text;
in_bound_text.reserve(result.size());
for (int i = 0; i < count; ++i) {
// If FPDFText_GetText() trimmed off characters, figure out how many were
// trimmed from the front. Store the result in `index_offset`, so the
// IsCharInPageBounds() calls below can have the correct index.
CHECK_GE(static_cast<size_t>(count), result.size());
size_t trimmed_count = static_cast<size_t>(count) - result.size();
int index_offset = 0;
while (trimmed_count) {
if (FPDFText_GetTextIndexFromCharIndex(page_->GetTextPage(),
index + index_offset) >= 0) {
break;
}
--trimmed_count;
++index_offset;
}
for (size_t i = 0; i < result.size(); ++i) {
// Filter out characters outside the page bounds, which are semantically
// not part of the page.
if (page_->IsCharInPageBounds(index + i, page_bounds))
if (page_->IsCharInPageBounds(index + index_offset + i, page_bounds))
in_bound_text += result[i];
}
result = in_bound_text;

@ -0,0 +1,46 @@
{{header}}
{{object 1 0}} <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
{{object 2 0}} <<
/Type /Pages
/MediaBox [0 0 200 200]
/Count 1
/Kids [3 0 R]
>>
endobj
{{object 3 0}} <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
{{object 4 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
endobj
{{object 5 0}} <<
{{streamlen}}
>>
stream
BT
20 50 Td
/F1 12 Tf
% Modified in a way similar to PDFium's bug_1139.in.
(\003\003\003Hello, world!\003) Tj
ET
endstream
endobj
{{xref}}
{{trailer}}
{{startxref}}
%%EOF

@ -0,0 +1,58 @@
%PDF-1.7
%<25><><EFBFBD><EFBFBD>
1 0 obj <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj <<
/Type /Pages
/MediaBox [0 0 200 200]
/Count 1
/Kids [3 0 R]
>>
endobj
3 0 obj <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
4 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
endobj
5 0 obj <<
/Length 113
>>
stream
BT
20 50 Td
/F1 12 Tf
% Modified in a way similar to PDFium's bug_1139.in.
(\003\003\003Hello, world!\003) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000068 00000 n
0000000157 00000 n
0000000283 00000 n
0000000361 00000 n
trailer <<
/Root 1 0 R
/Size 6
>>
startxref
526
%%EOF