0

Fix text run calculation for PDFs starting with a Unicode space

For presenting the text model of a PDF for accessibility we break the
PDF content into text runs. While breaking the text into text runs we
always start with the first non-space (Unicode) character. This
assumes that PDF content will not have a preceding space. However, it
is not always the case. We can see that this particular file
http://africau.edu/images/default/sample.pdf starts with a space.

The logic to calculate text run further goes on to create a text run
with a bound which has not taken care of the preceding space character.
When this information is passed on to PdfAccessibilityTree we report
that the text run starts from the space character but we report text run
bounds which is short by a space character. When we start arranging
characters in PdfAccessibilityTree we run short by the width of space
character. As a result we see an offset from the actual position of
characters.

The fix is to add the bounding box of the preceding space character in
the bounds of text run.

Bug: 1076752

Change-Id: Ic9b06307daf5dc3681a42ae8176288f5b3012886
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2167749
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Ankit Kumar 🌪️ <ankk@microsoft.com>
Reviewed-by: Kevin Babbitt <kbabbitt@microsoft.com>
Commit-Queue: Virender Singh <virens@microsoft.com>
Cr-Commit-Position: refs/heads/master@{#769180}
This commit is contained in:
Virender Singh
2020-05-15 06:53:37 +00:00
committed by Commit Bot
parent 52734adf2e
commit c31c7baafe
4 changed files with 205 additions and 3 deletions

@ -472,6 +472,18 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {
info.direction = PP_PRIVATEDIRECTION_NONE;
return info;
}
// If the first character in a text run is a space, we need to start
// |text_run_bounds| from the space character instead of the first
// non-space unicode character.
pp::FloatRect text_run_bounds =
actual_start_char_index > start_char_index
? GetFloatCharRectInPixels(page, text_page, start_char_index)
: pp::FloatRect();
// Pdfium trims more than 1 consecutive spaces to 1 space.
DCHECK_LE(actual_start_char_index - start_char_index, 1);
int char_index = actual_start_char_index;
// Set text run's style info from the first character of the text run.
@ -494,8 +506,8 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {
AddCharSizeToAverageCharSize(start_char_rect.Floatsize(), &avg_char_size,
&non_whitespace_chars_count);
// Add first char to text run.
pp::FloatRect text_run_bounds = start_char_rect;
// Add first non-space char to text run.
text_run_bounds = text_run_bounds.Union(start_char_rect);
PP_PrivateDirection char_direction =
GetDirectionFromAngle(FPDFText_GetCharAngle(text_page, char_index));
if (char_index < chars_count)

@ -8,6 +8,8 @@
#include <vector>
#include "base/check.h"
#include "base/optional.h"
#include "base/strings/string_util.h"
#include "base/test/gtest_util.h"
#include "pdf/pdfium/pdfium_engine.h"
#include "pdf/pdfium/pdfium_test_base.h"
@ -234,6 +236,80 @@ TEST_F(PDFiumPageImageTest, TestImageAltText) {
using PDFiumPageTextTest = PDFiumTestBase;
TEST_F(PDFiumPageTextTest, TestTextRunBounds) {
TestClient client;
std::unique_ptr<PDFiumEngine> engine = InitializeEngine(
&client, FILE_PATH_LITERAL("leading_trailing_spaces_per_text_run.pdf"));
ASSERT_TRUE(engine);
constexpr int kFirstRunStartIndex = 0;
constexpr int kFirstRunEndIndex = 20;
constexpr int kPageIndex = 0;
base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo> text_run_info_1 =
engine->GetTextRunInfo(kPageIndex, kFirstRunStartIndex);
ASSERT_TRUE(text_run_info_1.has_value());
const auto& actual_text_run_1 = text_run_info_1.value();
EXPECT_EQ(21u, actual_text_run_1.len);
EXPECT_TRUE(base::IsUnicodeWhitespace(
engine->GetCharUnicode(kPageIndex, kFirstRunStartIndex)));
pp::FloatRect text_run_bounds = actual_text_run_1.bounds;
EXPECT_TRUE(text_run_bounds.Contains(
engine->GetCharBounds(kPageIndex, kFirstRunStartIndex)));
// Last non-space character should fall in the bounding box of the text run.
// Text run looks like this:
// " Hello, world! \r\n "<17 characters><first Tj>
// " \r\n "<4 characters><second Tj>
// " "<1 character><third Tj starting spaces>
// Finally generated text run: " Hello, world! \r\n \r\n "
constexpr int kFirstRunLastNonSpaceCharIndex = 13;
EXPECT_FALSE(base::IsUnicodeWhitespace(
engine->GetCharUnicode(kPageIndex, kFirstRunLastNonSpaceCharIndex)));
EXPECT_TRUE(text_run_bounds.Contains(
engine->GetCharBounds(kPageIndex, kFirstRunLastNonSpaceCharIndex)));
EXPECT_TRUE(base::IsUnicodeWhitespace(
engine->GetCharUnicode(kPageIndex, kFirstRunEndIndex)));
pp::FloatRect end_char_rect =
engine->GetCharBounds(kPageIndex, kFirstRunEndIndex);
EXPECT_FALSE(text_run_bounds.Contains(end_char_rect));
// Equals to the length of the previous text run.
constexpr int kSecondRunStartIndex = 21;
constexpr int kSecondRunEndIndex = 36;
// Test the properties of second text run.
// Note: The leading spaces in second text run are accounted for in the end
// of first text run. Hence we won't see a space leading the second text run.
base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo> text_run_info_2 =
engine->GetTextRunInfo(kPageIndex, kSecondRunStartIndex);
ASSERT_TRUE(text_run_info_2.has_value());
const auto& actual_text_run_2 = text_run_info_2.value();
EXPECT_EQ(16u, actual_text_run_2.len);
EXPECT_FALSE(base::IsUnicodeWhitespace(
engine->GetCharUnicode(kPageIndex, kSecondRunStartIndex)));
text_run_bounds = actual_text_run_2.bounds;
EXPECT_TRUE(text_run_bounds.Contains(
engine->GetCharBounds(kPageIndex, kSecondRunStartIndex)));
// Last non-space character should fall in the bounding box of the text run.
// Text run looks like this:
// "Goodbye, world! "<19 characters><first Tj>
// Finally generated text run: "Goodbye, world! "
constexpr int kSecondRunLastNonSpaceCharIndex = 35;
EXPECT_FALSE(base::IsUnicodeWhitespace(
engine->GetCharUnicode(kPageIndex, kSecondRunLastNonSpaceCharIndex)));
EXPECT_TRUE(text_run_bounds.Contains(
engine->GetCharBounds(kPageIndex, kSecondRunLastNonSpaceCharIndex)));
EXPECT_TRUE(base::IsUnicodeWhitespace(
engine->GetCharUnicode(kPageIndex, kSecondRunEndIndex)));
EXPECT_FALSE(text_run_bounds.Contains(
engine->GetCharBounds(kPageIndex, kSecondRunEndIndex)));
}
TEST_F(PDFiumPageTextTest, GetTextRunInfo) {
TestClient client;
std::unique_ptr<PDFiumEngine> engine =
@ -333,7 +409,7 @@ TEST_F(PDFiumPageTextTest, TestHighlightTextRunInfo) {
{7,
PP_MakeFloatRectFromXYWH(106.66666f, 198.66667f, 73.333336f, 18.666672f),
PP_PrivateDirection::PP_PRIVATEDIRECTION_LTR, kExpectedStyle},
{2, PP_MakeFloatRectFromXYWH(188.0f, 202.66667f, 9.333333f, 14.666667f),
{2, PP_MakeFloatRectFromXYWH(181.33333f, 192.0f, 16.0f, 25.333344f),
PP_PrivateDirection::PP_PRIVATEDIRECTION_NONE, kExpectedStyle},
{2,
PP_MakeFloatRectFromXYWH(198.66667f, 202.66667f, 21.333328f, 10.666672f),

@ -0,0 +1,51 @@
{{header}}
{{object 1 0}} <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
{{object 2 0}} <<
/Type /Pages
/MediaBox [0 0 400 200]
/Count 1
/Kids [3 0 R]
>>
endobj
{{object 3 0}} <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
{{object 4 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
endobj
{{object 5 0}} <<
{{streamlen}}
>>
stream
BT
20 50 Td
/F1 12 Tf
( Hello, world! ) Tj
20 50 Td
/F1 12 Tf
( ) Tj
0 50 Td
/F1 16 Tf
( Goodbye, world! ) Tj
ET
endstream
endobj
{{xref}}
{{trailer}}
{{startxref}}
%%EOF

@ -0,0 +1,63 @@
%PDF-1.7
%<25><><EFBFBD><EFBFBD>
1 0 obj <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj <<
/Type /Pages
/MediaBox [0 0 400 200]
/Count 1
/Kids [3 0 R]
>>
endobj
3 0 obj <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
4 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
endobj
5 0 obj <<
/Length 121
>>
stream
BT
20 50 Td
/F1 12 Tf
( Hello, world! ) Tj
20 50 Td
/F1 12 Tf
( ) Tj
0 50 Td
/F1 16 Tf
( Goodbye, world! ) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000068 00000 n
0000000157 00000 n
0000000283 00000 n
0000000361 00000 n
trailer <<
/Root 1 0 R
/Size 6
>>
startxref
534
%%EOF