Fix text run calculation for PDFs starting with a Unicode space
For presenting the text model of a PDF for accessibility we break the PDF content into text runs. While breaking the text into text runs we always start with the first non-space (Unicode) character. This assumes that PDF content will not have a preceding space. However, it is not always the case. We can see that this particular file http://africau.edu/images/default/sample.pdf starts with a space. The logic to calculate text run further goes on to create a text run with a bound which has not taken care of the preceding space character. When this information is passed on to PdfAccessibilityTree we report that the text run starts from the space character but we report text run bounds which is short by a space character. When we start arranging characters in PdfAccessibilityTree we run short by the width of space character. As a result we see an offset from the actual position of characters. The fix is to add the bounding box of the preceding space character in the bounds of text run. Bug: 1076752 Change-Id: Ic9b06307daf5dc3681a42ae8176288f5b3012886 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2167749 Reviewed-by: Lei Zhang <thestig@chromium.org> Reviewed-by: Ankit Kumar 🌪️ <ankk@microsoft.com> Reviewed-by: Kevin Babbitt <kbabbitt@microsoft.com> Commit-Queue: Virender Singh <virens@microsoft.com> Cr-Commit-Position: refs/heads/master@{#769180}
This commit is contained in:

committed by
Commit Bot

parent
52734adf2e
commit
c31c7baafe
@ -472,6 +472,18 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {
|
||||
info.direction = PP_PRIVATEDIRECTION_NONE;
|
||||
return info;
|
||||
}
|
||||
|
||||
// If the first character in a text run is a space, we need to start
|
||||
// |text_run_bounds| from the space character instead of the first
|
||||
// non-space unicode character.
|
||||
pp::FloatRect text_run_bounds =
|
||||
actual_start_char_index > start_char_index
|
||||
? GetFloatCharRectInPixels(page, text_page, start_char_index)
|
||||
: pp::FloatRect();
|
||||
|
||||
// Pdfium trims more than 1 consecutive spaces to 1 space.
|
||||
DCHECK_LE(actual_start_char_index - start_char_index, 1);
|
||||
|
||||
int char_index = actual_start_char_index;
|
||||
|
||||
// Set text run's style info from the first character of the text run.
|
||||
@ -494,8 +506,8 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {
|
||||
AddCharSizeToAverageCharSize(start_char_rect.Floatsize(), &avg_char_size,
|
||||
&non_whitespace_chars_count);
|
||||
|
||||
// Add first char to text run.
|
||||
pp::FloatRect text_run_bounds = start_char_rect;
|
||||
// Add first non-space char to text run.
|
||||
text_run_bounds = text_run_bounds.Union(start_char_rect);
|
||||
PP_PrivateDirection char_direction =
|
||||
GetDirectionFromAngle(FPDFText_GetCharAngle(text_page, char_index));
|
||||
if (char_index < chars_count)
|
||||
|
@ -8,6 +8,8 @@
|
||||
#include <vector>
|
||||
|
||||
#include "base/check.h"
|
||||
#include "base/optional.h"
|
||||
#include "base/strings/string_util.h"
|
||||
#include "base/test/gtest_util.h"
|
||||
#include "pdf/pdfium/pdfium_engine.h"
|
||||
#include "pdf/pdfium/pdfium_test_base.h"
|
||||
@ -234,6 +236,80 @@ TEST_F(PDFiumPageImageTest, TestImageAltText) {
|
||||
|
||||
using PDFiumPageTextTest = PDFiumTestBase;
|
||||
|
||||
TEST_F(PDFiumPageTextTest, TestTextRunBounds) {
|
||||
TestClient client;
|
||||
std::unique_ptr<PDFiumEngine> engine = InitializeEngine(
|
||||
&client, FILE_PATH_LITERAL("leading_trailing_spaces_per_text_run.pdf"));
|
||||
ASSERT_TRUE(engine);
|
||||
|
||||
constexpr int kFirstRunStartIndex = 0;
|
||||
constexpr int kFirstRunEndIndex = 20;
|
||||
constexpr int kPageIndex = 0;
|
||||
base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo> text_run_info_1 =
|
||||
engine->GetTextRunInfo(kPageIndex, kFirstRunStartIndex);
|
||||
ASSERT_TRUE(text_run_info_1.has_value());
|
||||
|
||||
const auto& actual_text_run_1 = text_run_info_1.value();
|
||||
EXPECT_EQ(21u, actual_text_run_1.len);
|
||||
|
||||
EXPECT_TRUE(base::IsUnicodeWhitespace(
|
||||
engine->GetCharUnicode(kPageIndex, kFirstRunStartIndex)));
|
||||
pp::FloatRect text_run_bounds = actual_text_run_1.bounds;
|
||||
EXPECT_TRUE(text_run_bounds.Contains(
|
||||
engine->GetCharBounds(kPageIndex, kFirstRunStartIndex)));
|
||||
|
||||
// Last non-space character should fall in the bounding box of the text run.
|
||||
// Text run looks like this:
|
||||
// " Hello, world! \r\n "<17 characters><first Tj>
|
||||
// " \r\n "<4 characters><second Tj>
|
||||
// " "<1 character><third Tj starting spaces>
|
||||
// Finally generated text run: " Hello, world! \r\n \r\n "
|
||||
constexpr int kFirstRunLastNonSpaceCharIndex = 13;
|
||||
EXPECT_FALSE(base::IsUnicodeWhitespace(
|
||||
engine->GetCharUnicode(kPageIndex, kFirstRunLastNonSpaceCharIndex)));
|
||||
EXPECT_TRUE(text_run_bounds.Contains(
|
||||
engine->GetCharBounds(kPageIndex, kFirstRunLastNonSpaceCharIndex)));
|
||||
|
||||
EXPECT_TRUE(base::IsUnicodeWhitespace(
|
||||
engine->GetCharUnicode(kPageIndex, kFirstRunEndIndex)));
|
||||
pp::FloatRect end_char_rect =
|
||||
engine->GetCharBounds(kPageIndex, kFirstRunEndIndex);
|
||||
EXPECT_FALSE(text_run_bounds.Contains(end_char_rect));
|
||||
// Equals to the length of the previous text run.
|
||||
constexpr int kSecondRunStartIndex = 21;
|
||||
constexpr int kSecondRunEndIndex = 36;
|
||||
// Test the properties of second text run.
|
||||
// Note: The leading spaces in second text run are accounted for in the end
|
||||
// of first text run. Hence we won't see a space leading the second text run.
|
||||
base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo> text_run_info_2 =
|
||||
engine->GetTextRunInfo(kPageIndex, kSecondRunStartIndex);
|
||||
ASSERT_TRUE(text_run_info_2.has_value());
|
||||
|
||||
const auto& actual_text_run_2 = text_run_info_2.value();
|
||||
EXPECT_EQ(16u, actual_text_run_2.len);
|
||||
|
||||
EXPECT_FALSE(base::IsUnicodeWhitespace(
|
||||
engine->GetCharUnicode(kPageIndex, kSecondRunStartIndex)));
|
||||
text_run_bounds = actual_text_run_2.bounds;
|
||||
EXPECT_TRUE(text_run_bounds.Contains(
|
||||
engine->GetCharBounds(kPageIndex, kSecondRunStartIndex)));
|
||||
|
||||
// Last non-space character should fall in the bounding box of the text run.
|
||||
// Text run looks like this:
|
||||
// "Goodbye, world! "<19 characters><first Tj>
|
||||
// Finally generated text run: "Goodbye, world! "
|
||||
constexpr int kSecondRunLastNonSpaceCharIndex = 35;
|
||||
EXPECT_FALSE(base::IsUnicodeWhitespace(
|
||||
engine->GetCharUnicode(kPageIndex, kSecondRunLastNonSpaceCharIndex)));
|
||||
EXPECT_TRUE(text_run_bounds.Contains(
|
||||
engine->GetCharBounds(kPageIndex, kSecondRunLastNonSpaceCharIndex)));
|
||||
|
||||
EXPECT_TRUE(base::IsUnicodeWhitespace(
|
||||
engine->GetCharUnicode(kPageIndex, kSecondRunEndIndex)));
|
||||
EXPECT_FALSE(text_run_bounds.Contains(
|
||||
engine->GetCharBounds(kPageIndex, kSecondRunEndIndex)));
|
||||
}
|
||||
|
||||
TEST_F(PDFiumPageTextTest, GetTextRunInfo) {
|
||||
TestClient client;
|
||||
std::unique_ptr<PDFiumEngine> engine =
|
||||
@ -333,7 +409,7 @@ TEST_F(PDFiumPageTextTest, TestHighlightTextRunInfo) {
|
||||
{7,
|
||||
PP_MakeFloatRectFromXYWH(106.66666f, 198.66667f, 73.333336f, 18.666672f),
|
||||
PP_PrivateDirection::PP_PRIVATEDIRECTION_LTR, kExpectedStyle},
|
||||
{2, PP_MakeFloatRectFromXYWH(188.0f, 202.66667f, 9.333333f, 14.666667f),
|
||||
{2, PP_MakeFloatRectFromXYWH(181.33333f, 192.0f, 16.0f, 25.333344f),
|
||||
PP_PrivateDirection::PP_PRIVATEDIRECTION_NONE, kExpectedStyle},
|
||||
{2,
|
||||
PP_MakeFloatRectFromXYWH(198.66667f, 202.66667f, 21.333328f, 10.666672f),
|
||||
|
51
pdf/test/data/leading_trailing_spaces_per_text_run.in
Normal file
51
pdf/test/data/leading_trailing_spaces_per_text_run.in
Normal file
@ -0,0 +1,51 @@
|
||||
{{header}}
|
||||
{{object 1 0}} <<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
{{object 2 0}} <<
|
||||
/Type /Pages
|
||||
/MediaBox [0 0 400 200]
|
||||
/Count 1
|
||||
/Kids [3 0 R]
|
||||
>>
|
||||
endobj
|
||||
{{object 3 0}} <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 4 0 R
|
||||
>>
|
||||
>>
|
||||
/Contents 5 0 R
|
||||
>>
|
||||
endobj
|
||||
{{object 4 0}} <<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Times-Roman
|
||||
>>
|
||||
endobj
|
||||
{{object 5 0}} <<
|
||||
{{streamlen}}
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
20 50 Td
|
||||
/F1 12 Tf
|
||||
( Hello, world! ) Tj
|
||||
20 50 Td
|
||||
/F1 12 Tf
|
||||
( ) Tj
|
||||
0 50 Td
|
||||
/F1 16 Tf
|
||||
( Goodbye, world! ) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
{{xref}}
|
||||
{{trailer}}
|
||||
{{startxref}}
|
||||
%%EOF
|
63
pdf/test/data/leading_trailing_spaces_per_text_run.pdf
Normal file
63
pdf/test/data/leading_trailing_spaces_per_text_run.pdf
Normal file
@ -0,0 +1,63 @@
|
||||
%PDF-1.7
|
||||
%<25><><EFBFBD><EFBFBD>
|
||||
1 0 obj <<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj <<
|
||||
/Type /Pages
|
||||
/MediaBox [0 0 400 200]
|
||||
/Count 1
|
||||
/Kids [3 0 R]
|
||||
>>
|
||||
endobj
|
||||
3 0 obj <<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 4 0 R
|
||||
>>
|
||||
>>
|
||||
/Contents 5 0 R
|
||||
>>
|
||||
endobj
|
||||
4 0 obj <<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/BaseFont /Times-Roman
|
||||
>>
|
||||
endobj
|
||||
5 0 obj <<
|
||||
/Length 121
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
20 50 Td
|
||||
/F1 12 Tf
|
||||
( Hello, world! ) Tj
|
||||
20 50 Td
|
||||
/F1 12 Tf
|
||||
( ) Tj
|
||||
0 50 Td
|
||||
/F1 16 Tf
|
||||
( Goodbye, world! ) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 6
|
||||
0000000000 65535 f
|
||||
0000000015 00000 n
|
||||
0000000068 00000 n
|
||||
0000000157 00000 n
|
||||
0000000283 00000 n
|
||||
0000000361 00000 n
|
||||
trailer <<
|
||||
/Root 1 0 R
|
||||
/Size 6
|
||||
>>
|
||||
startxref
|
||||
534
|
||||
%%EOF
|
Reference in New Issue
Block a user