Fix text run calculation for PDFs starting with a Unicode space

For presenting the text model of a PDF for accessibility we break the
PDF content into text runs. While breaking the text into text runs we
always start with the first non-space (Unicode) character. This
assumes that PDF content will not have a preceding space. However, it
is not always the case. We can see that this particular file
http://africau.edu/images/default/sample.pdf starts with a space.

The logic to calculate text run further goes on to create a text run
with a bound which has not taken care of the preceding space character.
When this information is passed on to PdfAccessibilityTree we report
that the text run starts from the space character but we report text run
bounds which is short by a space character. When we start arranging
characters in PdfAccessibilityTree we run short by the width of space
character. As a result we see an offset from the actual position of
characters.

The fix is to add the bounding box of the preceding space character in
the bounds of text run.

Bug: 1076752

Change-Id: Ic9b06307daf5dc3681a42ae8176288f5b3012886
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2167749
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Ankit Kumar 🌪️ <ankk@microsoft.com>
Reviewed-by: Kevin Babbitt <kbabbitt@microsoft.com>
Commit-Queue: Virender Singh <virens@microsoft.com>
Cr-Commit-Position: refs/heads/master@{#769180}

This commit is contained in:

Virender Singh

2020-05-15 06:53:37 +00:00

committed by

Commit Bot

parent 52734adf2e

commit c31c7baafe

4 changed files with 205 additions and 3 deletions

pdf

pdfium

pdfium_page.cc pdfium_page_unittest.cc

test

data

leading_trailing_spaces_per_text_run.in leading_trailing_spaces_per_text_run.pdf

									
										16

pdf/pdfium/pdfium_page.cc
									
				@ -472,6 +472,18 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {

				    info.direction = PP_PRIVATEDIRECTION_NONE;

				    return info;

				  }

				  // If the first character in a text run is a space, we need to start

				  // |text_run_bounds| from the space character instead of the first

				  // non-space unicode character.

				  pp::FloatRect text_run_bounds =

				      actual_start_char_index > start_char_index

				          ? GetFloatCharRectInPixels(page, text_page, start_char_index)

				          : pp::FloatRect();

				  // Pdfium trims more than 1 consecutive spaces to 1 space.

				  DCHECK_LE(actual_start_char_index - start_char_index, 1);

				  int char_index = actual_start_char_index;

				  // Set text run's style info from the first character of the text run.

				@ -494,8 +506,8 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {

				  AddCharSizeToAverageCharSize(start_char_rect.Floatsize(), &avg_char_size,

				                               &non_whitespace_chars_count);

				  // Add first char to text run.

				  pp::FloatRect text_run_bounds = start_char_rect;

				  // Add first non-space char to text run.

				  text_run_bounds = text_run_bounds.Union(start_char_rect);

				  PP_PrivateDirection char_direction =

				      GetDirectionFromAngle(FPDFText_GetCharAngle(text_page, char_index));

				  if (char_index < chars_count)

									
										78

pdf/pdfium/pdfium_page_unittest.cc
									
				@ -8,6 +8,8 @@

				#include <vector>

				#include "base/check.h"

				#include "base/optional.h"

				#include "base/strings/string_util.h"

				#include "base/test/gtest_util.h"

				#include "pdf/pdfium/pdfium_engine.h"

				#include "pdf/pdfium/pdfium_test_base.h"

				@ -234,6 +236,80 @@ TEST_F(PDFiumPageImageTest, TestImageAltText) {

				using PDFiumPageTextTest = PDFiumTestBase;

				TEST_F(PDFiumPageTextTest, TestTextRunBounds) {

				  TestClient client;

				  std::unique_ptr<PDFiumEngine> engine = InitializeEngine(

				      &client, FILE_PATH_LITERAL("leading_trailing_spaces_per_text_run.pdf"));

				  ASSERT_TRUE(engine);

				  constexpr int kFirstRunStartIndex = 0;

				  constexpr int kFirstRunEndIndex = 20;

				  constexpr int kPageIndex = 0;

				  base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo> text_run_info_1 =

				      engine->GetTextRunInfo(kPageIndex, kFirstRunStartIndex);

				  ASSERT_TRUE(text_run_info_1.has_value());

				  const auto& actual_text_run_1 = text_run_info_1.value();

				  EXPECT_EQ(21u, actual_text_run_1.len);

				  EXPECT_TRUE(base::IsUnicodeWhitespace(

				      engine->GetCharUnicode(kPageIndex, kFirstRunStartIndex)));

				  pp::FloatRect text_run_bounds = actual_text_run_1.bounds;

				  EXPECT_TRUE(text_run_bounds.Contains(

				      engine->GetCharBounds(kPageIndex, kFirstRunStartIndex)));

				  // Last non-space character should fall in the bounding box of the text run.

				  // Text run looks like this:

				  // " Hello, world! \r\n "<17 characters><first Tj>

				  // " \r\n "<4 characters><second Tj>

				  // " "<1 character><third Tj starting spaces>

				  // Finally generated text run: " Hello, world! \r\n \r\n "

				  constexpr int kFirstRunLastNonSpaceCharIndex = 13;

				  EXPECT_FALSE(base::IsUnicodeWhitespace(

				      engine->GetCharUnicode(kPageIndex, kFirstRunLastNonSpaceCharIndex)));

				  EXPECT_TRUE(text_run_bounds.Contains(

				      engine->GetCharBounds(kPageIndex, kFirstRunLastNonSpaceCharIndex)));

				  EXPECT_TRUE(base::IsUnicodeWhitespace(

				      engine->GetCharUnicode(kPageIndex, kFirstRunEndIndex)));

				  pp::FloatRect end_char_rect =

				      engine->GetCharBounds(kPageIndex, kFirstRunEndIndex);

				  EXPECT_FALSE(text_run_bounds.Contains(end_char_rect));

				  // Equals to the length of the previous text run.

				  constexpr int kSecondRunStartIndex = 21;

				  constexpr int kSecondRunEndIndex = 36;

				  // Test the properties of second text run.

				  // Note: The leading spaces in second text run are accounted for in the end

				  // of first text run. Hence we won't see a space leading the second text run.

				  base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo> text_run_info_2 =

				      engine->GetTextRunInfo(kPageIndex, kSecondRunStartIndex);

				  ASSERT_TRUE(text_run_info_2.has_value());

				  const auto& actual_text_run_2 = text_run_info_2.value();

				  EXPECT_EQ(16u, actual_text_run_2.len);

				  EXPECT_FALSE(base::IsUnicodeWhitespace(

				      engine->GetCharUnicode(kPageIndex, kSecondRunStartIndex)));

				  text_run_bounds = actual_text_run_2.bounds;

				  EXPECT_TRUE(text_run_bounds.Contains(

				      engine->GetCharBounds(kPageIndex, kSecondRunStartIndex)));

				  // Last non-space character should fall in the bounding box of the text run.

				  // Text run looks like this:

				  // "Goodbye, world! "<19 characters><first Tj>

				  // Finally generated text run: "Goodbye, world! "

				  constexpr int kSecondRunLastNonSpaceCharIndex = 35;

				  EXPECT_FALSE(base::IsUnicodeWhitespace(

				      engine->GetCharUnicode(kPageIndex, kSecondRunLastNonSpaceCharIndex)));

				  EXPECT_TRUE(text_run_bounds.Contains(

				      engine->GetCharBounds(kPageIndex, kSecondRunLastNonSpaceCharIndex)));

				  EXPECT_TRUE(base::IsUnicodeWhitespace(

				      engine->GetCharUnicode(kPageIndex, kSecondRunEndIndex)));

				  EXPECT_FALSE(text_run_bounds.Contains(

				      engine->GetCharBounds(kPageIndex, kSecondRunEndIndex)));

				}

				TEST_F(PDFiumPageTextTest, GetTextRunInfo) {

				  TestClient client;

				  std::unique_ptr<PDFiumEngine> engine =

				@ -333,7 +409,7 @@ TEST_F(PDFiumPageTextTest, TestHighlightTextRunInfo) {

				      {7,

				       PP_MakeFloatRectFromXYWH(106.66666f, 198.66667f, 73.333336f, 18.666672f),

				       PP_PrivateDirection::PP_PRIVATEDIRECTION_LTR, kExpectedStyle},

				      {2, PP_MakeFloatRectFromXYWH(188.0f, 202.66667f, 9.333333f, 14.666667f),

				      {2, PP_MakeFloatRectFromXYWH(181.33333f, 192.0f, 16.0f, 25.333344f),

				       PP_PrivateDirection::PP_PRIVATEDIRECTION_NONE, kExpectedStyle},

				      {2,

				       PP_MakeFloatRectFromXYWH(198.66667f, 202.66667f, 21.333328f, 10.666672f),

51

pdf/test/data/leading_trailing_spaces_per_text_run.in Normal file

 @ -0,0 +1,51 @@
 {{header}}
 {{object 1 0}} <<
   /Type /Catalog
   /Pages 2 0 R
 >>
 endobj
 {{object 2 0}} <<
   /Type /Pages
   /MediaBox [0 0 400 200]
   /Count 1
   /Kids [3 0 R]
 >>
 endobj
 {{object 3 0}} <<
   /Type /Page
   /Parent 2 0 R
   /Resources <<
     /Font <<
       /F1 4 0 R
     >>
   >>
   /Contents 5 0 R
 >>
 endobj
 {{object 4 0}} <<
   /Type /Font
   /Subtype /Type1
   /BaseFont /Times-Roman
 >>
 endobj
 {{object 5 0}} <<
   {{streamlen}}
 >>
 stream
 BT
 50 Td
 /F1 12 Tf
 (  Hello, world!  ) Tj
 50 Td
 /F1 12 Tf
 (   ) Tj
 50 Td
 /F1 16 Tf
 (  Goodbye,   world!  ) Tj
 ET
 endstream
 endobj
 {{xref}}
 {{trailer}}
 {{startxref}}
 %%EOF

63

pdf/test/data/leading_trailing_spaces_per_text_run.pdf Normal file

 @ -0,0 +1,63 @@
 %PDF-1.7
 %<25><><EFBFBD><EFBFBD>
 0 obj <<
   /Type /Catalog
   /Pages 2 0 R
 >>
 endobj
 0 obj <<
   /Type /Pages
   /MediaBox [0 0 400 200]
   /Count 1
   /Kids [3 0 R]
 >>
 endobj
 0 obj <<
   /Type /Page
   /Parent 2 0 R
   /Resources <<
     /Font <<
       /F1 4 0 R
     >>
   >>
   /Contents 5 0 R
 >>
 endobj
 0 obj <<
   /Type /Font
   /Subtype /Type1
   /BaseFont /Times-Roman
 >>
 endobj
 0 obj <<
   /Length 121
 >>
 stream
 BT
 50 Td
 /F1 12 Tf
 (  Hello, world!  ) Tj
 50 Td
 /F1 12 Tf
 (   ) Tj
 50 Td
 /F1 16 Tf
 (  Goodbye,   world!  ) Tj
 ET
 endstream
 endobj
 xref
 6
 0000000000 65535 f
 0000000015 00000 n
 0000000068 00000 n
 0000000157 00000 n
 0000000283 00000 n
 0000000361 00000 n
 trailer <<
   /Root 1 0 R
   /Size 6
 >>
 startxref
 
 %%EOF

Fix text run calculation for PDFs starting with a Unicode space

16 pdf/pdfium/pdfium_page.cc

78 pdf/pdfium/pdfium_page_unittest.cc

51 pdf/test/data/leading_trailing_spaces_per_text_run.in Normal file

63 pdf/test/data/leading_trailing_spaces_per_text_run.pdf Normal file

16

pdf/pdfium/pdfium_page.cc

78

pdf/pdfium/pdfium_page_unittest.cc

51

pdf/test/data/leading_trailing_spaces_per_text_run.in Normal file

63

pdf/test/data/leading_trailing_spaces_per_text_run.pdf Normal file