0

PDF a11y: Use style info in the text-runs break heuristic

The text-runs break heuristic now considers the text style.
With this change, a text run will break if it encounters a difference
in the following text style properties:
* font-family
* font-weight
* rendering mode
* font-size
* fill color
* stroke color
* italic flag
* bold flag

Before this change, it used to break only if a font-size difference was
observed.

I added an "ultimate" dump test that tests for all of those properties.

Bug: 985604
Change-Id: Ice4b4a2dcdd8cc9642ca0e9d069a415c5b2700fd
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1771987
Commit-Queue: Benjamin Beaudry <benjamin.beaudry@microsoft.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Ian Prest <iapres@microsoft.com>
Cr-Commit-Position: refs/heads/master@{#710522}
This commit is contained in:
Benjamin Beaudry
2019-10-30 00:09:08 +00:00
committed by Commit Bot
parent f6f8dbb311
commit 8feb637ac1
12 changed files with 383 additions and 8 deletions

@ -2617,3 +2617,8 @@ IN_PROC_BROWSER_TEST_P(PDFExtensionAccessibilityTreeDumpTest,
LinksImagesAndText) {
RunPDFTest(FILE_PATH_LITERAL("text-image-link.pdf"));
}
IN_PROC_BROWSER_TEST_P(PDFExtensionAccessibilityTreeDumpTest,
TextRunStyleHeuristic) {
RunPDFTest(FILE_PATH_LITERAL("text-run-style-heuristic.pdf"));
}

@ -0,0 +1,15 @@
embeddedObject fontFamily='Roboto' color=-16777216 fontSize=13.00 fontWeight=400.00
++document fontFamily='Roboto' restriction=readOnly
++++region fontFamily='Roboto' name='Page 1' restriction=readOnly isPageBreakingObject=true
++++++paragraph fontFamily='Roboto' restriction=readOnly
++++++++staticText fontFamily='Roboto' name='Case 1<newline>' restriction=readOnly
++++++++++inlineTextBox fontFamily='Helvetica-Bold-Italic' name='Case 1<newline>' restriction=readOnly color=-65536 fontSize=38.00 fontWeight=600.00
++++++paragraph fontFamily='Roboto' restriction=readOnly
++++++++staticText fontFamily='Roboto' name='Case 2<newline>' restriction=readOnly
++++++++++inlineTextBox fontFamily='Helvetica-Bold-Italic' name='Case 2<newline>' restriction=readOnly color=-16711936 fontSize=38.00 fontWeight=600.00
++++++paragraph fontFamily='Roboto' restriction=readOnly
++++++++staticText fontFamily='Roboto' name='Case 3<newline>' restriction=readOnly
++++++++++inlineTextBox fontFamily='Helvetica-Bold-Italic' name='Case 3<newline>' restriction=readOnly color=-65536 fontSize=38.00 fontWeight=600.00
++++++paragraph fontFamily='Roboto' restriction=readOnly
++++++++staticText fontFamily='Roboto' name='Case 4' restriction=readOnly
++++++++++inlineTextBox fontFamily='Helvetica-Bold-Italic' name='Case 4' restriction=readOnly fontSize=38.00 fontWeight=600.00

@ -0,0 +1,5 @@
[embedded component]
++[document frame]
++++[landmark] name='Page 1'
++++++[paragraph]
++++++++[text] name='Hello world! One word equals to one text run.'

@ -0,0 +1,14 @@
embeddedObject
++document restriction=readOnly
++++region name='Page 1' restriction=readOnly isPageBreakingObject=true
++++++paragraph restriction=readOnly
++++++++staticText name='Hello world! One word equals to one text run.' restriction=readOnly
++++++++++inlineTextBox name='Hello ' restriction=readOnly
++++++++++inlineTextBox name='world! ' restriction=readOnly
++++++++++inlineTextBox name='One ' restriction=readOnly
++++++++++inlineTextBox name='word ' restriction=readOnly
++++++++++inlineTextBox name='equals ' restriction=readOnly
++++++++++inlineTextBox name='to ' restriction=readOnly
++++++++++inlineTextBox name='one ' restriction=readOnly
++++++++++inlineTextBox name='text ' restriction=readOnly
++++++++++inlineTextBox name='run.' restriction=readOnly

@ -0,0 +1,5 @@
AXGroup AXDescription='Page 1'
++AXGroup
++++AXGroup AXDescription='Page 1'
++++++AXGroup
++++++++AXStaticText AXValue='Hello world! One word equals to one text run.'

@ -0,0 +1,5 @@
group
++document
++++region Name='Page 1'
++++++group
++++++++description Name='Hello world! One word equals to one text run.'

@ -0,0 +1,5 @@
ROLE_SYSTEM_GROUPING FOCUSABLE
++ROLE_SYSTEM_DOCUMENT READONLY FOCUSABLE
++++IA2_ROLE_LANDMARK name='Page 1' READONLY
++++++IA2_ROLE_PARAGRAPH READONLY
++++++++ROLE_SYSTEM_STATICTEXT name='Hello world! One word equals to one text run.' READONLY

@ -0,0 +1,143 @@
{{header}}
{{object 1 0}} <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
{{object 2 0}} <<
/Type /Pages
/MediaBox [ 0 0 200 200 ]
/Count 1
/Kids [ 3 0 R ]
>>
endobj
{{object 3 0}} <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R %Basic font. Times-Roman, no style.
/F2 5 0 R %Basic font. Helvetica, no style.
/F3 6 0 R %F2 + font-weigth (/stemV and /Weight)
/F4 8 0 R %F3 + italic flag
>>
>>
/Contents 10 0 R
>>
endobj
{{object 4 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
endobj
{{object 5 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
{{object 6 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica-Bold
/FontDescriptor 7 0 R
>>
endobj
{{object 7 0}} <<
/Type /FontDescriptor
/FontName /Helvetica-Bold
/FontWeight 600
/StemV 80
/ItalicAngle 0
/Ascent 776
/Flags 0
/FontBBox [-250 -236 2827 1000]
/CapHeight 763
/Descent -223
>>
endobj
{{object 8 0}} <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica-Bold-Italic
/FontDescriptor 9 0 R
>>
endobj
{{object 9 0}} <<
/Type /FontDescriptor
/FontName /Helvetica-Bold-Italic
/FontWeight 600
/StemV 80
/ItalicAngle 45
/Ascent 776
/Flags 64
/FontBBox [-250 -236 2827 1000]
/CapHeight 763
/Descent -223
>>
endobj
{{object 10 0}} <<
{{streamlen}}
>>
stream
BT
0 0 Td
/F1 9 Tf
1 0 0 1 2 30 Tm
(Hello) Tj
0 0 Td
/F2 9 Tf
1 0 0 1 22 30 Tm
( world!) Tj
0 0 Td
/F2 8 Tf
1 0 0 1 48 30 Tm
( One) Tj
0 0 Td
/F3 8 Tf
1 0 0 1 65 30 Tm
( word) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 85 30 Tm
( equals) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 121 30 Tm
3 Tr
( to) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 137 30 Tm
1 Tr
( one) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 154 30 Tm
1 0 0 rg
0 1 0 RG
2 Tr
( text) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 170 30 Tm
0 0 1 rg
1 0 0 RG
1 Tr
( run.) Tj
ET
endstream
endobj
{{xref}}
{{trailer}}
{{startxref}}
%%EOF

@ -0,0 +1,160 @@
%PDF-1.7
%<25><><EFBFBD><EFBFBD>
1 0 obj <<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj <<
/Type /Pages
/MediaBox [ 0 0 200 200 ]
/Count 1
/Kids [ 3 0 R ]
>>
endobj
3 0 obj <<
/Type /Page
/Parent 2 0 R
/Resources <<
/Font <<
/F1 4 0 R %Basic font. Times-Roman, no style.
/F2 5 0 R %Basic font. Helvetica, no style.
/F3 6 0 R %F2 + font-weigth (/stemV and /Weight)
/F4 8 0 R %F3 + italic flag
>>
>>
/Contents 10 0 R
>>
endobj
4 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
endobj
5 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
6 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica-Bold
/FontDescriptor 7 0 R
>>
endobj
7 0 obj <<
/Type /FontDescriptor
/FontName /Helvetica-Bold
/FontWeight 600
/StemV 80
/ItalicAngle 0
/Ascent 776
/Flags 0
/FontBBox [-250 -236 2827 1000]
/CapHeight 763
/Descent -223
>>
endobj
8 0 obj <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica-Bold-Italic
/FontDescriptor 9 0 R
>>
endobj
9 0 obj <<
/Type /FontDescriptor
/FontName /Helvetica-Bold-Italic
/FontWeight 600
/StemV 80
/ItalicAngle 45
/Ascent 776
/Flags 64
/FontBBox [-250 -236 2827 1000]
/CapHeight 763
/Descent -223
>>
endobj
10 0 obj <<
/Length 469
>>
stream
BT
0 0 Td
/F1 9 Tf
1 0 0 1 2 30 Tm
(Hello) Tj
0 0 Td
/F2 9 Tf
1 0 0 1 22 30 Tm
( world!) Tj
0 0 Td
/F2 8 Tf
1 0 0 1 48 30 Tm
( One) Tj
0 0 Td
/F3 8 Tf
1 0 0 1 65 30 Tm
( word) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 85 30 Tm
( equals) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 121 30 Tm
3 Tr
( to) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 137 30 Tm
1 Tr
( one) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 154 30 Tm
1 0 0 rg
0 1 0 RG
2 Tr
( text) Tj
0 0 Td
/F4 8 Tf
1 0 0 1 170 30 Tm
0 0 1 rg
1 0 0 RG
1 Tr
( run.) Tj
ET
endstream
endobj
xref
0 11
0000000000 65535 f
0000000015 00000 n
0000000068 00000 n
0000000161 00000 n
0000000463 00000 n
0000000541 00000 n
0000000617 00000 n
0000000722 00000 n
0000000934 00000 n
0000001046 00000 n
0000001267 00000 n
trailer <<
/Root 1 0 R
/Size 11
>>
startxref
1789
%%EOF

@ -376,9 +376,9 @@ class PDFEngine {
// Get a given unicode character on a given page.
virtual uint32_t GetCharUnicode(int page_index, int char_index) = 0;
// Given a start char index, find the longest continuous run of text that's
// in a single direction and with the same style and font size. Return a
// filled out pp::PDF::PrivateAccessibilityTextRunInfo on success or
// base::nullopt on failure. e.g. When |start_char_index| is out of bounds.
// in a single direction and with the same text style. Return a filled out
// pp::PDF::PrivateAccessibilityTextRunInfo on success or base::nullopt on
// failure. e.g. When |start_char_index| is out of bounds.
virtual base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo>
GetTextRunInfo(int page_index, int start_char_index) = 0;
// Gets the number of links on a given page.

@ -326,6 +326,21 @@ void PDFiumPage::CalculateTextRunStyleInfo(
style_info->render_mode = FPDFText_GetTextRenderMode(text_page, char_index);
}
bool PDFiumPage::AreTextStyleEqual(
int char_index,
const pp::PDF::PrivateAccessibilityTextStyleInfo& style) {
pp::PDF::PrivateAccessibilityTextStyleInfo char_style;
CalculateTextRunStyleInfo(char_index, &char_style);
return char_style.font_name == style.font_name &&
char_style.font_weight == style.font_weight &&
char_style.render_mode == style.render_mode &&
DoubleEquals(char_style.font_size, style.font_size) &&
char_style.fill_color == style.fill_color &&
char_style.stroke_color == style.stroke_color &&
char_style.is_italic == style.is_italic &&
char_style.is_bold == style.is_bold;
}
base::Optional<pp::PDF::PrivateAccessibilityTextRunInfo>
PDFiumPage::GetTextRunInfo(int start_char_index) {
FPDF_PAGE page = GetPage();
@ -406,11 +421,9 @@ PDFiumPage::GetTextRunInfo(int start_char_index) {
GetFloatCharRectInPixels(page, text_page, char_index);
if (!base::IsUnicodeWhitespace(character)) {
// TODO (bebeaudr): add text run break heuristic to break on style.
// Heuristic: End the text run if different font size is encountered.
double font_size = FPDFText_GetFontSize(text_page, char_index);
if (!DoubleEquals(font_size, text_run_font_size))
// Heuristic: End the text run if the text style of the current character
// is different from the text run's style.
if (!AreTextStyleEqual(char_index, info.style))
break;
// Heuristic: End text run if character isn't going in the same direction.

@ -201,6 +201,11 @@ class PDFiumPage {
void CalculateTextRunStyleInfo(
int char_index,
pp::PDF::PrivateAccessibilityTextStyleInfo* style_info);
// Returns a boolean indicating if the character at index |char_index| has the
// same text style as the text run.
bool AreTextStyleEqual(
int char_index,
const pp::PDF::PrivateAccessibilityTextStyleInfo& style);
// Key : Marked content id for the image element as specified in the
// struct tree.