Add a test for exporting tagged PDFs

This is a follow-up to http://crrev.com/c/1970742 - it adds a new
interfaces to read the structure tree from PDFium, then generates a
tagged PDF using the new command-line flag and tests that the
resulting PDF file has the correct structure tree.

Bug: 607777

Change-Id: I796b15ea477ae4e20099e95982430fe770166577
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1970744
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Eric Seckler <eseckler@chromium.org>
Commit-Queue: Dominic Mazzoni <dmazzoni@chromium.org>
Cr-Commit-Position: refs/heads/master@{#736103}

This commit is contained in:

Dominic Mazzoni

2020-01-28 22:50:53 +00:00

committed by

Commit Bot

parent 1db56e05ed

commit 3895ab0377

7 changed files with 237 additions and 3 deletions

headless

lib

headless_web_contents_browsertest.cc

test

data

structured_doc.html

pdf

pdf.cc pdf.h pdf_engine.h

pdfium

pdfium_engine_exports.cc pdfium_engine_exports.h

									
										135

headless/lib/headless_web_contents_browsertest.cc
									
				@ -11,6 +11,7 @@

				#include "base/json/json_writer.h"

				#include "base/logging.h"

				#include "base/run_loop.h"

				#include "base/strings/string_util.h"

				#include "base/strings/stringprintf.h"

				#include "build/build_config.h"

				#include "cc/base/switches.h"

				@ -19,6 +20,7 @@

				#include "content/public/browser/web_contents.h"

				#include "content/public/common/content_switches.h"

				#include "content/public/test/browser_test.h"

				#include "headless/app/headless_shell_switches.h"

				#include "headless/lib/browser/headless_web_contents_impl.h"

				#include "headless/public/devtools/domains/browser.h"

				#include "headless/public/devtools/domains/dom_snapshot.h"

				@ -510,6 +512,133 @@ class HeadlessWebContentsPDFPageSizeRoundingTest

				HEADLESS_ASYNC_DEVTOOLED_TEST_F(HeadlessWebContentsPDFPageSizeRoundingTest);

				const char kExpectedStructTreeJSON[] = R"({

				   "type": "Document",

				   "~children": [ {

				      "type": "H",

				      "~children": [ {

				         "type": "NonStruct"

				      } ]

				   }, {

				      "type": "P",

				      "~children": [ {

				         "type": "NonStruct"

				      } ]

				   }, {

				      "type": "L",

				      "~children": [ {

				         "type": "LI",

				         "~children": [ {

				            "type": "NonStruct"

				         } ]

				      }, {

				         "type": "LI",

				         "~children": [ {

				            "type": "NonStruct"

				         } ]

				      } ]

				   }, {

				      "type": "Table",

				      "~children": [ {

				         "type": "TR",

				         "~children": [ {

				            "type": "TH",

				            "~children": [ {

				               "type": "NonStruct"

				            } ]

				         }, {

				            "type": "TH",

				            "~children": [ {

				               "type": "NonStruct"

				            } ]

				         } ]

				      }, {

				         "type": "TR",

				         "~children": [ {

				            "type": "TD",

				            "~children": [ {

				               "type": "NonStruct"

				            } ]

				         }, {

				            "type": "TD",

				            "~children": [ {

				               "type": "NonStruct"

				            } ]

				         } ]

				      } ]

				   } ]

				}

				)";

				class HeadlessWebContentsTaggedPDFTest

				    : public HeadlessAsyncDevTooledBrowserTest,

				      public page::Observer {

				 public:

				  void SetUpCommandLine(base::CommandLine* command_line) override {

				    // Specifically request a tagged (accessible) PDF. Maybe someday

				    // we can enable this by default.

				    HeadlessAsyncDevTooledBrowserTest::SetUpCommandLine(command_line);

				    command_line->AppendSwitch(switches::kExportTaggedPDF);

				  }

				  void RunDevTooledTest() override {

				    EXPECT_TRUE(embedded_test_server()->Start());

				    devtools_client_->GetPage()->AddObserver(this);

				    base::RunLoop run_loop(base::RunLoop::Type::kNestableTasksAllowed);

				    devtools_client_->GetPage()->Enable(run_loop.QuitClosure());

				    run_loop.Run();

				    devtools_client_->GetPage()->Navigate(

				        embedded_test_server()->GetURL("/structured_doc.html").spec());

				  }

				  void OnLoadEventFired(const page::LoadEventFiredParams&) override {

				    devtools_client_->GetPage()->GetExperimental()->PrintToPDF(

				        page::PrintToPDFParams::Builder()

				            .SetPrintBackground(true)

				            .SetPaperHeight(41)

				            .SetPaperWidth(41)

				            .SetMarginTop(0)

				            .SetMarginBottom(0)

				            .SetMarginLeft(0)

				            .SetMarginRight(0)

				            .Build(),

				        base::BindOnce(&HeadlessWebContentsTaggedPDFTest::OnPDFCreated,

				                       base::Unretained(this)));

				  }

				  void OnPDFCreated(std::unique_ptr<page::PrintToPDFResult> result) {

				    ASSERT_TRUE(result);

				    protocol::Binary pdf_data = result->GetData();

				    EXPECT_GT(pdf_data.size(), 0U);

				    auto pdf_span = base::make_span(pdf_data.data(), pdf_data.size());

				    int num_pages;

				    EXPECT_TRUE(chrome_pdf::GetPDFDocInfo(pdf_span, &num_pages, nullptr));

				    EXPECT_EQ(1, num_pages);

				    base::Optional<bool> tagged = chrome_pdf::IsPDFDocTagged(pdf_span);

				    ASSERT_TRUE(tagged.has_value());

				    EXPECT_TRUE(tagged.value());

				    constexpr int kFirstPage = 0;

				    base::Value struct_tree =

				        chrome_pdf::GetPDFStructTreeForPage(pdf_span, kFirstPage);

				    std::string json;

				    base::JSONWriter::WriteWithOptions(

				        struct_tree, base::JSONWriter::OPTIONS_PRETTY_PRINT, &json);

				    // Map Windows line endings to Unix by removing '\r'.

				    base::RemoveChars(json, "\r", &json);

				    EXPECT_EQ(kExpectedStructTreeJSON, json);

				    FinishAsynchronousTest();

				  }

				};

				HEADLESS_ASYNC_DEVTOOLED_TEST_F(HeadlessWebContentsTaggedPDFTest);

				#endif  // BUILDFLAG(ENABLE_PRINTING)

				class HeadlessWebContentsSecurityTest

				@ -671,11 +800,11 @@ class HeadlessWebContentsBeginFrameControlTest

				  void SetUpCommandLine(base::CommandLine* command_line) override {

				    HeadlessBrowserTest::SetUpCommandLine(command_line);

				    // See bit.ly/headless-rendering for why we use these flags.

				    command_line->AppendSwitch(switches::kRunAllCompositorStagesBeforeDraw);

				    command_line->AppendSwitch(switches::kDisableNewContentRenderingTimeout);

				    command_line->AppendSwitch(::switches::kRunAllCompositorStagesBeforeDraw);

				    command_line->AppendSwitch(::switches::kDisableNewContentRenderingTimeout);

				    command_line->AppendSwitch(cc::switches::kDisableCheckerImaging);

				    command_line->AppendSwitch(cc::switches::kDisableThreadedAnimation);

				    command_line->AppendSwitch(switches::kDisableThreadedScrolling);

				    command_line->AppendSwitch(::switches::kDisableThreadedScrolling);

				  }

				  void OnCreateTargetResult(

									
										19

headless/test/data/structured_doc.html
									
										Normal file
									
				@ -0,0 +1,19 @@

				<!DOCTYPE html>

				<body>

				  <h1>Title</h1>

				  <p>Para</p>

				  <ul>

				    <li>Item 1</li>

				    <li>Item 2</li>

				  </ul>

				  <table>

				    <tr>

				      <th>Header 1</th>

				      <th>Header 2</th>

				    </tr>

				    <tr>

				      <td>Cell 1</td>

				      <td>Cell 2</td>

				    </tr>

				  </table>

				</body>

									
										7

pdf/pdf.cc
									
				@ -97,6 +97,13 @@ base::Optional<bool> IsPDFDocTagged(base::span<const uint8_t> pdf_buffer) {

				  return engine_exports->IsPDFDocTagged(pdf_buffer);

				}

				base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,

				                                    int page_index) {

				  ScopedSdkInitializer scoped_sdk_initializer(/*enable_v8=*/true);

				  PDFEngineExports* engine_exports = PDFEngineExports::Get();

				  return engine_exports->GetPDFStructTreeForPage(pdf_buffer, page_index);

				}

				bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,

				                           int page_number,

				                           double* width,

									
										6

pdf/pdf.h
									
				@ -9,6 +9,7 @@

				#include "base/containers/span.h"

				#include "base/optional.h"

				#include "base/values.h"

				#include "build/build_config.h"

				#if defined(OS_WIN)

				@ -105,6 +106,11 @@ bool GetPDFDocInfo(base::span<const uint8_t> pdf_buffer,

				// PDF but untagged, and nullopt if the PDF can't be parsed.

				base::Optional<bool> IsPDFDocTagged(base::span<const uint8_t> pdf_buffer);

				// Given a tagged PDF (see IsPDFDocTagged, above), return the portion of

				// the structure tree for a given page as a hierarchical tree of base::Values.

				base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,

				                                    int page_index);

				// Gets the dimensions of a specific page in a document.

				// |pdf_buffer| is the buffer that contains the entire PDF document to be

				//     rendered.

									
										7

pdf/pdf_engine.h
									
				@ -16,6 +16,7 @@

				#include "base/optional.h"

				#include "base/strings/string16.h"

				#include "base/time/time.h"

				#include "base/values.h"

				#include "build/build_config.h"

				#include "pdf/document_layout.h"

				#include "ppapi/c/dev/pp_cursor_type_dev.h"

				@ -536,6 +537,12 @@ class PDFEngineExports {

				  virtual base::Optional<bool> IsPDFDocTagged(

				      base::span<const uint8_t> pdf_buffer) = 0;

				  // Given a tagged PDF (see IsPDFDocTagged, above), return the portion of

				  // the structure tree for a given page as a hierarchical tree of base::Values.

				  virtual base::Value GetPDFStructTreeForPage(

				      base::span<const uint8_t> pdf_buffer,

				      int page_index) = 0;

				  // See the definition of GetPDFPageSizeByIndex in pdf.cc for details.

				  virtual bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,

				                                     int page_number,

									
										64

pdf/pdfium/pdfium_engine_exports.cc
									
				@ -16,6 +16,7 @@

				#include "third_party/pdfium/public/cpp/fpdf_scopers.h"

				#include "third_party/pdfium/public/fpdf_catalog.h"

				#include "third_party/pdfium/public/fpdf_ppo.h"

				#include "third_party/pdfium/public/fpdf_structtree.h"

				#include "third_party/pdfium/public/fpdfview.h"

				#include "ui/gfx/geometry/rect.h"

				#include "ui/gfx/geometry/size.h"

				@ -129,6 +130,41 @@ bool IsValidPrintableArea(const gfx::Size& page_size,

				         printable_area.bottom() <= page_size.height();

				}

				base::Value RecursiveGetStructTree(FPDF_STRUCTELEMENT struct_elem) {

				  constexpr int kBufLen = 64;

				  base::char16 elem_type_buffer[kBufLen];

				  if (!FPDF_StructElement_GetType(struct_elem, elem_type_buffer,

				                                  sizeof(elem_type_buffer))) {

				    return base::Value(base::Value::Type::NONE);

				  }

				  base::Value result(base::Value::Type::DICTIONARY);

				  base::string16 elem_type(elem_type_buffer);

				  result.SetStringKey("type", elem_type);

				  int children_count = FPDF_StructElement_CountChildren(struct_elem);

				  if (children_count == 0)

				    return base::Value(base::Value::Type::NONE);

				  base::Value children(base::Value::Type::LIST);

				  for (int i = 0; i < children_count; i++) {

				    FPDF_STRUCTELEMENT child_elem =

				        FPDF_StructElement_GetChildAtIndex(struct_elem, i);

				    base::Value child = RecursiveGetStructTree(child_elem);

				    if (child.is_dict())

				      children.Append(std::move(child));

				  }

				  // use "~children" instead of "children" because we pretty-print the

				  // result of this as JSON and the keys are sorted; it's much easier to

				  // understand when the children are the last key.

				  if (!children.GetList().empty())

				    result.SetKey("~children", std::move(children));

				  return result;

				}

				}  // namespace

				PDFEngineExports::RenderingSettings::RenderingSettings(int dpi_x,

				@ -362,6 +398,34 @@ base::Optional<bool> PDFiumEngineExports::IsPDFDocTagged(

				  return FPDFCatalog_IsTagged(doc.get());

				}

				base::Value PDFiumEngineExports::GetPDFStructTreeForPage(

				    base::span<const uint8_t> pdf_buffer,

				    int page_index) {

				  ScopedFPDFDocument doc = LoadPdfData(pdf_buffer);

				  if (!doc)

				    return base::Value(base::Value::Type::NONE);

				  ScopedFPDFPage page(FPDF_LoadPage(doc.get(), page_index));

				  if (!page)

				    return base::Value(base::Value::Type::NONE);

				  ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page.get()));

				  if (!struct_tree)

				    return base::Value(base::Value::Type::NONE);

				  // We only expect one child of the struct tree - i.e. a single root node.

				  int children = FPDF_StructTree_CountChildren(struct_tree.get());

				  if (children != 1)

				    return base::Value(base::Value::Type::NONE);

				  FPDF_STRUCTELEMENT struct_root_elem =

				      FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);

				  if (!struct_root_elem)

				    return base::Value(base::Value::Type::NONE);

				  return RecursiveGetStructTree(struct_root_elem);

				}

				bool PDFiumEngineExports::GetPDFPageSizeByIndex(

				    base::span<const uint8_t> pdf_buffer,

				    int page_number,

									
										2

pdf/pdfium/pdfium_engine_exports.h
									
				@ -53,6 +53,8 @@ class PDFiumEngineExports : public PDFEngineExports {

				                     double* max_page_width) override;

				  base::Optional<bool> IsPDFDocTagged(

				      base::span<const uint8_t> pdf_buffer) override;

				  base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,

				                                      int page_index) override;

				  bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,

				                             int page_number,

				                             double* width,

Add a test for exporting tagged PDFs

135 headless/lib/headless_web_contents_browsertest.cc

19 headless/test/data/structured_doc.html Normal file

7 pdf/pdf.cc

6 pdf/pdf.h

7 pdf/pdf_engine.h

64 pdf/pdfium/pdfium_engine_exports.cc

2 pdf/pdfium/pdfium_engine_exports.h

135

headless/lib/headless_web_contents_browsertest.cc

19

headless/test/data/structured_doc.html Normal file

7

pdf/pdf.cc

6

pdf/pdf.h

7

pdf/pdf_engine.h

64

pdf/pdfium/pdfium_engine_exports.cc

2

pdf/pdfium/pdfium_engine_exports.h