0

Add a test for exporting tagged PDFs

This is a follow-up to http://crrev.com/c/1970742 - it adds a new
interfaces to read the structure tree from PDFium, then generates a
tagged PDF using the new command-line flag and tests that the
resulting PDF file has the correct structure tree.

Bug: 607777

Change-Id: I796b15ea477ae4e20099e95982430fe770166577
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1970744
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Eric Seckler <eseckler@chromium.org>
Commit-Queue: Dominic Mazzoni <dmazzoni@chromium.org>
Cr-Commit-Position: refs/heads/master@{#736103}
This commit is contained in:
Dominic Mazzoni
2020-01-28 22:50:53 +00:00
committed by Commit Bot
parent 1db56e05ed
commit 3895ab0377
7 changed files with 237 additions and 3 deletions

@ -11,6 +11,7 @@
#include "base/json/json_writer.h"
#include "base/logging.h"
#include "base/run_loop.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "build/build_config.h"
#include "cc/base/switches.h"
@ -19,6 +20,7 @@
#include "content/public/browser/web_contents.h"
#include "content/public/common/content_switches.h"
#include "content/public/test/browser_test.h"
#include "headless/app/headless_shell_switches.h"
#include "headless/lib/browser/headless_web_contents_impl.h"
#include "headless/public/devtools/domains/browser.h"
#include "headless/public/devtools/domains/dom_snapshot.h"
@ -510,6 +512,133 @@ class HeadlessWebContentsPDFPageSizeRoundingTest
HEADLESS_ASYNC_DEVTOOLED_TEST_F(HeadlessWebContentsPDFPageSizeRoundingTest);
const char kExpectedStructTreeJSON[] = R"({
"type": "Document",
"~children": [ {
"type": "H",
"~children": [ {
"type": "NonStruct"
} ]
}, {
"type": "P",
"~children": [ {
"type": "NonStruct"
} ]
}, {
"type": "L",
"~children": [ {
"type": "LI",
"~children": [ {
"type": "NonStruct"
} ]
}, {
"type": "LI",
"~children": [ {
"type": "NonStruct"
} ]
} ]
}, {
"type": "Table",
"~children": [ {
"type": "TR",
"~children": [ {
"type": "TH",
"~children": [ {
"type": "NonStruct"
} ]
}, {
"type": "TH",
"~children": [ {
"type": "NonStruct"
} ]
} ]
}, {
"type": "TR",
"~children": [ {
"type": "TD",
"~children": [ {
"type": "NonStruct"
} ]
}, {
"type": "TD",
"~children": [ {
"type": "NonStruct"
} ]
} ]
} ]
} ]
}
)";
class HeadlessWebContentsTaggedPDFTest
: public HeadlessAsyncDevTooledBrowserTest,
public page::Observer {
public:
void SetUpCommandLine(base::CommandLine* command_line) override {
// Specifically request a tagged (accessible) PDF. Maybe someday
// we can enable this by default.
HeadlessAsyncDevTooledBrowserTest::SetUpCommandLine(command_line);
command_line->AppendSwitch(switches::kExportTaggedPDF);
}
void RunDevTooledTest() override {
EXPECT_TRUE(embedded_test_server()->Start());
devtools_client_->GetPage()->AddObserver(this);
base::RunLoop run_loop(base::RunLoop::Type::kNestableTasksAllowed);
devtools_client_->GetPage()->Enable(run_loop.QuitClosure());
run_loop.Run();
devtools_client_->GetPage()->Navigate(
embedded_test_server()->GetURL("/structured_doc.html").spec());
}
void OnLoadEventFired(const page::LoadEventFiredParams&) override {
devtools_client_->GetPage()->GetExperimental()->PrintToPDF(
page::PrintToPDFParams::Builder()
.SetPrintBackground(true)
.SetPaperHeight(41)
.SetPaperWidth(41)
.SetMarginTop(0)
.SetMarginBottom(0)
.SetMarginLeft(0)
.SetMarginRight(0)
.Build(),
base::BindOnce(&HeadlessWebContentsTaggedPDFTest::OnPDFCreated,
base::Unretained(this)));
}
void OnPDFCreated(std::unique_ptr<page::PrintToPDFResult> result) {
ASSERT_TRUE(result);
protocol::Binary pdf_data = result->GetData();
EXPECT_GT(pdf_data.size(), 0U);
auto pdf_span = base::make_span(pdf_data.data(), pdf_data.size());
int num_pages;
EXPECT_TRUE(chrome_pdf::GetPDFDocInfo(pdf_span, &num_pages, nullptr));
EXPECT_EQ(1, num_pages);
base::Optional<bool> tagged = chrome_pdf::IsPDFDocTagged(pdf_span);
ASSERT_TRUE(tagged.has_value());
EXPECT_TRUE(tagged.value());
constexpr int kFirstPage = 0;
base::Value struct_tree =
chrome_pdf::GetPDFStructTreeForPage(pdf_span, kFirstPage);
std::string json;
base::JSONWriter::WriteWithOptions(
struct_tree, base::JSONWriter::OPTIONS_PRETTY_PRINT, &json);
// Map Windows line endings to Unix by removing '\r'.
base::RemoveChars(json, "\r", &json);
EXPECT_EQ(kExpectedStructTreeJSON, json);
FinishAsynchronousTest();
}
};
HEADLESS_ASYNC_DEVTOOLED_TEST_F(HeadlessWebContentsTaggedPDFTest);
#endif // BUILDFLAG(ENABLE_PRINTING)
class HeadlessWebContentsSecurityTest
@ -671,11 +800,11 @@ class HeadlessWebContentsBeginFrameControlTest
void SetUpCommandLine(base::CommandLine* command_line) override {
HeadlessBrowserTest::SetUpCommandLine(command_line);
// See bit.ly/headless-rendering for why we use these flags.
command_line->AppendSwitch(switches::kRunAllCompositorStagesBeforeDraw);
command_line->AppendSwitch(switches::kDisableNewContentRenderingTimeout);
command_line->AppendSwitch(::switches::kRunAllCompositorStagesBeforeDraw);
command_line->AppendSwitch(::switches::kDisableNewContentRenderingTimeout);
command_line->AppendSwitch(cc::switches::kDisableCheckerImaging);
command_line->AppendSwitch(cc::switches::kDisableThreadedAnimation);
command_line->AppendSwitch(switches::kDisableThreadedScrolling);
command_line->AppendSwitch(::switches::kDisableThreadedScrolling);
}
void OnCreateTargetResult(

@ -0,0 +1,19 @@
<!DOCTYPE html>
<body>
<h1>Title</h1>
<p>Para</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
<table>
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
<tr>
<td>Cell 1</td>
<td>Cell 2</td>
</tr>
</table>
</body>

@ -97,6 +97,13 @@ base::Optional<bool> IsPDFDocTagged(base::span<const uint8_t> pdf_buffer) {
return engine_exports->IsPDFDocTagged(pdf_buffer);
}
base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,
int page_index) {
ScopedSdkInitializer scoped_sdk_initializer(/*enable_v8=*/true);
PDFEngineExports* engine_exports = PDFEngineExports::Get();
return engine_exports->GetPDFStructTreeForPage(pdf_buffer, page_index);
}
bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,
int page_number,
double* width,

@ -9,6 +9,7 @@
#include "base/containers/span.h"
#include "base/optional.h"
#include "base/values.h"
#include "build/build_config.h"
#if defined(OS_WIN)
@ -105,6 +106,11 @@ bool GetPDFDocInfo(base::span<const uint8_t> pdf_buffer,
// PDF but untagged, and nullopt if the PDF can't be parsed.
base::Optional<bool> IsPDFDocTagged(base::span<const uint8_t> pdf_buffer);
// Given a tagged PDF (see IsPDFDocTagged, above), return the portion of
// the structure tree for a given page as a hierarchical tree of base::Values.
base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,
int page_index);
// Gets the dimensions of a specific page in a document.
// |pdf_buffer| is the buffer that contains the entire PDF document to be
// rendered.

@ -16,6 +16,7 @@
#include "base/optional.h"
#include "base/strings/string16.h"
#include "base/time/time.h"
#include "base/values.h"
#include "build/build_config.h"
#include "pdf/document_layout.h"
#include "ppapi/c/dev/pp_cursor_type_dev.h"
@ -536,6 +537,12 @@ class PDFEngineExports {
virtual base::Optional<bool> IsPDFDocTagged(
base::span<const uint8_t> pdf_buffer) = 0;
// Given a tagged PDF (see IsPDFDocTagged, above), return the portion of
// the structure tree for a given page as a hierarchical tree of base::Values.
virtual base::Value GetPDFStructTreeForPage(
base::span<const uint8_t> pdf_buffer,
int page_index) = 0;
// See the definition of GetPDFPageSizeByIndex in pdf.cc for details.
virtual bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,
int page_number,

@ -16,6 +16,7 @@
#include "third_party/pdfium/public/cpp/fpdf_scopers.h"
#include "third_party/pdfium/public/fpdf_catalog.h"
#include "third_party/pdfium/public/fpdf_ppo.h"
#include "third_party/pdfium/public/fpdf_structtree.h"
#include "third_party/pdfium/public/fpdfview.h"
#include "ui/gfx/geometry/rect.h"
#include "ui/gfx/geometry/size.h"
@ -129,6 +130,41 @@ bool IsValidPrintableArea(const gfx::Size& page_size,
printable_area.bottom() <= page_size.height();
}
base::Value RecursiveGetStructTree(FPDF_STRUCTELEMENT struct_elem) {
constexpr int kBufLen = 64;
base::char16 elem_type_buffer[kBufLen];
if (!FPDF_StructElement_GetType(struct_elem, elem_type_buffer,
sizeof(elem_type_buffer))) {
return base::Value(base::Value::Type::NONE);
}
base::Value result(base::Value::Type::DICTIONARY);
base::string16 elem_type(elem_type_buffer);
result.SetStringKey("type", elem_type);
int children_count = FPDF_StructElement_CountChildren(struct_elem);
if (children_count == 0)
return base::Value(base::Value::Type::NONE);
base::Value children(base::Value::Type::LIST);
for (int i = 0; i < children_count; i++) {
FPDF_STRUCTELEMENT child_elem =
FPDF_StructElement_GetChildAtIndex(struct_elem, i);
base::Value child = RecursiveGetStructTree(child_elem);
if (child.is_dict())
children.Append(std::move(child));
}
// use "~children" instead of "children" because we pretty-print the
// result of this as JSON and the keys are sorted; it's much easier to
// understand when the children are the last key.
if (!children.GetList().empty())
result.SetKey("~children", std::move(children));
return result;
}
} // namespace
PDFEngineExports::RenderingSettings::RenderingSettings(int dpi_x,
@ -362,6 +398,34 @@ base::Optional<bool> PDFiumEngineExports::IsPDFDocTagged(
return FPDFCatalog_IsTagged(doc.get());
}
base::Value PDFiumEngineExports::GetPDFStructTreeForPage(
base::span<const uint8_t> pdf_buffer,
int page_index) {
ScopedFPDFDocument doc = LoadPdfData(pdf_buffer);
if (!doc)
return base::Value(base::Value::Type::NONE);
ScopedFPDFPage page(FPDF_LoadPage(doc.get(), page_index));
if (!page)
return base::Value(base::Value::Type::NONE);
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page.get()));
if (!struct_tree)
return base::Value(base::Value::Type::NONE);
// We only expect one child of the struct tree - i.e. a single root node.
int children = FPDF_StructTree_CountChildren(struct_tree.get());
if (children != 1)
return base::Value(base::Value::Type::NONE);
FPDF_STRUCTELEMENT struct_root_elem =
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
if (!struct_root_elem)
return base::Value(base::Value::Type::NONE);
return RecursiveGetStructTree(struct_root_elem);
}
bool PDFiumEngineExports::GetPDFPageSizeByIndex(
base::span<const uint8_t> pdf_buffer,
int page_number,

@ -53,6 +53,8 @@ class PDFiumEngineExports : public PDFEngineExports {
double* max_page_width) override;
base::Optional<bool> IsPDFDocTagged(
base::span<const uint8_t> pdf_buffer) override;
base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,
int page_index) override;
bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,
int page_number,
double* width,