Add a test for exporting tagged PDFs
This is a follow-up to http://crrev.com/c/1970742 - it adds a new interfaces to read the structure tree from PDFium, then generates a tagged PDF using the new command-line flag and tests that the resulting PDF file has the correct structure tree. Bug: 607777 Change-Id: I796b15ea477ae4e20099e95982430fe770166577 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1970744 Reviewed-by: Lei Zhang <thestig@chromium.org> Reviewed-by: Eric Seckler <eseckler@chromium.org> Commit-Queue: Dominic Mazzoni <dmazzoni@chromium.org> Cr-Commit-Position: refs/heads/master@{#736103}
This commit is contained in:

committed by
Commit Bot

parent
1db56e05ed
commit
3895ab0377
@ -11,6 +11,7 @@
|
||||
#include "base/json/json_writer.h"
|
||||
#include "base/logging.h"
|
||||
#include "base/run_loop.h"
|
||||
#include "base/strings/string_util.h"
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "build/build_config.h"
|
||||
#include "cc/base/switches.h"
|
||||
@ -19,6 +20,7 @@
|
||||
#include "content/public/browser/web_contents.h"
|
||||
#include "content/public/common/content_switches.h"
|
||||
#include "content/public/test/browser_test.h"
|
||||
#include "headless/app/headless_shell_switches.h"
|
||||
#include "headless/lib/browser/headless_web_contents_impl.h"
|
||||
#include "headless/public/devtools/domains/browser.h"
|
||||
#include "headless/public/devtools/domains/dom_snapshot.h"
|
||||
@ -510,6 +512,133 @@ class HeadlessWebContentsPDFPageSizeRoundingTest
|
||||
|
||||
HEADLESS_ASYNC_DEVTOOLED_TEST_F(HeadlessWebContentsPDFPageSizeRoundingTest);
|
||||
|
||||
const char kExpectedStructTreeJSON[] = R"({
|
||||
"type": "Document",
|
||||
"~children": [ {
|
||||
"type": "H",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
}, {
|
||||
"type": "P",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
}, {
|
||||
"type": "L",
|
||||
"~children": [ {
|
||||
"type": "LI",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
}, {
|
||||
"type": "LI",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
} ]
|
||||
}, {
|
||||
"type": "Table",
|
||||
"~children": [ {
|
||||
"type": "TR",
|
||||
"~children": [ {
|
||||
"type": "TH",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
}, {
|
||||
"type": "TH",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
} ]
|
||||
}, {
|
||||
"type": "TR",
|
||||
"~children": [ {
|
||||
"type": "TD",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
}, {
|
||||
"type": "TD",
|
||||
"~children": [ {
|
||||
"type": "NonStruct"
|
||||
} ]
|
||||
} ]
|
||||
} ]
|
||||
} ]
|
||||
}
|
||||
)";
|
||||
|
||||
class HeadlessWebContentsTaggedPDFTest
|
||||
: public HeadlessAsyncDevTooledBrowserTest,
|
||||
public page::Observer {
|
||||
public:
|
||||
void SetUpCommandLine(base::CommandLine* command_line) override {
|
||||
// Specifically request a tagged (accessible) PDF. Maybe someday
|
||||
// we can enable this by default.
|
||||
HeadlessAsyncDevTooledBrowserTest::SetUpCommandLine(command_line);
|
||||
command_line->AppendSwitch(switches::kExportTaggedPDF);
|
||||
}
|
||||
|
||||
void RunDevTooledTest() override {
|
||||
EXPECT_TRUE(embedded_test_server()->Start());
|
||||
|
||||
devtools_client_->GetPage()->AddObserver(this);
|
||||
|
||||
base::RunLoop run_loop(base::RunLoop::Type::kNestableTasksAllowed);
|
||||
devtools_client_->GetPage()->Enable(run_loop.QuitClosure());
|
||||
run_loop.Run();
|
||||
|
||||
devtools_client_->GetPage()->Navigate(
|
||||
embedded_test_server()->GetURL("/structured_doc.html").spec());
|
||||
}
|
||||
|
||||
void OnLoadEventFired(const page::LoadEventFiredParams&) override {
|
||||
devtools_client_->GetPage()->GetExperimental()->PrintToPDF(
|
||||
page::PrintToPDFParams::Builder()
|
||||
.SetPrintBackground(true)
|
||||
.SetPaperHeight(41)
|
||||
.SetPaperWidth(41)
|
||||
.SetMarginTop(0)
|
||||
.SetMarginBottom(0)
|
||||
.SetMarginLeft(0)
|
||||
.SetMarginRight(0)
|
||||
.Build(),
|
||||
base::BindOnce(&HeadlessWebContentsTaggedPDFTest::OnPDFCreated,
|
||||
base::Unretained(this)));
|
||||
}
|
||||
|
||||
void OnPDFCreated(std::unique_ptr<page::PrintToPDFResult> result) {
|
||||
ASSERT_TRUE(result);
|
||||
protocol::Binary pdf_data = result->GetData();
|
||||
EXPECT_GT(pdf_data.size(), 0U);
|
||||
auto pdf_span = base::make_span(pdf_data.data(), pdf_data.size());
|
||||
int num_pages;
|
||||
EXPECT_TRUE(chrome_pdf::GetPDFDocInfo(pdf_span, &num_pages, nullptr));
|
||||
EXPECT_EQ(1, num_pages);
|
||||
|
||||
base::Optional<bool> tagged = chrome_pdf::IsPDFDocTagged(pdf_span);
|
||||
ASSERT_TRUE(tagged.has_value());
|
||||
EXPECT_TRUE(tagged.value());
|
||||
|
||||
constexpr int kFirstPage = 0;
|
||||
base::Value struct_tree =
|
||||
chrome_pdf::GetPDFStructTreeForPage(pdf_span, kFirstPage);
|
||||
std::string json;
|
||||
base::JSONWriter::WriteWithOptions(
|
||||
struct_tree, base::JSONWriter::OPTIONS_PRETTY_PRINT, &json);
|
||||
// Map Windows line endings to Unix by removing '\r'.
|
||||
base::RemoveChars(json, "\r", &json);
|
||||
|
||||
EXPECT_EQ(kExpectedStructTreeJSON, json);
|
||||
|
||||
FinishAsynchronousTest();
|
||||
}
|
||||
};
|
||||
|
||||
HEADLESS_ASYNC_DEVTOOLED_TEST_F(HeadlessWebContentsTaggedPDFTest);
|
||||
|
||||
#endif // BUILDFLAG(ENABLE_PRINTING)
|
||||
|
||||
class HeadlessWebContentsSecurityTest
|
||||
@ -671,11 +800,11 @@ class HeadlessWebContentsBeginFrameControlTest
|
||||
void SetUpCommandLine(base::CommandLine* command_line) override {
|
||||
HeadlessBrowserTest::SetUpCommandLine(command_line);
|
||||
// See bit.ly/headless-rendering for why we use these flags.
|
||||
command_line->AppendSwitch(switches::kRunAllCompositorStagesBeforeDraw);
|
||||
command_line->AppendSwitch(switches::kDisableNewContentRenderingTimeout);
|
||||
command_line->AppendSwitch(::switches::kRunAllCompositorStagesBeforeDraw);
|
||||
command_line->AppendSwitch(::switches::kDisableNewContentRenderingTimeout);
|
||||
command_line->AppendSwitch(cc::switches::kDisableCheckerImaging);
|
||||
command_line->AppendSwitch(cc::switches::kDisableThreadedAnimation);
|
||||
command_line->AppendSwitch(switches::kDisableThreadedScrolling);
|
||||
command_line->AppendSwitch(::switches::kDisableThreadedScrolling);
|
||||
}
|
||||
|
||||
void OnCreateTargetResult(
|
||||
|
19
headless/test/data/structured_doc.html
Normal file
19
headless/test/data/structured_doc.html
Normal file
@ -0,0 +1,19 @@
|
||||
<!DOCTYPE html>
|
||||
<body>
|
||||
<h1>Title</h1>
|
||||
<p>Para</p>
|
||||
<ul>
|
||||
<li>Item 1</li>
|
||||
<li>Item 2</li>
|
||||
</ul>
|
||||
<table>
|
||||
<tr>
|
||||
<th>Header 1</th>
|
||||
<th>Header 2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 1</td>
|
||||
<td>Cell 2</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body>
|
@ -97,6 +97,13 @@ base::Optional<bool> IsPDFDocTagged(base::span<const uint8_t> pdf_buffer) {
|
||||
return engine_exports->IsPDFDocTagged(pdf_buffer);
|
||||
}
|
||||
|
||||
base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,
|
||||
int page_index) {
|
||||
ScopedSdkInitializer scoped_sdk_initializer(/*enable_v8=*/true);
|
||||
PDFEngineExports* engine_exports = PDFEngineExports::Get();
|
||||
return engine_exports->GetPDFStructTreeForPage(pdf_buffer, page_index);
|
||||
}
|
||||
|
||||
bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,
|
||||
int page_number,
|
||||
double* width,
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
#include "base/containers/span.h"
|
||||
#include "base/optional.h"
|
||||
#include "base/values.h"
|
||||
#include "build/build_config.h"
|
||||
|
||||
#if defined(OS_WIN)
|
||||
@ -105,6 +106,11 @@ bool GetPDFDocInfo(base::span<const uint8_t> pdf_buffer,
|
||||
// PDF but untagged, and nullopt if the PDF can't be parsed.
|
||||
base::Optional<bool> IsPDFDocTagged(base::span<const uint8_t> pdf_buffer);
|
||||
|
||||
// Given a tagged PDF (see IsPDFDocTagged, above), return the portion of
|
||||
// the structure tree for a given page as a hierarchical tree of base::Values.
|
||||
base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,
|
||||
int page_index);
|
||||
|
||||
// Gets the dimensions of a specific page in a document.
|
||||
// |pdf_buffer| is the buffer that contains the entire PDF document to be
|
||||
// rendered.
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "base/optional.h"
|
||||
#include "base/strings/string16.h"
|
||||
#include "base/time/time.h"
|
||||
#include "base/values.h"
|
||||
#include "build/build_config.h"
|
||||
#include "pdf/document_layout.h"
|
||||
#include "ppapi/c/dev/pp_cursor_type_dev.h"
|
||||
@ -536,6 +537,12 @@ class PDFEngineExports {
|
||||
virtual base::Optional<bool> IsPDFDocTagged(
|
||||
base::span<const uint8_t> pdf_buffer) = 0;
|
||||
|
||||
// Given a tagged PDF (see IsPDFDocTagged, above), return the portion of
|
||||
// the structure tree for a given page as a hierarchical tree of base::Values.
|
||||
virtual base::Value GetPDFStructTreeForPage(
|
||||
base::span<const uint8_t> pdf_buffer,
|
||||
int page_index) = 0;
|
||||
|
||||
// See the definition of GetPDFPageSizeByIndex in pdf.cc for details.
|
||||
virtual bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,
|
||||
int page_number,
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "third_party/pdfium/public/cpp/fpdf_scopers.h"
|
||||
#include "third_party/pdfium/public/fpdf_catalog.h"
|
||||
#include "third_party/pdfium/public/fpdf_ppo.h"
|
||||
#include "third_party/pdfium/public/fpdf_structtree.h"
|
||||
#include "third_party/pdfium/public/fpdfview.h"
|
||||
#include "ui/gfx/geometry/rect.h"
|
||||
#include "ui/gfx/geometry/size.h"
|
||||
@ -129,6 +130,41 @@ bool IsValidPrintableArea(const gfx::Size& page_size,
|
||||
printable_area.bottom() <= page_size.height();
|
||||
}
|
||||
|
||||
base::Value RecursiveGetStructTree(FPDF_STRUCTELEMENT struct_elem) {
|
||||
constexpr int kBufLen = 64;
|
||||
base::char16 elem_type_buffer[kBufLen];
|
||||
if (!FPDF_StructElement_GetType(struct_elem, elem_type_buffer,
|
||||
sizeof(elem_type_buffer))) {
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
}
|
||||
|
||||
base::Value result(base::Value::Type::DICTIONARY);
|
||||
base::string16 elem_type(elem_type_buffer);
|
||||
result.SetStringKey("type", elem_type);
|
||||
|
||||
int children_count = FPDF_StructElement_CountChildren(struct_elem);
|
||||
if (children_count == 0)
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
|
||||
base::Value children(base::Value::Type::LIST);
|
||||
for (int i = 0; i < children_count; i++) {
|
||||
FPDF_STRUCTELEMENT child_elem =
|
||||
FPDF_StructElement_GetChildAtIndex(struct_elem, i);
|
||||
|
||||
base::Value child = RecursiveGetStructTree(child_elem);
|
||||
if (child.is_dict())
|
||||
children.Append(std::move(child));
|
||||
}
|
||||
|
||||
// use "~children" instead of "children" because we pretty-print the
|
||||
// result of this as JSON and the keys are sorted; it's much easier to
|
||||
// understand when the children are the last key.
|
||||
if (!children.GetList().empty())
|
||||
result.SetKey("~children", std::move(children));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
PDFEngineExports::RenderingSettings::RenderingSettings(int dpi_x,
|
||||
@ -362,6 +398,34 @@ base::Optional<bool> PDFiumEngineExports::IsPDFDocTagged(
|
||||
return FPDFCatalog_IsTagged(doc.get());
|
||||
}
|
||||
|
||||
base::Value PDFiumEngineExports::GetPDFStructTreeForPage(
|
||||
base::span<const uint8_t> pdf_buffer,
|
||||
int page_index) {
|
||||
ScopedFPDFDocument doc = LoadPdfData(pdf_buffer);
|
||||
if (!doc)
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
|
||||
ScopedFPDFPage page(FPDF_LoadPage(doc.get(), page_index));
|
||||
if (!page)
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
|
||||
ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page.get()));
|
||||
if (!struct_tree)
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
|
||||
// We only expect one child of the struct tree - i.e. a single root node.
|
||||
int children = FPDF_StructTree_CountChildren(struct_tree.get());
|
||||
if (children != 1)
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
|
||||
FPDF_STRUCTELEMENT struct_root_elem =
|
||||
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
|
||||
if (!struct_root_elem)
|
||||
return base::Value(base::Value::Type::NONE);
|
||||
|
||||
return RecursiveGetStructTree(struct_root_elem);
|
||||
}
|
||||
|
||||
bool PDFiumEngineExports::GetPDFPageSizeByIndex(
|
||||
base::span<const uint8_t> pdf_buffer,
|
||||
int page_number,
|
||||
|
@ -53,6 +53,8 @@ class PDFiumEngineExports : public PDFEngineExports {
|
||||
double* max_page_width) override;
|
||||
base::Optional<bool> IsPDFDocTagged(
|
||||
base::span<const uint8_t> pdf_buffer) override;
|
||||
base::Value GetPDFStructTreeForPage(base::span<const uint8_t> pdf_buffer,
|
||||
int page_index) override;
|
||||
bool GetPDFPageSizeByIndex(base::span<const uint8_t> pdf_buffer,
|
||||
int page_number,
|
||||
double* width,
|
||||
|
Reference in New Issue
Block a user