0

Improve how PaidContent checks for microdata annotations.

Check for <meta> elements while checking each element instead of using
QuerySelectorAll which can be expensive and may invalidate style and
layout.

Also add more cases to unit tests to increase paid_content code coverage.

Change-Id: I76ae9ed65aa7e9647c650ee99bb8d51444e03e04
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6431533
Reviewed-by: Abigail Klein <abigailbklein@google.com>
Reviewed-by: Khushal Sagar <khushalsagar@chromium.org>
Commit-Queue: Gary Klassen <gklassen@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1443880}
This commit is contained in:
Gary Klassen
2025-04-07 18:35:27 -07:00
committed by Chromium LUCI CQ
parent 7b491de370
commit 16288d346e
3 changed files with 165 additions and 11 deletions
third_party/blink/renderer/modules/content_extraction

@ -2437,10 +2437,15 @@ bool ContainsRole(const Vector<mojom::blink::AIPageContentAnnotatedRole>& roles,
TEST_F(AIPageContentAgentTest, PaidContent) {
frame_test_helpers::LoadHTMLString(
helper_.LocalMainFrame(), R"HTML(
<head>
<script></script>
<script type='unrelated'></script>
<script type="application/ld+json">{this: "will fail parsing",}</script>
<script type="application/ld+json">"not": "an object"</script>
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "NewsArticle",
"mainEntityOfPage": "https://www.evergreengazette.com/dailyplanet.com/world/world-news/",
"mainEntityOfPage": "https://www.evergreengazette.com/world/world-news/",
"headline": "City Council Debates Future of Automated Transit System",
"alternativeHeadline": "City Council Debates Future of Automated Transit System",
"dateModified": "2025-03-25T19:17:05.541Z",
@ -2567,7 +2572,10 @@ TEST_F(AIPageContentAgentTest, PaidContentRootOnly) {
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "NewsArticle",
"isAccessibleForFree": "False"
"isAccessibleForFree": "False",
"hasPart": {
"@type": "unrelated"
}
}</script>
<body>
Content
@ -2598,7 +2606,7 @@ TEST_F(AIPageContentAgentTest, PaidContentRootOnly) {
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
}
TEST_F(AIPageContentAgentTest, DISABLED_PaidContentMicrodata) {
TEST_F(AIPageContentAgentTest, PaidContentMicrodata) {
frame_test_helpers::LoadHTMLString(
helper_.LocalMainFrame(), R"HTML(
<script type="application/ld+json">{
@ -2609,8 +2617,12 @@ TEST_F(AIPageContentAgentTest, DISABLED_PaidContentMicrodata) {
<body>
Content
<div class="paidContent">
<meta itemprop="isAccessibleForFree" content="false">
Paid Content
<meta itemprop="isAccessibleForFree" content="false">
Paid Content
</div>
<div class="paidContent">
<meta itemprop="unrelated">
Content
</div>
</body>
)HTML",
@ -2796,6 +2808,119 @@ TEST_F(AIPageContentAgentTest, PaidContentSubframe) {
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
}
TEST_F(AIPageContentAgentTest, PaidContentSubframeMicrodata) {
frame_test_helpers::LoadHTMLString(
helper_.LocalMainFrame(), R"HTML(
<script type="application/ld+json">{
"@context": "https://schema.org",
"@type": "NewsArticle",
"isAccessibleForFree": true
}</script>
<body>
Free Content
<div class="paidContent">
<meta itemprop="isAccessibleForFree" content="false">
Microdata not checked
</div>
<iframe srcdoc='
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "NewsArticle",
"isAccessibleForFree": false
}</script>
<body>
Content
<div class="paidContent">
<meta itemprop="isAccessibleForFree" content="false">
Paid Content
</div>
</body>
'></iframe>
<iframe srcdoc='
<body>
Content
<div class="paidContent">
<meta itemprop="isAccessibleForFree" content="false">
Microdata not checked
</div>
</body>
'></iframe>
<iframe srcdoc='
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "NewsArticle",
"isAccessibleForFree": false
}</script>
<body>
Content
<div class="paidContent">
<meta itemprop="isAccessibleForFree" content="false">
Paid Content
</div>
</body>
'></iframe>
</body>
)HTML",
url_test_helpers::ToKURL("http://foobar.com"));
auto content = GetAIPageContent();
ASSERT_TRUE(content);
ASSERT_TRUE(content->root_node);
// The root node does not contain paid content.
EXPECT_FALSE(content->frame_data->contains_paid_content);
const auto& root = *content->root_node;
auto& nodes = root.children_nodes;
EXPECT_FALSE(ContainsRole(nodes[0]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
EXPECT_FALSE(ContainsRole(nodes[1]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
const auto& iframe1 = nodes[2];
EXPECT_EQ(iframe1->content_attributes->attribute_type,
mojom::blink::AIPageContentAttributeType::kIframe);
EXPECT_TRUE(iframe1->content_attributes->iframe_data->local_frame_data
->contains_paid_content);
const auto& children1 = iframe1->children_nodes[0]->children_nodes;
EXPECT_FALSE(
ContainsRole(children1[0]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
EXPECT_TRUE(
ContainsRole(children1[1]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
const auto& iframe2 = nodes[3];
EXPECT_EQ(iframe2->content_attributes->attribute_type,
mojom::blink::AIPageContentAttributeType::kIframe);
EXPECT_FALSE(iframe2->content_attributes->iframe_data->local_frame_data
->contains_paid_content);
const auto& children2 = iframe2->children_nodes[0]->children_nodes;
EXPECT_FALSE(
ContainsRole(children2[0]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
EXPECT_FALSE(
ContainsRole(children2[1]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
const auto& iframe3 = nodes[4];
EXPECT_EQ(iframe3->content_attributes->attribute_type,
mojom::blink::AIPageContentAttributeType::kIframe);
EXPECT_TRUE(iframe3->content_attributes->iframe_data->local_frame_data
->contains_paid_content);
const auto& children3 = iframe3->children_nodes[0]->children_nodes;
EXPECT_FALSE(
ContainsRole(children3[0]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
EXPECT_TRUE(
ContainsRole(children3[1]->content_attributes->annotated_roles,
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
}
void CheckMatchesNode(
const mojom::blink::AIPageContentHitTestNode& hit_test_node,
const mojom::blink::AIPageContentNode& node) {

@ -8,6 +8,7 @@
#include "third_party/blink/renderer/core/dom/document.h"
#include "third_party/blink/renderer/core/dom/element.h"
#include "third_party/blink/renderer/core/dom/static_node_list.h"
#include "third_party/blink/renderer/core/html/html_meta_element.h"
#include "third_party/blink/renderer/core/html/html_head_element.h"
#include "third_party/blink/renderer/core/html/html_script_element.h"
#include "third_party/blink/renderer/platform/json/json_parser.h"
@ -16,6 +17,9 @@
namespace blink {
namespace {
const char kIsAccessibleForFree[] = "isAccessibleForFree";
bool ObjectValuePresentAndEquals(const JSONObject* object,
const String& key,
const String& value) {
@ -70,6 +74,17 @@ bool ObjectValuePresentAndFalse(const JSONObject* object, const String& key) {
} // namespace
bool PaidContent::IsPaidElement(const Element* element) const {
auto* document = &element->GetDocument();
if (check_microdata_.Contains(document) && check_microdata_.at(document)) {
for (HTMLMetaElement& meta_element :
Traversal<HTMLMetaElement>::ChildrenOf(*element)) {
auto itemprop = meta_element.FastGetAttribute(html_names::kItempropAttr);
if (itemprop.GetString() != kIsAccessibleForFree) {
continue;
}
return meta_element.Content() == "false";
}
}
for (const auto& paid_element : paid_elements_) {
if (element == paid_element) {
return true;
@ -115,11 +130,13 @@ bool PaidContent::QueryPaidElements(Document& document) {
// and WebPage. Multiple types are supported.
// check for isAccessibleForFree=false
if (!ObjectValuePresentAndFalse(script_obj, "isAccessibleForFree")) {
if (!ObjectValuePresentAndFalse(script_obj, kIsAccessibleForFree)) {
continue;
}
paid_content_present = true;
bool has_part_found = false;
// Check for hasPart with isAccessibleForFree=false and a cssSelector
JSONValue* hasPart_val = script_obj->Get("hasPart");
if (hasPart_val) {
@ -130,23 +147,30 @@ bool PaidContent::QueryPaidElements(Document& document) {
JSONValue* hasPart_obj_val = hasPart_array->at(j);
if (hasPart_obj_val->GetType() == JSONValue::kTypeObject) {
JSONObject* hasPart_obj = JSONObject::Cast(hasPart_obj_val);
AppendHasPartElements(document, *hasPart_obj);
has_part_found |= AppendHasPartElements(document, *hasPart_obj);
}
}
} else if (hasPart_type == JSONValue::kTypeObject) {
JSONObject* hasPart_obj = JSONObject::Cast(hasPart_val);
AppendHasPartElements(document, *hasPart_obj);
has_part_found |= AppendHasPartElements(document, *hasPart_obj);
}
}
// Assume that pages will only use either ld+json or microdata.
// If ld+json hasPart exists, don't check for microdata to save
// the cost of checking each element.
if (!has_part_found) {
check_microdata_.Set(&document, true);
}
return paid_content_present;
}
return paid_content_present;
}
void PaidContent::AppendHasPartElements(Document& document,
bool PaidContent::AppendHasPartElements(Document& document,
JSONObject& hasPart_obj) {
if (ObjectValuePresentAndEquals(&hasPart_obj, "@type", "WebPageElement") &&
ObjectValuePresentAndFalse(&hasPart_obj, "isAccessibleForFree")) {
ObjectValuePresentAndFalse(&hasPart_obj, kIsAccessibleForFree)) {
JSONValue* selector_val = hasPart_obj.Get("cssSelector");
if (selector_val && selector_val->GetType() == JSONValue::kTypeString) {
String selector;
@ -158,8 +182,10 @@ void PaidContent::AppendHasPartElements(Document& document,
paid_elements_.push_back(elements->item(j));
}
}
return true;
}
}
return false;
}
} // namespace blink

@ -25,8 +25,11 @@ class PaidContent final {
bool IsPaidElement(const Element* element) const;
private:
// Whether to check for microdata annotations while walking.
HeapHashMap<WeakMember<Document>, bool> check_microdata_;
// Appends elements found by the cssSelector in the hasPart object.
void AppendHasPartElements(Document& document, JSONObject& hasPart_obj);
bool AppendHasPartElements(Document& document, JSONObject& hasPart_obj);
// List of nodes marked as isAccessibleForFree=false.
HeapVector<Member<Element>> paid_elements_;