Improve how PaidContent checks for microdata annotations.
Check for <meta> elements while checking each element instead of using QuerySelectorAll which can be expensive and may invalidate style and layout. Also add more cases to unit tests to increase paid_content code coverage. Change-Id: I76ae9ed65aa7e9647c650ee99bb8d51444e03e04 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6431533 Reviewed-by: Abigail Klein <abigailbklein@google.com> Reviewed-by: Khushal Sagar <khushalsagar@chromium.org> Commit-Queue: Gary Klassen <gklassen@chromium.org> Cr-Commit-Position: refs/heads/main@{#1443880}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
7b491de370
commit
16288d346e
third_party/blink/renderer/modules/content_extraction
135
third_party/blink/renderer/modules/content_extraction/ai_page_content_agent_unittest.cc
vendored
135
third_party/blink/renderer/modules/content_extraction/ai_page_content_agent_unittest.cc
vendored
@ -2437,10 +2437,15 @@ bool ContainsRole(const Vector<mojom::blink::AIPageContentAnnotatedRole>& roles,
|
||||
TEST_F(AIPageContentAgentTest, PaidContent) {
|
||||
frame_test_helpers::LoadHTMLString(
|
||||
helper_.LocalMainFrame(), R"HTML(
|
||||
<head>
|
||||
<script></script>
|
||||
<script type='unrelated'></script>
|
||||
<script type="application/ld+json">{this: "will fail parsing",}</script>
|
||||
<script type="application/ld+json">"not": "an object"</script>
|
||||
<script type="application/ld+json">{
|
||||
"@context": "http://schema.org",
|
||||
"@type": "NewsArticle",
|
||||
"mainEntityOfPage": "https://www.evergreengazette.com/dailyplanet.com/world/world-news/",
|
||||
"mainEntityOfPage": "https://www.evergreengazette.com/world/world-news/",
|
||||
"headline": "City Council Debates Future of Automated Transit System",
|
||||
"alternativeHeadline": "City Council Debates Future of Automated Transit System",
|
||||
"dateModified": "2025-03-25T19:17:05.541Z",
|
||||
@ -2567,7 +2572,10 @@ TEST_F(AIPageContentAgentTest, PaidContentRootOnly) {
|
||||
<script type="application/ld+json">{
|
||||
"@context": "http://schema.org",
|
||||
"@type": "NewsArticle",
|
||||
"isAccessibleForFree": "False"
|
||||
"isAccessibleForFree": "False",
|
||||
"hasPart": {
|
||||
"@type": "unrelated"
|
||||
}
|
||||
}</script>
|
||||
<body>
|
||||
Content
|
||||
@ -2598,7 +2606,7 @@ TEST_F(AIPageContentAgentTest, PaidContentRootOnly) {
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
}
|
||||
|
||||
TEST_F(AIPageContentAgentTest, DISABLED_PaidContentMicrodata) {
|
||||
TEST_F(AIPageContentAgentTest, PaidContentMicrodata) {
|
||||
frame_test_helpers::LoadHTMLString(
|
||||
helper_.LocalMainFrame(), R"HTML(
|
||||
<script type="application/ld+json">{
|
||||
@ -2609,8 +2617,12 @@ TEST_F(AIPageContentAgentTest, DISABLED_PaidContentMicrodata) {
|
||||
<body>
|
||||
Content
|
||||
<div class="paidContent">
|
||||
<meta itemprop="isAccessibleForFree" content="false">
|
||||
Paid Content
|
||||
<meta itemprop="isAccessibleForFree" content="false">
|
||||
Paid Content
|
||||
</div>
|
||||
<div class="paidContent">
|
||||
<meta itemprop="unrelated">
|
||||
Content
|
||||
</div>
|
||||
</body>
|
||||
)HTML",
|
||||
@ -2796,6 +2808,119 @@ TEST_F(AIPageContentAgentTest, PaidContentSubframe) {
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
}
|
||||
|
||||
TEST_F(AIPageContentAgentTest, PaidContentSubframeMicrodata) {
|
||||
frame_test_helpers::LoadHTMLString(
|
||||
helper_.LocalMainFrame(), R"HTML(
|
||||
<script type="application/ld+json">{
|
||||
"@context": "https://schema.org",
|
||||
"@type": "NewsArticle",
|
||||
"isAccessibleForFree": true
|
||||
}</script>
|
||||
<body>
|
||||
Free Content
|
||||
<div class="paidContent">
|
||||
<meta itemprop="isAccessibleForFree" content="false">
|
||||
Microdata not checked
|
||||
</div>
|
||||
<iframe srcdoc='
|
||||
<script type="application/ld+json">{
|
||||
"@context": "http://schema.org",
|
||||
"@type": "NewsArticle",
|
||||
"isAccessibleForFree": false
|
||||
}</script>
|
||||
<body>
|
||||
Content
|
||||
<div class="paidContent">
|
||||
<meta itemprop="isAccessibleForFree" content="false">
|
||||
Paid Content
|
||||
</div>
|
||||
</body>
|
||||
'></iframe>
|
||||
<iframe srcdoc='
|
||||
<body>
|
||||
Content
|
||||
<div class="paidContent">
|
||||
<meta itemprop="isAccessibleForFree" content="false">
|
||||
Microdata not checked
|
||||
</div>
|
||||
</body>
|
||||
'></iframe>
|
||||
<iframe srcdoc='
|
||||
<script type="application/ld+json">{
|
||||
"@context": "http://schema.org",
|
||||
"@type": "NewsArticle",
|
||||
"isAccessibleForFree": false
|
||||
}</script>
|
||||
<body>
|
||||
Content
|
||||
<div class="paidContent">
|
||||
<meta itemprop="isAccessibleForFree" content="false">
|
||||
Paid Content
|
||||
</div>
|
||||
</body>
|
||||
'></iframe>
|
||||
</body>
|
||||
)HTML",
|
||||
url_test_helpers::ToKURL("http://foobar.com"));
|
||||
|
||||
auto content = GetAIPageContent();
|
||||
ASSERT_TRUE(content);
|
||||
ASSERT_TRUE(content->root_node);
|
||||
|
||||
// The root node does not contain paid content.
|
||||
EXPECT_FALSE(content->frame_data->contains_paid_content);
|
||||
|
||||
const auto& root = *content->root_node;
|
||||
auto& nodes = root.children_nodes;
|
||||
|
||||
EXPECT_FALSE(ContainsRole(nodes[0]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
EXPECT_FALSE(ContainsRole(nodes[1]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
|
||||
const auto& iframe1 = nodes[2];
|
||||
EXPECT_EQ(iframe1->content_attributes->attribute_type,
|
||||
mojom::blink::AIPageContentAttributeType::kIframe);
|
||||
EXPECT_TRUE(iframe1->content_attributes->iframe_data->local_frame_data
|
||||
->contains_paid_content);
|
||||
|
||||
const auto& children1 = iframe1->children_nodes[0]->children_nodes;
|
||||
EXPECT_FALSE(
|
||||
ContainsRole(children1[0]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
EXPECT_TRUE(
|
||||
ContainsRole(children1[1]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
|
||||
const auto& iframe2 = nodes[3];
|
||||
EXPECT_EQ(iframe2->content_attributes->attribute_type,
|
||||
mojom::blink::AIPageContentAttributeType::kIframe);
|
||||
EXPECT_FALSE(iframe2->content_attributes->iframe_data->local_frame_data
|
||||
->contains_paid_content);
|
||||
|
||||
const auto& children2 = iframe2->children_nodes[0]->children_nodes;
|
||||
EXPECT_FALSE(
|
||||
ContainsRole(children2[0]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
EXPECT_FALSE(
|
||||
ContainsRole(children2[1]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
|
||||
const auto& iframe3 = nodes[4];
|
||||
EXPECT_EQ(iframe3->content_attributes->attribute_type,
|
||||
mojom::blink::AIPageContentAttributeType::kIframe);
|
||||
EXPECT_TRUE(iframe3->content_attributes->iframe_data->local_frame_data
|
||||
->contains_paid_content);
|
||||
|
||||
const auto& children3 = iframe3->children_nodes[0]->children_nodes;
|
||||
EXPECT_FALSE(
|
||||
ContainsRole(children3[0]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
EXPECT_TRUE(
|
||||
ContainsRole(children3[1]->content_attributes->annotated_roles,
|
||||
mojom::blink::AIPageContentAnnotatedRole::kPaidContent));
|
||||
}
|
||||
|
||||
void CheckMatchesNode(
|
||||
const mojom::blink::AIPageContentHitTestNode& hit_test_node,
|
||||
const mojom::blink::AIPageContentNode& node) {
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "third_party/blink/renderer/core/dom/document.h"
|
||||
#include "third_party/blink/renderer/core/dom/element.h"
|
||||
#include "third_party/blink/renderer/core/dom/static_node_list.h"
|
||||
#include "third_party/blink/renderer/core/html/html_meta_element.h"
|
||||
#include "third_party/blink/renderer/core/html/html_head_element.h"
|
||||
#include "third_party/blink/renderer/core/html/html_script_element.h"
|
||||
#include "third_party/blink/renderer/platform/json/json_parser.h"
|
||||
@ -16,6 +17,9 @@
|
||||
|
||||
namespace blink {
|
||||
namespace {
|
||||
|
||||
const char kIsAccessibleForFree[] = "isAccessibleForFree";
|
||||
|
||||
bool ObjectValuePresentAndEquals(const JSONObject* object,
|
||||
const String& key,
|
||||
const String& value) {
|
||||
@ -70,6 +74,17 @@ bool ObjectValuePresentAndFalse(const JSONObject* object, const String& key) {
|
||||
} // namespace
|
||||
|
||||
bool PaidContent::IsPaidElement(const Element* element) const {
|
||||
auto* document = &element->GetDocument();
|
||||
if (check_microdata_.Contains(document) && check_microdata_.at(document)) {
|
||||
for (HTMLMetaElement& meta_element :
|
||||
Traversal<HTMLMetaElement>::ChildrenOf(*element)) {
|
||||
auto itemprop = meta_element.FastGetAttribute(html_names::kItempropAttr);
|
||||
if (itemprop.GetString() != kIsAccessibleForFree) {
|
||||
continue;
|
||||
}
|
||||
return meta_element.Content() == "false";
|
||||
}
|
||||
}
|
||||
for (const auto& paid_element : paid_elements_) {
|
||||
if (element == paid_element) {
|
||||
return true;
|
||||
@ -115,11 +130,13 @@ bool PaidContent::QueryPaidElements(Document& document) {
|
||||
// and WebPage. Multiple types are supported.
|
||||
|
||||
// check for isAccessibleForFree=false
|
||||
if (!ObjectValuePresentAndFalse(script_obj, "isAccessibleForFree")) {
|
||||
if (!ObjectValuePresentAndFalse(script_obj, kIsAccessibleForFree)) {
|
||||
continue;
|
||||
}
|
||||
paid_content_present = true;
|
||||
|
||||
bool has_part_found = false;
|
||||
|
||||
// Check for hasPart with isAccessibleForFree=false and a cssSelector
|
||||
JSONValue* hasPart_val = script_obj->Get("hasPart");
|
||||
if (hasPart_val) {
|
||||
@ -130,23 +147,30 @@ bool PaidContent::QueryPaidElements(Document& document) {
|
||||
JSONValue* hasPart_obj_val = hasPart_array->at(j);
|
||||
if (hasPart_obj_val->GetType() == JSONValue::kTypeObject) {
|
||||
JSONObject* hasPart_obj = JSONObject::Cast(hasPart_obj_val);
|
||||
AppendHasPartElements(document, *hasPart_obj);
|
||||
has_part_found |= AppendHasPartElements(document, *hasPart_obj);
|
||||
}
|
||||
}
|
||||
} else if (hasPart_type == JSONValue::kTypeObject) {
|
||||
JSONObject* hasPart_obj = JSONObject::Cast(hasPart_val);
|
||||
AppendHasPartElements(document, *hasPart_obj);
|
||||
has_part_found |= AppendHasPartElements(document, *hasPart_obj);
|
||||
}
|
||||
}
|
||||
|
||||
// Assume that pages will only use either ld+json or microdata.
|
||||
// If ld+json hasPart exists, don't check for microdata to save
|
||||
// the cost of checking each element.
|
||||
if (!has_part_found) {
|
||||
check_microdata_.Set(&document, true);
|
||||
}
|
||||
return paid_content_present;
|
||||
}
|
||||
return paid_content_present;
|
||||
}
|
||||
|
||||
void PaidContent::AppendHasPartElements(Document& document,
|
||||
bool PaidContent::AppendHasPartElements(Document& document,
|
||||
JSONObject& hasPart_obj) {
|
||||
if (ObjectValuePresentAndEquals(&hasPart_obj, "@type", "WebPageElement") &&
|
||||
ObjectValuePresentAndFalse(&hasPart_obj, "isAccessibleForFree")) {
|
||||
ObjectValuePresentAndFalse(&hasPart_obj, kIsAccessibleForFree)) {
|
||||
JSONValue* selector_val = hasPart_obj.Get("cssSelector");
|
||||
if (selector_val && selector_val->GetType() == JSONValue::kTypeString) {
|
||||
String selector;
|
||||
@ -158,8 +182,10 @@ void PaidContent::AppendHasPartElements(Document& document,
|
||||
paid_elements_.push_back(elements->item(j));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace blink
|
||||
|
@ -25,8 +25,11 @@ class PaidContent final {
|
||||
bool IsPaidElement(const Element* element) const;
|
||||
|
||||
private:
|
||||
// Whether to check for microdata annotations while walking.
|
||||
HeapHashMap<WeakMember<Document>, bool> check_microdata_;
|
||||
|
||||
// Appends elements found by the cssSelector in the hasPart object.
|
||||
void AppendHasPartElements(Document& document, JSONObject& hasPart_obj);
|
||||
bool AppendHasPartElements(Document& document, JSONObject& hasPart_obj);
|
||||
|
||||
// List of nodes marked as isAccessibleForFree=false.
|
||||
HeapVector<Member<Element>> paid_elements_;
|
||||
|
Reference in New Issue
Block a user