
Done entirely with `git grep` and `sed` + `git cl format`, no hand-editing. Bug: 386918226 Change-Id: I7377af2f9c3758c68a249b421d98bd3fd5c2c1fd Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6201377 Auto-Submit: Peter Kasting <pkasting@chromium.org> Reviewed-by: Ted Choc <tedchoc@chromium.org> Commit-Queue: Peter Kasting <pkasting@chromium.org> Cr-Commit-Position: refs/heads/main@{#1411730}
1015 lines
37 KiB
C++
1015 lines
37 KiB
C++
// Copyright 2017 The Chromium Authors
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#include "components/url_pattern_index/url_pattern_index.h"
|
|
|
|
#include <algorithm>
|
|
#include <limits>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <utility>
|
|
|
|
#include "base/check_op.h"
|
|
#include "base/containers/contains.h"
|
|
#include "base/containers/flat_map.h"
|
|
#include "base/functional/callback.h"
|
|
#include "base/memory/raw_ref.h"
|
|
#include "base/no_destructor.h"
|
|
#include "base/not_fatal_until.h"
|
|
#include "base/notreached.h"
|
|
#include "base/numerics/safe_conversions.h"
|
|
#include "base/strings/string_util.h"
|
|
#include "base/trace_event/trace_event.h"
|
|
#include "components/url_pattern_index/ngram_extractor.h"
|
|
#include "components/url_pattern_index/url_pattern.h"
|
|
#include "components/url_pattern_index/url_rule_util.h"
|
|
#include "url/gurl.h"
|
|
#include "url/origin.h"
|
|
#include "url/url_constants.h"
|
|
#include "url/url_util.h"
|
|
|
|
namespace url_pattern_index {
|
|
|
|
namespace {
|
|
|
|
using FlatUrlRuleList = flatbuffers::Vector<flatbuffers::Offset<flat::UrlRule>>;
|
|
|
|
using ActivationTypeMap =
|
|
base::flat_map<proto::ActivationType, flat::ActivationType>;
|
|
using ElementTypeMap = base::flat_map<proto::ElementType, flat::ElementType>;
|
|
|
|
// Maps proto::ActivationType to flat::ActivationType.
|
|
const ActivationTypeMap& GetActivationTypeMap() {
|
|
static base::NoDestructor<ActivationTypeMap> activation_type_map(
|
|
std::initializer_list<ActivationTypeMap::value_type>{
|
|
{proto::ACTIVATION_TYPE_UNSPECIFIED, flat::ActivationType_NONE},
|
|
{proto::ACTIVATION_TYPE_DOCUMENT, flat::ActivationType_DOCUMENT},
|
|
// ELEMHIDE is not supported.
|
|
{proto::ACTIVATION_TYPE_ELEMHIDE, flat::ActivationType_NONE},
|
|
// GENERICHIDE is not supported.
|
|
{proto::ACTIVATION_TYPE_GENERICHIDE, flat::ActivationType_NONE},
|
|
{proto::ACTIVATION_TYPE_GENERICBLOCK,
|
|
flat::ActivationType_GENERIC_BLOCK},
|
|
});
|
|
return *activation_type_map;
|
|
}
|
|
|
|
// Maps proto::ElementType to flat::ElementType.
|
|
const ElementTypeMap& GetElementTypeMap() {
|
|
static base::NoDestructor<ElementTypeMap> element_type_map(
|
|
std::initializer_list<ElementTypeMap::value_type>{
|
|
{proto::ELEMENT_TYPE_UNSPECIFIED, flat::ElementType_NONE},
|
|
{proto::ELEMENT_TYPE_OTHER, flat::ElementType_OTHER},
|
|
{proto::ELEMENT_TYPE_SCRIPT, flat::ElementType_SCRIPT},
|
|
{proto::ELEMENT_TYPE_IMAGE, flat::ElementType_IMAGE},
|
|
{proto::ELEMENT_TYPE_STYLESHEET, flat::ElementType_STYLESHEET},
|
|
{proto::ELEMENT_TYPE_OBJECT, flat::ElementType_OBJECT},
|
|
{proto::ELEMENT_TYPE_XMLHTTPREQUEST,
|
|
flat::ElementType_XMLHTTPREQUEST},
|
|
{proto::ELEMENT_TYPE_OBJECT_SUBREQUEST,
|
|
flat::ElementType_OBJECT_SUBREQUEST},
|
|
{proto::ELEMENT_TYPE_SUBDOCUMENT, flat::ElementType_SUBDOCUMENT},
|
|
{proto::ELEMENT_TYPE_PING, flat::ElementType_PING},
|
|
{proto::ELEMENT_TYPE_MEDIA, flat::ElementType_MEDIA},
|
|
{proto::ELEMENT_TYPE_FONT, flat::ElementType_FONT},
|
|
// Filtering popups is not supported.
|
|
{proto::ELEMENT_TYPE_POPUP, flat::ElementType_NONE},
|
|
{proto::ELEMENT_TYPE_WEBSOCKET, flat::ElementType_WEBSOCKET},
|
|
{proto::ELEMENT_TYPE_WEBTRANSPORT, flat::ElementType_WEBTRANSPORT},
|
|
{proto::ELEMENT_TYPE_WEBBUNDLE, flat::ElementType_WEBBUNDLE},
|
|
});
|
|
return *element_type_map;
|
|
}
|
|
|
|
flat::ActivationType ProtoToFlatActivationType(proto::ActivationType type) {
|
|
const auto it = GetActivationTypeMap().find(type);
|
|
CHECK(it != GetActivationTypeMap().end(), base::NotFatalUntil::M130);
|
|
return it->second;
|
|
}
|
|
|
|
flat::ElementType ProtoToFlatElementType(proto::ElementType type) {
|
|
const auto it = GetElementTypeMap().find(type);
|
|
CHECK(it != GetElementTypeMap().end(), base::NotFatalUntil::M130);
|
|
return it->second;
|
|
}
|
|
|
|
std::string_view ToStringView(const flatbuffers::String* string) {
|
|
DCHECK(string);
|
|
return std::string_view(string->c_str(), string->size());
|
|
}
|
|
|
|
bool HasNoUpperAscii(std::string_view string) {
|
|
return std::ranges::none_of(string, base::IsAsciiUpper<char>);
|
|
}
|
|
|
|
// Comparator to sort UrlRule. Sorts rules by descending order of rule priority.
|
|
bool UrlRuleDescendingPriorityComparator(const flat::UrlRule* lhs,
|
|
const flat::UrlRule* rhs) {
|
|
DCHECK(lhs);
|
|
DCHECK(rhs);
|
|
return lhs->priority() > rhs->priority();
|
|
}
|
|
|
|
// Returns a bitmask of all the keys of the |map| passed.
|
|
template <typename T>
|
|
int GetKeysMask(const T& map) {
|
|
int mask = 0;
|
|
for (const auto& pair : map)
|
|
mask |= pair.first;
|
|
return mask;
|
|
}
|
|
|
|
// Checks whether a URL |rule| can be converted to its FlatBuffers equivalent,
|
|
// and performs the actual conversion.
|
|
class UrlRuleFlatBufferConverter {
|
|
public:
|
|
// Creates the converter, and initializes |is_convertible| bit. If
|
|
// |is_convertible| == true, then all the fields, needed for serializing the
|
|
// |rule| to FlatBuffer, are initialized (|options|, |anchor_right|, etc.).
|
|
explicit UrlRuleFlatBufferConverter(const proto::UrlRule& rule)
|
|
: rule_(rule) {
|
|
is_convertible_ = InitializeOptions() && InitializeElementTypes() &&
|
|
InitializeActivationTypes() && InitializeUrlPattern() &&
|
|
IsMeaningful();
|
|
}
|
|
|
|
// Writes the URL |rule| to the FlatBuffer using the |builder|, and returns
|
|
// the offset to the serialized rule. Returns an empty offset in case the rule
|
|
// can't be converted. The conversion is not possible if the rule has
|
|
// attributes not supported by this client version.
|
|
//
|
|
// |domain_map| Should point to a non-nullptr map of domain vectors to their
|
|
// existing offsets. It is used to de-dupe domain vectors in the serialized
|
|
// rules.
|
|
UrlRuleOffset SerializeConvertedRule(flatbuffers::FlatBufferBuilder* builder,
|
|
FlatDomainMap* domain_map) const {
|
|
if (!is_convertible_)
|
|
return UrlRuleOffset();
|
|
|
|
DCHECK_NE(rule_->url_pattern_type(), proto::URL_PATTERN_TYPE_REGEXP);
|
|
|
|
FlatDomainsOffset initiator_domains_included_offset;
|
|
FlatDomainsOffset initiator_domains_excluded_offset;
|
|
FlatDomainsOffset request_domains_included_offset;
|
|
FlatDomainsOffset request_domains_excluded_offset;
|
|
|
|
if (!PopulateIncludedAndExcludedDomains(
|
|
rule_->initiator_domains_size(), rule_->initiator_domains(),
|
|
builder, domain_map, &initiator_domains_included_offset,
|
|
&initiator_domains_excluded_offset)) {
|
|
return UrlRuleOffset();
|
|
}
|
|
|
|
if (!PopulateIncludedAndExcludedDomains(
|
|
rule_->request_domains_size(), rule_->request_domains(), builder,
|
|
domain_map, &request_domains_included_offset,
|
|
&request_domains_excluded_offset)) {
|
|
return UrlRuleOffset();
|
|
}
|
|
|
|
// Non-ascii characters in patterns are unsupported.
|
|
if (!base::IsStringASCII(rule_->url_pattern()))
|
|
return UrlRuleOffset();
|
|
|
|
// TODO(crbug.com/41413799): Lower case case-insensitive patterns here if we
|
|
// want to support case-insensitive rules for subresource filter.
|
|
auto url_pattern_offset = builder->CreateSharedString(rule_->url_pattern());
|
|
|
|
return flat::CreateUrlRule(
|
|
*builder, options_, element_types_, flat::RequestMethod_ANY,
|
|
activation_types_, url_pattern_type_, anchor_left_, anchor_right_,
|
|
initiator_domains_included_offset, initiator_domains_excluded_offset,
|
|
request_domains_included_offset, request_domains_excluded_offset,
|
|
url_pattern_offset);
|
|
}
|
|
|
|
private:
|
|
FlatDomainsOffset SerializeDomainList(std::vector<FlatStringOffset> domains,
|
|
flatbuffers::FlatBufferBuilder* builder,
|
|
FlatDomainMap* domain_map) const {
|
|
// The comparator ensuring the domains order necessary for fast matching.
|
|
auto precedes = [&builder](FlatStringOffset lhs, FlatStringOffset rhs) {
|
|
return CompareDomains(
|
|
ToStringView(flatbuffers::GetTemporaryPointer(*builder, lhs)),
|
|
ToStringView(
|
|
flatbuffers::GetTemporaryPointer(*builder, rhs))) < 0;
|
|
};
|
|
if (domains.empty())
|
|
return FlatDomainsOffset();
|
|
std::sort(domains.begin(), domains.end(), precedes);
|
|
|
|
// Share domain lists if we've already serialized an exact duplicate. Note
|
|
// that this can share excluded and included domain lists.
|
|
DCHECK(domain_map);
|
|
auto it = domain_map->find(domains);
|
|
if (it == domain_map->end()) {
|
|
auto offset = builder->CreateVector(domains);
|
|
(*domain_map)[domains] = offset;
|
|
return offset;
|
|
}
|
|
return it->second;
|
|
}
|
|
|
|
// Returns true on success, false on an invalid domain entry.
|
|
bool PopulateIncludedAndExcludedDomains(
|
|
int domains_size,
|
|
google::protobuf::RepeatedPtrField<
|
|
::url_pattern_index::proto::DomainListItem> domain_list_items,
|
|
flatbuffers::FlatBufferBuilder* builder,
|
|
FlatDomainMap* domain_map,
|
|
FlatDomainsOffset* domains_included_offset,
|
|
FlatDomainsOffset* domains_excluded_offset) const {
|
|
if (domains_size == 0)
|
|
return true;
|
|
|
|
std::vector<FlatStringOffset> domains_included;
|
|
std::vector<FlatStringOffset> domains_excluded;
|
|
// Reserve only for `domains_included` because it is expected to
|
|
// be the one used more frequently.
|
|
domains_included.reserve(domains_size);
|
|
|
|
for (const auto& domain_list_item : domain_list_items) {
|
|
const std::string& domain = domain_list_item.domain();
|
|
|
|
// Non-ascii characters in domains are unsupported.
|
|
if (!base::IsStringASCII(domain))
|
|
return false;
|
|
|
|
// Note: This is not always correct. Chrome's URL parser uses upper-case
|
|
// for percent encoded hosts. E.g. https://,.com is encoded as
|
|
// https://%2C.com.
|
|
auto offset = builder->CreateSharedString(
|
|
HasNoUpperAscii(domain) ? domain : base::ToLowerASCII(domain));
|
|
|
|
if (domain_list_item.exclude())
|
|
domains_excluded.push_back(offset);
|
|
else
|
|
domains_included.push_back(offset);
|
|
}
|
|
// The domains are stored in sorted order to support fast matching.
|
|
*domains_included_offset =
|
|
SerializeDomainList(std::move(domains_included), builder, domain_map);
|
|
*domains_excluded_offset =
|
|
SerializeDomainList(std::move(domains_excluded), builder, domain_map);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool ConvertAnchorType(proto::AnchorType anchor_type,
|
|
flat::AnchorType* result) {
|
|
switch (anchor_type) {
|
|
case proto::ANCHOR_TYPE_NONE:
|
|
*result = flat::AnchorType_NONE;
|
|
break;
|
|
case proto::ANCHOR_TYPE_BOUNDARY:
|
|
*result = flat::AnchorType_BOUNDARY;
|
|
break;
|
|
case proto::ANCHOR_TYPE_SUBDOMAIN:
|
|
*result = flat::AnchorType_SUBDOMAIN;
|
|
break;
|
|
default:
|
|
return false; // Unsupported anchor type.
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool InitializeOptions() {
|
|
static_assert(flat::OptionFlag_ANY <= std::numeric_limits<uint8_t>::max(),
|
|
"Option flags can not be stored in uint8_t.");
|
|
static_assert(
|
|
flat::RequestMethod_ANY <= std::numeric_limits<uint16_t>::max(),
|
|
"Request methods can not be stored in uint16_t.");
|
|
|
|
if (rule_->semantics() == proto::RULE_SEMANTICS_ALLOWLIST) {
|
|
options_ |= flat::OptionFlag_IS_ALLOWLIST;
|
|
} else if (rule_->semantics() != proto::RULE_SEMANTICS_BLOCKLIST) {
|
|
return false; // Unsupported semantics.
|
|
}
|
|
|
|
switch (rule_->source_type()) {
|
|
case proto::SOURCE_TYPE_ANY:
|
|
options_ |= flat::OptionFlag_APPLIES_TO_THIRD_PARTY;
|
|
[[fallthrough]];
|
|
case proto::SOURCE_TYPE_FIRST_PARTY:
|
|
options_ |= flat::OptionFlag_APPLIES_TO_FIRST_PARTY;
|
|
break;
|
|
case proto::SOURCE_TYPE_THIRD_PARTY:
|
|
options_ |= flat::OptionFlag_APPLIES_TO_THIRD_PARTY;
|
|
break;
|
|
|
|
default:
|
|
return false; // Unsupported source type.
|
|
}
|
|
|
|
// TODO(crbug.com/41413799): Consider setting IS_CASE_INSENSITIVE here if we
|
|
// want to support case insensitive rules for subresource_filter.
|
|
return true;
|
|
}
|
|
|
|
bool InitializeElementTypes() {
|
|
static_assert(flat::ElementType_ANY <= std::numeric_limits<uint16_t>::max(),
|
|
"Element types can not be stored in uint16_t.");
|
|
|
|
// Handle the default case. Note this means we end up adding
|
|
// flat::ElementType_CSP_REPORT as an element type when there is no
|
|
// corresponding proto::ElementType for it. However this should not matter
|
|
// in practice since subresource_filter does not do matching on CSP reports
|
|
// currently. If subresource_filter started to do so, add support for CSP
|
|
// reports in proto::ElementType.
|
|
if (rule_->element_types() == kDefaultProtoElementTypesMask) {
|
|
element_types_ = kDefaultFlatElementTypesMask;
|
|
return true;
|
|
}
|
|
|
|
const ElementTypeMap& element_type_map = GetElementTypeMap();
|
|
// Ensure all proto::ElementType(s) are mapped in |element_type_map|.
|
|
DCHECK_EQ(proto::ELEMENT_TYPE_ALL, GetKeysMask(element_type_map));
|
|
|
|
element_types_ = flat::ElementType_NONE;
|
|
|
|
for (const auto& pair : element_type_map)
|
|
if (rule_->element_types() & pair.first)
|
|
element_types_ |= pair.second;
|
|
|
|
// Normally we can not distinguish between the main plugin resource and any
|
|
// other loads it makes. We treat them both as OBJECT requests. Hence an
|
|
// OBJECT request would also match OBJECT_SUBREQUEST rules, but not the
|
|
// the other way round.
|
|
if (element_types_ & flat::ElementType_OBJECT_SUBREQUEST)
|
|
element_types_ |= flat::ElementType_OBJECT;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool InitializeActivationTypes() {
|
|
static_assert(
|
|
flat::ActivationType_ANY <= std::numeric_limits<uint8_t>::max(),
|
|
"Activation types can not be stored in uint8_t.");
|
|
|
|
const ActivationTypeMap& activation_type_map = GetActivationTypeMap();
|
|
// Ensure all proto::ActivationType(s) are mapped in |activation_type_map|.
|
|
DCHECK_EQ(proto::ACTIVATION_TYPE_ALL, GetKeysMask(activation_type_map));
|
|
|
|
activation_types_ = flat::ActivationType_NONE;
|
|
|
|
for (const auto& pair : activation_type_map)
|
|
if (rule_->activation_types() & pair.first)
|
|
activation_types_ |= pair.second;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool InitializeUrlPattern() {
|
|
switch (rule_->url_pattern_type()) {
|
|
case proto::URL_PATTERN_TYPE_SUBSTRING:
|
|
url_pattern_type_ = flat::UrlPatternType_SUBSTRING;
|
|
break;
|
|
case proto::URL_PATTERN_TYPE_WILDCARDED:
|
|
url_pattern_type_ = flat::UrlPatternType_WILDCARDED;
|
|
break;
|
|
|
|
// TODO(pkalinnikov): Implement REGEXP rules matching.
|
|
case proto::URL_PATTERN_TYPE_REGEXP:
|
|
default:
|
|
return false; // Unsupported URL pattern type.
|
|
}
|
|
|
|
if (!ConvertAnchorType(rule_->anchor_left(), &anchor_left_) ||
|
|
!ConvertAnchorType(rule_->anchor_right(), &anchor_right_)) {
|
|
return false;
|
|
}
|
|
if (anchor_right_ == flat::AnchorType_SUBDOMAIN)
|
|
return false; // Unsupported right anchor.
|
|
|
|
// We disallow patterns like "||*xyz" because it isn't clear how to match
|
|
// them.
|
|
if (anchor_left_ == flat::AnchorType_SUBDOMAIN &&
|
|
(!rule_->url_pattern().empty() &&
|
|
rule_->url_pattern().front() == '*')) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Returns whether the rule is not a no-op after all the modifications above.
|
|
bool IsMeaningful() const { return element_types_ || activation_types_; }
|
|
|
|
const raw_ref<const proto::UrlRule> rule_;
|
|
|
|
uint8_t options_ = 0;
|
|
uint16_t element_types_ = 0;
|
|
uint8_t activation_types_ = 0;
|
|
flat::UrlPatternType url_pattern_type_ = flat::UrlPatternType_WILDCARDED;
|
|
flat::AnchorType anchor_left_ = flat::AnchorType_NONE;
|
|
flat::AnchorType anchor_right_ = flat::AnchorType_NONE;
|
|
|
|
bool is_convertible_ = true;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
// Helpers. --------------------------------------------------------------------
|
|
|
|
bool OffsetVectorCompare::operator()(
|
|
const std::vector<FlatStringOffset>& a,
|
|
const std::vector<FlatStringOffset>& b) const {
|
|
auto compare = [](const FlatStringOffset a_offset,
|
|
const FlatStringOffset b_offset) {
|
|
DCHECK(!a_offset.IsNull());
|
|
DCHECK(!b_offset.IsNull());
|
|
return a_offset.o < b_offset.o;
|
|
};
|
|
// |lexicographical_compare| is how vector::operator< is implemented.
|
|
return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(),
|
|
compare);
|
|
}
|
|
|
|
UrlRuleOffset SerializeUrlRule(const proto::UrlRule& rule,
|
|
flatbuffers::FlatBufferBuilder* builder,
|
|
FlatDomainMap* domain_map) {
|
|
DCHECK(builder);
|
|
UrlRuleFlatBufferConverter converter(rule);
|
|
return converter.SerializeConvertedRule(builder, domain_map);
|
|
}
|
|
|
|
int CompareDomains(std::string_view lhs_domain, std::string_view rhs_domain) {
|
|
if (lhs_domain.size() != rhs_domain.size())
|
|
return lhs_domain.size() > rhs_domain.size() ? -1 : 1;
|
|
return lhs_domain.compare(rhs_domain);
|
|
}
|
|
|
|
// UrlPatternIndexBuilder ------------------------------------------------------
|
|
|
|
UrlPatternIndexBuilder::UrlPatternIndexBuilder(
|
|
flatbuffers::FlatBufferBuilder* flat_builder)
|
|
: flat_builder_(flat_builder) {
|
|
DCHECK(flat_builder_);
|
|
}
|
|
|
|
UrlPatternIndexBuilder::~UrlPatternIndexBuilder() = default;
|
|
|
|
void UrlPatternIndexBuilder::IndexUrlRule(UrlRuleOffset offset) {
|
|
DCHECK(offset.o);
|
|
|
|
const auto* rule = flatbuffers::GetTemporaryPointer(*flat_builder_, offset);
|
|
DCHECK(rule);
|
|
|
|
#if DCHECK_IS_ON()
|
|
// Sanity check that the rule does not have fields with non-ascii characters.
|
|
DCHECK(base::IsStringASCII(ToStringView(rule->url_pattern())));
|
|
if (rule->initiator_domains_included()) {
|
|
for (auto* domain : *rule->initiator_domains_included())
|
|
DCHECK(base::IsStringASCII(ToStringView(domain)));
|
|
}
|
|
if (rule->initiator_domains_excluded()) {
|
|
for (auto* domain : *rule->initiator_domains_excluded())
|
|
DCHECK(base::IsStringASCII(ToStringView(domain)));
|
|
}
|
|
if (rule->request_domains_included()) {
|
|
for (auto* domain : *rule->request_domains_included())
|
|
DCHECK(base::IsStringASCII(ToStringView(domain)));
|
|
}
|
|
if (rule->request_domains_excluded()) {
|
|
for (auto* domain : *rule->request_domains_excluded())
|
|
DCHECK(base::IsStringASCII(ToStringView(domain)));
|
|
}
|
|
|
|
// Case-insensitive patterns should be lower-cased.
|
|
if (rule->options() & flat::OptionFlag_IS_CASE_INSENSITIVE)
|
|
DCHECK(HasNoUpperAscii(ToStringView(rule->url_pattern())));
|
|
#endif
|
|
|
|
NGram ngram = GetMostDistinctiveNGram(ToStringView(rule->url_pattern()));
|
|
|
|
if (ngram) {
|
|
ngram_index_[ngram].push_back(offset);
|
|
} else {
|
|
// TODO(pkalinnikov): Index fallback rules as well.
|
|
fallback_rules_.push_back(offset);
|
|
}
|
|
}
|
|
|
|
UrlPatternIndexOffset UrlPatternIndexBuilder::Finish() {
|
|
std::vector<flatbuffers::Offset<flat::NGramToRules>> flat_hash_table(
|
|
ngram_index_.table_size());
|
|
|
|
flatbuffers::Offset<flat::NGramToRules> empty_slot_offset =
|
|
flat::CreateNGramToRules(*flat_builder_);
|
|
auto rules_comparator = [this](const UrlRuleOffset& lhs,
|
|
const UrlRuleOffset& rhs) {
|
|
return UrlRuleDescendingPriorityComparator(
|
|
flatbuffers::GetTemporaryPointer(*flat_builder_, lhs),
|
|
flatbuffers::GetTemporaryPointer(*flat_builder_, rhs));
|
|
};
|
|
|
|
for (size_t i = 0, size = ngram_index_.table_size(); i != size; ++i) {
|
|
const uint32_t entry_index = ngram_index_.hash_table()[i];
|
|
if (entry_index >= ngram_index_.size()) {
|
|
flat_hash_table[i] = empty_slot_offset;
|
|
continue;
|
|
}
|
|
const MutableNGramIndex::EntryType& entry =
|
|
ngram_index_.entries()[entry_index];
|
|
// Retrieve a mutable reference to |entry.second| and sort it in descending
|
|
// order of priority.
|
|
MutableUrlRuleList& rule_list = ngram_index_[entry.first];
|
|
std::sort(rule_list.begin(), rule_list.end(), rules_comparator);
|
|
|
|
auto rules_offset = flat_builder_->CreateVector(rule_list);
|
|
flat_hash_table[i] =
|
|
flat::CreateNGramToRules(*flat_builder_, entry.first, rules_offset);
|
|
}
|
|
auto ngram_index_offset = flat_builder_->CreateVector(flat_hash_table);
|
|
|
|
// Sort |fallback_rules_| in descending order of priority.
|
|
std::sort(fallback_rules_.begin(), fallback_rules_.end(), rules_comparator);
|
|
auto fallback_rules_offset = flat_builder_->CreateVector(fallback_rules_);
|
|
|
|
return flat::CreateUrlPatternIndex(*flat_builder_, kNGramSize,
|
|
ngram_index_offset, empty_slot_offset,
|
|
fallback_rules_offset);
|
|
}
|
|
|
|
NGram UrlPatternIndexBuilder::GetMostDistinctiveNGram(
|
|
std::string_view pattern) {
|
|
size_t min_list_size = std::numeric_limits<size_t>::max();
|
|
NGram best_ngram = 0;
|
|
|
|
// To support case-insensitive matching, make sure the n-grams for |pattern|
|
|
// are lower-cased.
|
|
DCHECK(base::IsStringASCII(pattern));
|
|
auto ngrams =
|
|
CreateNGramExtractor<kNGramSize, NGram, NGramCaseExtraction::kLowerCase>(
|
|
pattern, [](char c) { return c == '*' || c == '^'; });
|
|
|
|
for (uint64_t ngram : ngrams) {
|
|
const MutableUrlRuleList* rules = ngram_index_.Get(ngram);
|
|
const size_t list_size = rules ? rules->size() : 0;
|
|
if (list_size < min_list_size) {
|
|
// TODO(pkalinnikov): Pick random of the same-sized lists.
|
|
min_list_size = list_size;
|
|
best_ngram = ngram;
|
|
if (list_size == 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
return best_ngram;
|
|
}
|
|
|
|
// UrlPatternIndex -------------------------------------------------------------
|
|
|
|
namespace {
|
|
|
|
using FlatNGramIndex =
|
|
flatbuffers::Vector<flatbuffers::Offset<flat::NGramToRules>>;
|
|
|
|
// Returns the size of the longest (sub-)domain of `host` matching one of the
|
|
// `domains` in the list.
|
|
//
|
|
// The `domains` should be sorted in descending order of their length, and
|
|
// ascending alphabetical order within the groups of same-length domains.
|
|
size_t GetLongestMatchingSubdomain(std::string_view host,
|
|
const FlatDomains& domains) {
|
|
if (host.empty())
|
|
return 0;
|
|
|
|
// If the |domains| list is short, then the simple strategy is usually faster.
|
|
if (domains.size() <= 5) {
|
|
for (auto* domain : domains) {
|
|
const std::string_view domain_piece = ToStringView(domain);
|
|
if (url::DomainIs(host, domain_piece))
|
|
return domain_piece.size();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Otherwise look for each subdomain of the `host` using binary search.
|
|
|
|
// If the host name ends with a dot, then ignore it.
|
|
if (host.back() == '.')
|
|
host.remove_suffix(1);
|
|
|
|
// The |left| bound of the search is shared between iterations, because
|
|
// subdomains are considered in decreasing order of their lengths, therefore
|
|
// each consecutive lower_bound will be at least as far as the previous.
|
|
flatbuffers::uoffset_t left = 0;
|
|
for (size_t position = 0;; ++position) {
|
|
const std::string_view subdomain = host.substr(position);
|
|
|
|
flatbuffers::uoffset_t right = domains.size();
|
|
while (left + 1 < right) {
|
|
auto middle = left + (right - left) / 2;
|
|
DCHECK_LT(middle, domains.size());
|
|
if (CompareDomains(ToStringView(domains[middle]), subdomain) <= 0) {
|
|
left = middle;
|
|
} else
|
|
right = middle;
|
|
}
|
|
|
|
DCHECK_LT(left, domains.size());
|
|
if (ToStringView(domains[left]) == subdomain) {
|
|
return subdomain.size();
|
|
}
|
|
|
|
position = host.find('.', position);
|
|
if (position == std::string_view::npos) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
// |sorted_candidates| is sorted in descending order by priority. If
|
|
// |matched_rules| is specified, then all rule matches in |sorted_candidates|
|
|
// will be added to |matched_rules| and null is returned. If |matched_rules| is
|
|
// not specified, then this returns the first matching rule i.e. the rule with
|
|
// the highest priority in |sorted_candidates| or null if no rule matches.
|
|
const flat::UrlRule* FindMatchAmongCandidates(
|
|
const FlatUrlRuleList* sorted_candidates,
|
|
const UrlPattern::UrlInfo& url,
|
|
const url::Origin& document_origin,
|
|
flat::ElementType element_type,
|
|
flat::ActivationType activation_type,
|
|
flat::RequestMethod request_method,
|
|
bool is_third_party,
|
|
bool disable_generic_rules,
|
|
const UrlPatternIndexMatcher::EmbedderConditionsMatcher&
|
|
embedder_conditions_matcher,
|
|
std::vector<const flat::UrlRule*>* matched_rules,
|
|
const base::flat_set<int>& disabled_rule_ids) {
|
|
if (!sorted_candidates)
|
|
return nullptr;
|
|
|
|
DCHECK(std::is_sorted(sorted_candidates->begin(), sorted_candidates->end(),
|
|
&UrlRuleDescendingPriorityComparator));
|
|
|
|
for (const flat::UrlRule* rule : *sorted_candidates) {
|
|
DCHECK_NE(rule, nullptr);
|
|
DCHECK_NE(rule->url_pattern_type(), flat::UrlPatternType_REGEXP);
|
|
if (!DoesRuleFlagsMatch(*rule, element_type, activation_type,
|
|
request_method, is_third_party,
|
|
embedder_conditions_matcher)) {
|
|
continue;
|
|
}
|
|
|
|
if (disable_generic_rules && IsRuleGeneric(*rule))
|
|
continue;
|
|
|
|
if (!UrlPattern(*rule).MatchesUrl(url))
|
|
continue;
|
|
|
|
if (!DoesOriginMatchInitiatorDomainList(document_origin, *rule))
|
|
continue;
|
|
|
|
if (!DoesURLMatchRequestDomainList(url, *rule))
|
|
continue;
|
|
|
|
if (base::Contains(disabled_rule_ids, rule->id()))
|
|
continue;
|
|
|
|
if (matched_rules)
|
|
matched_rules->push_back(rule);
|
|
else
|
|
return rule;
|
|
}
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
// Returns whether the network request matches a UrlPattern |index| represented
|
|
// in its FlatBuffers format. |is_third_party| should reflect the relation
|
|
// between |url| and |document_origin|. If |strategy| is kAll, then
|
|
// |matched_rules| will be populated with all matching UrlRules and nullptr is
|
|
// returned.
|
|
const flat::UrlRule* FindMatchInFlatUrlPatternIndex(
|
|
const flat::UrlPatternIndex& index,
|
|
const UrlPattern::UrlInfo& url,
|
|
const url::Origin& document_origin,
|
|
flat::ElementType element_type,
|
|
flat::ActivationType activation_type,
|
|
flat::RequestMethod request_method,
|
|
bool is_third_party,
|
|
bool disable_generic_rules,
|
|
const UrlPatternIndexMatcher::EmbedderConditionsMatcher&
|
|
embedder_conditions_matcher,
|
|
UrlPatternIndexMatcher::FindRuleStrategy strategy,
|
|
std::vector<const flat::UrlRule*>* matched_rules,
|
|
const base::flat_set<int>& disabled_rule_ids) {
|
|
using FindRuleStrategy = UrlPatternIndexMatcher::FindRuleStrategy;
|
|
|
|
// Check that the outparam |matched_rules| is specified if and only if
|
|
// |strategy| is kAll.
|
|
DCHECK_EQ(strategy == FindRuleStrategy::kAll, !!matched_rules);
|
|
|
|
const FlatNGramIndex* hash_table = index.ngram_index();
|
|
const flat::NGramToRules* empty_slot = index.ngram_index_empty_slot();
|
|
DCHECK_NE(hash_table, nullptr);
|
|
|
|
NGramHashTableProber prober;
|
|
|
|
// |hash_table| contains lower-cased n-grams. Use lower-cased extraction to
|
|
// find prospective matches.
|
|
auto ngrams = CreateNGramExtractor<kNGramSize, uint64_t,
|
|
NGramCaseExtraction::kLowerCase>(
|
|
url.spec(), [](char) { return false; });
|
|
|
|
auto get_max_priority_rule = [](const flat::UrlRule* lhs,
|
|
const flat::UrlRule* rhs) {
|
|
if (!lhs)
|
|
return rhs;
|
|
if (!rhs)
|
|
return lhs;
|
|
return lhs->priority() > rhs->priority() ? lhs : rhs;
|
|
};
|
|
const flat::UrlRule* max_priority_rule = nullptr;
|
|
|
|
for (uint64_t ngram : ngrams) {
|
|
const uint32_t slot_index = prober.FindSlot(
|
|
ngram, hash_table->size(),
|
|
[hash_table, empty_slot](NGram ngram, uint32_t slot_index) {
|
|
const flat::NGramToRules* entry = hash_table->Get(slot_index);
|
|
DCHECK_NE(entry, nullptr);
|
|
return entry == empty_slot || entry->ngram() == ngram;
|
|
});
|
|
DCHECK_LT(slot_index, hash_table->size());
|
|
|
|
const flat::NGramToRules* entry = hash_table->Get(slot_index);
|
|
if (entry == empty_slot)
|
|
continue;
|
|
const flat::UrlRule* rule = FindMatchAmongCandidates(
|
|
entry->rule_list(), url, document_origin, element_type, activation_type,
|
|
request_method, is_third_party, disable_generic_rules,
|
|
embedder_conditions_matcher, matched_rules, disabled_rule_ids);
|
|
if (!rule)
|
|
continue;
|
|
|
|
// |rule| is a matching rule with the highest priority amongst
|
|
// |entry->rule_list()|.
|
|
switch (strategy) {
|
|
case FindRuleStrategy::kAny:
|
|
return rule;
|
|
case FindRuleStrategy::kHighestPriority:
|
|
max_priority_rule = get_max_priority_rule(max_priority_rule, rule);
|
|
break;
|
|
case FindRuleStrategy::kAll:
|
|
continue;
|
|
}
|
|
}
|
|
|
|
const flat::UrlRule* rule = FindMatchAmongCandidates(
|
|
index.fallback_rules(), url, document_origin, element_type,
|
|
activation_type, request_method, is_third_party, disable_generic_rules,
|
|
embedder_conditions_matcher, matched_rules, disabled_rule_ids);
|
|
|
|
switch (strategy) {
|
|
case FindRuleStrategy::kAny:
|
|
return rule;
|
|
case FindRuleStrategy::kHighestPriority:
|
|
return get_max_priority_rule(max_priority_rule, rule);
|
|
case FindRuleStrategy::kAll:
|
|
return nullptr;
|
|
}
|
|
|
|
NOTREACHED();
|
|
}
|
|
|
|
} // namespace
|
|
|
|
bool IsRuleGeneric(const flat::UrlRule& rule) {
|
|
return !rule.initiator_domains_included();
|
|
}
|
|
|
|
// Returns whether the `host` matches the domain conditions. It's considered a
|
|
// match if both:
|
|
// 1. An included domain matches the `host`, or `domains_included` is omitted
|
|
// entirely (since rules match all domains by default).
|
|
// 2. No excluded domain match the `host`, or the longest matching excluded
|
|
// domain is shorter than the longest matching included domain (since
|
|
// longer, more specific domain matches take precedence).
|
|
bool DoesHostMatchDomainLists(
|
|
std::string_view host,
|
|
const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
|
|
domains_included,
|
|
const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
|
|
domains_excluded) {
|
|
DCHECK(!domains_included || domains_included->size());
|
|
|
|
size_t longest_matching_included_domain_length = 1;
|
|
if (domains_included) {
|
|
longest_matching_included_domain_length =
|
|
GetLongestMatchingSubdomain(host, *domains_included);
|
|
}
|
|
if (longest_matching_included_domain_length && domains_excluded) {
|
|
return GetLongestMatchingSubdomain(host, *domains_excluded) <
|
|
longest_matching_included_domain_length;
|
|
}
|
|
return !!longest_matching_included_domain_length;
|
|
}
|
|
|
|
bool DoesURLMatchRequestDomainList(const UrlPattern::UrlInfo& url,
|
|
const flat::UrlRule& rule) {
|
|
return DoesHostMatchDomainLists(url.GetStringHost(),
|
|
rule.request_domains_included(),
|
|
rule.request_domains_excluded());
|
|
}
|
|
|
|
bool DoesOriginMatchInitiatorDomainList(const url::Origin& origin,
|
|
const flat::UrlRule& rule) {
|
|
// Unique `origin` matches lists of exception domains only.
|
|
if (origin.opaque())
|
|
return IsRuleGeneric(rule);
|
|
|
|
return DoesHostMatchDomainLists(origin.host(),
|
|
rule.initiator_domains_included(),
|
|
rule.initiator_domains_excluded());
|
|
}
|
|
|
|
bool DoesRuleFlagsMatch(const flat::UrlRule& rule,
|
|
flat::ElementType element_type,
|
|
flat::ActivationType activation_type,
|
|
flat::RequestMethod request_method,
|
|
bool is_third_party,
|
|
const UrlPatternIndexMatcher::EmbedderConditionsMatcher&
|
|
embedder_conditions_matcher) {
|
|
DCHECK((element_type == flat::ElementType_NONE) !=
|
|
(activation_type == flat::ActivationType_NONE));
|
|
|
|
if (element_type != flat::ElementType_NONE &&
|
|
!(rule.element_types() & element_type)) {
|
|
return false;
|
|
}
|
|
if (activation_type != flat::ActivationType_NONE &&
|
|
!(rule.activation_types() & activation_type)) {
|
|
return false;
|
|
}
|
|
if (request_method != flat::RequestMethod_NONE &&
|
|
!(rule.request_methods() & request_method)) {
|
|
return false;
|
|
}
|
|
|
|
if (is_third_party &&
|
|
!(rule.options() & flat::OptionFlag_APPLIES_TO_THIRD_PARTY)) {
|
|
return false;
|
|
}
|
|
if (!is_third_party &&
|
|
!(rule.options() & flat::OptionFlag_APPLIES_TO_FIRST_PARTY)) {
|
|
return false;
|
|
}
|
|
|
|
if (rule.embedder_conditions() && !embedder_conditions_matcher.is_null() &&
|
|
!embedder_conditions_matcher.Run(*rule.embedder_conditions())) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
UrlPatternIndexMatcher::UrlPatternIndexMatcher(
|
|
const flat::UrlPatternIndex* flat_index)
|
|
: flat_index_(flat_index) {
|
|
DCHECK(!flat_index || flat_index->n() == kNGramSize);
|
|
// Speculative investigation for crash (see crbug.com/1286207): check that we
|
|
// can access the ngram_index on each UrlPatternIndexMatcher without failure.
|
|
if (flat_index) {
|
|
CHECK_GT(flat_index->ngram_index()->size(), 0u);
|
|
}
|
|
}
|
|
|
|
UrlPatternIndexMatcher::~UrlPatternIndexMatcher() = default;
|
|
UrlPatternIndexMatcher::UrlPatternIndexMatcher(UrlPatternIndexMatcher&&) =
|
|
default;
|
|
UrlPatternIndexMatcher& UrlPatternIndexMatcher::operator=(
|
|
UrlPatternIndexMatcher&&) = default;
|
|
|
|
size_t UrlPatternIndexMatcher::GetRulesCount() const {
|
|
if (rules_count_)
|
|
return *rules_count_;
|
|
|
|
if (!flat_index_) {
|
|
rules_count_ = 0;
|
|
return 0;
|
|
}
|
|
|
|
rules_count_ = flat_index_->fallback_rules()->size();
|
|
|
|
// Iterate over all ngrams and check their corresponding rules.
|
|
for (auto* ngram_to_rules : *flat_index_->ngram_index()) {
|
|
if (ngram_to_rules == flat_index_->ngram_index_empty_slot())
|
|
continue;
|
|
|
|
*rules_count_ += ngram_to_rules->rule_list()->size();
|
|
}
|
|
|
|
return *rules_count_;
|
|
}
|
|
|
|
const flat::UrlRule* UrlPatternIndexMatcher::FindMatch(
|
|
const GURL& url,
|
|
const url::Origin& first_party_origin,
|
|
proto::ElementType element_type,
|
|
proto::ActivationType activation_type,
|
|
bool is_third_party,
|
|
bool disable_generic_rules,
|
|
const EmbedderConditionsMatcher& embedder_conditions_matcher,
|
|
FindRuleStrategy strategy,
|
|
const base::flat_set<int>& disabled_rule_ids) const {
|
|
return FindMatch(
|
|
url, first_party_origin, ProtoToFlatElementType(element_type),
|
|
ProtoToFlatActivationType(activation_type), flat::RequestMethod_NONE,
|
|
is_third_party, disable_generic_rules, embedder_conditions_matcher,
|
|
strategy, disabled_rule_ids);
|
|
}
|
|
|
|
const flat::UrlRule* UrlPatternIndexMatcher::FindMatch(
|
|
const GURL& url,
|
|
const url::Origin& first_party_origin,
|
|
flat::ElementType element_type,
|
|
flat::ActivationType activation_type,
|
|
flat::RequestMethod request_method,
|
|
bool is_third_party,
|
|
bool disable_generic_rules,
|
|
const EmbedderConditionsMatcher& embedder_conditions_matcher,
|
|
FindRuleStrategy strategy,
|
|
const base::flat_set<int>& disabled_rule_ids) const {
|
|
// Ignore URLs that are greater than the max URL length. Since those will be
|
|
// disallowed elsewhere in the loading stack, we can save compute time by
|
|
// avoiding matching here.
|
|
if (!flat_index_ || !url.is_valid() ||
|
|
url.spec().length() > url::kMaxURLChars) {
|
|
return nullptr;
|
|
}
|
|
if ((element_type == flat::ElementType_NONE) ==
|
|
(activation_type == flat::ActivationType_NONE)) {
|
|
return nullptr;
|
|
}
|
|
|
|
// FindAllMatches should be used instead to find all matches.
|
|
DCHECK_NE(strategy, FindRuleStrategy::kAll);
|
|
|
|
auto* rule = FindMatchInFlatUrlPatternIndex(
|
|
*flat_index_, UrlPattern::UrlInfo(url), first_party_origin, element_type,
|
|
activation_type, request_method, is_third_party, disable_generic_rules,
|
|
embedder_conditions_matcher, strategy, nullptr /* matched_rules */,
|
|
disabled_rule_ids);
|
|
if (rule) {
|
|
TRACE_EVENT1(TRACE_DISABLED_BY_DEFAULT("loading"),
|
|
"UrlPatternIndexMatcher::FindMatch", "pattern",
|
|
FlatUrlRuleToFilterlistString(rule));
|
|
}
|
|
return rule;
|
|
}
|
|
|
|
std::vector<const flat::UrlRule*> UrlPatternIndexMatcher::FindAllMatches(
|
|
const GURL& url,
|
|
const url::Origin& first_party_origin,
|
|
proto::ElementType element_type,
|
|
proto::ActivationType activation_type,
|
|
bool is_third_party,
|
|
bool disable_generic_rules,
|
|
const EmbedderConditionsMatcher& embedder_conditions_matcher,
|
|
const base::flat_set<int>& disabled_rule_ids) const {
|
|
return FindAllMatches(
|
|
url, first_party_origin, ProtoToFlatElementType(element_type),
|
|
ProtoToFlatActivationType(activation_type), flat::RequestMethod_NONE,
|
|
is_third_party, disable_generic_rules, embedder_conditions_matcher,
|
|
disabled_rule_ids);
|
|
}
|
|
|
|
std::vector<const flat::UrlRule*> UrlPatternIndexMatcher::FindAllMatches(
|
|
const GURL& url,
|
|
const url::Origin& first_party_origin,
|
|
flat::ElementType element_type,
|
|
flat::ActivationType activation_type,
|
|
flat::RequestMethod request_method,
|
|
bool is_third_party,
|
|
bool disable_generic_rules,
|
|
const EmbedderConditionsMatcher& embedder_conditions_matcher,
|
|
const base::flat_set<int>& disabled_rule_ids) const {
|
|
// Ignore URLs that are greater than the max URL length. Since those will be
|
|
// disallowed elsewhere in the loading stack, we can save compute time by
|
|
// avoiding matching here.
|
|
if (!flat_index_ || !url.is_valid() ||
|
|
url.spec().length() > url::kMaxURLChars) {
|
|
return std::vector<const flat::UrlRule*>();
|
|
}
|
|
if ((element_type == flat::ElementType_NONE) ==
|
|
(activation_type == flat::ActivationType_NONE)) {
|
|
return std::vector<const flat::UrlRule*>();
|
|
}
|
|
|
|
std::vector<const flat::UrlRule*> rules;
|
|
FindMatchInFlatUrlPatternIndex(
|
|
*flat_index_, UrlPattern::UrlInfo(url), first_party_origin, element_type,
|
|
activation_type, request_method, is_third_party, disable_generic_rules,
|
|
embedder_conditions_matcher, FindRuleStrategy::kAll, &rules,
|
|
disabled_rule_ids);
|
|
|
|
return rules;
|
|
}
|
|
|
|
} // namespace url_pattern_index
|