910e62b5创建于 1月15日历史提交
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/ui/lens/lens_url_matcher.h"

#include "base/json/json_reader.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h"
#include "third_party/farmhash/src/src/farmhash.h"

namespace lens {

namespace {

// Converts a JSON string array to a vector.
std::vector<std::string> JSONArrayToVector(const std::string& json_array) {
  std::optional<base::Value> json_value =
      base::JSONReader::Read(json_array, base::JSON_PARSE_CHROMIUM_EXTENSIONS);

  if (!json_value) {
    return {};
  }

  base::Value::List* entries = json_value->GetIfList();
  if (!entries) {
    return {};
  }

  std::vector<std::string> result;
  result.reserve(entries->size());
  for (const base::Value& entry : *entries) {
    const std::string* filter = entry.GetIfString();
    if (filter) {
      result.emplace_back(*filter);
    }
  }
  return result;
}

}  // namespace

LensUrlMatcher::LensUrlMatcher(std::string url_allow_filters,
                               std::string url_block_filters,
                               std::string path_match_allow_filters,
                               std::string path_match_block_filters,
                               std::string url_forced_allowed_match_patterns,
                               std::string hashed_domain_block_filters_list) {
  base::MatcherStringPattern::ID id(0);
  InitializeUrlMatcher(url_allow_filters, url_block_filters, &id);
  InitializeForceAllowUrlPatterns(url_forced_allowed_match_patterns, &id);
  InitializePathAllowMatcher(path_match_allow_filters, &id);
  InitializePathBlockMatcher(path_match_block_filters, &id);
  InitializeHashedDomainBlockFilters(hashed_domain_block_filters_list);
}

LensUrlMatcher::~LensUrlMatcher() = default;

void LensUrlMatcher::InitializeUrlMatcher(std::string url_allow_filters,
                                          std::string url_block_filters,
                                          base::MatcherStringPattern::ID* id) {
  url_matcher_ = std::make_unique<url_matcher::URLMatcher>();
  url_matcher::util::AddFiltersWithLimit(url_matcher_.get(), true, id,
                                         JSONArrayToVector(url_allow_filters),
                                         &url_filters_);
  url_matcher::util::AddFiltersWithLimit(url_matcher_.get(), false, id,
                                         JSONArrayToVector(url_block_filters),
                                         &url_filters_);
}

void LensUrlMatcher::InitializeForceAllowUrlPatterns(
    std::string url_path_forced_allowed_match_patterns,
    base::MatcherStringPattern::ID* id) {
  auto force_allow_url_strings =
      JSONArrayToVector(url_path_forced_allowed_match_patterns);
  std::vector<base::MatcherStringPattern> force_allow_url_patterns;
  std::vector<const base::MatcherStringPattern*> force_allow_url_pointers;
  force_allow_url_patterns.reserve(force_allow_url_strings.size());
  force_allow_url_pointers.reserve(force_allow_url_strings.size());
  for (const std::string& entry : force_allow_url_strings) {
    (*id)++;
    force_allow_url_patterns.emplace_back(entry, *id);
    force_allow_url_pointers.push_back(&force_allow_url_patterns.back());
  }
  url_forced_allow_matcher = std::make_unique<url_matcher::RegexSetMatcher>();
  // Pointers will not be referenced after AddPatterns() completes.
  url_forced_allow_matcher->AddPatterns(force_allow_url_pointers);
}

void LensUrlMatcher::InitializePathAllowMatcher(
    std::string path_match_allow_filters,
    base::MatcherStringPattern::ID* id) {
  const auto allow_strings = JSONArrayToVector(path_match_allow_filters);
  std::vector<base::MatcherStringPattern> allow_patterns;
  std::vector<const base::MatcherStringPattern*> allow_pointers;
  allow_patterns.reserve(allow_strings.size());
  allow_pointers.reserve(allow_strings.size());
  for (const std::string& entry : allow_strings) {
    (*id)++;
    allow_patterns.emplace_back(entry, *id);
    allow_pointers.push_back(&allow_patterns.back());
  }
  path_allow_matcher_ = std::make_unique<url_matcher::RegexSetMatcher>();
  // Pointers will not be referenced after AddPatterns() completes.
  path_allow_matcher_->AddPatterns(allow_pointers);
}

void LensUrlMatcher::InitializePathBlockMatcher(
    std::string path_match_block_filters,
    base::MatcherStringPattern::ID* id) {
  const auto block_strings = JSONArrayToVector(path_match_block_filters);
  std::vector<base::MatcherStringPattern> block_patterns;
  std::vector<const base::MatcherStringPattern*> block_pointers;
  block_patterns.reserve(block_strings.size());
  block_pointers.reserve(block_strings.size());
  for (const std::string& entry : block_strings) {
    (*id)++;
    block_patterns.emplace_back(entry, *id);
    block_pointers.push_back(&block_patterns.back());
  }
  path_block_matcher_ = std::make_unique<url_matcher::RegexSetMatcher>();
  // Pointers will not be referenced after AddPatterns() completes.
  path_block_matcher_->AddPatterns(block_pointers);
}

void LensUrlMatcher::InitializeHashedDomainBlockFilters(
    std::string hashed_domain_block_filters_list) {
  for (std::string_view hash_string :
       base::SplitStringPiece(hashed_domain_block_filters_list, ",",
                              base::WhitespaceHandling::TRIM_WHITESPACE,
                              base::SplitResult::SPLIT_WANT_NONEMPTY)) {
    uint32_t hash;
    if (base::StringToUint(hash_string, &hash)) {
      hashed_domain_block_filters_.insert(hash);
    }
  }
}

bool LensUrlMatcher::IsMatch(const GURL& url) {
  // Check if the URL matches any of the allow filters. If it does not, return
  // false immediately to block this URL.
  auto matches = url_matcher_.get()->MatchURL(url);
  if (!matches.size()) {
    return false;
  }

  // Now that the URL is allowed, check if it matches any of the block filters.
  // If it does, return false to block this URL.
  for (auto match : matches) {
    // Blocks take precedence over allows.
    if (!url_filters_[match].allow) {
      return false;
    }
  }

  // Check if the domain matches any of the hashed block filters. If it does,
  // return false to block this URL.
  if (SubdomainsMatchHash(url.GetHost())) {
    return false;
  }

  // Check if the path matches the path block matcher. If it does, return false
  // to block this URL.
  if (path_block_matcher_ && !path_block_matcher_->IsEmpty() &&
      path_block_matcher_->Match(url.GetPath(), &matches)) {
    return false;
  }

  // Check if the URL matches any of the forced allowed URLs. If it does, return
  // true as this should be a shown match even if the path does not contain an
  // allowlisted pattern (below).
  if (url_forced_allow_matcher && !url_forced_allow_matcher->IsEmpty() &&
      url_forced_allow_matcher->Match(url.spec(), &matches)) {
    return true;
  }

  // Finally, check if the path matches the path allow matcher. If it doesn't,
  // return false to block this URL.
  if (path_allow_matcher_ && !path_allow_matcher_->IsEmpty() &&
      !path_allow_matcher_->Match(url.GetPath(), &matches)) {
    return false;
  }

  // Finally if all checks pass, this must be a valid match, i.e.:
  // 1. The URL matches at least one of the allowed URLs.
  // 2. The URL does not match any of the blocked URLs.
  // 3. The domain does not match any of the hashed blocked domains.
  // 4. The URL does not match any of the block path patterns.
  // 5. The URL either matches the force allowed patterns, or matches at least
  //    one of the allowed path patterns.
  return true;
}

bool LensUrlMatcher::SubdomainsMatchHash(std::string_view str) {
  // Remove any periods from the start and end of the hostname.
  size_t start = str.find_first_not_of('.');
  if (start == std::string::npos) {
    return false;
  }
  size_t end = str.find_last_not_of('.');
  std::string_view domain =
      std::string_view(str).substr(start, 1 + end - start);
  while (true) {
    if (MatchesHash(domain)) {
      return true;
    }
    size_t found = domain.find('.');
    if (found == std::string::npos) {
      // Top-level domain.
      return false;
    }
    domain = domain.substr(found + 1);
  }
}

bool LensUrlMatcher::MatchesHash(std::string_view str) {
  uint32_t hash = util::Fingerprint32(str);
  return hashed_domain_block_filters_.contains(hash);
}

}  // namespace lens