| // Copyright 2013 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "net/tools/tld_cleanup/tld_cleanup_util.h" |
| |
| #include <sstream> |
| #include <string> |
| |
| #include "base/containers/contains.h" |
| #include "base/files/file_util.h" |
| #include "base/logging.h" |
| #include "base/ranges/algorithm.h" |
| #include "base/strings/strcat.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "url/gurl.h" |
| #include "url/third_party/mozilla/url_parse.h" |
| |
| namespace { |
| |
| const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; |
| const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; |
| |
| const int kExceptionRule = 1; |
| const int kWildcardRule = 2; |
| const int kPrivateRule = 4; |
| } |
| |
| namespace net::tld_cleanup { |
| |
| std::string RulesToGperf(const RuleMap& rules) { |
| std::string data; |
| data.append("%{\n" |
| "// Copyright 2012 The Chromium Authors\n" |
| "// Use of this source code is governed by a BSD-style license " |
| "that can be\n" |
| "// found in the LICENSE file.\n\n" |
| "// This file is generated by net/tools/tld_cleanup/.\n" |
| "// DO NOT MANUALLY EDIT!\n" |
| "%}\n" |
| "struct DomainRule {\n" |
| " int name_offset;\n" |
| " int type; // flags: 1: exception, 2: wildcard, 4: private\n" |
| "};\n" |
| "%%\n"); |
| |
| for (const auto& [domain, rule] : rules) { |
| data.append(domain); |
| data.append(", "); |
| int type = 0; |
| if (rule.exception) { |
| type = kExceptionRule; |
| } else if (rule.wildcard) { |
| type = kWildcardRule; |
| } |
| if (rule.is_private) { |
| type += kPrivateRule; |
| } |
| data.append(base::NumberToString(type)); |
| data.append("\n"); |
| } |
| |
| data.append("%%\n"); |
| |
| return data; |
| } |
| |
| // Adjusts the rule to a standard form: removes single extraneous dots and |
| // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
| // valid; logs a warning and returns kWarning if it is probably invalid; and |
| // logs an error and returns kError if the rule is (almost) certainly invalid. |
| NormalizeResult NormalizeRule(std::string& domain, Rule& rule) { |
| NormalizeResult result = NormalizeResult::kSuccess; |
| |
| // Strip single leading and trailing dots. |
| if (base::StartsWith(domain, ".")) |
| domain.erase(0, 1); |
| if (base::EndsWith(domain, ".")) |
| domain.pop_back(); |
| |
| // Allow single leading '*.' or '!', saved here so it's not canonicalized. |
| if (base::StartsWith(domain, "!")) { |
| domain.erase(0, 1); |
| rule.exception = true; |
| } else if (base::StartsWith(domain, "*.")) { |
| domain.erase(0, 2); |
| rule.wildcard = true; |
| } |
| if (domain.empty()) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return NormalizeResult::kWarning; |
| } |
| |
| // Warn about additional '*.' or '!'. |
| if (base::Contains(domain, "*.") || base::Contains(domain, '!')) { |
| LOG(WARNING) << "Keeping probably invalid rule: " << domain; |
| result = NormalizeResult::kWarning; |
| } |
| |
| // Make a GURL and normalize it, then get the host back out. |
| GURL gurl(base::StrCat({"http://", domain})); |
| const std::string& spec = gurl.possibly_invalid_spec(); |
| url::Component host = gurl.parsed_for_possibly_invalid_spec().host; |
| if (!host.is_valid()) { |
| LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain; |
| return NormalizeResult::kError; |
| } |
| if (!gurl.is_valid()) { |
| LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain; |
| result = NormalizeResult::kWarning; |
| } |
| domain.assign(spec.substr(host.begin, host.len)); |
| |
| return result; |
| } |
| |
| NormalizeResult NormalizeDataToRuleMap(const std::string& data, |
| RuleMap& rules) { |
| // We do a lot of string assignment during parsing, but simplicity is more |
| // important than performance here. |
| NormalizeResult result = NormalizeResult::kSuccess; |
| std::istringstream data_stream(data); |
| |
| bool in_private_section = false; |
| RuleMap extra_rules; |
| |
| for (std::string line; std::getline(data_stream, line, '\n');) { |
| if (base::StartsWith(line, kBeginPrivateDomainsComment)) { |
| in_private_section = true; |
| continue; |
| } |
| if (base::StartsWith(line, kEndPrivateDomainsComment)) { |
| in_private_section = false; |
| continue; |
| } |
| if (base::StartsWith(line, "//")) { |
| // Skip comments. |
| continue; |
| } |
| if (line.empty()) { |
| continue; |
| } |
| |
| // Truncate at first whitespace. |
| if (size_t first_whitespace = line.find_first_of("\r\n \t"); |
| first_whitespace != std::string::npos) { |
| line.erase(first_whitespace); |
| } |
| std::string domain = line; |
| |
| Rule rule{/*exception=*/false, /*wildcard=*/false, |
| /*is_private=*/in_private_section}; |
| NormalizeResult new_result = NormalizeRule(domain, rule); |
| result = std::max(result, new_result); |
| if (new_result == NormalizeResult::kError) { |
| continue; |
| } |
| |
| // Check the existing rules to make sure we don't have an exception and |
| // wildcard for the same rule, or that the same domain is listed as both |
| // private and not private. If we did, we'd have to update our |
| // parsing code to handle this case. |
| CHECK(!base::Contains(rules, domain)) |
| << "Duplicate rule found for " << domain; |
| |
| rules[domain] = rule; |
| // Add true TLD for multi-level rules. We don't add them right now, in |
| // case there's an exception or wild card that either exists or might be |
| // added in a later iteration. In those cases, there's no need to add |
| // it and it would just slow down parsing the data. |
| size_t tld_start = domain.find_last_of('.'); |
| if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { |
| std::string extra_rule_domain = domain.substr(tld_start + 1); |
| RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); |
| // If a rule already exists, we ensure that if any of the entries is not |
| // private the result should be that the entry is not private. An example |
| // is .au which is not listed as a real TLD, but only lists second-level |
| // domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are |
| // also listed in the private section, which is processed later, so this |
| // ensures that the real TLD (eg. .au) is listed as public. |
| bool is_private = in_private_section && |
| (iter == extra_rules.end() || iter->second.is_private); |
| extra_rules[extra_rule_domain] = |
| Rule{/*exception=*/false, /*wildcard=*/false, is_private}; |
| } |
| } |
| |
| base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()), |
| [&](const auto& extra_rule) { |
| return !base::Contains(rules, extra_rule.first); |
| }); |
| |
| return result; |
| } |
| |
| NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
| const base::FilePath& out_filename) { |
| RuleMap rules; |
| std::string data; |
| if (!base::ReadFileToString(in_filename, &data)) { |
| LOG(ERROR) << "Unable to read file"; |
| // We return success since we've already reported the error. |
| return NormalizeResult::kSuccess; |
| } |
| |
| NormalizeResult result = NormalizeDataToRuleMap(data, rules); |
| |
| if (!base::WriteFile(out_filename, RulesToGperf(rules))) { |
| LOG(ERROR) << "Error(s) writing output file"; |
| result = NormalizeResult::kError; |
| } |
| |
| return result; |
| } |
| |
| } // namespace net::tld_cleanup |