net/tools/tld_cleanup/tld_cleanup_util.cc - platform/external/cronet - Git at Google

 // Copyright 2013 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "net/tools/tld_cleanup/tld_cleanup_util.h"

 #include <sstream>
 #include <string>

 #include "base/containers/contains.h"
 #include "base/files/file_util.h"
 #include "base/logging.h"
 #include "base/ranges/algorithm.h"
 #include "base/strings/strcat.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_util.h"
 #include "url/gurl.h"
 #include "url/third_party/mozilla/url_parse.h"

 namespace {

 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

 const int kExceptionRule = 1;
 const int kWildcardRule = 2;
 const int kPrivateRule = 4;
 }

 namespace net::tld_cleanup {

 std::string RulesToGperf(const RuleMap& rules) {
   std::string data;
   data.append("%{\n"
               "// Copyright 2012 The Chromium Authors\n"
               "// Use of this source code is governed by a BSD-style license "
               "that can be\n"
               "// found in the LICENSE file.\n\n"
               "// This file is generated by net/tools/tld_cleanup/.\n"
               "// DO NOT MANUALLY EDIT!\n"
               "%}\n"
               "struct DomainRule {\n"
               "  int name_offset;\n"
               "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
               "};\n"
               "%%\n");

   for (const auto& [domain, rule] : rules) {
     data.append(domain);
     data.append(", ");
     int type = 0;
     if (rule.exception) {
       type = kExceptionRule;
     } else if (rule.wildcard) {
       type = kWildcardRule;
     }
     if (rule.is_private) {
       type += kPrivateRule;
     }
     data.append(base::NumberToString(type));
     data.append("\n");
   }

   data.append("%%\n");

   return data;
 }

 // Adjusts the rule to a standard form: removes single extraneous dots and
 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
 // valid; logs a warning and returns kWarning if it is probably invalid; and
 // logs an error and returns kError if the rule is (almost) certainly invalid.
 NormalizeResult NormalizeRule(std::string& domain, Rule& rule) {
   NormalizeResult result = NormalizeResult::kSuccess;

   // Strip single leading and trailing dots.
   if (base::StartsWith(domain, "."))
     domain.erase(0, 1);
   if (base::EndsWith(domain, "."))
     domain.pop_back();

   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
   if (base::StartsWith(domain, "!")) {
     domain.erase(0, 1);
     rule.exception = true;
   } else if (base::StartsWith(domain, "*.")) {
     domain.erase(0, 2);
     rule.wildcard = true;
   }
   if (domain.empty()) {
     LOG(WARNING) << "Ignoring empty rule";
     return NormalizeResult::kWarning;
   }

   // Warn about additional '*.' or '!'.
   if (base::Contains(domain, "*.") || base::Contains(domain, '!')) {
     LOG(WARNING) << "Keeping probably invalid rule: " << domain;
     result = NormalizeResult::kWarning;
   }

   // Make a GURL and normalize it, then get the host back out.
   GURL gurl(base::StrCat({"http://", domain}));
   const std::string& spec = gurl.possibly_invalid_spec();
   url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
   if (!host.is_valid()) {
     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain;
     return NormalizeResult::kError;
   }
   if (!gurl.is_valid()) {
     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain;
     result = NormalizeResult::kWarning;
   }
   domain.assign(spec.substr(host.begin, host.len));

   return result;
 }

 NormalizeResult NormalizeDataToRuleMap(const std::string& data,
                                        RuleMap& rules) {
   // We do a lot of string assignment during parsing, but simplicity is more
   // important than performance here.
   NormalizeResult result = NormalizeResult::kSuccess;
   std::istringstream data_stream(data);

   bool in_private_section = false;
   RuleMap extra_rules;

   for (std::string line; std::getline(data_stream, line, '\n');) {
     if (base::StartsWith(line, kBeginPrivateDomainsComment)) {
       in_private_section = true;
       continue;
     }
     if (base::StartsWith(line, kEndPrivateDomainsComment)) {
       in_private_section = false;
       continue;
     }
     if (base::StartsWith(line, "//")) {
       // Skip comments.
       continue;
     }
     if (line.empty()) {
       continue;
     }

     // Truncate at first whitespace.
     if (size_t first_whitespace = line.find_first_of("\r\n \t");
         first_whitespace != std::string::npos) {
       line.erase(first_whitespace);
     }
     std::string domain = line;

     Rule rule{/*exception=*/false, /*wildcard=*/false,
               /*is_private=*/in_private_section};
     NormalizeResult new_result = NormalizeRule(domain, rule);
     result = std::max(result, new_result);
     if (new_result == NormalizeResult::kError) {
       continue;
     }

     // Check the existing rules to make sure we don't have an exception and
     // wildcard for the same rule, or that the same domain is listed as both
     // private and not private. If we did, we'd have to update our
     // parsing code to handle this case.
     CHECK(!base::Contains(rules, domain))
         << "Duplicate rule found for " << domain;

     rules[domain] = rule;
     // Add true TLD for multi-level rules.  We don't add them right now, in
     // case there's an exception or wild card that either exists or might be
     // added in a later iteration.  In those cases, there's no need to add
     // it and it would just slow down parsing the data.
     size_t tld_start = domain.find_last_of('.');
     if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
       std::string extra_rule_domain = domain.substr(tld_start + 1);
       RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
       // If a rule already exists, we ensure that if any of the entries is not
       // private the result should be that the entry is not private.  An example
       // is .au which is not listed as a real TLD, but only lists second-level
       // domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are
       // also listed in the private section, which is processed later, so this
       // ensures that the real TLD (eg. .au) is listed as public.
       bool is_private = in_private_section &&
                         (iter == extra_rules.end() || iter->second.is_private);
       extra_rules[extra_rule_domain] =
           Rule{/*exception=*/false, /*wildcard=*/false, is_private};
     }
   }

   base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()),
                         [&](const auto& extra_rule) {
                           return !base::Contains(rules, extra_rule.first);
                         });

   return result;
 }

 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
                               const base::FilePath& out_filename) {
   RuleMap rules;
   std::string data;
   if (!base::ReadFileToString(in_filename, &data)) {
     LOG(ERROR) << "Unable to read file";
     // We return success since we've already reported the error.
     return NormalizeResult::kSuccess;
   }

   NormalizeResult result = NormalizeDataToRuleMap(data, rules);

   if (!base::WriteFile(out_filename, RulesToGperf(rules))) {
     LOG(ERROR) << "Error(s) writing output file";
     result = NormalizeResult::kError;
   }

   return result;
 }

 }  // namespace net::tld_cleanup
	// Copyright 2013 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "net/tools/tld_cleanup/tld_cleanup_util.h"

	#include <sstream>
	#include <string>

	#include "base/containers/contains.h"
	#include "base/files/file_util.h"
	#include "base/logging.h"
	#include "base/ranges/algorithm.h"
	#include "base/strings/strcat.h"
	#include "base/strings/string_number_conversions.h"
	#include "base/strings/string_util.h"
	#include "url/gurl.h"
	#include "url/third_party/mozilla/url_parse.h"

	namespace {

	const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
	const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

	const int kExceptionRule = 1;
	const int kWildcardRule = 2;
	const int kPrivateRule = 4;
	}

	namespace net::tld_cleanup {

	std::string RulesToGperf(const RuleMap& rules) {
	std::string data;
	data.append("%{\n"
	"// Copyright 2012 The Chromium Authors\n"
	"// Use of this source code is governed by a BSD-style license "
	"that can be\n"
	"// found in the LICENSE file.\n\n"
	"// This file is generated by net/tools/tld_cleanup/.\n"
	"// DO NOT MANUALLY EDIT!\n"
	"%}\n"
	"struct DomainRule {\n"
	" int name_offset;\n"
	" int type; // flags: 1: exception, 2: wildcard, 4: private\n"
	"};\n"
	"%%\n");

	for (const auto& [domain, rule] : rules) {
	data.append(domain);
	data.append(", ");
	int type = 0;
	if (rule.exception) {
	type = kExceptionRule;
	} else if (rule.wildcard) {
	type = kWildcardRule;
	}
	if (rule.is_private) {
	type += kPrivateRule;
	}
	data.append(base::NumberToString(type));
	data.append("\n");
	}

	data.append("%%\n");

	return data;
	}

	// Adjusts the rule to a standard form: removes single extraneous dots and
	// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
	// valid; logs a warning and returns kWarning if it is probably invalid; and
	// logs an error and returns kError if the rule is (almost) certainly invalid.
	NormalizeResult NormalizeRule(std::string& domain, Rule& rule) {
	NormalizeResult result = NormalizeResult::kSuccess;

	// Strip single leading and trailing dots.
	if (base::StartsWith(domain, "."))
	domain.erase(0, 1);
	if (base::EndsWith(domain, "."))
	domain.pop_back();

	// Allow single leading '*.' or '!', saved here so it's not canonicalized.
	if (base::StartsWith(domain, "!")) {
	domain.erase(0, 1);
	rule.exception = true;
	} else if (base::StartsWith(domain, "*.")) {
	domain.erase(0, 2);
	rule.wildcard = true;
	}
	if (domain.empty()) {
	LOG(WARNING) << "Ignoring empty rule";
	return NormalizeResult::kWarning;
	}

	// Warn about additional '*.' or '!'.
	if (base::Contains(domain, "*.") \|\| base::Contains(domain, '!')) {
	LOG(WARNING) << "Keeping probably invalid rule: " << domain;
	result = NormalizeResult::kWarning;
	}

	// Make a GURL and normalize it, then get the host back out.
	GURL gurl(base::StrCat({"http://", domain}));
	const std::string& spec = gurl.possibly_invalid_spec();
	url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
	if (!host.is_valid()) {
	LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain;
	return NormalizeResult::kError;
	}
	if (!gurl.is_valid()) {
	LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain;
	result = NormalizeResult::kWarning;
	}
	domain.assign(spec.substr(host.begin, host.len));

	return result;
	}

	NormalizeResult NormalizeDataToRuleMap(const std::string& data,
	RuleMap& rules) {
	// We do a lot of string assignment during parsing, but simplicity is more
	// important than performance here.
	NormalizeResult result = NormalizeResult::kSuccess;
	std::istringstream data_stream(data);

	bool in_private_section = false;
	RuleMap extra_rules;

	for (std::string line; std::getline(data_stream, line, '\n');) {
	if (base::StartsWith(line, kBeginPrivateDomainsComment)) {
	in_private_section = true;
	continue;
	}
	if (base::StartsWith(line, kEndPrivateDomainsComment)) {
	in_private_section = false;
	continue;
	}
	if (base::StartsWith(line, "//")) {
	// Skip comments.
	continue;
	}
	if (line.empty()) {
	continue;
	}

	// Truncate at first whitespace.
	if (size_t first_whitespace = line.find_first_of("\r\n \t");
	first_whitespace != std::string::npos) {
	line.erase(first_whitespace);
	}
	std::string domain = line;

	Rule rule{/exception=/false, /wildcard=/false,
	/is_private=/in_private_section};
	NormalizeResult new_result = NormalizeRule(domain, rule);
	result = std::max(result, new_result);
	if (new_result == NormalizeResult::kError) {
	continue;
	}

	// Check the existing rules to make sure we don't have an exception and
	// wildcard for the same rule, or that the same domain is listed as both
	// private and not private. If we did, we'd have to update our
	// parsing code to handle this case.
	CHECK(!base::Contains(rules, domain))
	<< "Duplicate rule found for " << domain;

	rules[domain] = rule;
	// Add true TLD for multi-level rules. We don't add them right now, in
	// case there's an exception or wild card that either exists or might be
	// added in a later iteration. In those cases, there's no need to add
	// it and it would just slow down parsing the data.
	size_t tld_start = domain.find_last_of('.');
	if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
	std::string extra_rule_domain = domain.substr(tld_start + 1);
	RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
	// If a rule already exists, we ensure that if any of the entries is not
	// private the result should be that the entry is not private. An example
	// is .au which is not listed as a real TLD, but only lists second-level
	// domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are
	// also listed in the private section, which is processed later, so this
	// ensures that the real TLD (eg. .au) is listed as public.
	bool is_private = in_private_section &&
	(iter == extra_rules.end() \|\| iter->second.is_private);
	extra_rules[extra_rule_domain] =
	Rule{/exception=/false, /wildcard=/false, is_private};
	}
	}

	base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()),
	[&](const auto& extra_rule) {
	return !base::Contains(rules, extra_rule.first);
	});

	return result;
	}

	NormalizeResult NormalizeFile(const base::FilePath& in_filename,
	const base::FilePath& out_filename) {
	RuleMap rules;
	std::string data;
	if (!base::ReadFileToString(in_filename, &data)) {
	LOG(ERROR) << "Unable to read file";
	// We return success since we've already reported the error.
	return NormalizeResult::kSuccess;
	}

	NormalizeResult result = NormalizeDataToRuleMap(data, rules);

	if (!base::WriteFile(out_filename, RulesToGperf(rules))) {
	LOG(ERROR) << "Error(s) writing output file";
	result = NormalizeResult::kError;
	}

	return result;
	}

	} // namespace net::tld_cleanup