vendor/regex-0.2.11/scripts/frequencies.py - toolchain/rustc - Git at Google

 #!/usr/bin/env python

 # This does simple normalized frequency analysis on UTF-8 encoded text. The
 # result of the analysis is translated to a ranked list, where every byte is
 # assigned a rank. This list is written to src/freqs.rs.
 #
 # Currently, the frequencies are generated from the following corpuses:
 #
 #   * The CIA world fact book
 #   * The source code of rustc
 #   * Septuaginta

 from __future__ import absolute_import, division, print_function

 import argparse
 from collections import Counter
 import sys

 preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 // NOTE: The following code was generated by "scripts/frequencies.py", do not
 // edit directly
 '''


 def eprint(*args, **kwargs):
     kwargs['file'] = sys.stderr
     print(*args, **kwargs)


 def main():
     p = argparse.ArgumentParser()
     p.add_argument('corpus', metavar='FILE', nargs='+')
     args = p.parse_args()

     # Get frequency counts of each byte.
     freqs = Counter()
     for i in range(0, 256):
         freqs[i] = 0

     eprint('reading entire corpus into memory')
     corpus = []
     for fpath in args.corpus:
         corpus.append(open(fpath, 'rb').read())

     eprint('computing byte frequencies')
     for c in corpus:
         for byte in c:
             freqs[byte] += 1.0 / float(len(c))

     eprint('writing Rust code')
     # Get the rank of each byte. A lower rank => lower relative frequency.
     rank = [0] * 256
     for i, (byte, _) in enumerate(freqs.most_common()):
         # print(byte)
         rank[byte] = 255 - i

     # Forcefully set the highest rank possible for bytes that start multi-byte
     # UTF-8 sequences. The idea here is that a continuation byte will be more
     # discerning in a homogenous haystack.
     for byte in range(0xC0, 0xFF + 1):
         rank[byte] = 255

     # Now write Rust.
     olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
     for byte in range(256):
         olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
     olines.append('];')

     print(preamble)
     print('\n'.join(olines))

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python

	# This does simple normalized frequency analysis on UTF-8 encoded text. The
	# result of the analysis is translated to a ranked list, where every byte is
	# assigned a rank. This list is written to src/freqs.rs.
	#
	# Currently, the frequencies are generated from the following corpuses:
	#
	# * The CIA world fact book
	# * The source code of rustc
	# * Septuaginta

	from __future__ import absolute_import, division, print_function

	import argparse
	from collections import Counter
	import sys

	preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	// NOTE: The following code was generated by "scripts/frequencies.py", do not
	// edit directly
	'''


	def eprint(args, *kwargs):
	kwargs['file'] = sys.stderr
	print(args, *kwargs)


	def main():
	p = argparse.ArgumentParser()
	p.add_argument('corpus', metavar='FILE', nargs='+')
	args = p.parse_args()

	# Get frequency counts of each byte.
	freqs = Counter()
	for i in range(0, 256):
	freqs[i] = 0

	eprint('reading entire corpus into memory')
	corpus = []
	for fpath in args.corpus:
	corpus.append(open(fpath, 'rb').read())

	eprint('computing byte frequencies')
	for c in corpus:
	for byte in c:
	freqs[byte] += 1.0 / float(len(c))

	eprint('writing Rust code')
	# Get the rank of each byte. A lower rank => lower relative frequency.
	rank = [0] * 256
	for i, (byte, _) in enumerate(freqs.most_common()):
	# print(byte)
	rank[byte] = 255 - i

	# Forcefully set the highest rank possible for bytes that start multi-byte
	# UTF-8 sequences. The idea here is that a continuation byte will be more
	# discerning in a homogenous haystack.
	for byte in range(0xC0, 0xFF + 1):
	rank[byte] = 255

	# Now write Rust.
	olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
	for byte in range(256):
	olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
	olines.append('];')

	print(preamble)
	print('\n'.join(olines))

	if __name__ == '__main__':
	main()