| #!/usr/bin/env python |
| |
| # This does simple normalized frequency analysis on UTF-8 encoded text. The |
| # result of the analysis is translated to a ranked list, where every byte is |
| # assigned a rank. This list is written to src/freqs.rs. |
| # |
| # Currently, the frequencies are generated from the following corpuses: |
| # |
| # * The CIA world fact book |
| # * The source code of rustc |
| # * Septuaginta |
| |
| from __future__ import absolute_import, division, print_function |
| |
| import argparse |
| from collections import Counter |
| import sys |
| |
| preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| // NOTE: The following code was generated by "scripts/frequencies.py", do not |
| // edit directly |
| ''' |
| |
| |
| def eprint(*args, **kwargs): |
| kwargs['file'] = sys.stderr |
| print(*args, **kwargs) |
| |
| |
| def main(): |
| p = argparse.ArgumentParser() |
| p.add_argument('corpus', metavar='FILE', nargs='+') |
| args = p.parse_args() |
| |
| # Get frequency counts of each byte. |
| freqs = Counter() |
| for i in range(0, 256): |
| freqs[i] = 0 |
| |
| eprint('reading entire corpus into memory') |
| corpus = [] |
| for fpath in args.corpus: |
| corpus.append(open(fpath, 'rb').read()) |
| |
| eprint('computing byte frequencies') |
| for c in corpus: |
| for byte in c: |
| freqs[byte] += 1.0 / float(len(c)) |
| |
| eprint('writing Rust code') |
| # Get the rank of each byte. A lower rank => lower relative frequency. |
| rank = [0] * 256 |
| for i, (byte, _) in enumerate(freqs.most_common()): |
| # print(byte) |
| rank[byte] = 255 - i |
| |
| # Forcefully set the highest rank possible for bytes that start multi-byte |
| # UTF-8 sequences. The idea here is that a continuation byte will be more |
| # discerning in a homogenous haystack. |
| for byte in range(0xC0, 0xFF + 1): |
| rank[byte] = 255 |
| |
| # Now write Rust. |
| olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] |
| for byte in range(256): |
| olines.append(' %3d, // %r' % (rank[byte], chr(byte))) |
| olines.append('];') |
| |
| print(preamble) |
| print('\n'.join(olines)) |
| |
| if __name__ == '__main__': |
| main() |