blob: b08b133a846cdd4771d8aa8ded0467dec82217ef [file] [log] [blame] [edit]
#!/usr/bin/env python
from __future__ import absolute_import, division, print_function
import codecs
from operator import itemgetter
import sys
if __name__ == '__main__':
# Get frequency counts of each byte.
freqs = [0] * 256 # byte |--> frequency
for fpath in sys.argv[1:]:
with codecs.open(fpath, 'r', 'utf-8') as fin:
for line in fin:
for byte in line.strip().encode('utf-8'):
freqs[byte] += 1
# Create the inverse mapping.
orders = [0] * 256 # byte |--> sort index, descending
sort_by_freq = sorted(zip(range(256), freqs),
key=itemgetter(1), reverse=True)
for sort_idx, byte in enumerate(map(itemgetter(0), sort_by_freq)):
orders[byte] = sort_idx
# Now write Rust.
olines = ['pub const COMMON_INPUTS: [u8; 256] = [']
for byte in range(256):
olines.append(' %3d, // %r' % (orders[byte], chr(byte)))
olines.append('];')
olines.append('')
olines.append('pub const COMMON_INPUTS_INV: [u8; 256] = [')
for sort_idx in range(256):
byte = orders.index(sort_idx)
if byte <= 127:
olines.append(' b%r,' % chr(byte))
else:
olines.append(" b'\\x%x'," % byte)
olines.append('];')
print('\n'.join(olines))