| #!/usr/bin/python |
| |
| # Copyright Mozilla Foundation. See the COPYRIGHT |
| # file at the top-level directory of this distribution. |
| # |
| # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| # <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| # option. This file may not be copied, modified, or distributed |
| # except according to those terms. |
| |
| import json |
| import subprocess |
| import sys |
| import os.path |
| |
| if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")): |
| sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n"); |
| sys.exit(-1) |
| |
| if not os.path.isfile("../encoding_c/src/lib.rs"): |
| sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n"); |
| sys.exit(-1) |
| |
| if not os.path.isfile("../codepage/src/lib.rs"): |
| sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n"); |
| sys.exit(-1) |
| |
| def cmp_from_end(one, other): |
| c = cmp(len(one), len(other)) |
| if c != 0: |
| return c |
| i = len(one) - 1 |
| while i >= 0: |
| c = cmp(one[i], other[i]) |
| if c != 0: |
| return c |
| i -= 1 |
| return 0 |
| |
| |
| class Label: |
| def __init__(self, label, preferred): |
| self.label = label |
| self.preferred = preferred |
| def __cmp__(self, other): |
| return cmp_from_end(self.label, other.label) |
| |
| class CodePage: |
| def __init__(self, code_page, preferred): |
| self.code_page = code_page |
| self.preferred = preferred |
| def __cmp__(self, other): |
| return self.code_page, other.code_page |
| |
| def static_u16_table(name, data): |
| data_file.write('''pub static %s: [u16; %d] = [ |
| ''' % (name, len(data))) |
| |
| for i in xrange(len(data)): |
| data_file.write('0x%04X,\n' % data[i]) |
| |
| data_file.write(''']; |
| |
| ''') |
| |
| def static_u16_table_from_indexable(name, data, item, feature): |
| data_file.write('''#[cfg(all( |
| feature = "less-slow-%s", |
| not(feature = "fast-%s") |
| ))] |
| static %s: [u16; %d] = [ |
| ''' % (feature, feature, name, len(data))) |
| |
| for i in xrange(len(data)): |
| data_file.write('0x%04X,\n' % data[i][item]) |
| |
| data_file.write(''']; |
| |
| ''') |
| |
| def static_u8_pair_table_from_indexable(name, data, item, feature): |
| data_file.write('''#[cfg(all( |
| feature = "less-slow-%s", |
| not(feature = "fast-%s") |
| ))] |
| static %s: [[u8; 2]; %d] = [ |
| ''' % (feature, feature, name, len(data))) |
| |
| for i in xrange(len(data)): |
| data_file.write('[0x%02X, 0x%02X],\n' % data[i][item]) |
| |
| data_file.write(''']; |
| |
| ''') |
| |
| def static_u8_pair_table(name, data, feature): |
| data_file.write('''#[cfg(feature = "%s")] |
| static %s: [[u8; 2]; %d] = [ |
| ''' % (feature, name, len(data))) |
| |
| for i in xrange(len(data)): |
| pair = data[i] |
| if not pair: |
| pair = (0, 0) |
| data_file.write('[0x%02X, 0x%02X],\n' % pair) |
| |
| data_file.write(''']; |
| |
| ''') |
| |
| preferred = [] |
| |
| dom = [] |
| |
| labels = [] |
| |
| data = json.load(open("../encoding/encodings.json", "r")) |
| |
| indexes = json.load(open("../encoding/indexes.json", "r")) |
| |
| single_byte = [] |
| |
| multi_byte = [] |
| |
| def to_camel_name(name): |
| if name == u"iso-8859-8-i": |
| return u"Iso8I" |
| if name.startswith(u"iso-8859-"): |
| return name.replace(u"iso-8859-", u"Iso") |
| return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"") |
| |
| def to_constant_name(name): |
| return name.replace(u"-", u"_").upper() |
| |
| def to_snake_name(name): |
| return name.replace(u"-", u"_").lower() |
| |
| def to_dom_name(name): |
| return name |
| |
| # Guestimate based on |
| # https://w3techs.com/technologies/overview/character_encoding/all |
| # whose methodology is known to be bogus, but the results are credible for |
| # this purpose. UTF-16LE lifted up due to prevalence on Windows and |
| # "ANSI codepages" prioritized. |
| encodings_by_code_page_frequency = [ |
| "UTF-8", |
| "UTF-16LE", |
| "windows-1252", |
| "windows-1251", |
| "GBK", |
| "Shift_JIS", |
| "EUC-KR", |
| "windows-1250", |
| "windows-1256", |
| "windows-1254", |
| "Big5", |
| "windows-874", |
| "windows-1255", |
| "windows-1253", |
| "windows-1257", |
| "windows-1258", |
| "EUC-JP", |
| "ISO-8859-2", |
| "ISO-8859-15", |
| "ISO-8859-7", |
| "KOI8-R", |
| "gb18030", |
| "ISO-8859-5", |
| "ISO-8859-8-I", |
| "ISO-8859-4", |
| "ISO-8859-6", |
| "ISO-2022-JP", |
| "KOI8-U", |
| "ISO-8859-13", |
| "ISO-8859-3", |
| "UTF-16BE", |
| "IBM866", |
| "ISO-8859-10", |
| "ISO-8859-8", |
| "macintosh", |
| "x-mac-cyrillic", |
| "ISO-8859-14", |
| "ISO-8859-16", |
| ] |
| |
| encodings_by_code_page = { |
| 932: "Shift_JIS", |
| 936: "GBK", |
| 949: "EUC-KR", |
| 950: "Big5", |
| 866: "IBM866", |
| 874: "windows-874", |
| 1200: "UTF-16LE", |
| 1201: "UTF-16BE", |
| 1250: "windows-1250", |
| 1251: "windows-1251", |
| 1252: "windows-1252", |
| 1253: "windows-1253", |
| 1254: "windows-1254", |
| 1255: "windows-1255", |
| 1256: "windows-1256", |
| 1257: "windows-1257", |
| 1258: "windows-1258", |
| 10000: "macintosh", |
| 10017: "x-mac-cyrillic", |
| 20866: "KOI8-R", |
| 20932: "EUC-JP", |
| 21866: "KOI8-U", |
| 28592: "ISO-8859-2", |
| 28593: "ISO-8859-3", |
| 28594: "ISO-8859-4", |
| 28595: "ISO-8859-5", |
| 28596: "ISO-8859-6", |
| 28597: "ISO-8859-7", |
| 28598: "ISO-8859-8", |
| 28600: "ISO-8859-10", |
| 28603: "ISO-8859-13", |
| 28604: "ISO-8859-14", |
| 28605: "ISO-8859-15", |
| 28606: "ISO-8859-16", |
| 38598: "ISO-8859-8-I", |
| 50221: "ISO-2022-JP", |
| 54936: "gb18030", |
| 65001: "UTF-8", |
| } |
| |
| code_pages_by_encoding = {} |
| |
| for code_page, encoding in encodings_by_code_page.iteritems(): |
| code_pages_by_encoding[encoding] = code_page |
| |
| encoding_by_alias_code_page = { |
| 951: "Big5", |
| 10007: "x-mac-cyrillic", |
| 20936: "GBK", |
| 20949: "EUC-KR", |
| 21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat |
| 28591: "windows-1252", |
| 28599: "windows-1254", |
| 28601: "windows-874", |
| 50220: "ISO-2022-JP", |
| 50222: "ISO-2022-JP", |
| 50225: "replacement", # ISO-2022-KR |
| 50227: "replacement", # ISO-2022-CN |
| 51949: "EUC-JP", |
| 51936: "GBK", |
| 51949: "EUC-KR", |
| 52936: "replacement", # HZ |
| } |
| |
| code_pages = [] |
| |
| for name in encodings_by_code_page_frequency: |
| code_pages.append(code_pages_by_encoding[name]) |
| |
| encodings_by_code_page.update(encoding_by_alias_code_page) |
| |
| temp_keys = encodings_by_code_page.keys() |
| temp_keys.sort() |
| for code_page in temp_keys: |
| if not code_page in code_pages: |
| code_pages.append(code_page) |
| |
| # The position in the index (0 is the first index entry, |
| # i.e. byte value 0x80) that starts the longest run of |
| # consecutive code points. Must not be in the first |
| # quadrant. If the character to be encoded is not in this |
| # run, the part of the index after the run is searched |
| # forward. Then the part of the index from 32 to the start |
| # of the run. The first quadrant is searched last. |
| # |
| # If there is no obviously most useful longest run, |
| # the index here is just used to affect the search order. |
| start_of_longest_run_in_single_byte = { |
| "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant |
| "windows-874": 33, |
| "windows-1250": 92, |
| "windows-1251": 64, |
| "windows-1252": 32, |
| "windows-1253": 83, |
| "windows-1254": 95, |
| "windows-1255": 96, |
| "windows-1256": 65, |
| "windows-1257": 95, # not actually longest |
| "windows-1258": 95, # not actually longest |
| "macintosh": 106, # useless |
| "x-mac-cyrillic": 96, |
| "KOI8-R": 64, # not actually longest |
| "KOI8-U": 64, # not actually longest |
| "ISO-8859-2": 95, # not actually longest |
| "ISO-8859-3": 95, # not actually longest |
| "ISO-8859-4": 95, # not actually longest |
| "ISO-8859-5": 46, |
| "ISO-8859-6": 65, |
| "ISO-8859-7": 83, |
| "ISO-8859-8": 96, |
| "ISO-8859-10": 90, # not actually longest |
| "ISO-8859-13": 95, # not actually longest |
| "ISO-8859-14": 95, |
| "ISO-8859-15": 63, |
| "ISO-8859-16": 95, # not actually longest |
| } |
| |
| # |
| |
| for group in data: |
| if group["heading"] == "Legacy single-byte encodings": |
| single_byte = group["encodings"] |
| else: |
| multi_byte.extend(group["encodings"]) |
| for encoding in group["encodings"]: |
| preferred.append(encoding["name"]) |
| for label in encoding["labels"]: |
| labels.append(Label(label, encoding["name"])) |
| |
| for name in preferred: |
| dom.append(to_dom_name(name)) |
| |
| preferred.sort() |
| labels.sort() |
| dom.sort(cmp=cmp_from_end) |
| |
| longest_label_length = 0 |
| longest_name_length = 0 |
| longest_label = None |
| longest_name = None |
| |
| for name in preferred: |
| if len(name) > longest_name_length: |
| longest_name_length = len(name) |
| longest_name = name |
| |
| for label in labels: |
| if len(label.label) > longest_label_length: |
| longest_label_length = len(label.label) |
| longest_label = label.label |
| |
| def longest_run_for_single_byte(name): |
| if name == u"ISO-8859-8-I": |
| name = u"ISO-8859-8" |
| index = indexes[name.lower()] |
| run_byte_offset = start_of_longest_run_in_single_byte[name] |
| run_bmp_offset = index[run_byte_offset] |
| previous_code_point = run_bmp_offset |
| run_length = 1 |
| while True: |
| i = run_byte_offset + run_length |
| if i == len(index): |
| break |
| code_point = index[i] |
| if previous_code_point + 1 != code_point: |
| break |
| previous_code_point = code_point |
| run_length += 1 |
| return (run_bmp_offset, run_byte_offset, run_length) |
| |
| def is_single_byte(name): |
| for encoding in single_byte: |
| if name == encoding["name"]: |
| return True |
| return False |
| |
| def read_non_generated(path): |
| partially_generated_file = open(path, "r") |
| full = partially_generated_file.read() |
| partially_generated_file.close() |
| |
| generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT." |
| generated_end = "// END GENERATED CODE" |
| |
| generated_begin_index = full.find(generated_begin) |
| if generated_begin_index < 0: |
| sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path) |
| sys.exit(-1) |
| generated_end_index = full.find(generated_end) |
| if generated_end_index < 0: |
| sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path) |
| sys.exit(-1) |
| |
| return (full[0:generated_begin_index + len(generated_begin)], |
| full[generated_end_index:]) |
| |
| (lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs") |
| |
| label_file = open("src/lib.rs", "w") |
| |
| label_file.write(lib_rs_begin) |
| label_file.write(""" |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| const LONGEST_LABEL_LENGTH: usize = %d; // %s |
| |
| """ % (longest_label_length, longest_label)) |
| |
| for name in preferred: |
| variant = None |
| if is_single_byte(name): |
| (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name) |
| variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length) |
| else: |
| variant = to_camel_name(name) |
| |
| docfile = open("doc/%s.txt" % name, "r") |
| doctext = docfile.read() |
| docfile.close() |
| |
| label_file.write('''/// The initializer for the [%s](static.%s.html) encoding. |
| /// |
| /// For use only for taking the address of this form when |
| /// Rust prohibits the use of the non-`_INIT` form directly, |
| /// such as in initializers of other `static`s. If in doubt, |
| /// use the corresponding non-`_INIT` reference-typed `static`. |
| /// |
| /// This part of the public API will go away if Rust changes |
| /// to make the referent of `pub const FOO: &'static Encoding` |
| /// unique cross-crate or if Rust starts allowing static arrays |
| /// to be initialized with `pub static FOO: &'static Encoding` |
| /// items. |
| pub static %s_INIT: Encoding = Encoding { |
| name: "%s", |
| variant: VariantEncoding::%s, |
| }; |
| |
| /// The %s encoding. |
| /// |
| %s/// |
| /// This will change from `static` to `const` if Rust changes |
| /// to make the referent of `pub const FOO: &'static Encoding` |
| /// unique cross-crate, so don't take the address of this |
| /// `static`. |
| pub static %s: &'static Encoding = &%s_INIT; |
| |
| ''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name))) |
| |
| label_file.write("""static LABELS_SORTED: [&'static str; %d] = [ |
| """ % len(labels)) |
| |
| for label in labels: |
| label_file.write('''"%s",\n''' % label.label) |
| |
| label_file.write("""]; |
| |
| static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [ |
| """ % len(labels)) |
| |
| for label in labels: |
| label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred)) |
| |
| label_file.write(''']; |
| |
| ''') |
| label_file.write(lib_rs_end) |
| label_file.close() |
| |
| label_test_file = open("src/test_labels_names.rs", "w") |
| label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the |
| // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| |
| // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| use super::*; |
| |
| #[test] |
| fn test_all_labels() { |
| ''') |
| |
| for label in labels: |
| label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred))) |
| |
| label_test_file.write('''} |
| ''') |
| label_test_file.close() |
| |
| def null_to_zero(code_point): |
| if not code_point: |
| code_point = 0 |
| return code_point |
| |
| (data_rs_begin, data_rs_end) = read_non_generated("src/data.rs") |
| |
| data_file = open("src/data.rs", "w") |
| data_file.write(data_rs_begin) |
| data_file.write(''' |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| #[repr(align(64))] // Align to cache lines |
| pub struct SingleByteData { |
| ''') |
| |
| # Single-byte |
| |
| for encoding in single_byte: |
| name = encoding["name"] |
| if name == u"ISO-8859-8-I": |
| continue |
| |
| data_file.write(''' pub %s: [u16; 128], |
| ''' % to_snake_name(name)) |
| |
| data_file.write('''} |
| |
| pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData { |
| ''') |
| |
| for encoding in single_byte: |
| name = encoding["name"] |
| if name == u"ISO-8859-8-I": |
| continue |
| |
| data_file.write(''' %s: [ |
| ''' % to_snake_name(name)) |
| |
| for code_point in indexes[name.lower()]: |
| data_file.write('0x%04X,\n' % null_to_zero(code_point)) |
| |
| data_file.write('''], |
| ''') |
| |
| data_file.write('''}; |
| |
| ''') |
| |
| # Big5 |
| |
| index = indexes["big5"] |
| |
| astralness = [] |
| low_bits = [] |
| |
| for code_point in index[942:19782]: |
| if code_point: |
| astralness.append(1 if code_point > 0xFFFF else 0) |
| low_bits.append(code_point & 0xFFFF) |
| else: |
| astralness.append(0) |
| low_bits.append(0) |
| |
| # pad length to multiple of 32 |
| for j in xrange(32 - (len(astralness) % 32)): |
| astralness.append(0) |
| |
| data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))] |
| static BIG5_ASTRALNESS: [u32; %d] = [ |
| ''' % (len(astralness) / 32)) |
| |
| i = 0 |
| while i < len(astralness): |
| accu = 0 |
| for j in xrange(32): |
| accu |= astralness[i + j] << j |
| data_file.write('0x%08X,\n' % accu) |
| i += 32 |
| |
| data_file.write(''']; |
| |
| ''') |
| |
| static_u16_table("BIG5_LOW_BITS", low_bits) |
| |
| # Encoder table for Level 1 Hanzi |
| # Note: If we were OK with doubling this table, we |
| # could use a directly-indexable table instead... |
| level1_hanzi_index = index[5495:10896] |
| level1_hanzi_pairs = [] |
| for i in xrange(len(level1_hanzi_index)): |
| hanzi_lead = (i / 157) + 0xA4 |
| hanzi_trail = (i % 157) |
| hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62 |
| level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) |
| level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B))) |
| level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D))) |
| level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1))) |
| level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2))) |
| level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3))) |
| level1_hanzi_pairs.sort(key=lambda x: x[0]) |
| |
| static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode") |
| static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode") |
| |
| # Fast Unified Ideograph encode |
| big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00) |
| for row in xrange(0x7E - 0x20): |
| for column in xrange(157): |
| pointer = 5024 + column + (row * 157) |
| code_point = index[pointer] |
| if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB: |
| unified_offset = code_point - 0x4E00 |
| unified_lead = 0xA1 + row |
| unified_trail = (0x40 if column < 0x3F else 0x62) + column |
| if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]: |
| big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail) |
| |
| static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode") |
| |
| # JIS0208 |
| |
| index = indexes["jis0208"] |
| |
| # JIS 0208 Level 1 Kanji |
| static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375]) |
| |
| # JIS 0208 Level 2 Kanji and Additional Kanji |
| static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808]) |
| |
| # IBM Kanji |
| static_u16_table("IBM_KANJI", index[8272:8632]) |
| |
| # Check that the other instance is the same |
| if index[8272:8632] != index[10744:11104]: |
| raise Error() |
| |
| # JIS 0208 symbols (all non-Kanji, non-range items) |
| symbol_index = [] |
| symbol_triples = [] |
| pointers_to_scan = [ |
| (0, 188), |
| (658, 691), |
| (1159, 1221), |
| ] |
| in_run = False |
| run_start_pointer = 0 |
| run_start_array_index = 0 |
| for (start, end) in pointers_to_scan: |
| for i in range(start, end): |
| code_point = index[i] |
| if in_run: |
| if code_point: |
| symbol_index.append(code_point) |
| else: |
| symbol_triples.append(run_start_pointer) |
| symbol_triples.append(i - run_start_pointer) |
| symbol_triples.append(run_start_array_index) |
| in_run = False |
| else: |
| if code_point: |
| in_run = True |
| run_start_pointer = i |
| run_start_array_index = len(symbol_index) |
| symbol_index.append(code_point) |
| if in_run: |
| symbol_triples.append(run_start_pointer) |
| symbol_triples.append(end - run_start_pointer) |
| symbol_triples.append(run_start_array_index) |
| in_run = False |
| if in_run: |
| raise Error() |
| |
| # Now add manually the two overlapping slices of |
| # index from the NEC/IBM extensions. |
| run_start_array_index = len(symbol_index) |
| symbol_index.extend(index[10736:10744]) |
| # Later |
| symbol_triples.append(10736) |
| symbol_triples.append(8) |
| symbol_triples.append(run_start_array_index) |
| # Earlier |
| symbol_triples.append(8644) |
| symbol_triples.append(4) |
| symbol_triples.append(run_start_array_index) |
| |
| static_u16_table("JIS0208_SYMBOLS", symbol_index) |
| static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples) |
| |
| # Write down the magic numbers needed when preferring the earlier case |
| data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1)) |
| data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4)) |
| data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645) |
| |
| # JIS 0208 ranges (excluding kana) |
| range_triples = [] |
| pointers_to_scan = [ |
| (188, 281), |
| (470, 657), |
| (1128, 1159), |
| (8634, 8644), |
| (10716, 10736), |
| ] |
| in_run = False |
| run_start_pointer = 0 |
| run_start_code_point = 0 |
| previous_code_point = 0 |
| for (start, end) in pointers_to_scan: |
| for i in range(start, end): |
| code_point = index[i] |
| if in_run: |
| if code_point: |
| if previous_code_point + 1 != code_point: |
| range_triples.append(run_start_pointer) |
| range_triples.append(i - run_start_pointer) |
| range_triples.append(run_start_code_point) |
| run_start_pointer = i |
| run_start_code_point = code_point |
| previous_code_point = code_point |
| else: |
| range_triples.append(run_start_pointer) |
| range_triples.append(i - run_start_pointer) |
| range_triples.append(run_start_code_point) |
| run_start_pointer = 0 |
| run_start_code_point = 0 |
| previous_code_point = 0 |
| in_run = False |
| else: |
| if code_point: |
| in_run = True |
| run_start_pointer = i |
| run_start_code_point = code_point |
| previous_code_point = code_point |
| if in_run: |
| range_triples.append(run_start_pointer) |
| range_triples.append(end - run_start_pointer) |
| range_triples.append(run_start_code_point) |
| run_start_pointer = 0 |
| run_start_code_point = 0 |
| previous_code_point = 0 |
| in_run = False |
| if in_run: |
| raise Error() |
| |
| static_u16_table("JIS0208_RANGE_TRIPLES", range_triples) |
| |
| # Encoder table for Level 1 Kanji |
| # Note: If we were OK with 30 KB more footprint, we |
| # could use a directly-indexable table instead... |
| level1_kanji_index = index[1410:4375] |
| level1_kanji_pairs = [] |
| for i in xrange(len(level1_kanji_index)): |
| pointer = 1410 + i |
| (lead, trail) = divmod(pointer, 188) |
| lead += 0x81 if lead < 0x1F else 0xC1 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail))) |
| level1_kanji_pairs.sort(key=lambda x: x[0]) |
| |
| static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode") |
| static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode") |
| |
| # Fast encoder table for Kanji |
| kanji_bytes = [None] * (0x9FA1 - 0x4E00) |
| for pointer in xrange(len(index)): |
| code_point = index[pointer] |
| if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0: |
| (lead, trail) = divmod(pointer, 188) |
| lead += 0x81 if lead < 0x1F else 0xC1 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| # unset the high bit of lead if IBM Kanji |
| if pointer >= 8272: |
| lead = lead & 0x7F |
| kanji_bytes[code_point - 0x4E00] = (lead, trail) |
| |
| static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode") |
| |
| # ISO-2022-JP half-width katakana |
| |
| # index is still jis0208 |
| half_width_index = indexes["iso-2022-jp-katakana"] |
| |
| data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [ |
| ''' % len(half_width_index)) |
| |
| for i in xrange(len(half_width_index)): |
| code_point = half_width_index[i] |
| pointer = index.index(code_point) |
| trail = pointer % 94 + 0x21 |
| data_file.write('0x%02X,\n' % trail) |
| |
| data_file.write(''']; |
| |
| ''') |
| |
| # EUC-KR |
| |
| index = indexes["euc-kr"] |
| |
| # Unicode 1.1 Hangul above the old KS X 1001 block |
| # Compressed form takes 35% of uncompressed form |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for row in xrange(0x20): |
| for column in xrange(190): |
| i = column + (row * 190) |
| # Skip the gaps |
| if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): |
| continue |
| code_point = index[i] |
| if previous_code_point > code_point: |
| raise Error() |
| if code_point - previous_code_point != 1: |
| adjustment = 0 |
| if column >= 0x40: |
| adjustment = 12 |
| elif column >= 0x20: |
| adjustment = 6 |
| pointers.append(column - adjustment + (row * (190 - 12))) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers) |
| static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets) |
| |
| # Unicode 1.1 Hangul to the left of the old KS X 1001 block |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for row in xrange(0x46 - 0x20): |
| for column in xrange(190 - 94): |
| i = 6080 + column + (row * 190) |
| # Skip the gaps |
| if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): |
| continue |
| if i > 13127: |
| # Exclude unassigned on partial last row |
| break |
| code_point = index[i] |
| if previous_code_point > code_point: |
| raise Error() |
| if code_point - previous_code_point != 1: |
| adjustment = 0 |
| if column >= 0x40: |
| adjustment = 12 |
| elif column >= 0x20: |
| adjustment = 6 |
| pointers.append(column - adjustment + (row * (190 - 94 - 12))) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers) |
| static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets) |
| |
| # KS X 1001 Hangul |
| hangul_index = [] |
| previous_code_point = 0 |
| for row in xrange(0x48 - 0x2F): |
| for column in xrange(94): |
| code_point = index[9026 + column + (row * 190)] |
| if previous_code_point >= code_point: |
| raise Error() |
| hangul_index.append(code_point) |
| previous_code_point = code_point |
| |
| static_u16_table("KSX1001_HANGUL", hangul_index) |
| |
| # KS X 1001 Hanja |
| hanja_index = [] |
| for row in xrange(0x7D - 0x49): |
| for column in xrange(94): |
| hanja_index.append(index[13966 + column + (row * 190)]) |
| |
| static_u16_table("KSX1001_HANJA", hanja_index) |
| |
| # KS X 1001 symbols |
| symbol_index = [] |
| for i in range(6176, 6270): |
| symbol_index.append(index[i]) |
| for i in range(6366, 6437): |
| symbol_index.append(index[i]) |
| |
| static_u16_table("KSX1001_SYMBOLS", symbol_index) |
| |
| # KS X 1001 Uppercase Latin |
| subindex = [] |
| for i in range(7506, 7521): |
| subindex.append(null_to_zero(index[i])) |
| |
| static_u16_table("KSX1001_UPPERCASE", subindex) |
| |
| # KS X 1001 Lowercase Latin |
| subindex = [] |
| for i in range(7696, 7712): |
| subindex.append(index[i]) |
| |
| static_u16_table("KSX1001_LOWERCASE", subindex) |
| |
| # KS X 1001 Box drawing |
| subindex = [] |
| for i in range(7126, 7194): |
| subindex.append(index[i]) |
| |
| static_u16_table("KSX1001_BOX", subindex) |
| |
| # KS X 1001 other |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for row in xrange(10): |
| for column in xrange(94): |
| i = 6556 + column + (row * 190) |
| code_point = index[i] |
| # Exclude ranges that were processed as lookup tables |
| # or that contain unmapped cells by filling them with |
| # ASCII. Upon encode, ASCII code points will |
| # never appear as the search key. |
| if (i >= 6946 and i <= 6950): |
| code_point = i - 6946 |
| elif (i >= 6961 and i <= 6967): |
| code_point = i - 6961 |
| elif (i >= 6992 and i <= 6999): |
| code_point = i - 6992 |
| elif (i >= 7024 and i <= 7029): |
| code_point = i - 7024 |
| elif (i >= 7126 and i <= 7219): |
| code_point = i - 7126 |
| elif (i >= 7395 and i <= 7409): |
| code_point = i - 7395 |
| elif (i >= 7506 and i <= 7521): |
| code_point = i - 7506 |
| elif (i >= 7696 and i <= 7711): |
| code_point = i - 7696 |
| elif (i >= 7969 and i <= 7979): |
| code_point = i - 7969 |
| elif (i >= 8162 and i <= 8169): |
| code_point = i - 8162 |
| elif (i >= 8299 and i <= 8313): |
| code_point = i - 8299 |
| elif (i >= 8347 and i <= 8359): |
| code_point = i - 8347 |
| if code_point - previous_code_point != 1: |
| pointers.append(column + (row * 94)) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| static_u16_table("KSX1001_OTHER_POINTERS", pointers) |
| # Omit the last offset, because the end of the last line |
| # is unmapped, so we don't want to look at it. |
| static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1]) |
| |
| # Fast Hangul and Hanja encode |
| hangul_bytes = [None] * (0xD7A4 - 0xAC00) |
| hanja_unified_bytes = [None] * (0x9F9D - 0x4E00) |
| hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900) |
| for row in xrange(0x7D): |
| for column in xrange(190): |
| pointer = column + (row * 190) |
| code_point = index[pointer] |
| if code_point: |
| lead = 0x81 + row |
| trail = 0x41 + column |
| if code_point >= 0xAC00 and code_point < 0xD7A4: |
| hangul_bytes[code_point - 0xAC00] = (lead, trail) |
| elif code_point >= 0x4E00 and code_point < 0x9F9D: |
| hanja_unified_bytes[code_point - 0x4E00] = (lead, trail) |
| elif code_point >= 0xF900 and code_point < 0xFA0C: |
| hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail) |
| |
| static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode") |
| static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode") |
| static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode") |
| |
| # JIS 0212 |
| |
| index = indexes["jis0212"] |
| |
| # JIS 0212 Kanji |
| static_u16_table("JIS0212_KANJI", index[1410:7211]) |
| |
| # JIS 0212 accented (all non-Kanji, non-range items) |
| symbol_index = [] |
| symbol_triples = [] |
| pointers_to_scan = [ |
| (0, 596), |
| (608, 644), |
| (656, 1409), |
| ] |
| in_run = False |
| run_start_pointer = 0 |
| run_start_array_index = 0 |
| for (start, end) in pointers_to_scan: |
| for i in range(start, end): |
| code_point = index[i] |
| if in_run: |
| if code_point: |
| symbol_index.append(code_point) |
| elif index[i + 1]: |
| symbol_index.append(0) |
| else: |
| symbol_triples.append(run_start_pointer) |
| symbol_triples.append(i - run_start_pointer) |
| symbol_triples.append(run_start_array_index) |
| in_run = False |
| else: |
| if code_point: |
| in_run = True |
| run_start_pointer = i |
| run_start_array_index = len(symbol_index) |
| symbol_index.append(code_point) |
| if in_run: |
| symbol_triples.append(run_start_pointer) |
| symbol_triples.append(end - run_start_pointer) |
| symbol_triples.append(run_start_array_index) |
| in_run = False |
| if in_run: |
| raise Error() |
| |
| static_u16_table("JIS0212_ACCENTED", symbol_index) |
| static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples) |
| |
| # gb18030 |
| |
| index = indexes["gb18030"] |
| |
| # Unicode 1.1 ideographs above the old GB2312 block |
| # Compressed form takes 63% of uncompressed form |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for i in xrange(6080): |
| code_point = index[i] |
| if previous_code_point > code_point: |
| raise Error() |
| if code_point - previous_code_point != 1: |
| pointers.append(i) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers) |
| static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets) |
| |
| # Unicode 1.1 ideographs to the left of the old GB2312 block |
| # Compressed form takes 40% of uncompressed form |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for row in xrange(0x7D - 0x29): |
| for column in xrange(190 - 94): |
| i = 7790 + column + (row * 190) |
| if i > 23650: |
| # Exclude compatibility ideographs at the end |
| break |
| code_point = index[i] |
| if previous_code_point > code_point: |
| raise Error() |
| if code_point - previous_code_point != 1: |
| pointers.append(column + (row * (190 - 94))) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers) |
| static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets) |
| |
| # GBK other (excl. Ext A, Compat & PUA at the bottom) |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for row in xrange(0x29 - 0x20): |
| for column in xrange(190 - 94): |
| i = 6080 + column + (row * 190) |
| code_point = index[i] |
| if code_point - previous_code_point != 1: |
| pointers.append(column + (row * (190 - 94))) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| pointers.append((190 - 94) * (0x29 - 0x20)) |
| static_u16_table("GBK_OTHER_POINTERS", pointers) |
| static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets) |
| |
| # GBK bottom: Compatibility ideagraphs, Ext A and PUA |
| bottom_index = [] |
| # 5 compat following Unified Ideographs |
| for i in range(23651, 23656): |
| bottom_index.append(index[i]) |
| # Last row |
| for i in range(23750, 23846): |
| bottom_index.append(index[i]) |
| |
| static_u16_table("GBK_BOTTOM", bottom_index) |
| |
| # GB2312 Hanzi |
| # (and the 5 PUA code points in between Level 1 and Level 2) |
| hanzi_index = [] |
| for row in xrange(0x77 - 0x2F): |
| for column in xrange(94): |
| hanzi_index.append(index[9026 + column + (row * 190)]) |
| |
| static_u16_table("GB2312_HANZI", hanzi_index) |
| |
| # GB2312 symbols |
| symbol_index = [] |
| for i in xrange(94): |
| symbol_index.append(index[6176 + i]) |
| |
| static_u16_table("GB2312_SYMBOLS", symbol_index) |
| |
| # GB2312 symbols on Greek row (incl. PUA) |
| symbol_index = [] |
| for i in xrange(22): |
| symbol_index.append(index[7189 + i]) |
| |
| static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index) |
| |
| # GB2312 Pinyin |
| pinyin_index = [] |
| for i in xrange(32): |
| pinyin_index.append(index[7506 + i]) |
| |
| static_u16_table("GB2312_PINYIN", pinyin_index) |
| |
| # GB2312 other (excl. bottom PUA) |
| pointers = [] |
| offsets = [] |
| previous_code_point = 0 |
| for row in xrange(14): |
| for column in xrange(94): |
| i = 6366 + column + (row * 190) |
| code_point = index[i] |
| # Exclude the two ranges that were processed as |
| # lookup tables above by filling them with |
| # ASCII. Upon encode, ASCII code points will |
| # never appear as the search key. |
| if (i >= 7189 and i < 7189 + 22): |
| code_point = i - 7189 |
| elif (i >= 7506 and i < 7506 + 32): |
| code_point = i - 7506 |
| if code_point - previous_code_point != 1: |
| pointers.append(column + (row * 94)) |
| offsets.append(code_point) |
| previous_code_point = code_point |
| |
| pointers.append(14 * 94) |
| static_u16_table("GB2312_OTHER_POINTERS", pointers) |
| static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets) |
| |
| # Non-gbk code points |
| pointers = [] |
| offsets = [] |
| for pair in indexes["gb18030-ranges"]: |
| if pair[1] == 0x10000: |
| break # the last entry doesn't fit in u16 |
| pointers.append(pair[0]) |
| offsets.append(pair[1]) |
| |
| static_u16_table("GB18030_RANGE_POINTERS", pointers) |
| static_u16_table("GB18030_RANGE_OFFSETS", offsets) |
| |
| # Encoder table for Level 1 Hanzi |
| # The units here really fit into 12 bits, but since we're |
| # looking for speed here, let's use 16 bits per unit. |
| # Once we use 16 bits per unit, we might as well precompute |
| # the output bytes. |
| level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)] |
| level1_hanzi_pairs = [] |
| for i in xrange(len(level1_hanzi_index)): |
| hanzi_lead = (i / 94) + 0xB0 |
| hanzi_trail = (i % 94) + 0xA1 |
| level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) |
| level1_hanzi_pairs.sort(key=lambda x: x[0]) |
| |
| static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode") |
| static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode") |
| |
| # Fast Hanzi encoder table |
| hanzi_bytes = [None] * (0x9FA7 - 0x4E00) |
| for row in xrange(126): |
| for column in xrange(190): |
| pointer = column + (row * 190) |
| code_point = index[pointer] |
| if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6: |
| hanzi_lead = 0x81 + row |
| hanzi_trail = column + (0x40 if column < 0x3F else 0x41) |
| hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail) |
| |
| static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode") |
| |
| data_file.write(data_rs_end) |
| |
| data_file.close() |
| |
| # Variant |
| |
| variant_file = open("src/variant.rs", "w") |
| variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT |
| // file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| //! This module provides enums that wrap the various decoders and encoders. |
| //! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the |
| //! dispatch explicitly for a finite set of specialized decoders and encoders. |
| //! Unfortunately, this means the compiler doesn't generate the dispatch code |
| //! and it has to be written here instead. |
| //! |
| //! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack |
| //! allocation in Rust code, including the convenience methods on `Encoding`. |
| |
| ''') |
| |
| encoding_variants = [u"single-byte",] |
| for encoding in multi_byte: |
| if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]: |
| continue |
| else: |
| encoding_variants.append(encoding["name"]) |
| encoding_variants.append(u"UTF-16") |
| |
| decoder_variants = [] |
| for variant in encoding_variants: |
| if variant == u"GBK": |
| continue |
| decoder_variants.append(variant) |
| |
| encoder_variants = [] |
| for variant in encoding_variants: |
| if variant in [u"replacement", u"GBK", u"UTF-16"]: |
| continue |
| encoder_variants.append(variant) |
| |
| for variant in decoder_variants: |
| variant_file.write("use %s::*;\n" % to_snake_name(variant)) |
| |
| variant_file.write('''use super::*; |
| |
| pub enum VariantDecoder { |
| ''') |
| |
| for variant in decoder_variants: |
| variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant))) |
| |
| variant_file.write('''} |
| |
| impl VariantDecoder { |
| ''') |
| |
| def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind): |
| variant_file.write('''pub fn %s(&''' % name) |
| if mut: |
| variant_file.write('''mut ''') |
| variant_file.write('''self''') |
| for arg in arg_list: |
| variant_file.write(''', %s: %s''' % (arg[0], arg[1])) |
| variant_file.write(''')''') |
| if ret: |
| variant_file.write(''' -> %s''' % ret) |
| variant_file.write(''' {\nmatch *self {\n''') |
| for variant in variants: |
| variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant))) |
| if mut: |
| variant_file.write('''mut ''') |
| if variant in excludes: |
| variant_file.write('''v) => (),''') |
| continue |
| variant_file.write('''v) => v.%s(''' % name) |
| first = True |
| for arg in arg_list: |
| if not first: |
| variant_file.write(''', ''') |
| first = False |
| variant_file.write(arg[0]) |
| variant_file.write('''),\n''') |
| variant_file.write('''}\n}\n\n''') |
| |
| write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") |
| |
| write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") |
| |
| write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") |
| |
| write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"), |
| ("dst", "&mut [u16]"), |
| ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") |
| |
| write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"), |
| ("dst", "&mut [u8]"), |
| ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") |
| |
| variant_file.write(''' |
| |
| pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> { |
| match *self { |
| VariantDecoder::SingleByte(ref v) => { |
| return Some(v.latin1_byte_compatible_up_to(buffer)); |
| } |
| VariantDecoder::Utf8(ref v) => { |
| if !v.in_neutral_state() { |
| return None; |
| } |
| } |
| VariantDecoder::Gb18030(ref v) => { |
| if !v.in_neutral_state() { |
| return None; |
| } |
| } |
| VariantDecoder::Big5(ref v) => { |
| if !v.in_neutral_state() { |
| return None; |
| } |
| } |
| VariantDecoder::EucJp(ref v) => { |
| if !v.in_neutral_state() { |
| return None; |
| } |
| } |
| VariantDecoder::Iso2022Jp(ref v) => { |
| if v.in_neutral_state() { |
| return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer)); |
| } |
| return None; |
| } |
| VariantDecoder::ShiftJis(ref v) => { |
| if !v.in_neutral_state() { |
| return None; |
| } |
| } |
| VariantDecoder::EucKr(ref v) => { |
| if !v.in_neutral_state() { |
| return None; |
| } |
| } |
| VariantDecoder::UserDefined(_) => {} |
| VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => { |
| return None; |
| } |
| }; |
| Some(Encoding::ascii_valid_up_to(buffer)) |
| } |
| } |
| |
| pub enum VariantEncoder { |
| ''') |
| |
| for variant in encoder_variants: |
| variant_file.write(" %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant))) |
| |
| variant_file.write('''} |
| |
| impl VariantEncoder { |
| pub fn has_pending_state(&self) -> bool { |
| match *self { |
| VariantEncoder::Iso2022Jp(ref v) => { |
| v.has_pending_state() |
| } |
| _ => false, |
| } |
| } |
| ''') |
| |
| write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") |
| |
| write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") |
| |
| write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"), |
| ("dst", "&mut [u8]"), |
| ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") |
| |
| write_variant_method("encode_from_utf8_raw", True, [("src", "&str"), |
| ("dst", "&mut [u8]"), |
| ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") |
| |
| |
| variant_file.write('''} |
| |
| pub enum VariantEncoding { |
| SingleByte(&'static [u16; 128], u16, u8, u8),''') |
| |
| for encoding in multi_byte: |
| variant_file.write("%s,\n" % to_camel_name(encoding["name"])) |
| |
| variant_file.write('''} |
| |
| impl VariantEncoding { |
| pub fn new_variant_decoder(&self) -> VariantDecoder { |
| match *self { |
| VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table), |
| VariantEncoding::Utf8 => Utf8Decoder::new(), |
| VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(), |
| VariantEncoding::Big5 => Big5Decoder::new(), |
| VariantEncoding::EucJp => EucJpDecoder::new(), |
| VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(), |
| VariantEncoding::ShiftJis => ShiftJisDecoder::new(), |
| VariantEncoding::EucKr => EucKrDecoder::new(), |
| VariantEncoding::Replacement => ReplacementDecoder::new(), |
| VariantEncoding::UserDefined => UserDefinedDecoder::new(), |
| VariantEncoding::Utf16Be => Utf16Decoder::new(true), |
| VariantEncoding::Utf16Le => Utf16Decoder::new(false), |
| } |
| } |
| |
| pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder { |
| match *self { |
| VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length), |
| VariantEncoding::Utf8 => Utf8Encoder::new(encoding), |
| VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false), |
| VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true), |
| VariantEncoding::Big5 => Big5Encoder::new(encoding), |
| VariantEncoding::EucJp => EucJpEncoder::new(encoding), |
| VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding), |
| VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding), |
| VariantEncoding::EucKr => EucKrEncoder::new(encoding), |
| VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding), |
| VariantEncoding::Utf16Be | VariantEncoding::Replacement | |
| VariantEncoding::Utf16Le => unreachable!(), |
| } |
| } |
| |
| pub fn is_single_byte(&self) -> bool { |
| match *self { |
| VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true, |
| _ => false, |
| } |
| } |
| } |
| ''') |
| |
| variant_file.close() |
| |
| (ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs") |
| |
| ffi_file = open("../encoding_c/src/lib.rs", "w") |
| |
| ffi_file.write(ffi_rs_begin) |
| ffi_file.write(""" |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| /// The minimum length of buffers that may be passed to `encoding_name()`. |
| pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s |
| |
| """ % (longest_name_length, longest_name)) |
| |
| for name in preferred: |
| ffi_file.write('''/// The %s encoding. |
| #[no_mangle] |
| pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT); |
| |
| ''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name))) |
| |
| ffi_file.write(ffi_rs_end) |
| ffi_file.close() |
| |
| (single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs") |
| |
| single_byte_file = open("src/single_byte.rs", "w") |
| |
| single_byte_file.write(single_byte_rs_begin) |
| single_byte_file.write(""" |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| #[test] |
| fn test_single_byte_decode() {""") |
| |
| idx = 0 # for Miri, return after 2nd test |
| for name in preferred: |
| if name == u"ISO-8859-8-I": |
| continue; |
| if is_single_byte(name): |
| single_byte_file.write(""" |
| decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name))) |
| idx += 1 |
| if idx == 2: |
| single_byte_file.write(""" |
| if cfg!(miri) { |
| // Miri is too slow |
| return; |
| }""") |
| |
| single_byte_file.write(""" |
| } |
| |
| #[test] |
| fn test_single_byte_encode() {""") |
| |
| |
| idx = 0 # for Miri, return after 2nd test |
| for name in preferred: |
| if name == u"ISO-8859-8-I": |
| continue; |
| if is_single_byte(name): |
| single_byte_file.write(""" |
| encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name))) |
| idx += 1 |
| if idx == 2: |
| single_byte_file.write(""" |
| if cfg!(miri) { |
| // Miri is too slow |
| return; |
| }""") |
| |
| |
| single_byte_file.write(""" |
| } |
| """) |
| |
| single_byte_file.write(single_byte_rs_end) |
| single_byte_file.close() |
| |
| static_file = open("../encoding_c/include/encoding_rs_statics.h", "w") |
| |
| static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT |
| // file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| // THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| // This file is not meant to be included directly. Instead, encoding_rs.h |
| // includes this file. |
| |
| #ifndef encoding_rs_statics_h_ |
| #define encoding_rs_statics_h_ |
| |
| #ifndef ENCODING_RS_ENCODING |
| #define ENCODING_RS_ENCODING Encoding |
| #ifndef __cplusplus |
| typedef struct Encoding_ Encoding; |
| #endif |
| #endif |
| |
| #ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR |
| #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING* |
| #endif |
| |
| #ifndef ENCODING_RS_ENCODER |
| #define ENCODING_RS_ENCODER Encoder |
| #ifndef __cplusplus |
| typedef struct Encoder_ Encoder; |
| #endif |
| #endif |
| |
| #ifndef ENCODING_RS_DECODER |
| #define ENCODING_RS_DECODER Decoder |
| #ifndef __cplusplus |
| typedef struct Decoder_ Decoder; |
| #endif |
| #endif |
| |
| #define INPUT_EMPTY 0 |
| |
| #define OUTPUT_FULL 0xFFFFFFFF |
| |
| // %s |
| #define ENCODING_NAME_MAX_LENGTH %d |
| |
| """ % (longest_name, longest_name_length)) |
| |
| for name in preferred: |
| static_file.write('''/// The %s encoding. |
| extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING; |
| |
| ''' % (to_dom_name(name), to_constant_name(name))) |
| |
| static_file.write("""#endif // encoding_rs_statics_h_ |
| """) |
| static_file.close() |
| |
| (utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs") |
| |
| utf_8_file = open("src/utf_8.rs", "w") |
| |
| utf_8_file.write(utf_8_rs_begin) |
| utf_8_file.write(""" |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| pub static UTF8_DATA: Utf8Data = Utf8Data { |
| table: [ |
| """) |
| |
| for i in range(256): |
| combined = (1 << 2) # invalid lead |
| if i < 0x80 or i > 0xBF: |
| combined |= (1 << 3) # normal trail |
| if i < 0xA0 or i > 0xBF: |
| combined |= (1 << 4) # three-byte special lower bound |
| if i < 0x80 or i > 0x9F: |
| combined |= (1 << 5) # three-byte special upper bound |
| if i < 0x90 or i > 0xBF: |
| combined |= (1 << 6) # four-byte special lower bound |
| if i < 0x80 or i > 0x8F: |
| combined |= (1 << 7) # four-byte special upper bound |
| utf_8_file.write("%d," % combined) |
| |
| for i in range(128, 256): |
| lane = (1 << 2) # invalid lead |
| if i >= 0xC2 and i <= 0xDF: |
| lane = (1 << 3) # normal trail |
| elif i == 0xE0: |
| lane = (1 << 4) # three-byte special lower bound |
| elif i >= 0xE1 and i <= 0xEC: |
| lane = (1 << 3) # normal trail |
| elif i == 0xED: |
| lane = (1 << 5) # three-byte special upper bound |
| elif i >= 0xEE and i <= 0xEF: |
| lane = (1 << 3) # normal trail |
| elif i == 0xF0: |
| lane = (1 << 6) # four-byte special lower bound |
| elif i >= 0xF1 and i <= 0xF3: |
| lane = (1 << 3) # normal trail |
| elif i == 0xF4: |
| lane = (1 << 7) # four-byte special upper bound |
| utf_8_file.write("%d," % lane) |
| |
| utf_8_file.write(""" |
| ], |
| }; |
| |
| """) |
| |
| utf_8_file.write(utf_8_rs_end) |
| utf_8_file.close() |
| |
| # Unit tests |
| |
| TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the |
| Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| |
| This is a generated file. Please do not edit. |
| Instead, please regenerate using generate-encoding-data.py |
| ''' |
| |
| index = indexes["jis0208"] |
| |
| jis0208_in_file = open("src/test_data/jis0208_in.txt", "w") |
| jis0208_in_file.write(TEST_HEADER) |
| for pointer in range(0, 94 * 94): |
| (lead, trail) = divmod(pointer, 94) |
| lead += 0xA1 |
| trail += 0xA1 |
| jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| jis0208_in_file.close() |
| |
| jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w") |
| jis0208_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, 94 * 94): |
| code_point = index[pointer] |
| if code_point: |
| jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| jis0208_in_ref_file.close() |
| |
| jis0208_out_file = open("src/test_data/jis0208_out.txt", "w") |
| jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w") |
| jis0208_out_file.write(TEST_HEADER) |
| jis0208_out_ref_file.write(TEST_HEADER) |
| for pointer in range(0, 94 * 94): |
| code_point = index[pointer] |
| if code_point: |
| revised_pointer = pointer |
| if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): |
| revised_pointer = index.index(code_point) |
| (lead, trail) = divmod(revised_pointer, 94) |
| lead += 0xA1 |
| trail += 0xA1 |
| jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| jis0208_out_file.close() |
| jis0208_out_ref_file.close() |
| |
| shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w") |
| shift_jis_in_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| (lead, trail) = divmod(pointer, 188) |
| lead += 0x81 if lead < 0x1F else 0xC1 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| shift_jis_in_file.close() |
| |
| shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w") |
| shift_jis_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer] |
| if code_point: |
| shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| trail = pointer % 188 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| if trail < 0x80: |
| shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) |
| else: |
| shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| shift_jis_in_ref_file.close() |
| |
| shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w") |
| shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w") |
| shift_jis_out_file.write(TEST_HEADER) |
| shift_jis_out_ref_file.write(TEST_HEADER) |
| for pointer in range(0, 8272): |
| code_point = index[pointer] |
| if code_point: |
| revised_pointer = pointer |
| if revised_pointer >= 1207 and revised_pointer < 1220: |
| revised_pointer = index.index(code_point) |
| (lead, trail) = divmod(revised_pointer, 188) |
| lead += 0x81 if lead < 0x1F else 0xC1 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| for pointer in range(8836, len(index)): |
| code_point = index[pointer] |
| if code_point: |
| revised_pointer = index.index(code_point) |
| if revised_pointer >= 8272 and revised_pointer < 8836: |
| revised_pointer = pointer |
| (lead, trail) = divmod(revised_pointer, 188) |
| lead += 0x81 if lead < 0x1F else 0xC1 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| shift_jis_out_file.close() |
| shift_jis_out_ref_file.close() |
| |
| iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w") |
| iso_2022_jp_in_file.write(TEST_HEADER) |
| for pointer in range(0, 94 * 94): |
| (lead, trail) = divmod(pointer, 94) |
| lead += 0x21 |
| trail += 0x21 |
| iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) |
| iso_2022_jp_in_file.close() |
| |
| iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w") |
| iso_2022_jp_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, 94 * 94): |
| code_point = index[pointer] |
| if code_point: |
| iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| iso_2022_jp_in_ref_file.close() |
| |
| iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w") |
| iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w") |
| iso_2022_jp_out_file.write(TEST_HEADER) |
| iso_2022_jp_out_ref_file.write(TEST_HEADER) |
| for pointer in range(0, 94 * 94): |
| code_point = index[pointer] |
| if code_point: |
| revised_pointer = pointer |
| if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): |
| revised_pointer = index.index(code_point) |
| (lead, trail) = divmod(revised_pointer, 94) |
| lead += 0x21 |
| trail += 0x21 |
| iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) |
| iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| for i in xrange(len(half_width_index)): |
| code_point = i + 0xFF61 |
| normalized_code_point = half_width_index[i] |
| pointer = index.index(normalized_code_point) |
| (lead, trail) = divmod(pointer, 94) |
| lead += 0x21 |
| trail += 0x21 |
| iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) |
| iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| iso_2022_jp_out_file.close() |
| iso_2022_jp_out_ref_file.close() |
| |
| index = indexes["euc-kr"] |
| |
| euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w") |
| euc_kr_in_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| (lead, trail) = divmod(pointer, 190) |
| lead += 0x81 |
| trail += 0x41 |
| euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| euc_kr_in_file.close() |
| |
| euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w") |
| euc_kr_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| code_point = index[pointer] |
| if code_point: |
| euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| trail = pointer % 190 |
| trail += 0x41 |
| if trail < 0x80: |
| euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) |
| else: |
| euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| euc_kr_in_ref_file.close() |
| |
| euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w") |
| euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w") |
| euc_kr_out_file.write(TEST_HEADER) |
| euc_kr_out_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| code_point = index[pointer] |
| if code_point: |
| (lead, trail) = divmod(pointer, 190) |
| lead += 0x81 |
| trail += 0x41 |
| euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| euc_kr_out_file.close() |
| euc_kr_out_ref_file.close() |
| |
| index = indexes["gb18030"] |
| |
| gb18030_in_file = open("src/test_data/gb18030_in.txt", "w") |
| gb18030_in_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| (lead, trail) = divmod(pointer, 190) |
| lead += 0x81 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| gb18030_in_file.close() |
| |
| gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w") |
| gb18030_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| code_point = index[pointer] |
| if code_point: |
| gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| trail = pointer % 190 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| if trail < 0x80: |
| gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) |
| else: |
| gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| gb18030_in_ref_file.close() |
| |
| gb18030_out_file = open("src/test_data/gb18030_out.txt", "w") |
| gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w") |
| gb18030_out_file.write(TEST_HEADER) |
| gb18030_out_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| if pointer == 6555: |
| continue |
| code_point = index[pointer] |
| if code_point: |
| (lead, trail) = divmod(pointer, 190) |
| lead += 0x81 |
| trail += 0x40 if trail < 0x3F else 0x41 |
| gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| gb18030_out_file.close() |
| gb18030_out_ref_file.close() |
| |
| index = indexes["big5"] |
| |
| big5_in_file = open("src/test_data/big5_in.txt", "w") |
| big5_in_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| (lead, trail) = divmod(pointer, 157) |
| lead += 0x81 |
| trail += 0x40 if trail < 0x3F else 0x62 |
| big5_in_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| big5_in_file.close() |
| |
| big5_two_characters = { |
| 1133: u"\u00CA\u0304", |
| 1135: u"\u00CA\u030C", |
| 1164: u"\u00EA\u0304", |
| 1166: u"\u00EA\u030C", |
| } |
| |
| big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w") |
| big5_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| if pointer in big5_two_characters.keys(): |
| big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8")) |
| continue |
| code_point = index[pointer] |
| if code_point: |
| big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| trail = pointer % 157 |
| trail += 0x40 if trail < 0x3F else 0x62 |
| if trail < 0x80: |
| big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) |
| else: |
| big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| big5_in_ref_file.close() |
| |
| prefer_last = [ |
| 0x2550, |
| 0x255E, |
| 0x2561, |
| 0x256A, |
| 0x5341, |
| 0x5345, |
| ] |
| |
| pointer_for_prefer_last = [] |
| |
| for code_point in prefer_last: |
| # Python lists don't have .rindex() :-( |
| for i in xrange(len(index) - 1, -1, -1): |
| candidate = index[i] |
| if candidate == code_point: |
| pointer_for_prefer_last.append(i) |
| break |
| |
| big5_out_file = open("src/test_data/big5_out.txt", "w") |
| big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w") |
| big5_out_file.write(TEST_HEADER) |
| big5_out_ref_file.write(TEST_HEADER) |
| for pointer in range(((0xA1 - 0x81) * 157), len(index)): |
| code_point = index[pointer] |
| if code_point: |
| if code_point in prefer_last: |
| if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]: |
| continue |
| else: |
| if pointer != index.index(code_point): |
| continue |
| (lead, trail) = divmod(pointer, 157) |
| lead += 0x81 |
| trail += 0x40 if trail < 0x3F else 0x62 |
| big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) |
| big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| big5_out_file.close() |
| big5_out_ref_file.close() |
| |
| index = indexes["jis0212"] |
| |
| jis0212_in_file = open("src/test_data/jis0212_in.txt", "w") |
| jis0212_in_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| (lead, trail) = divmod(pointer, 94) |
| lead += 0xA1 |
| trail += 0xA1 |
| jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail))) |
| jis0212_in_file.close() |
| |
| jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w") |
| jis0212_in_ref_file.write(TEST_HEADER) |
| for pointer in range(0, len(index)): |
| code_point = index[pointer] |
| if code_point: |
| jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) |
| else: |
| jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) |
| jis0212_in_ref_file.close() |
| |
| (codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs") |
| |
| codepage_file = open("../codepage/src/lib.rs", "w") |
| |
| codepage_file.write(codepage_begin) |
| codepage_file.write(""" |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| /// Supported code page numbers in estimated order of usage frequency |
| static CODE_PAGES: [u16; %d] = [ |
| """ % len(code_pages)) |
| |
| for code_page in code_pages: |
| codepage_file.write(" %d,\n" % code_page) |
| |
| codepage_file.write("""]; |
| |
| /// Encodings corresponding to the code page numbers in the same order |
| static ENCODINGS: [&'static Encoding; %d] = [ |
| """ % len(code_pages)) |
| |
| for code_page in code_pages: |
| name = encodings_by_code_page[code_page] |
| codepage_file.write(" &%s_INIT,\n" % to_constant_name(name)) |
| |
| codepage_file.write("""]; |
| |
| """) |
| |
| codepage_file.write(codepage_end) |
| codepage_file.close() |
| |
| (codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs") |
| |
| codepage_test_file = open("../codepage/src/tests.rs", "w") |
| |
| codepage_test_file.write(codepage_test_begin) |
| codepage_test_file.write(""" |
| // Instead, please regenerate using generate-encoding-data.py |
| |
| #[test] |
| fn test_to_encoding() { |
| assert_eq!(to_encoding(0), None); |
| |
| """) |
| |
| for code_page in code_pages: |
| codepage_test_file.write(" assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page]))) |
| |
| codepage_test_file.write("""} |
| |
| #[test] |
| fn test_from_encoding() { |
| """) |
| |
| for name in preferred: |
| if code_pages_by_encoding.has_key(name): |
| codepage_test_file.write(" assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name])) |
| else: |
| codepage_test_file.write(" assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name)) |
| |
| codepage_test_file.write("""} |
| """) |
| |
| codepage_test_file.write(codepage_test_end) |
| codepage_test_file.close() |
| |
| subprocess.call(["cargo", "fmt"]) |