vendor/unicode-width-0.1.10/scripts/unicode.py - toolchain/rustc - Git at Google

 #!/usr/bin/env python3
 #
 # Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT
 # file at the top-level directory of this distribution and at
 # http://rust-lang.org/COPYRIGHT.
 #
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.

 # This script uses the following Unicode tables:
 # - EastAsianWidth.txt
 # - ReadMe.txt
 # - UnicodeData.txt
 #
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the generated module into git.

 import enum
 import math
 import os
 import re
 import sys

 NUM_CODEPOINTS = 0x110000
 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""

 MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1))
 """The maximum number of bits required to represent a Unicode codepoint."""


 class OffsetType(enum.IntEnum):
     """Represents the data type of a lookup table's offsets. Each variant's value represents the
     number of bits required to represent that variant's type."""

     U2 = 2
     """Offsets are 2-bit unsigned integers, packed four-per-byte."""
     U4 = 4
     """Offsets are 4-bit unsigned integers, packed two-per-byte."""
     U8 = 8
     """Each offset is a single byte (u8)."""


 TABLE_CFGS = [
     (13, MAX_CODEPOINT_BITS, OffsetType.U8),
     (6, 13, OffsetType.U8),
     (0, 6, OffsetType.U2),
 ]
 """Represents the format of each level of the multi-level lookup table.
 A level's entry is of the form `(low_bit, cap_bit, offset_type)`.
 This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the
 codepoint and those tables offsets are stored according to `offset_type`.

 If this is edited, you must ensure that `emit_module` reflects your changes."""

 MODULE_FILENAME = "tables.rs"
 """The filename of the emitted Rust module (will be created in the working directory)"""

 Codepoint = int
 BitPos = int


 def fetch_open(filename: str):
     """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
     fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
     if not os.path.exists(os.path.basename(filename)):
         os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
     try:
         return open(filename, encoding="utf-8")
     except OSError:
         sys.stderr.write(f"cannot load {filename}")
         sys.exit(1)


 def load_unicode_version() -> "tuple[int, int, int]":
     """Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
     with fetch_open("ReadMe.txt") as readme:
         pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
         return tuple(map(int, re.search(pattern, readme.read()).groups()))


 class EffectiveWidth(enum.IntEnum):
     """Represents the width of a Unicode character. All East Asian Width classes resolve into
     either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""

     ZERO = 0
     """ Zero columns wide. """
     NARROW = 1
     """ One column wide. """
     WIDE = 2
     """ Two columns wide. """
     AMBIGUOUS = 3
     """ Two columns wide in a CJK context. One column wide in all other contexts. """


 def load_east_asian_widths() -> "list[EffectiveWidth]":
     """Return a list of effective widths, indexed by codepoint.
     Widths are determined by fetching and parsing `EastAsianWidth.txt`.

     `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.

     `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.

     `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
     with fetch_open("EastAsianWidth.txt") as eaw:
         # matches a width assignment for a single codepoint, i.e. "1F336;N  # ..."
         single = re.compile(r"^([0-9A-F]+);(\w+) +# (\w+)")
         # matches a width assignment for a range of codepoints, i.e. "3001..3003;W  # ..."
         multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
         # map between width category code and condensed width
         width_codes = {
             **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
             **{c: EffectiveWidth.WIDE for c in ["W", "F"]},
             "A": EffectiveWidth.AMBIGUOUS,
         }

         width_map = []
         current = 0
         for line in eaw.readlines():
             raw_data = None  # (low, high, width)
             if match := single.match(line):
                 raw_data = (match.group(1), match.group(1), match.group(2))
             elif match := multiple.match(line):
                 raw_data = (match.group(1), match.group(2), match.group(3))
             else:
                 continue
             low = int(raw_data[0], 16)
             high = int(raw_data[1], 16)
             width = width_codes[raw_data[2]]

             assert current <= high
             while current <= high:
                 # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt.
                 # All such codepoints are implicitly given Neural width (resolves to narrow)
                 width_map.append(EffectiveWidth.NARROW if current < low else width)
                 current += 1

         while len(width_map) < NUM_CODEPOINTS:
             # Catch any leftover codepoints and assign them implicit Neutral/narrow width.
             width_map.append(EffectiveWidth.NARROW)

         return width_map


 def load_zero_widths() -> "list[bool]":
     """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
     character. `c` is considered a zero-width character if `c` is in general categories
     `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
     with fetch_open("UnicodeData.txt") as categories:
         zw_map = []
         current = 0
         for line in categories.readlines():
             if len(raw_data := line.split(";")) != 15:
                 continue
             [codepoint, name, cat_code] = [
                 int(raw_data[0], 16),
                 raw_data[1],
                 raw_data[2],
             ]
             zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]

             assert current <= codepoint
             while current <= codepoint:
                 if name.endswith(", Last>") or current == codepoint:
                     # if name ends with Last, we backfill the width value to all codepoints since
                     # the previous codepoint (aka the start of the range)
                     zw_map.append(zero_width)
                 else:
                     # unassigned characters are implicitly given Neutral width, which is nonzero
                     zw_map.append(False)
                 current += 1

         while len(zw_map) < NUM_CODEPOINTS:
             # Catch any leftover codepoints. They must be unassigned (so nonzero width)
             zw_map.append(False)

         return zw_map


 class Bucket:
     """A bucket contains a group of codepoints and an ordered width list. If one bucket's width
     list overlaps with another's width list, those buckets can be merged via `try_extend`."""

     def __init__(self):
         """Creates an empty bucket."""
         self.entry_set = set()
         self.widths = []

     def append(self, codepoint: Codepoint, width: EffectiveWidth):
         """Adds a codepoint/width pair to the bucket, and appends `width` to the width list."""
         self.entry_set.add((codepoint, width))
         self.widths.append(width)

     def try_extend(self, attempt: "Bucket") -> bool:
         """If either `self` or `attempt`'s width list starts with the other bucket's width list,
         set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints
         into `self`, and return `True`. Otherwise, return `False`."""
         (less, more) = (self.widths, attempt.widths)
         if len(self.widths) > len(attempt.widths):
             (less, more) = (attempt.widths, self.widths)
         if less != more[: len(less)]:
             return False
         self.entry_set |= attempt.entry_set
         self.widths = more
         return True

     def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
         """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
         result = list(self.entry_set)
         result.sort()
         return result

     def width(self) -> "EffectiveWidth":
         """If all codepoints in this bucket have the same width, return that width; otherwise,
         return `None`."""
         if len(self.widths) == 0:
             return None
         potential_width = self.widths[0]
         for width in self.widths[1:]:
             if potential_width != width:
                 return None
         return potential_width


 def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
     """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
     codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
     same bucket. Returns a list of the buckets in increasing order of those bits."""
     num_bits = cap_bit - low_bit
     assert num_bits > 0
     buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
     mask = (1 << num_bits) - 1
     for (codepoint, width) in entries:
         buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
     return buckets


 class Table:
     """Represents a lookup table. Each table contains a certain number of subtables; each
     subtable is indexed by a contiguous bit range of the codepoint and contains a list
     of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.)

     Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should
     be indexed by sub-table `i` in the next-level lookup table. The entries of this table are
     indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The
     key to compression is that two different buckets in two different sub-tables may have the
     same width list, which means that they can be merged into the same bucket.

     If no bucket contains two codepoints with different widths, calling `indices_to_widths` will
     discard the buckets and convert the entries into `EffectiveWidth` values."""

     def __init__(
         self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType
     ):
         """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator
         in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`,
         and each table entry is represented in the format specified by  `offset_type`. Asserts
         that this table is actually representable with `offset_type`."""
         self.low_bit = low_bit
         self.cap_bit = cap_bit
         self.offset_type = offset_type
         self.entries = []
         self.indexed = []

         buckets = []
         for entries in entry_groups:
             buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))

         for bucket in buckets:
             for (i, existing) in enumerate(self.indexed):
                 if existing.try_extend(bucket):
                     self.entries.append(i)
                     break
             else:
                 self.entries.append(len(self.indexed))
                 self.indexed.append(bucket)

         # Validate offset type
         for index in self.entries:
             assert index < (1 << int(self.offset_type))

     def indices_to_widths(self):
         """Destructively converts the indices in this table to the `EffectiveWidth` values of
         their buckets. Assumes that no bucket contains codepoints with different widths."""
         self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
         del self.indexed

     def buckets(self):
         """Returns an iterator over this table's buckets."""
         return self.indexed

     def to_bytes(self) -> "list[int]":
         """Returns this table's entries as a list of bytes. The bytes are formatted according to
         the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
         to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
         contain four packed 2-bit entries."""
         entries_per_byte = 8 // int(self.offset_type)
         byte_array = []
         for i in range(0, len(self.entries), entries_per_byte):
             byte = 0
             for j in range(0, entries_per_byte):
                 byte |= self.entries[i + j] << (j * int(self.offset_type))
             byte_array.append(byte)
         return byte_array


 def make_tables(
     table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
 ) -> "list[Table]":
     """Creates a table for each configuration in `table_cfgs`, with the first config corresponding
     to the top-level lookup table, the second config corresponding to the second-level lookup
     table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
     to include in the top-level table."""
     tables = []
     entry_groups = [entries]
     for (low_bit, cap_bit, offset_type) in table_cfgs:
         table = Table(entry_groups, low_bit, cap_bit, offset_type)
         entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
         tables.append(table)
     return tables


 def emit_module(
     out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
 ):
     """Outputs a Rust module to `out_name` using table data from `tables`.
     If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
     if os.path.exists(out_name):
         os.remove(out_name)
     with open(out_name, "w", newline="\n", encoding="utf-8") as module:
         module.write(
             """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
 """
         )
         module.write(
             f"""
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-width is based on.
 pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version};
 """
         )

         module.write(
             """
 pub mod charwidth {
     use core::option::Option::{self, None, Some};

     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
     /// consulting a multi-level lookup table.
     /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
     /// they're treated as single width.
     ///
     /// # Maintenance
     /// The tables themselves are autogenerated but this function is hardcoded. You should have
     /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
     /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
     /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
     #[inline]
     fn lookup_width(c: char, is_cjk: bool) -> usize {
         let cp = c as usize;

         let t1_offset = TABLES_0[cp >> 13 & 0xFF];

         // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
         // so each sub-table is 128 bytes in size.
         // (Sub-tables are selected using the computed offset from the previous table.)
         let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];

         // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
         // This is accomplished by packing four stored entries into one byte.
         // So each sub-table is 2**(6-2) == 16 bytes in size.
         // Since this is the last table, each entry represents an encoded width.
         let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];

         // Extract the packed width
         let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;

         // A width of 3 signifies that the codepoint is ambiguous width.
         if width == 3 {
             if is_cjk {
                 2
             } else {
                 1
             }
         } else {
             width.into()
         }
     }
 """
         )

         module.write(
             """
     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
     /// `None` if `c` is a control character other than `'\\x00'`.
     /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
     /// they're treated as single width.
     #[inline]
     pub fn width(c: char, is_cjk: bool) -> Option<usize> {
         if c < '\\u{7F}' {
             if c >= '\\u{20}' {
                 // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
                 Some(1)
             } else if c == '\\0' {
                 // U+0000 *is* a control code, but it's special-cased
                 Some(0)
             } else {
                 // U+0001 to U+0020 (exclusive) are control codes
                 None
             }
         } else if c >= '\\u{A0}' {
             // No characters >= U+00A0 are control codes, so we can consult the lookup tables
             Some(lookup_width(c, is_cjk))
         } else {
             // U+007F to U+00A0 (exclusive) are control codes
             None
         }
     }
 """
         )

         subtable_count = 1
         for (i, table) in enumerate(tables):
             new_subtable_count = len(table.buckets())
             if i == len(tables) - 1:
                 table.indices_to_widths()  # for the last table, indices == widths
             byte_array = table.to_bytes()
             module.write(
                 f"""
     /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
     static TABLES_{i}: [u8; {len(byte_array)}] = ["""
             )
             for (j, byte) in enumerate(byte_array):
                 # Add line breaks for every 15th entry (chosen to match what rustfmt does)
                 if j % 15 == 0:
                     module.write("\n       ")
                 module.write(f" 0x{byte:02X},")
             module.write("\n    ];\n")
             subtable_count = new_subtable_count
         module.write("}\n")


 def main(module_filename: str):
     """Obtain character data from the latest version of Unicode, transform it into a multi-level
     lookup table for character width, and write a Rust module utilizing that table to
     `module_filename`.

     We obey the following rules in decreasing order of importance:
     - The soft hyphen (`U+00AD`) is single-width.
     - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
     - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
     - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
     - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
     - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
     of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.

     These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
     version = load_unicode_version()
     print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")

     eaw_map = load_east_asian_widths()
     zw_map = load_zero_widths()

     # Characters marked as zero-width in zw_map should be zero-width in the final map
     width_map = list(
         map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map))
     )

     # Override for soft hyphen
     width_map[0x00AD] = EffectiveWidth.NARROW

     # Override for Hangul Jamo medial vowels & final consonants
     for i in range(0x1160, 0x11FF + 1):
         width_map[i] = EffectiveWidth.ZERO

     tables = make_tables(TABLE_CFGS, enumerate(width_map))

     print("------------------------")
     total_size = 0
     for (i, table) in enumerate(tables):
         size_bytes = len(table.to_bytes())
         print(f"Table {i} Size: {size_bytes} bytes")
         total_size += size_bytes
     print("------------------------")
     print(f"  Total Size: {total_size} bytes")

     emit_module(module_filename, version, tables)
     print(f'Wrote to "{module_filename}"')


 if __name__ == "__main__":
     main(MODULE_FILENAME)
	#!/usr/bin/env python3
	#
	# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT
	# file at the top-level directory of this distribution and at
	# http://rust-lang.org/COPYRIGHT.
	#
	# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	# option. This file may not be copied, modified, or distributed
	# except according to those terms.

	# This script uses the following Unicode tables:
	# - EastAsianWidth.txt
	# - ReadMe.txt
	# - UnicodeData.txt
	#
	# Since this should not require frequent updates, we just store this
	# out-of-line and check the generated module into git.

	import enum
	import math
	import os
	import re
	import sys

	NUM_CODEPOINTS = 0x110000
	"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""

	MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1))
	"""The maximum number of bits required to represent a Unicode codepoint."""


	class OffsetType(enum.IntEnum):
	"""Represents the data type of a lookup table's offsets. Each variant's value represents the
	number of bits required to represent that variant's type."""

	U2 = 2
	"""Offsets are 2-bit unsigned integers, packed four-per-byte."""
	U4 = 4
	"""Offsets are 4-bit unsigned integers, packed two-per-byte."""
	U8 = 8
	"""Each offset is a single byte (u8)."""


	TABLE_CFGS = [
	(13, MAX_CODEPOINT_BITS, OffsetType.U8),
	(6, 13, OffsetType.U8),
	(0, 6, OffsetType.U2),
	]
	"""Represents the format of each level of the multi-level lookup table.
	A level's entry is of the form `(low_bit, cap_bit, offset_type)`.
	This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the
	codepoint and those tables offsets are stored according to `offset_type`.

	If this is edited, you must ensure that `emit_module` reflects your changes."""

	MODULE_FILENAME = "tables.rs"
	"""The filename of the emitted Rust module (will be created in the working directory)"""

	Codepoint = int
	BitPos = int


	def fetch_open(filename: str):
	"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
	fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
	if not os.path.exists(os.path.basename(filename)):
	os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
	try:
	return open(filename, encoding="utf-8")
	except OSError:
	sys.stderr.write(f"cannot load {filename}")
	sys.exit(1)


	def load_unicode_version() -> "tuple[int, int, int]":
	"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
	with fetch_open("ReadMe.txt") as readme:
	pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
	return tuple(map(int, re.search(pattern, readme.read()).groups()))


	class EffectiveWidth(enum.IntEnum):
	"""Represents the width of a Unicode character. All East Asian Width classes resolve into
	either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""

	ZERO = 0
	""" Zero columns wide. """
	NARROW = 1
	""" One column wide. """
	WIDE = 2
	""" Two columns wide. """
	AMBIGUOUS = 3
	""" Two columns wide in a CJK context. One column wide in all other contexts. """


	def load_east_asian_widths() -> "list[EffectiveWidth]":
	"""Return a list of effective widths, indexed by codepoint.
	Widths are determined by fetching and parsing `EastAsianWidth.txt`.

	`Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.

	`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.

	`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
	with fetch_open("EastAsianWidth.txt") as eaw:
	# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
	single = re.compile(r"^([0-9A-F]+);(\w+) +# (\w+)")
	# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."
	multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
	# map between width category code and condensed width
	width_codes = {
	**{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
	**{c: EffectiveWidth.WIDE for c in ["W", "F"]},
	"A": EffectiveWidth.AMBIGUOUS,
	}

	width_map = []
	current = 0
	for line in eaw.readlines():
	raw_data = None # (low, high, width)
	if match := single.match(line):
	raw_data = (match.group(1), match.group(1), match.group(2))
	elif match := multiple.match(line):
	raw_data = (match.group(1), match.group(2), match.group(3))
	else:
	continue
	low = int(raw_data[0], 16)
	high = int(raw_data[1], 16)
	width = width_codes[raw_data[2]]

	assert current <= high
	while current <= high:
	# Some codepoints don't fall into any of the ranges in EastAsianWidth.txt.
	# All such codepoints are implicitly given Neural width (resolves to narrow)
	width_map.append(EffectiveWidth.NARROW if current < low else width)
	current += 1

	while len(width_map) < NUM_CODEPOINTS:
	# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
	width_map.append(EffectiveWidth.NARROW)

	return width_map


	def load_zero_widths() -> "list[bool]":
	"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
	character. `c` is considered a zero-width character if `c` is in general categories
	`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
	with fetch_open("UnicodeData.txt") as categories:
	zw_map = []
	current = 0
	for line in categories.readlines():
	if len(raw_data := line.split(";")) != 15:
	continue
	[codepoint, name, cat_code] = [
	int(raw_data[0], 16),
	raw_data[1],
	raw_data[2],
	]
	zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]

	assert current <= codepoint
	while current <= codepoint:
	if name.endswith(", Last>") or current == codepoint:
	# if name ends with Last, we backfill the width value to all codepoints since
	# the previous codepoint (aka the start of the range)
	zw_map.append(zero_width)
	else:
	# unassigned characters are implicitly given Neutral width, which is nonzero
	zw_map.append(False)
	current += 1

	while len(zw_map) < NUM_CODEPOINTS:
	# Catch any leftover codepoints. They must be unassigned (so nonzero width)
	zw_map.append(False)

	return zw_map


	class Bucket:
	"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
	list overlaps with another's width list, those buckets can be merged via `try_extend`."""

	def __init__(self):
	"""Creates an empty bucket."""
	self.entry_set = set()
	self.widths = []

	def append(self, codepoint: Codepoint, width: EffectiveWidth):
	"""Adds a codepoint/width pair to the bucket, and appends `width` to the width list."""
	self.entry_set.add((codepoint, width))
	self.widths.append(width)

	def try_extend(self, attempt: "Bucket") -> bool:
	"""If either `self` or `attempt`'s width list starts with the other bucket's width list,
	set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints
	into `self`, and return `True`. Otherwise, return `False`."""
	(less, more) = (self.widths, attempt.widths)
	if len(self.widths) > len(attempt.widths):
	(less, more) = (attempt.widths, self.widths)
	if less != more[: len(less)]:
	return False
	self.entry_set \|= attempt.entry_set
	self.widths = more
	return True

	def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
	"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
	result = list(self.entry_set)
	result.sort()
	return result

	def width(self) -> "EffectiveWidth":
	"""If all codepoints in this bucket have the same width, return that width; otherwise,
	return `None`."""
	if len(self.widths) == 0:
	return None
	potential_width = self.widths[0]
	for width in self.widths[1:]:
	if potential_width != width:
	return None
	return potential_width


	def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
	"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
	codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
	same bucket. Returns a list of the buckets in increasing order of those bits."""
	num_bits = cap_bit - low_bit
	assert num_bits > 0
	buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
	mask = (1 << num_bits) - 1
	for (codepoint, width) in entries:
	buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
	return buckets


	class Table:
	"""Represents a lookup table. Each table contains a certain number of subtables; each
	subtable is indexed by a contiguous bit range of the codepoint and contains a list
	of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.)

	Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should
	be indexed by sub-table `i` in the next-level lookup table. The entries of this table are
	indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The
	key to compression is that two different buckets in two different sub-tables may have the
	same width list, which means that they can be merged into the same bucket.

	If no bucket contains two codepoints with different widths, calling `indices_to_widths` will
	discard the buckets and convert the entries into `EffectiveWidth` values."""

	def __init__(
	self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType
	):
	"""Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator
	in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`,
	and each table entry is represented in the format specified by `offset_type`. Asserts
	that this table is actually representable with `offset_type`."""
	self.low_bit = low_bit
	self.cap_bit = cap_bit
	self.offset_type = offset_type
	self.entries = []
	self.indexed = []

	buckets = []
	for entries in entry_groups:
	buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))

	for bucket in buckets:
	for (i, existing) in enumerate(self.indexed):
	if existing.try_extend(bucket):
	self.entries.append(i)
	break
	else:
	self.entries.append(len(self.indexed))
	self.indexed.append(bucket)

	# Validate offset type
	for index in self.entries:
	assert index < (1 << int(self.offset_type))

	def indices_to_widths(self):
	"""Destructively converts the indices in this table to the `EffectiveWidth` values of
	their buckets. Assumes that no bucket contains codepoints with different widths."""
	self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
	del self.indexed

	def buckets(self):
	"""Returns an iterator over this table's buckets."""
	return self.indexed

	def to_bytes(self) -> "list[int]":
	"""Returns this table's entries as a list of bytes. The bytes are formatted according to
	the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
	to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
	contain four packed 2-bit entries."""
	entries_per_byte = 8 // int(self.offset_type)
	byte_array = []
	for i in range(0, len(self.entries), entries_per_byte):
	byte = 0
	for j in range(0, entries_per_byte):
	byte \|= self.entries[i + j] << (j * int(self.offset_type))
	byte_array.append(byte)
	return byte_array


	def make_tables(
	table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
	) -> "list[Table]":
	"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
	to the top-level lookup table, the second config corresponding to the second-level lookup
	table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
	to include in the top-level table."""
	tables = []
	entry_groups = [entries]
	for (low_bit, cap_bit, offset_type) in table_cfgs:
	table = Table(entry_groups, low_bit, cap_bit, offset_type)
	entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
	tables.append(table)
	return tables


	def emit_module(
	out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
	):
	"""Outputs a Rust module to `out_name` using table data from `tables`.
	If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
	if os.path.exists(out_name):
	os.remove(out_name)
	with open(out_name, "w", newline="\n", encoding="utf-8") as module:
	module.write(
	"""// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
	"""
	)
	module.write(
	f"""
	/// The version of [Unicode](http://www.unicode.org/)
	/// that this version of unicode-width is based on.
	pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version};
	"""
	)

	module.write(
	"""
	pub mod charwidth {
	use core::option::Option::{self, None, Some};

	/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
	/// consulting a multi-level lookup table.
	/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
	/// they're treated as single width.
	///
	/// # Maintenance
	/// The tables themselves are autogenerated but this function is hardcoded. You should have
	/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
	/// However, if you change the actual structure of the lookup tables (perhaps by editing the
	/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
	#[inline]
	fn lookup_width(c: char, is_cjk: bool) -> usize {
	let cp = c as usize;

	let t1_offset = TABLES_0[cp >> 13 & 0xFF];

	// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
	// so each sub-table is 128 bytes in size.
	// (Sub-tables are selected using the computed offset from the previous table.)
	let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];

	// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
	// This is accomplished by packing four stored entries into one byte.
	// So each sub-table is 2**(6-2) == 16 bytes in size.
	// Since this is the last table, each entry represents an encoded width.
	let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];

	// Extract the packed width
	let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;

	// A width of 3 signifies that the codepoint is ambiguous width.
	if width == 3 {
	if is_cjk {
	2
	} else {
	1
	}
	} else {
	width.into()
	}
	}
	"""
	)

	module.write(
	"""
	/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
	/// `None` if `c` is a control character other than `'\\x00'`.
	/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
	/// they're treated as single width.
	#[inline]
	pub fn width(c: char, is_cjk: bool) -> Option<usize> {
	if c < '\\u{7F}' {
	if c >= '\\u{20}' {
	// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
	Some(1)
	} else if c == '\\0' {
	// U+0000 is a control code, but it's special-cased
	Some(0)
	} else {
	// U+0001 to U+0020 (exclusive) are control codes
	None
	}
	} else if c >= '\\u{A0}' {
	// No characters >= U+00A0 are control codes, so we can consult the lookup tables
	Some(lookup_width(c, is_cjk))
	} else {
	// U+007F to U+00A0 (exclusive) are control codes
	None
	}
	}
	"""
	)

	subtable_count = 1
	for (i, table) in enumerate(tables):
	new_subtable_count = len(table.buckets())
	if i == len(tables) - 1:
	table.indices_to_widths() # for the last table, indices == widths
	byte_array = table.to_bytes()
	module.write(
	f"""
	/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
	static TABLES_{i}: [u8; {len(byte_array)}] = ["""
	)
	for (j, byte) in enumerate(byte_array):
	# Add line breaks for every 15th entry (chosen to match what rustfmt does)
	if j % 15 == 0:
	module.write("\n ")
	module.write(f" 0x{byte:02X},")
	module.write("\n ];\n")
	subtable_count = new_subtable_count
	module.write("}\n")


	def main(module_filename: str):
	"""Obtain character data from the latest version of Unicode, transform it into a multi-level
	lookup table for character width, and write a Rust module utilizing that table to
	`module_filename`.

	We obey the following rules in decreasing order of importance:
	- The soft hyphen (`U+00AD`) is single-width.
	- Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
	- All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
	- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
	- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
	- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
	of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.

	These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
	http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
	version = load_unicode_version()
	print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")

	eaw_map = load_east_asian_widths()
	zw_map = load_zero_widths()

	# Characters marked as zero-width in zw_map should be zero-width in the final map
	width_map = list(
	map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map))
	)

	# Override for soft hyphen
	width_map[0x00AD] = EffectiveWidth.NARROW

	# Override for Hangul Jamo medial vowels & final consonants
	for i in range(0x1160, 0x11FF + 1):
	width_map[i] = EffectiveWidth.ZERO

	tables = make_tables(TABLE_CFGS, enumerate(width_map))

	print("------------------------")
	total_size = 0
	for (i, table) in enumerate(tables):
	size_bytes = len(table.to_bytes())
	print(f"Table {i} Size: {size_bytes} bytes")
	total_size += size_bytes
	print("------------------------")
	print(f" Total Size: {total_size} bytes")

	emit_module(module_filename, version, tables)
	print(f'Wrote to "{module_filename}"')


	if __name__ == "__main__":
	main(MODULE_FILENAME)