| #! /usr/bin/python |
| |
| # PCRE2 UNICODE PROPERTY SUPPORT |
| # ------------------------------ |
| # |
| # This script generates the pcre2_ucd.c file from Unicode data files. This is |
| # the compressed Unicode property data used by PCRE2. The script was created in |
| # December 2021 as part of the Unicode data generation refactoring. It is |
| # basically a re-working of the MultiStage2.py script that was submitted to the |
| # PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of |
| # Unicode property support. A number of extensions have since been added. The |
| # main difference in the 2021 upgrade (apart from comments and layout) is that |
| # the data tables (e.g. list of script names) are now listed in or generated by |
| # a separate Python module that is shared with the other Generate scripts. |
| # |
| # This script must be run in the "maint" directory. It requires the following |
| # Unicode data tables: BidiMirrorring.txt, CaseFolding.txt, |
| # DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt, |
| # GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt, |
| # PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and |
| # emoji-data.txt. These must be in the Unicode.tables subdirectory. |
| # |
| # The emoji-data.txt file is found in the "emoji" subdirectory even though it |
| # is technically part of a different (but coordinated) standard as shown |
| # in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), |
| # for example: |
| # |
| # http://unicode.org/Public/emoji/13.0/ReadMe.txt |
| # |
| # DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" |
| # subdirectory of the Unicode database (UCD) on the Unicode web site; |
| # GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files |
| # are in the top-level UCD directory. |
| # |
| # ----------------------------------------------------------------------------- |
| # Minor modifications made to the original script: |
| # Added #! line at start |
| # Removed tabs |
| # Made it work with Python 2.4 by rewriting two statements that needed 2.5 |
| # Consequent code tidy |
| # Adjusted data file names to take from the Unicode.tables directory |
| # Adjusted global table names by prefixing _pcre_. |
| # Commented out stuff relating to the casefolding table, which isn't used; |
| # removed completely in 2012. |
| # Corrected size calculation |
| # Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. |
| # Update for PCRE2: name changes, and SUPPORT_UCP is abolished. |
| # |
| # Major modifications made to the original script: |
| # Added code to add a grapheme break property field to records. |
| # |
| # Added code to search for sets of more than two characters that must match |
| # each other caselessly. A new table is output containing these sets, and |
| # offsets into the table are added to the main output records. This new |
| # code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer |
| # used. |
| # |
| # Update for Python3: |
| # . Processed with 2to3, but that didn't fix everything |
| # . Changed string.strip to str.strip |
| # . Added encoding='utf-8' to the open() call |
| # . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is |
| # required and the result of the division is a float |
| # |
| # Added code to scan the emoji-data.txt file to find the Extended Pictographic |
| # property, which is used by PCRE2 as a grapheme breaking property. This was |
| # done when updating to Unicode 11.0.0 (July 2018). |
| # |
| # Added code to add a Script Extensions field to records. This has increased |
| # their size from 8 to 12 bytes, only 10 of which are currently used. |
| # |
| # Added code to add a bidi class field to records by scanning the |
| # DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare |
| # bytes, so now 11 out of 12 are in use. |
| # |
| # 01-March-2010: Updated list of scripts for Unicode 5.2.0 |
| # 30-April-2011: Updated list of scripts for Unicode 6.0.0 |
| # July-2012: Updated list of scripts for Unicode 6.1.0 |
| # 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new |
| # field in the record to hold the value. Luckily, the |
| # structure had a hole in it, so the resulting table is |
| # not much bigger than before. |
| # 18-September-2012: Added code for multiple caseless sets. This uses the |
| # final hole in the structure. |
| # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 |
| # 13-May-2014: Updated for PCRE2 |
| # 03-June-2014: Updated for Python 3 |
| # 20-June-2014: Updated for Unicode 7.0.0 |
| # 12-August-2014: Updated to put Unicode version into the file |
| # 19-June-2015: Updated for Unicode 8.0.0 |
| # 02-July-2017: Updated for Unicode 10.0.0 |
| # 03-July-2018: Updated for Unicode 11.0.0 |
| # 07-July-2018: Added code to scan emoji-data.txt for the Extended |
| # Pictographic property. |
| # 01-October-2018: Added the 'Unknown' script name |
| # 03-October-2018: Added new field for Script Extensions |
| # 27-July-2019: Updated for Unicode 12.1.0 |
| # 10-March-2020: Updated for Unicode 13.0.0 |
| # PCRE2-10.39: Updated for Unicode 14.0.0 |
| # 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class, |
| # and also PropList.txt for the Bidi_Control property |
| # 19-December-2021: Reworked script extensions lists to be bit maps instead |
| # of zero-terminated lists of script numbers. |
| # ---------------------------------------------------------------------------- |
| # |
| # Changes to the refactored script: |
| # |
| # 26-December-2021: Refactoring completed |
| # 10-January-2022: Addition of general Boolean property support |
| # 12-January-2022: Merge scriptx and bidiclass fields |
| # 14-January-2022: Enlarge Boolean property offset to 12 bits |
| # 28-January-2023: Remove ASCII "other case" from non-ASCII character that |
| # are present in caseless sets. |
| # |
| # ---------------------------------------------------------------------------- |
| # |
| # |
| # The main tables generated by this script are used by macros defined in |
| # pcre2_internal.h. They look up Unicode character properties using short |
| # sequences of code that contains no branches, which makes for greater speed. |
| # |
| # Conceptually, there is a table of records (of type ucd_record), one for each |
| # Unicode character. Each record contains the script number, script extension |
| # value, character type, grapheme break type, offset to caseless matching set, |
| # offset to the character's other case, the bidi class, and offset to bitmap of |
| # Boolean properties. |
| # |
| # A real table covering all Unicode characters would be far too big. It can be |
| # efficiently compressed by observing that many characters have the same |
| # record, and many blocks of characters (taking 128 characters in a block) have |
| # the same set of records as other blocks. This leads to a 2-stage lookup |
| # process. |
| # |
| # This script constructs seven tables. The ucd_caseless_sets table contains |
| # lists of characters that all match each other caselessly. Each list is |
| # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than |
| # any valid character. The first list is empty; this is used for characters |
| # that are not part of any list. |
| # |
| # The ucd_digit_sets table contains the code points of the '9' characters in |
| # each set of 10 decimal digits in Unicode. This is used to ensure that digits |
| # in script runs all come from the same set. The first element in the vector |
| # contains the number of subsequent elements, which are in ascending order. |
| # |
| # Scripts are partitioned into two groups. Scripts that appear in at least one |
| # character's script extension list come first, followed by "Unknown" and then |
| # all the rest. This sorting is done automatically in the GenerateCommon.py |
| # script. A script's number is its index in the script_names list. |
| # |
| # The ucd_script_sets table contains bitmaps that represent lists of scripts |
| # for Script Extensions properties. Each bitmap consists of a fixed number of |
| # unsigned 32-bit numbers, enough to allocate a bit for every script that is |
| # used in any character's extension list, that is, enough for every script |
| # whose number is less than ucp_Unknown. A character's script extension value |
| # in its ucd record is an offset into the ucd_script_sets vector. The first |
| # bitmap has no bits set; characters that have no script extensions have zero |
| # as their script extensions value so that they use this map. |
| # |
| # The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean |
| # properties. Each bitmap consists of a fixed number of unsigned 32-bit |
| # numbers, enough to allocate a bit for each supported Boolean property. |
| # |
| # The ucd_records table contains one instance of every unique character record |
| # that is required. The ucd_stage1 table is indexed by a character's block |
| # number, which is the character's code point divided by 128, since 128 is the |
| # size of each block. The result of a lookup in ucd_stage1 a "virtual" block |
| # number. |
| # |
| # The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by |
| # the offset of a character within its own block, and the result is the index |
| # number of the required record in the ucd_records vector. |
| # |
| # The following examples are correct for the Unicode 14.0.0 database. Future |
| # updates may make change the actual lookup values. |
| # |
| # Example: lowercase "a" (U+0061) is in block 0 |
| # lookup 0 in stage1 table yields 0 |
| # lookup 97 (0x61) in the first table in stage2 yields 35 |
| # record 35 is { 0, 5, 12, 0, -32, 18432, 44 } |
| # 0 = ucp_Latin => Latin script |
| # 5 = ucp_Ll => Lower case letter |
| # 12 = ucp_gbOther => Grapheme break property "Other" |
| # 0 => Not part of a caseless set |
| # -32 (-0x20) => Other case is U+0041 |
| # 18432 = 0x4800 => Combined Bidi class + script extension values |
| # 44 => Offset to Boolean properties |
| # |
| # The top 5 bits of the sixth field are the Bidi class, with the rest being the |
| # script extension value, giving: |
| # |
| # 9 = ucp_bidiL => Bidi class left-to-right |
| # 0 => No special script extension property |
| # |
| # Almost all lowercase latin characters resolve to the same record. One or two |
| # are different because they are part of a multi-character caseless set (for |
| # example, k, K and the Kelvin symbol are such a set). |
| # |
| # Example: hiragana letter A (U+3042) is in block 96 (0x60) |
| # lookup 96 in stage1 table yields 93 |
| # lookup 66 (0x42) in table 93 in stage2 yields 819 |
| # record 819 is { 20, 7, 12, 0, 0, 18432, 82 } |
| # 20 = ucp_Hiragana => Hiragana script |
| # 7 = ucp_Lo => Other letter |
| # 12 = ucp_gbOther => Grapheme break property "Other" |
| # 0 => Not part of a caseless set |
| # 0 => No other case |
| # 18432 = 0x4800 => Combined Bidi class + script extension values |
| # 82 => Offset to Boolean properties |
| # |
| # The top 5 bits of the sixth field are the Bidi class, with the rest being the |
| # script extension value, giving: |
| # |
| # 9 = ucp_bidiL => Bidi class left-to-right |
| # 0 => No special script extension property |
| # |
| # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) |
| # lookup 57 in stage1 table yields 55 |
| # lookup 80 (0x50) in table 55 in stage2 yields 621 |
| # record 621 is { 84, 12, 3, 0, 0, 26762, 96 } |
| # 84 = ucp_Inherited => Script inherited from predecessor |
| # 12 = ucp_Mn => Non-spacing mark |
| # 3 = ucp_gbExtend => Grapheme break property "Extend" |
| # 0 => Not part of a caseless set |
| # 0 => No other case |
| # 26762 = 0x688A => Combined Bidi class + script extension values |
| # 96 => Offset to Boolean properties |
| # |
| # The top 5 bits of the sixth field are the Bidi class, with the rest being the |
| # script extension value, giving: |
| # |
| # 13 = ucp_bidiNSM => Bidi class non-spacing mark |
| # 138 => Script Extension list offset = 138 |
| # |
| # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, |
| # 18, and 47 set. This means that this character is expected to be used with |
| # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. |
| # |
| # Philip Hazel, last updated 14 January 2022. |
| ############################################################################## |
| |
| |
| # Import standard modules |
| |
| import re |
| import string |
| import sys |
| |
| # Import common data lists and functions |
| |
| from GenerateCommon import \ |
| bidi_classes, \ |
| bool_properties, \ |
| bool_propsfiles, \ |
| bool_props_list_item_size, \ |
| break_properties, \ |
| category_names, \ |
| general_category_names, \ |
| script_abbrevs, \ |
| script_list_item_size, \ |
| script_names, \ |
| open_output |
| |
| # Some general parameters |
| |
| MAX_UNICODE = 0x110000 |
| NOTACHAR = 0xffffffff |
| |
| |
| # --------------------------------------------------------------------------- |
| # DEFINE FUNCTIONS |
| # --------------------------------------------------------------------------- |
| |
| |
| # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt |
| |
| def make_get_names(enum): |
| return lambda chardata: enum.index(chardata[1]) |
| |
| |
| # Parse a line of DerivedBidiClass.txt |
| |
| def get_bidi(chardata): |
| if len(chardata[1]) > 3: |
| return bidi_classes_long.index(chardata[1]) |
| else: |
| return bidi_classes_short.index(chardata[1]) |
| |
| |
| # Parse a line of CaseFolding.txt |
| |
| def get_other_case(chardata): |
| if chardata[1] == 'C' or chardata[1] == 'S': |
| return int(chardata[2], 16) - int(chardata[0], 16) |
| return None |
| |
| |
| # Parse a line of ScriptExtensions.txt |
| |
| def get_script_extension(chardata): |
| global last_script_extension |
| |
| offset = len(script_lists) * script_list_item_size |
| if last_script_extension == chardata[1]: |
| return offset - script_list_item_size |
| |
| last_script_extension = chardata[1] |
| script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' '))) |
| return offset |
| |
| |
| # Read a whole table in memory, setting/checking the Unicode version |
| |
| def read_table(file_name, get_value, default_value): |
| global unicode_version |
| |
| f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) |
| file_base = f.group(1) |
| version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" |
| file = open(file_name, 'r', encoding='utf-8') |
| f = re.match(version_pat, file.readline()) |
| version = f.group(1) |
| if unicode_version == "": |
| unicode_version = version |
| elif unicode_version != version: |
| print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) |
| |
| table = [default_value] * MAX_UNICODE |
| for line in file: |
| if file_base == 'DerivedBidiClass': |
| line = re.sub(r'# @missing: ', '', line) |
| |
| line = re.sub(r'#.*', '', line) |
| chardata = list(map(str.strip, line.split(';'))) |
| if len(chardata) <= 1: |
| continue |
| value = get_value(chardata) |
| if value is None: |
| continue |
| m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) |
| char = int(m.group(1), 16) |
| if m.group(3) is None: |
| last = char |
| else: |
| last = int(m.group(3), 16) |
| for i in range(char, last + 1): |
| table[i] = value |
| |
| file.close() |
| return table |
| |
| |
| # Get the smallest possible C language type for the values in a table |
| |
| def get_type_size(table): |
| type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), |
| ("signed char", 1), ("int16_t", 2), ("int32_t", 4)] |
| limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127), |
| (-32768, 32767), (-2147483648, 2147483647)] |
| minval = min(table) |
| maxval = max(table) |
| for num, (minlimit, maxlimit) in enumerate(limits): |
| if minlimit <= minval and maxval <= maxlimit: |
| return type_size[num] |
| raise OverflowError("Too large to fit into C types") |
| |
| |
| # Get the total size of a list of tables |
| |
| def get_tables_size(*tables): |
| total_size = 0 |
| for table in tables: |
| type, size = get_type_size(table) |
| total_size += size * len(table) |
| return total_size |
| |
| |
| # Compress a table into the two stages |
| |
| def compress_table(table, block_size): |
| blocks = {} # Dictionary for finding identical blocks |
| stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) |
| stage2 = [] # Stage 2 table contains the blocks with property values |
| table = tuple(table) |
| for i in range(0, len(table), block_size): |
| block = table[i:i+block_size] |
| start = blocks.get(block) |
| if start is None: |
| # Allocate a new block |
| start = len(stage2) / block_size |
| stage2 += block |
| blocks[block] = start |
| stage1.append(start) |
| return stage1, stage2 |
| |
| |
| # Output a table |
| |
| def write_table(table, table_name, block_size = None): |
| type, size = get_type_size(table) |
| ELEMS_PER_LINE = 16 |
| |
| s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) |
| if block_size: |
| s += ", block = %d" % block_size |
| f.write(s + " */\n") |
| table = tuple(table) |
| if block_size is None: |
| fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n" |
| mult = MAX_UNICODE / len(table) |
| for i in range(0, len(table), ELEMS_PER_LINE): |
| f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),))) |
| else: |
| if block_size > ELEMS_PER_LINE: |
| el = ELEMS_PER_LINE |
| else: |
| el = block_size |
| fmt = "%3d," * el + "\n" |
| if block_size > ELEMS_PER_LINE: |
| fmt = fmt * int(block_size / ELEMS_PER_LINE) |
| for i in range(0, len(table), block_size): |
| f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) |
| f.write("};\n\n") |
| |
| |
| # Extract the unique combinations of properties into records |
| |
| def combine_tables(*tables): |
| records = {} |
| index = [] |
| for t in zip(*tables): |
| i = records.get(t) |
| if i is None: |
| i = records[t] = len(records) |
| index.append(i) |
| return index, records |
| |
| |
| # Create a record struct |
| |
| def get_record_size_struct(records): |
| size = 0 |
| structure = 'typedef struct {\n' |
| for i in range(len(records[0])): |
| record_slice = [record[i] for record in records] |
| slice_type, slice_size = get_type_size(record_slice) |
| # add padding: round up to the nearest power of slice_size |
| size = (size + slice_size - 1) & -slice_size |
| size += slice_size |
| structure += '%s property_%d;\n' % (slice_type, i) |
| |
| # round up to the first item of the next structure in array |
| record_slice = [record[0] for record in records] |
| slice_type, slice_size = get_type_size(record_slice) |
| size = (size + slice_size - 1) & -slice_size |
| |
| structure += '} ucd_record;\n*/\n' |
| return size, structure |
| |
| |
| # Write records |
| |
| def write_records(records, record_size): |
| f.write('const ucd_record PRIV(ucd_records)[] = { ' + \ |
| '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size)) |
| records = list(zip(list(records.keys()), list(records.values()))) |
| records.sort(key = lambda x: x[1]) |
| for i, record in enumerate(records): |
| f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,))) |
| f.write('};\n\n') |
| |
| |
| # Write a bit set |
| |
| def write_bitsets(list, item_size): |
| for d in list: |
| bitwords = [0] * item_size |
| for idx in d: |
| bitwords[idx // 32] |= 1 << (idx & 31) |
| s = " " |
| for x in bitwords: |
| f.write("%s" % s) |
| s = ", " |
| f.write("0x%08xu" % x) |
| f.write(",\n") |
| f.write("};\n\n") |
| |
| |
| # --------------------------------------------------------------------------- |
| # This bit of code must have been useful when the original script was being |
| # developed. Retain it just in case it is ever needed again. |
| |
| # def test_record_size(): |
| # tests = [ \ |
| # ( [(3,), (6,), (6,), (1,)], 1 ), \ |
| # ( [(300,), (600,), (600,), (100,)], 2 ), \ |
| # ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ |
| # ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ |
| # ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ |
| # ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ |
| # ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ |
| # ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ |
| # ] |
| # for test in tests: |
| # size, struct = get_record_size_struct(test[0]) |
| # assert(size == test[1]) |
| # test_record_size() |
| # --------------------------------------------------------------------------- |
| |
| |
| |
| # --------------------------------------------------------------------------- |
| # MAIN CODE FOR CREATING TABLES |
| # --------------------------------------------------------------------------- |
| |
| unicode_version = "" |
| |
| # Some of the tables imported from GenerateCommon.py have alternate comment |
| # strings for use by GenerateUcpHeader. The comments are not wanted here, so |
| # remove them. |
| |
| bidi_classes_short = bidi_classes[::2] |
| bidi_classes_long = bidi_classes[1::2] |
| break_properties = break_properties[::2] |
| category_names = category_names[::2] |
| |
| # Create the various tables from Unicode data files |
| |
| script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) |
| category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) |
| break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other')) |
| other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) |
| bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', get_bidi, bidi_classes_short.index('L')) |
| |
| # The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now |
| # we need to find the Extended_Pictographic property for emoji characters. This |
| # can be set as an additional grapheme break property, because the default for |
| # all the emojis is "other". We scan the emoji-data.txt file and modify the |
| # break-props table. |
| |
| file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') |
| for line in file: |
| line = re.sub(r'#.*', '', line) |
| chardata = list(map(str.strip, line.split(';'))) |
| if len(chardata) <= 1: |
| continue |
| if chardata[1] != "Extended_Pictographic": |
| continue |
| m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) |
| char = int(m.group(1), 16) |
| if m.group(3) is None: |
| last = char |
| else: |
| last = int(m.group(3), 16) |
| for i in range(char, last + 1): |
| if break_props[i] != break_properties.index('Other'): |
| print("WARNING: Emoji 0x%x has break property %s, not 'Other'", |
| i, break_properties[break_props[i]], file=sys.stderr) |
| break_props[i] = break_properties.index('Extended_Pictographic') |
| file.close() |
| |
| # Handle script extensions. The get_script_extesion() function maintains a |
| # list of unique bitmaps representing lists of scripts, returning the offset |
| # in that list. Initialize the list with an empty set, which is used for |
| # characters that have no script extensions. |
| |
| script_lists = [[]] |
| last_script_extension = "" |
| scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0) |
| |
| for idx in range(len(scriptx_bidi_class)): |
| scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11) |
| bidi_class = None |
| |
| # Find the Boolean properties of each character. This next bit of magic creates |
| # a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to |
| # the *same* list, which is not what we want. |
| |
| bprops = [[] for _ in range(MAX_UNICODE)] |
| |
| # Collect the properties from the various files |
| |
| for filename in bool_propsfiles: |
| try: |
| file = open('Unicode.tables/' + filename, 'r') |
| except IOError: |
| print(f"** Couldn't open {'Unicode.tables/' + filename}\n") |
| sys.exit(1) |
| |
| for line in file: |
| line = re.sub(r'#.*', '', line) |
| data = list(map(str.strip, line.split(';'))) |
| if len(data) <= 1: |
| continue |
| |
| try: |
| ix = bool_properties.index(data[1]) |
| except ValueError: |
| continue |
| |
| m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0]) |
| char = int(m.group(1), 16) |
| if m.group(3) is None: |
| last = char |
| else: |
| last = int(m.group(3), 16) |
| |
| for i in range(char, last + 1): |
| bprops[i].append(ix) |
| |
| file.close() |
| |
| # The ASCII property isn't listed in any files, but it is easy enough to add |
| # it manually. |
| |
| ix = bool_properties.index("ASCII") |
| for i in range(128): |
| bprops[i].append(ix) |
| |
| # The Bidi_Mirrored property isn't listed in any property files. We have to |
| # deduce it from the file that lists the mirrored characters. |
| |
| ix = bool_properties.index("Bidi_Mirrored") |
| |
| try: |
| file = open('Unicode.tables/BidiMirroring.txt', 'r') |
| except IOError: |
| print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n") |
| sys.exit(1) |
| |
| for line in file: |
| line = re.sub(r'#.*', '', line) |
| data = list(map(str.strip, line.split(';'))) |
| if len(data) <= 1: |
| continue |
| c = int(data[0], 16) |
| bprops[c].append(ix) |
| |
| file.close() |
| |
| # Scan each character's boolean property list and created a list of unique |
| # lists, at the same time, setting the index in that list for each property in |
| # the bool_props vector. |
| |
| bool_props = [0] * MAX_UNICODE |
| bool_props_lists = [[]] |
| |
| for c in range(MAX_UNICODE): |
| s = set(bprops[c]) |
| for i in range(len(bool_props_lists)): |
| if s == set(bool_props_lists[i]): |
| break; |
| else: |
| bool_props_lists.append(bprops[c]) |
| i += 1 |
| |
| bool_props[c] = i * bool_props_list_item_size |
| |
| # This block of code was added by PH in September 2012. It scans the other_case |
| # table to find sets of more than two characters that must all match each other |
| # caselessly. Later in this script a table of these sets is written out. |
| # However, we have to do this work here in order to compute the offsets in the |
| # table that are inserted into the main table. |
| |
| # The CaseFolding.txt file lists pairs, but the common logic for reading data |
| # sets only one value, so first we go through the table and set "return" |
| # offsets for those that are not already set. |
| |
| for c in range(MAX_UNICODE): |
| if other_case[c] != 0 and other_case[c + other_case[c]] == 0: |
| other_case[c + other_case[c]] = -other_case[c] |
| |
| # Now scan again and create equivalence sets. |
| |
| caseless_sets = [] |
| |
| for c in range(MAX_UNICODE): |
| o = c + other_case[c] |
| |
| # Trigger when this character's other case does not point back here. We |
| # now have three characters that are case-equivalent. |
| |
| if other_case[o] != -other_case[c]: |
| t = o + other_case[o] |
| |
| # Scan the existing sets to see if any of the three characters are already |
| # part of a set. If so, unite the existing set with the new set. |
| |
| appended = 0 |
| for s in caseless_sets: |
| found = 0 |
| for x in s: |
| if x == c or x == o or x == t: |
| found = 1 |
| |
| # Add new characters to an existing set |
| |
| if found: |
| found = 0 |
| for y in [c, o, t]: |
| for x in s: |
| if x == y: |
| found = 1 |
| if not found: |
| s.append(y) |
| appended = 1 |
| |
| # If we have not added to an existing set, create a new one. |
| |
| if not appended: |
| caseless_sets.append([c, o, t]) |
| |
| # End of loop looking for caseless sets. |
| |
| # Now scan the sets and set appropriate offsets for the characters. |
| |
| caseless_offsets = [0] * MAX_UNICODE |
| |
| offset = 1; |
| for s in caseless_sets: |
| for x in s: |
| caseless_offsets[x] = offset |
| offset += len(s) + 1 |
| |
| # End of block of code for creating offsets for caseless matching sets. |
| |
| # Scan the caseless sets, and for any non-ASCII character that has an ASCII |
| # character as its "base" other case, remove the other case. This makes it |
| # easier to handle those characters when the PCRE2 option for not mixing ASCII |
| # and non-ASCII is enabled. In principle one should perhaps scan for a |
| # non-ASCII alternative, but in practice these don't exist. |
| |
| for s in caseless_sets: |
| for x in s: |
| if x > 127 and x + other_case[x] < 128: |
| other_case[x] = 0 |
| |
| # Combine all the tables |
| |
| table, records = combine_tables(script, category, break_props, |
| caseless_offsets, other_case, scriptx_bidi_class, bool_props) |
| |
| # Find the record size and create a string definition of the structure for |
| # outputting as a comment. |
| |
| record_size, record_struct = get_record_size_struct(list(records.keys())) |
| |
| # Find the optimum block size for the two-stage table |
| |
| min_size = sys.maxsize |
| for block_size in [2 ** i for i in range(5,10)]: |
| size = len(records) * record_size |
| stage1, stage2 = compress_table(table, block_size) |
| size += get_tables_size(stage1, stage2) |
| #print("/* block size {:3d} => {:5d} bytes */".format(block_size, size)) |
| if size < min_size: |
| min_size = size |
| min_stage1, min_stage2 = stage1, stage2 |
| min_block_size = block_size |
| |
| |
| # --------------------------------------------------------------------------- |
| # MAIN CODE FOR WRITING THE OUTPUT FILE |
| # --------------------------------------------------------------------------- |
| |
| # Open the output file (no return on failure). This call also writes standard |
| # header boilerplate. |
| |
| f = open_output("pcre2_ucd.c") |
| |
| # Output this file's heading text |
| |
| f.write("""\ |
| /* This file contains tables of Unicode properties that are extracted from |
| Unicode data files. See the comments at the start of maint/GenerateUcd.py for |
| details. |
| |
| As well as being part of the PCRE2 library, this file is #included by the |
| pcre2test program, which redefines the PRIV macro to change table names from |
| _pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present, |
| just one of these tables is actually needed. When compiling the library, some |
| headers are needed. */ |
| |
| #ifndef PCRE2_PCRE2TEST |
| #ifdef HAVE_CONFIG_H |
| #include "config.h" |
| #endif |
| #include "pcre2_internal.h" |
| #endif /* PCRE2_PCRE2TEST */ |
| |
| /* The tables herein are needed only when UCP support is built, and in PCRE2 |
| that happens automatically with UTF support. This module should not be |
| referenced otherwise, so it should not matter whether it is compiled or not. |
| However a comment was received about space saving - maybe the guy linked all |
| the modules rather than using a library - so we include a condition to cut out |
| the tables when not needed. But don't leave a totally empty module because some |
| compilers barf at that. Instead, just supply some small dummy tables. */ |
| |
| #ifndef SUPPORT_UNICODE |
| const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}}; |
| const uint16_t PRIV(ucd_stage1)[] = {0}; |
| const uint16_t PRIV(ucd_stage2)[] = {0}; |
| const uint32_t PRIV(ucd_caseless_sets)[] = {0}; |
| #else |
| \n""") |
| |
| # --- Output some variable heading stuff --- |
| |
| f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size)) |
| f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version)) |
| |
| f.write("""\ |
| /* When recompiling tables with a new Unicode version, please check the types |
| in this structure definition with those in pcre2_internal.h (the actual field |
| names will be different). |
| \n""") |
| |
| f.write(record_struct) |
| |
| f.write(""" |
| /* If the 32-bit library is run in non-32-bit mode, character values greater |
| than 0x10ffff may be encountered. For these we set up a special record. */ |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| const ucd_record PRIV(dummy_ucd_record)[] = {{ |
| ucp_Unknown, /* script */ |
| ucp_Cn, /* type unassigned */ |
| ucp_gbOther, /* grapheme break property */ |
| 0, /* case set */ |
| 0, /* other case */ |
| 0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */ |
| 0, /* bool properties offset */ |
| }}; |
| #endif |
| \n""") |
| |
| # --- Output the table of caseless character sets --- |
| |
| f.write("""\ |
| /* This table contains lists of characters that are caseless sets of |
| more than one character. Each list is terminated by NOTACHAR. */ |
| |
| const uint32_t PRIV(ucd_caseless_sets)[] = { |
| NOTACHAR, |
| """) |
| |
| for s in caseless_sets: |
| s = sorted(s) |
| for x in s: |
| f.write(' 0x%04x,' % x) |
| f.write(' NOTACHAR,\n') |
| f.write('};\n\n') |
| |
| # --- Other tables are not needed by pcre2test --- |
| |
| f.write("""\ |
| /* When #included in pcre2test, we don't need the table of digit sets, nor the |
| the large main UCD tables. */ |
| |
| #ifndef PCRE2_PCRE2TEST |
| \n""") |
| |
| # --- Read Scripts.txt again for the sets of 10 digits. --- |
| |
| digitsets = [] |
| file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') |
| |
| for line in file: |
| m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) |
| if m is None: |
| continue |
| first = int(m.group(1),16) |
| last = int(m.group(2),16) |
| if ((last - first + 1) % 10) != 0: |
| f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), |
| file=sys.stderr) |
| while first < last: |
| digitsets.append(first + 9) |
| first += 10 |
| file.close() |
| digitsets.sort() |
| |
| f.write("""\ |
| /* This table lists the code points for the '9' characters in each set of |
| decimal digits. It is used to ensure that all the digits in a script run come |
| from the same set. */ |
| |
| const uint32_t PRIV(ucd_digit_sets)[] = { |
| """) |
| |
| f.write(" %d, /* Number of subsequent values */" % len(digitsets)) |
| count = 8 |
| for d in digitsets: |
| if count == 8: |
| f.write("\n ") |
| count = 0 |
| f.write(" 0x%05x," % d) |
| count += 1 |
| f.write("\n};\n\n") |
| |
| f.write("""\ |
| /* This vector is a list of script bitsets for the Script Extension property. |
| The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as |
| ucd_script_sets_item_size. */ |
| |
| const uint32_t PRIV(ucd_script_sets)[] = { |
| """) |
| write_bitsets(script_lists, script_list_item_size) |
| |
| f.write("""\ |
| /* This vector is a list of bitsets for Boolean properties. The number of |
| 32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in |
| pcre2_ucp.h. */ |
| |
| const uint32_t PRIV(ucd_boolprop_sets)[] = { |
| """) |
| write_bitsets(bool_props_lists, bool_props_list_item_size) |
| |
| |
| # Output the main UCD tables. |
| |
| f.write("""\ |
| /* These are the main two-stage UCD tables. The fields in each record are: |
| script (8 bits), character type (8 bits), grapheme break property (8 bits), |
| offset to multichar other cases or zero (8 bits), offset to other case or zero |
| (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed |
| into a 16-bit field, and offset in binary properties table (16 bits). */ |
| \n""") |
| |
| write_records(records, record_size) |
| write_table(min_stage1, 'PRIV(ucd_stage1)') |
| write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) |
| |
| f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size) |
| f.write("""\ |
| #error Please correct UCD_BLOCK_SIZE in pcre2_internal.h |
| #endif |
| #endif /* SUPPORT_UNICODE */ |
| |
| #endif /* PCRE2_PCRE2TEST */ |
| |
| /* End of pcre2_ucd.c */ |
| """) |
| |
| f.close |
| |
| # End |