| #!/usr/bin/python |
| # -*- coding: utf-8 -*- |
| # |
| # created on: 2013jun05 |
| # created by: Markus W. Scherer |
| |
| """Converts CLDR collation files from XML syntax to ICU syntax. |
| |
| Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June. |
| Preserves indentation (except where it joins lines) and text vs. NCR etc. |
| Does not handle arbitrary LDML XML collation syntax.""" |
| |
| # Invoke with two arguments: |
| # - the source folder path |
| # - the destination folder path |
| # For example: |
| # ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation |
| |
| import codecs |
| import glob |
| import os.path |
| import sys |
| |
| def GetIndent(s): |
| for i in range(len(s)): |
| if s[i] not in " \t": return s[:i] |
| return s |
| |
| |
| # substring replacements |
| replacements = ( |
| # White space and syntax characters must be quoted. |
| # Using '\\u0020' rather than just ' ' for clarity. |
| ("<reset> </reset>", "&'\\u0020'"), # can't just replace all "> <" |
| (">!<", ">'!'<"), |
| ('>"<', ">'\\\"'<"), |
| (">"<", ">'\\\"'<"), |
| (">#<", ">'\\u0023'<"), |
| (">$<", ">'$'<"), |
| (">%<", ">'%'<"), |
| (">&<", ">'&'<"), |
| (">&<", ">'&'<"), |
| (">'<", ">''<"), |
| (">'<", ">''<"), |
| (">(<", ">'('<"), |
| (">)<", ">')'<"), |
| (">*<", ">'*'<"), |
| (">+<", ">'+'<"), |
| (">,<", ">','<"), |
| (">-<", ">'-'<"), |
| (">.<", ">'.'<"), |
| (">/<", ">'/'<"), |
| (">:<", ">':'<"), |
| (">;<", ">';'<"), |
| ("><<", ">'<'<"), |
| (">=<", ">'='<"), |
| (">><", ">'>'<"), |
| (">?<", ">'?'<"), |
| (">@<", ">'@'<"), |
| (">[<", ">'['<"), |
| (">\\<", ">'\\\\'<"), |
| (">]<", ">']'<"), |
| (">^<", ">'^'<"), |
| (">_<", ">'_'<"), |
| (">`<", ">'`'<"), |
| (">{<", ">'{'<"), |
| (">|<", ">'|'<"), |
| (">}<", ">'}'<"), |
| (">~<", ">'~'<"), |
| # ha.xml has the following |
| ("'y", "''y"), |
| ("'Y", "''Y"), |
| # kl.xml has the following |
| ("K'", "K''"), |
| # not Pattern_White_Space, just obscure |
| (u"\u00A0", u"\\u00A0"), |
| (u"\u200C", u"\\u200C"), |
| (u"\u200D", u"\\u200D"), |
| (u"\u3000", u"\\u3000"), |
| # obscure, and some tools do not handle noncharacters well |
| (u"\uFDD0", u"'\\uFDD0'"), |
| # The old ICU collation rule parser seems to need more escaping than it should. |
| (u"≠", u"'≠'"), |
| # fi.xml resets contain a space |
| (u" ̵</reset>", u"'\\u0020'̵"), |
| # fa.xml <sc> with non-NFD_Inert chars |
| (u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"), |
| # ml.xml strings contain spaces |
| (u" </s>", u"'\\u0020'"), |
| (u" </reset>", u"'\\u0020'"), |
| # vi.xml <sc> with non-NFD_Inert chars |
| (u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"), |
| # en_US_POSIX needs a lot of quoting. |
| ("<pc> !"#$%&'()*+,-./</pc>", "<*'\\u0020'-'/'"), |
| ("<pc>0123456789:;<=>?@</pc>", "<*0-'@'"), |
| ("<pc>[\]^_`</pc>", "<*'['-'`'"), |
| ("<pc>{|}~</pc>", "<*'{'-'\u007F'"), |
| # CJK parenthesized resets |
| ("<reset>(", "&'('"), |
| (")</reset>", "')'"), |
| # Convert XML elements into ICU syntax. |
| ("><!--", "> #"), # add a space before an inline comment |
| ("<!--", "#"), |
| (" -->", ""), |
| ("-->", ""), |
| ("<reset>", "&"), |
| ('<reset before="primary">', "&[before 1]"), |
| ('<reset before="secondary">', "&[before 2]"), |
| ('<reset before="tertiary">', "&[before 3]"), |
| ("</reset>", ""), |
| ("<p>", "<"), |
| ("</p>", ""), |
| ("<s>", "<<"), |
| ("</s>", ""), |
| ("<t>", "<<<"), |
| ("</t>", ""), |
| ("<i>", "="), |
| ("</i>", ""), |
| ("<pc>", "<*"), |
| ("</pc>", ""), |
| ("<sc>", "<<*"), |
| ("</sc>", ""), |
| ("<tc>", "<<<*"), |
| ("</tc>", ""), |
| ("<ic>", "=*"), |
| ("</ic>", ""), |
| ("<x>", ""), |
| ("</x>", ""), |
| ("<extend>", "/"), |
| ("</extend>", ""), |
| ("</context>", "|"), |
| ("<first_tertiary_ignorable/>", "[first tertiary ignorable]"), |
| ("<last_tertiary_ignorable/>", "[last tertiary ignorable]"), |
| ("<first_secondary_ignorable/>", "[first secondary ignorable]"), |
| ("<last_secondary_ignorable/>", "[last secondary ignorable]"), |
| ("<first_primary_ignorable/>", "[first primary ignorable]"), |
| ("<last_primary_ignorable/>", "[last primary ignorable]"), |
| ("<first_variable/>", "[first variable]"), |
| ("<last_variable/>", "[last variable]"), |
| ("<first_non_ignorable/>", "[first regular]"), |
| ("<last_non_ignorable/>", "[last regular]"), |
| ("<last_non_ignorable />", "[last regular]"), |
| ("<first_trailing/>", "[first trailing]"), |
| ("<last_trailing/>", "[last trailing]") |
| ) |
| |
| |
| def ConvertFile(src, dest): |
| in_rules = False |
| partial = "" |
| in_ml_comment = False |
| for line in src: |
| if "<rules>" in line: |
| indent = GetIndent(line) |
| stripped = line.strip() |
| # Replace import-only rules with import elements. |
| if stripped == '<rules><import source="sr"/></rules>': |
| dest.write(indent + '<import source="sr"/>\n') |
| elif stripped == '<rules><import source="hr" type="search"/></rules>': |
| dest.write(indent + '<import source="hr" type="search"/>\n') |
| elif stripped == '<rules><import source="hr"/></rules>': |
| dest.write(indent + '<import source="hr"/>\n') |
| elif stripped == '<rules><import source="ps"/></rules>': |
| dest.write(indent + '<import source="ps"/>\n') |
| else: |
| # Replace the XML <rules> section with ICU syntax rules in <cr>. |
| assert stripped == "<rules>" |
| dest.write(indent + "<cr><![CDATA[\n") |
| in_rules = True |
| elif "</rules>" in line: |
| # Flush, and go back to just copying lines until the next <rules>. |
| if partial: |
| dest.write(partial + "\n") |
| partial = "" |
| in_ml_comment = False |
| dest.write(GetIndent(line) + "]]></cr>\n") |
| in_rules = False |
| else: |
| if in_rules: |
| # Find out whether we want to concatenate the current line |
| # with the previous and/or next one. |
| finish_partial = False # Finish collected, partial input. |
| start_ml_comment = False # Start of a multi-line comment. |
| stop_comment = False # End of a comment, must terminate the line. |
| if ("<reset" in line) or line.lstrip().startswith("<!--"): |
| finish_partial = True |
| if partial and len(partial.strip()) > 80: |
| finish_partial = True |
| if "<!--" in line and "-->" not in line: |
| start_ml_comment = True |
| if "-->" in line: |
| assert line.rstrip().endswith("-->") |
| stop_comment = True |
| |
| # Convert XML syntax to ICU syntax. |
| if "<context>" in line: |
| # Swap context & relation: |
| # <x><context>カ</context><i>ー</i></x> |
| # turns into |
| # =カ|ー |
| if "<i>" in line: |
| line = line.replace("<i>", "").replace("<context>", "<i>") |
| elif "<t>" in line: |
| line = line.replace("<t>", "").replace("<context>", "<t>") |
| |
| for (xml, icu) in replacements: |
| line = line.replace(xml, icu) |
| |
| while True: |
| # Convert a Numeric Character Reference to \\uhhhh. |
| i = line.find("&#x") |
| if i < 0: break |
| limit = line.find(";", i + 3) |
| cp = line[i + 3:limit] |
| while len(cp) < 4: cp = "0" + cp |
| assert len(cp) == 4 # not handling supplementary code points |
| line = line[:i] + "\\u" + cp + line[limit + 1:] |
| |
| # Start/continue/finish concatenation, and output. |
| if partial and finish_partial: |
| # Write collected input. |
| dest.write(partial + "\n") |
| partial = "" |
| |
| if start_ml_comment: |
| # Start a multi-line comment. |
| assert not partial |
| comment_indent = GetIndent(line) # can be the empty string |
| in_ml_comment = True |
| elif in_ml_comment: |
| # Continue a multi-line comment. |
| assert not partial |
| if line.startswith(comment_indent): |
| if line[len(comment_indent)] in " \t": |
| # Preserve further indentation. |
| line = comment_indent + "#" + line[len(comment_indent):] |
| else: |
| # Add a space after the #. |
| line = comment_indent + "# " + line[len(comment_indent):] |
| else: |
| # Indent at least as much as the first line. |
| line = line.lstrip() |
| if line: |
| line = comment_indent + "# " + line |
| else: |
| line = comment_indent + "#\n" |
| elif stop_comment: |
| # Just output the line, do not start collecting input. |
| # ICU-syntax comments end with the end of the line, |
| # do not append rules to them. |
| if partial: |
| line = partial + line.lstrip() + "\n" |
| partial = "" |
| elif not partial: |
| # Start collecting input. |
| partial = line.rstrip() |
| elif partial: |
| # Continue collecting input. |
| partial += line.strip() |
| |
| if stop_comment: |
| in_ml_comment = False |
| if not partial: dest.write(line) |
| |
| |
| def main(): |
| (src_root, dest_root) = sys.argv[1:3] |
| src_pattern = os.path.join(src_root, "*.xml") |
| for src_path in glob.iglob(src_pattern): |
| basename = os.path.basename(src_path) |
| dest_path = os.path.join(dest_root, basename) |
| with codecs.open(src_path, "r", "UTF-8") as src: |
| with codecs.open(dest_path, "w", "UTF-8") as dest: |
| ConvertFile(src, dest) |
| |
| |
| if __name__ == "__main__": |
| main() |