| from bisect import bisect_left |
| import operator |
| import java.lang.Character |
| |
| # XXX - this is intended as a stopgap measure until 2.5.1, which will have a Java implementation |
| # requires java 6 for `normalize` function |
| # only has one version of the database |
| # does not normalized ideographs |
| |
| _codepoints = {} |
| _eaw = {} |
| _names = {} |
| _segments = [] |
| _eaw_segments = [] |
| Nonesuch = object() |
| |
| def get_int(col): |
| try: |
| return int(col) |
| except ValueError: |
| return None |
| |
| def get_yn(col): |
| if col == 'Y': return 1 |
| else: return 0 |
| |
| def get_numeric(col): |
| try: |
| return float(col) |
| except ValueError: |
| try: |
| a, b = col.split('/') |
| return float(a)/float(b) |
| except: |
| return None |
| |
| def init_unicodedata(data): |
| for row in data: |
| cols = row.split(';') |
| codepoint = int(cols[0], 16) |
| name = cols[1] |
| if name == '<CJK Ideograph, Last>': |
| lookup_name = 'CJK UNIFIED IDEOGRAPH' |
| else: |
| lookup_name = name |
| data = ( |
| cols[2], |
| get_int(cols[3]), |
| cols[4], |
| cols[5], |
| get_int(cols[6]), |
| get_int(cols[7]), |
| get_numeric(cols[8]), |
| get_yn(cols[9]), |
| lookup_name, |
| ) |
| |
| if name.find('First') >= 0: |
| start = codepoint |
| elif name.find('Last') >= 0: |
| _segments.append((start, (start, codepoint), data)) |
| else: |
| _names[name] = unichr(codepoint) |
| _codepoints[codepoint] = data |
| |
| def init_east_asian_width(data): |
| for row in data: |
| if row.startswith('#'): |
| continue |
| row = row.partition('#')[0] |
| cols = row.split(';') |
| if len(cols) < 2: |
| continue |
| cr = cols[0].split('..') |
| width = cols[1].rstrip() |
| if len(cr) == 1: |
| codepoint = int(cr[0], 16) |
| _eaw[codepoint] = width |
| else: |
| start = int(cr[0], 16) |
| end = int(cr[1], 16) |
| _eaw_segments.append((start, (start, end), width)) |
| |
| # xxx - need to normalize the segments, so |
| # <CJK Ideograph, Last> ==> CJK UNIFIED IDEOGRAPH; |
| # may need to do some sort of analysis against CPython for the normalization! |
| |
| def name(unichr, default=None): |
| codepoint = get_codepoint(unichr, "name") |
| v = _codepoints.get(codepoint, None) |
| if v is None: |
| v = check_segments(codepoint, _segments) |
| if v is not None: |
| return "%s-%X" % (v[8], codepoint) |
| |
| if v is None: |
| if default is not Nonesuch: |
| return default |
| raise ValueError() |
| return v[8] |
| |
| # xxx - also need to add logic here so that if it's CJK UNIFIED |
| # IDEOGRAPH-8000, we go against the segment to verify the prefix |
| |
| def lookup(name): |
| return _names[name] |
| |
| def check_segments(codepoint, segments): |
| i = bisect_left(segments, (codepoint,)) |
| if i < len(segments): |
| segment = segments[i - 1] |
| if codepoint <= segment[1][1]: |
| return segment[2] |
| return None |
| |
| |
| def get_codepoint(unichr, fn=None): |
| if not(isinstance(unichr, unicode)): |
| raise TypeError(fn, "() argument 1 must be unicode, not " + type(unichr)) |
| if len(unichr) > 1 or len(unichr) == 0: |
| raise TypeError("need a single Unicode character as parameter") |
| return ord(unichr) |
| |
| def get_eaw(unichr, default, fn): |
| codepoint = get_codepoint(unichr, fn) |
| v = _eaw.get(codepoint, None) |
| if v is None: |
| v = check_segments(codepoint, _eaw_segments) |
| |
| if v is None: |
| if default is not Nonesuch: |
| return default |
| raise ValueError() |
| return v |
| |
| def get(unichr, default, fn, getter): |
| codepoint = get_codepoint(unichr, fn) |
| data = _codepoints.get(codepoint, None) |
| if data is None: |
| data = check_segments(codepoint, _segments) |
| if data is None: |
| if default is not Nonesuch: |
| return default |
| raise ValueError() |
| v = getter(data) |
| if v is None: |
| if default is not Nonesuch: |
| return default |
| raise ValueError() |
| else: |
| return v |
| |
| category_getter = operator.itemgetter(0) |
| combining_getter = operator.itemgetter(1) |
| bidirectional_getter = operator.itemgetter(2) |
| decomposition_getter = operator.itemgetter(3) |
| decimal_getter = operator.itemgetter(4) |
| digit_getter = operator.itemgetter(5) |
| numeric_getter = operator.itemgetter(6) |
| mirrored_getter = operator.itemgetter(7) |
| |
| def decimal(unichr, default=Nonesuch): |
| return get(unichr, default, 'decimal', decimal_getter) |
| |
| def decomposition(unichr, default=''): |
| return get(unichr, default, 'decomposition', decomposition_getter) |
| |
| def digit(unichr, default=Nonesuch): |
| return get(unichr, default, 'digit', digit_getter) |
| |
| def numeric(unichr, default=Nonesuch): |
| return get(unichr, default, 'numeric', numeric_getter) |
| |
| def category(unichr): |
| return get(unichr, 'Cn', 'catgegory', category_getter) |
| |
| def bidirectional(unichr): |
| return get(unichr, '', 'bidirectional', bidirectional_getter) |
| |
| def combining(unichr): |
| return get(unichr, 0, 'combining', combining_getter) |
| |
| def mirrored(unichr): |
| return get(unichr, 0, 'mirrored', mirrored_getter) |
| |
| def east_asian_width(unichr): |
| return get_eaw(unichr, 'N', 'east_asian_width') |
| |
| def jymirrored(unichr): |
| return java.lang.Character.isMirrored(get_codepoint(unichr, 'mirrored')) |
| |
| try: |
| from java.text import Normalizer |
| |
| _forms = { |
| 'NFC': Normalizer.Form.NFC, |
| 'NFKC': Normalizer.Form.NFKC, |
| 'NFD': Normalizer.Form.NFD, |
| 'NFKD': Normalizer.Form.NFKD |
| } |
| |
| def normalize(form, unistr): |
| """ |
| Return the normal form 'form' for the Unicode string unistr. Valid |
| values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. |
| """ |
| |
| try: |
| normalizer_form = _forms[form] |
| except KeyError: |
| raise ValueError('invalid normalization form') |
| return Normalizer.normalize(unistr, normalizer_form) |
| |
| except ImportError: |
| pass |
| |
| |
| def init(): |
| import pkgutil |
| import os.path |
| import StringIO |
| import sys |
| |
| my_path = os.path.dirname(__file__) |
| loader = pkgutil.get_loader('unicodedata') |
| init_unicodedata(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'UnicodeData.txt')))) |
| init_east_asian_width(StringIO.StringIO(loader.get_data(os.path.join(my_path, 'EastAsianWidth.txt')))) |
| |
| init() |