#!/usr/bin/env python # Copyright 2015 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Generate a compact character composition table. Normal use: ./generate_character_composer_data.py \ --output character_composer_data.h \ character_composer_sequences.txt Run this script with --help for a description of arguments. Input file format: Comments begin with '#' and extend to the end of the line. Each non-comment line is a sequence of two or more keys, separated by space, the last of which is the result of composing the initial part. A sequence must begin with a dead key or Compose, and the result must be a character key. Each key can either be a character key, a dead key, or the compose key. A character key is written as one of the following inside matched delimiters: - a single printable ASCII character; - a Unicode character name; - 'U+' followed by one or more hexadecimal digits. Delimiter pairs are any of '' "" () <> [] or {}. A dead key is written as the word 'Dead' followed (immediately, without space) by the combining character written in the same form as a character key. A compose key is written as the word 'Compose'. Output file format: The output file is a C++ header containing a small header structure |ui::TreeComposeChecker::CompositionData| and a tree of composition sequences. For space efficiency, the tree is stored in a single array of 16-bit values, which represent either characters (printable or dead-key) or subtree array indices. Each tree level consists for four tables: two key kinds (character or dead) by two node types (internal or leaf), in the order: - character internal - character leaf - dead-key internal - dead-key leaf This order is used because character key entries are more common than dead-key entries, and internal entries are more common than leaf entries. Each table consists of a word containing the number of table entries |n|, followed immediately by |n| key-value pairs of words, ordered by key. For internal edges, the value is the array index of the resulting subtree. For leaf edges, the value is the unicode character code of the composition result. """ import argparse import codecs import collections import sys import unicodedata # Global error counter. g_fail = 0 class Key(str): """Represents an element of a composition sequence. Supports only Compose, dead keys, and BMP unicode characters. Based on |str| for easy comparison and sorting. The representation is 'C' (for unicode characters) or 'D' (for dead keys) followed by 4 hex digits for the character value. The Compose key is represented as dead key with combining character 0. """ _kinds = ['character', 'dead_key'] def __new__(cls, key, character, location=None): """Construct a Key. Call as: - Key(None, character_code) - Key('Dead', combining_character_code) - Key('Compose', 0) """ global g_fail if character > 0xFFFF: print '{}: unsupported non-BMP character {}'.format(location, character) g_fail += 1 s = 'ERROR' elif key is None: s = 'C{:04X}'.format(character) elif key.lower() == 'dead': s = 'D{:04X}'.format(character) elif key.lower() == 'compose': s = 'D0000' else: print '{}: unexpected combination {}<{}>'.format(location, key, character) g_fail += 1 s = 'ERROR' return str.__new__(cls, s) def Kind(self): return {'C': 'character', 'D': 'dead_key'}[self[0]] def CharacterCode(self): return int(self[1:], 16) def UnicodeName(self): v = self.CharacterCode() try: return unicodedata.name(unichr(v)).lower() except ValueError: return 'U+{:04X}'.format(v) def ShorterUnicodeName(self): s = self.UnicodeName() if s.startswith('latin ') or s.startswith('greek '): s = s[6:] if s.startswith('small '): s = s[6:] return s.replace(' letter ', ' ') def Pretty(self): if self == 'D0000': return 'Compose' return ('Dead' if self[0] == 'D' else '') + '<' + self.UnicodeName() + '>' class Input: """ Takes a sequences of file names and presents them as a single input stream, with location reporting for error messages. """ def __init__(self, filenames): self._pending = filenames self._filename = None self._file = None self._line = None self._lineno = 0 self._column = 0 def Where(self): """Report the current input location, for error messages.""" if self._file: return '{}:{}:{}'.format(self._filename, self._lineno, self._column) if self._pending: return '<before>' return '<eof>' def Get(self): """Return the next input character, or None when inputs are exhausted.""" if self._line is None: if self._file is None: if not self._pending: return None self._filename = self._pending[0] self._pending = self._pending[1:] self._file = codecs.open(self._filename, mode='rb', encoding='utf-8') self._lineno = 0 self._lineno += 1 self._line = self._file.readline() if not self._line: self._file = None self._filename = None return self.Get() self._column = 0 if self._column >= len(self._line): self._line = None return self.Get() c = self._line[self._column] self._column += 1 return c class Lexer: """ Breaks the input stream into a sequence of tokens, each of which is either a Key or the string 'EOL'. """ def __init__(self, compose_input): self._input = compose_input _delimiters = { '"': '"', "'": "'", '<': '>', '(': ')', '[': ']', '{': '}', } def GetUntilDelimiter(self, e): text = '' c = self._input.Get() while c and c != e: text += c c = self._input.Get() return text def Get(self): global g_fail c = ' ' while c and c.isspace() and c != '\n': c = self._input.Get() if not c: return None location = self._input.Where() if c == '\n': return 'EOL' if c == '#': self.GetUntilDelimiter('\n') return 'EOL' if c == '\\': self.GetUntilDelimiter('\n') return self.Get() key = None character = None if c.isalnum(): key = '' while c and c.isalnum(): key += c c = self._input.Get() if c in Lexer._delimiters: s = self.GetUntilDelimiter(Lexer._delimiters[c]) if len(s) == 1: character = ord(s) elif s.startswith('U+'): character = int(s[2:], 16) else: try: character = ord(unicodedata.lookup(s.upper())) except KeyError: g_fail += 1 character = None print '{}: unknown character name "{}"'.format(location, s.encode('utf-8')) return Key(key, character, location) def Where(self): return self._input.Where() class Parser: """ Takes a sequence of tokens from a Lexer and returns a tree of composition sequences, represented as nested dictionaries where each composition source element key is a dictionary key, and the final composition result has a dictionary key of |None|. """ def __init__(self, lexer): self._lexer = lexer self._trie = {} def Parse(self): global g_fail self._trie = {} while True: seq = [] t = self._lexer.Get() if not t: break if t and t != 'EOL' and t.Kind() != 'dead_key': g_fail += 1 print ('{}: sequence does not begin with Compose or Dead key' .format(self._lexer.Where())) break while t and t != 'EOL': seq.append(t) t = self._lexer.Get() if not seq: continue self.AddToSimpleTree(self._trie, seq) return self._trie def AddToSimpleTree(self, tree, seq): first = seq[0] rest = seq[1:] if first not in tree: tree[first] = {} if len(rest) == 1: # Leaf tree[first][None] = rest[0] else: self.AddToSimpleTree(tree[first], rest) class GroupedTree: """ Represents composition sequences in a manner close to the output format. The core of the representation is the |_tables| dictionary, which has an entry for each kind of |Key|, each of which is a dictionary with two entries, 'internal' and 'leaf', for the output tables, each being a dictionary indexed by a composition sequence |Key|. For 'internal' tables the dictionary values are |GroupedTree|s at the next level; for 'leaf' tables the dictionary values are |Key| composition results. """ _key_kinds = Key._kinds _sub_parts = ['internal', 'leaf'] def __init__(self, simple_trie, path=None): if path is None: path = [] self.path = path self.depth = len(path) self.height = 0 self.empty = True self.location = -1 # Initialize |_tables|. self._tables = {} for k in self._key_kinds: self._tables[k] = {} for p in self._sub_parts: self._tables[k][p] = {} # Build the tables. for key in simple_trie: if key is not None: # Leaf table entry. if None in simple_trie[key]: self.empty = False self._tables[key.Kind()]['leaf'][key] = simple_trie[key][None] # Internal subtree entries. v = GroupedTree(simple_trie[key], path + [key]) if not v.empty: self.empty = False self._tables[key.Kind()]['internal'][key] = v if self.height < v.height: self.height = v.height self.height += 1 def SubTrees(self): """Returns a list of all sub-|GroupedTree|s of the current GroupTree.""" r = [] for k in self._key_kinds: for key in sorted(self._tables[k]['internal']): r.append(self._tables[k]['internal'][key]) return r class Assembler: """Convert a parse tree via a GroupedTree to a C++ header.""" def __init__(self, args, dtree): self._name = args.data_name self._type = args.type_name self._gtree = GroupedTree(dtree) def Write(self, out): # First pass: determine table sizes and locations. self.Pass(None, self._gtree) # Second pass: write the array. out.write('\nstatic const uint16_t {}Tree[] = {{\n'.format(self._name)) end = self.Pass(out, self._gtree) out.write('};\n\n') # Write the description structure. out.write('static const {} {} = {{\n' .format(self._type, self._name)) out.write(' {}, // maximum sequence length\n'.format(self._gtree.height)) out.write(' {}, // tree array entries\n'.format(end)) out.write(' {}Tree\n'.format(self._name)) out.write('};\n\n') def Pass(self, out, gtree, location=0): gtree.location = location # Write tree header. if out: out.write('\n // offset 0x{:04X}:\n'.format(location)) if gtree.path: out.write(' // prefix:\n') for key in gtree.path: out.write(' // {}\n'.format(key.Pretty())) # Write tables. for k in gtree._key_kinds: for p in gtree._sub_parts: # Write table size. location += 1 if out: out.write(' // {} {} table\n'.format(p, k)) out.write(' 0x{:04X}, // number of entries\n' .format(len(gtree._tables[k][p]))) # Write table body. for key in sorted(gtree._tables[k][p]): location += 2 if out: out.write(' 0x{:04X}, // {}\n' .format(key.CharacterCode(), key.ShorterUnicodeName())) result = gtree._tables[k][p][key] if p == 'internal': out.write(' 0x{:04X},\n'.format(result.location)) else: out.write(' 0x{:04X}, // -> {}\n' .format(result.CharacterCode(), result.ShorterUnicodeName())) # Assemble subtrees of the current tree. for t in gtree.SubTrees(): location = self.Pass(out, t, location) return location def main(argv): parser = argparse.ArgumentParser() parser.add_argument('--type_name', default='ui::TreeComposeChecker::CompositionData') parser.add_argument('--data_name', default='kCompositions') parser.add_argument('--output', default='character_composer_data.h') parser.add_argument('--guard', default=None) parser.add_argument('inputs', nargs='*') args = parser.parse_args(argv[1:]) parse_tree = Parser(Lexer(Input(args.inputs))).Parse() with (sys.stdout if args.output == '-' else open(args.output, 'wb')) as out: out.write('// Copyright 2015 The Chromium Authors. All rights reserved.\n') out.write('// Use of this source code is governed by a BSD-style license') out.write(' that can be\n// found in the LICENSE file.\n\n') out.write('// DO NOT EDIT.\n') out.write('// GENERATED BY {}\n'.format(sys.argv[0])) out.write('// FROM {}\n\n'.format(' '.join(args.inputs))) guard = args.guard if args.guard else args.output guard = ''.join([c.upper() if c.isalpha() else '_' for c in guard]) out.write('#ifndef {0}_\n#define {0}_\n'.format(guard)) Assembler(args, parse_tree).Write(out) out.write('#endif // {}_\n'.format(guard)) return g_fail if __name__ == '__main__': sys.exit(main(sys.argv))