diff options
author | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2016-01-25 11:39:07 +0100 |
---|---|---|
committer | Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com> | 2016-01-25 15:20:42 +0000 |
commit | 6c91641271e536ffaa88a1dff5127e42ee99a91e (patch) | |
tree | 703d9dd49602377ddc90cbf886aad37913f2496b /chromium/net/tools/tld_cleanup | |
parent | b145b7fafd36f0c260d6a768c81fc14e32578099 (diff) | |
download | qtwebengine-chromium-6c91641271e536ffaa88a1dff5127e42ee99a91e.tar.gz |
BASELINE: Update Chromium to 49.0.2623.23
Also adds missing printing sources.
Change-Id: I3726b8f0c7d6751c9fc846096c571fadca7108cd
Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>
Diffstat (limited to 'chromium/net/tools/tld_cleanup')
-rw-r--r-- | chromium/net/tools/tld_cleanup/PRESUBMIT.py | 32 | ||||
-rwxr-xr-x | chromium/net/tools/tld_cleanup/make_dafsa.py | 469 | ||||
-rwxr-xr-x | chromium/net/tools/tld_cleanup/make_dafsa_unittest.py | 757 |
3 files changed, 0 insertions, 1258 deletions
diff --git a/chromium/net/tools/tld_cleanup/PRESUBMIT.py b/chromium/net/tools/tld_cleanup/PRESUBMIT.py deleted file mode 100644 index 8391e0e61a1..00000000000 --- a/chromium/net/tools/tld_cleanup/PRESUBMIT.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2014 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - - -"""Chromium presubmit script for src/net/tools/tld_cleanup.""" - - -def _RunMakeDafsaTests(input_api, output_api): - """Runs unittest for make_dafsa if any related file has been modified.""" - files = ('net/tools/tld_cleanup/make_dafsa.py', - 'net/tools/tld_cleanup/make_dafsa_unittest.py') - if not any(f in input_api.LocalPaths() for f in files): - return [] - test_path = input_api.os_path.join(input_api.PresubmitLocalPath(), - 'make_dafsa_unittest.py') - cmd_name = 'make_dafsa_unittest' - cmd = [input_api.python_executable, test_path] - test_cmd = input_api.Command( - name=cmd_name, - cmd=cmd, - kwargs={}, - message=output_api.PresubmitPromptWarning) - return input_api.RunTests([test_cmd]) - - -def CheckChangeOnUpload(input_api, output_api): - return _RunMakeDafsaTests(input_api, output_api) - - -def CheckChangeOnCommit(input_api, output_api): - return _RunMakeDafsaTests(input_api, output_api) diff --git a/chromium/net/tools/tld_cleanup/make_dafsa.py b/chromium/net/tools/tld_cleanup/make_dafsa.py deleted file mode 100755 index 78358effa84..00000000000 --- a/chromium/net/tools/tld_cleanup/make_dafsa.py +++ /dev/null @@ -1,469 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -""" -A Deterministic acyclic finite state automaton (DAFSA) is a compact -representation of an unordered word list (dictionary). - -http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton - -This python program converts a list of strings to a byte array in C++. -This python program fetches strings and return values from a gperf file -and generates a C++ file with a byte array representing graph that can be -used as a memory efficient replacement for the perfect hash table. - -The input strings are assumed to consist of printable 7-bit ASCII characters -and the return values are assumed to be one digit integers. - -In this program a DAFSA is a diamond shaped graph starting at a common -source node and ending at a common sink node. All internal nodes contain -a label and each word is represented by the labels in one path from -the source node to the sink node. - -The following python represention is used for nodes: - - Source node: [ children ] - Internal node: (label, [ children ]) - Sink node: None - -The graph is first compressed by prefixes like a trie. In the next step -suffixes are compressed so that the graph gets diamond shaped. Finally -one to one linked nodes are replaced by nodes with the labels joined. - -The order of the operations is crucial since lookups will be performed -starting from the source with no backtracking. Thus a node must have at -most one child with a label starting by the same character. The output -is also arranged so that all jumps are to increasing addresses, thus forward -in memory. - -The generated output has suffix free decoding so that the sign of leading -bits in a link (a reference to a child node) indicate if it has a size of one, -two or three bytes and if it is the last outgoing link from the actual node. -A node label is terminated by a byte with the leading bit set. - -The generated byte array can described by the following BNF: - -<byte> ::= < 8-bit value in range [0x00-0xFF] > - -<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > -<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > -<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > - -<offset1> ::= < byte in range [0x00-0x3F] > -<offset2> ::= < byte in range [0x40-0x5F] > -<offset3> ::= < byte in range [0x60-0x7F] > - -<end_offset1> ::= < byte in range [0x80-0xBF] > -<end_offset2> ::= < byte in range [0xC0-0xDF] > -<end_offset3> ::= < byte in range [0xE0-0xFF] > - -<prefix> ::= <char> - -<label> ::= <end_char> - | <char> <label> - -<end_label> ::= <return_value> - | <char> <end_label> - -<offset> ::= <offset1> - | <offset2> <byte> - | <offset3> <byte> <byte> - -<end_offset> ::= <end_offset1> - | <end_offset2> <byte> - | <end_offset3> <byte> <byte> - -<offsets> ::= <end_offset> - | <offset> <offsets> - -<source> ::= <offsets> - -<node> ::= <label> <offsets> - | <prefix> <node> - | <end_label> - -<dafsa> ::= <source> - | <dafsa> <node> - -Decoding: - -<char> -> printable 7-bit ASCII character -<end_char> & 0x7F -> printable 7-bit ASCII character -<return value> & 0x0F -> integer -<offset1 & 0x3F> -> integer -((<offset2> & 0x1F>) << 8) + <byte> -> integer -((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer - -end_offset1, end_offset2 and and_offset3 are decoded same as offset1, -offset2 and offset3 respectively. - -The first offset in a list of offsets is the distance in bytes between the -offset itself and the first child node. Subsequent offsets are the distance -between previous child node and next child node. Thus each offset links a node -to a child node. The distance is always counted between start addresses, i.e. -first byte in decoded offset or first byte in child node. - -Example 1: - -%% -aa, 1 -a, 2 -%% - -The input is first parsed to a list of words: -["aa1", "a2"] - -A fully expanded graph is created from the words: -source = [node1, node4] -node1 = ("a", [node2]) -node2 = ("a", [node3]) -node3 = ("\x01", [sink]) -node4 = ("a", [node5]) -node5 = ("\x02", [sink]) -sink = None - -Compression results in the following graph: -source = [node1] -node1 = ("a", [node2, node3]) -node2 = ("\x02", [sink]) -node3 = ("a\x01", [sink]) -sink = None - -A C++ representation of the compressed graph is generated: - -const unsigned char dafsa[7] = { - 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, -}; - -The bytes in the generated array has the following meaning: - - 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 - - 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" - 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 - - 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 - 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 - - 5: 0x61 <char> label character 0x61 -> match "a" - 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 - -Example 2: - -%% -aa, 1 -bbb, 2 -baa, 1 -%% - -The input is first parsed to a list of words: -["aa1", "bbb2", "baa1"] - -Compression results in the following graph: -source = [node1, node2] -node1 = ("b", [node2, node3]) -node2 = ("aa\x01", [sink]) -node3 = ("bb\x02", [sink]) -sink = None - -A C++ representation of the compressed graph is generated: - -const unsigned char dafsa[11] = { - 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, -}; - -The bytes in the generated array has the following meaning: - - 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 - 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 - - 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" - 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 - 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 - - 5: 0x61 <char> label character 0x61 -> match "a" - 6: 0x61 <char> label character 0x61 -> match "a" - 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 - - 8: 0x62 <char> label character 0x62 -> match "b" - 9: 0x62 <char> label character 0x62 -> match "b" -10: 0x82 <return_value> 0x82 & 0x0F -> return 2 -""" - -import sys - -class InputError(Exception): - """Exception raised for errors in the input file.""" - - -def to_dafsa(words): - """Generates a DAFSA from a word list and returns the source node. - - Each word is split into characters so that each character is represented by - a unique node. It is assumed the word list is not empty. - """ - if not words: - raise InputError('The domain list must not be empty') - def ToNodes(word): - """Split words into characters""" - if not 0x1F < ord(word[0]) < 0x80: - raise InputError('Domain names must be printable 7-bit ASCII') - if len(word) == 1: - return chr(ord(word[0]) & 0x0F), [None] - return word[0], [ToNodes(word[1:])] - return [ToNodes(word) for word in words] - - -def to_words(node): - """Generates a word list from all paths starting from an internal node.""" - if not node: - return [''] - return [(node[0] + word) for child in node[1] for word in to_words(child)] - - -def reverse(dafsa): - """Generates a new DAFSA that is reversed, so that the old sink node becomes - the new source node. - """ - sink = [] - nodemap = {} - - def dfs(node, parent): - """Creates reverse nodes. - - A new reverse node will be created for each old node. The new node will - get a reversed label and the parents of the old node as children. - """ - if not node: - sink.append(parent) - elif id(node) not in nodemap: - nodemap[id(node)] = (node[0][::-1], [parent]) - for child in node[1]: - dfs(child, nodemap[id(node)]) - else: - nodemap[id(node)][1].append(parent) - - for node in dafsa: - dfs(node, None) - return sink - - -def join_labels(dafsa): - """Generates a new DAFSA where internal nodes are merged if there is a one to - one connection. - """ - parentcount = { id(None): 2 } - nodemap = { id(None): None } - - def count_parents(node): - """Count incoming references""" - if id(node) in parentcount: - parentcount[id(node)] += 1 - else: - parentcount[id(node)] = 1 - for child in node[1]: - count_parents(child) - - def join(node): - """Create new nodes""" - if id(node) not in nodemap: - children = [join(child) for child in node[1]] - if len(children) == 1 and parentcount[id(node[1][0])] == 1: - child = children[0] - nodemap[id(node)] = (node[0] + child[0], child[1]) - else: - nodemap[id(node)] = (node[0], children) - return nodemap[id(node)] - - for node in dafsa: - count_parents(node) - return [join(node) for node in dafsa] - - -def join_suffixes(dafsa): - """Generates a new DAFSA where nodes that represent the same word lists - towards the sink are merged. - """ - nodemap = { frozenset(('',)): None } - - def join(node): - """Returns a macthing node. A new node is created if no matching node - exists. The graph is accessed in dfs order. - """ - suffixes = frozenset(to_words(node)) - if suffixes not in nodemap: - nodemap[suffixes] = (node[0], [join(child) for child in node[1]]) - return nodemap[suffixes] - - return [join(node) for node in dafsa] - - -def top_sort(dafsa): - """Generates list of nodes in topological sort order.""" - incoming = {} - - def count_incoming(node): - """Counts incoming references.""" - if node: - if id(node) not in incoming: - incoming[id(node)] = 1 - for child in node[1]: - count_incoming(child) - else: - incoming[id(node)] += 1 - - for node in dafsa: - count_incoming(node) - - for node in dafsa: - incoming[id(node)] -= 1 - - waiting = [node for node in dafsa if incoming[id(node)] == 0] - nodes = [] - - while waiting: - node = waiting.pop() - assert incoming[id(node)] == 0 - nodes.append(node) - for child in node[1]: - if child: - incoming[id(child)] -= 1 - if incoming[id(child)] == 0: - waiting.append(child) - return nodes - - -def encode_links(children, offsets, current): - """Encodes a list of children as one, two or three byte offsets.""" - if not children[0]: - # This is an <end_label> node and no links follow such nodes - assert len(children) == 1 - return [] - guess = 3 * len(children) - assert children - children = sorted(children, key = lambda x: -offsets[id(x)]) - while True: - offset = current + guess - buf = [] - for child in children: - last = len(buf) - distance = offset - offsets[id(child)] - assert distance > 0 and distance < (1 << 21) - - if distance < (1 << 6): - # A 6-bit offset: "s0xxxxxx" - buf.append(distance) - elif distance < (1 << 13): - # A 13-bit offset: "s10xxxxxxxxxxxxx" - buf.append(0x40 | (distance >> 8)) - buf.append(distance & 0xFF) - else: - # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" - buf.append(0x60 | (distance >> 16)) - buf.append((distance >> 8) & 0xFF) - buf.append(distance & 0xFF) - # Distance in first link is relative to following record. - # Distance in other links are relative to previous link. - offset -= distance - if len(buf) == guess: - break - guess = len(buf) - # Set most significant bit to mark end of links in this node. - buf[last] |= (1 << 7) - buf.reverse() - return buf - - -def encode_prefix(label): - """Encodes a node label as a list of bytes without a trailing high byte. - - This method encodes a node if there is exactly one child and the - child follows immidiately after so that no jump is needed. This label - will then be a prefix to the label in the child node. - """ - assert label - return [ord(c) for c in reversed(label)] - - -def encode_label(label): - """Encodes a node label as a list of bytes with a trailing high byte >0x80. - """ - buf = encode_prefix(label) - # Set most significant bit to mark end of label in this node. - buf[0] |= (1 << 7) - return buf - - -def encode(dafsa): - """Encodes a DAFSA to a list of bytes""" - output = [] - offsets = {} - - for node in reversed(top_sort(dafsa)): - if (len(node[1]) == 1 and node[1][0] and - (offsets[id(node[1][0])] == len(output))): - output.extend(encode_prefix(node[0])) - else: - output.extend(encode_links(node[1], offsets, len(output))) - output.extend(encode_label(node[0])) - offsets[id(node)] = len(output) - - output.extend(encode_links(dafsa, offsets, len(output))) - output.reverse() - return output - - -def to_cxx(data): - """Generates C++ code from a list of encoded bytes.""" - text = '/* This file is generated. DO NOT EDIT!\n\n' - text += 'The byte array encodes effective tld names. See make_dafsa.py for' - text += ' documentation.' - text += '*/\n\n' - text += 'const unsigned char kDafsa[%s] = {\n' % len(data) - for i in range(0, len(data), 12): - text += ' ' - text += ', '.join('0x%02x' % byte for byte in data[i:i + 12]) - text += ',\n' - text += '};\n' - return text - - -def words_to_cxx(words): - """Generates C++ code from a word list""" - dafsa = to_dafsa(words) - for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): - dafsa = fun(dafsa) - return to_cxx(encode(dafsa)) - - -def parse_gperf(infile): - """Parses gperf file and extract strings and return code""" - lines = [line.strip() for line in infile] - # Extract strings after the first '%%' and before the second '%%'. - begin = lines.index('%%') + 1 - end = lines.index('%%', begin) - lines = lines[begin:end] - for line in lines: - if line[-3:-1] != ', ': - raise InputError('Expected "domainname, <digit>", found "%s"' % line) - # Technically the DAFSA format could support return values in range [0-31], - # but the values below are the only with a defined meaning. - if line[-1] not in '0124': - raise InputError('Expected value to be one of {0,1,2,4}, found "%s"' % - line[-1]) - return [line[:-3] + line[-1] for line in lines] - - -def main(): - if len(sys.argv) != 3: - print('usage: %s infile outfile' % sys.argv[0]) - return 1 - with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile: - outfile.write(words_to_cxx(parse_gperf(infile))) - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/chromium/net/tools/tld_cleanup/make_dafsa_unittest.py b/chromium/net/tools/tld_cleanup/make_dafsa_unittest.py deleted file mode 100755 index 5ff92e62292..00000000000 --- a/chromium/net/tools/tld_cleanup/make_dafsa_unittest.py +++ /dev/null @@ -1,757 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - - -import sys -import unittest -import make_dafsa - - -class ParseGperfTest(unittest.TestCase): - def testMalformedKey(self): - """Tests exception is thrown at bad format.""" - infile1 = [ '%%', '', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile1) - - infile2 = [ '%%', 'apa,1', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile2) - - infile3 = [ '%%', 'apa, 1', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile3) - - def testBadValues(self): - """Tests exception is thrown when value is out of range.""" - infile1 = [ '%%', 'a, -1', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile1) - - infile2 = [ '%%', 'a, x', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile2) - - infile3 = [ '%%', 'a, 3', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile3) - - infile4 = [ '%%', 'a, 6', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile4) - - infile5 = [ '%%', 'a, 12', '%%' ] - self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile5) - - def testValues(self): - """Tests legal values are accepted.""" - infile1 = [ '%%', 'a, 0', '%%' ] - words1 = [ 'a0' ] - self.assertEqual(make_dafsa.parse_gperf(infile1), words1) - - infile2 = [ '%%', 'a, 1', '%%' ] - words2 = [ 'a1' ] - self.assertEqual(make_dafsa.parse_gperf(infile2), words2) - - infile3 = [ '%%', 'a, 2', '%%' ] - words3 = [ 'a2' ] - self.assertEqual(make_dafsa.parse_gperf(infile3), words3) - - infile4 = [ '%%', 'a, 4', '%%' ] - words4 = [ 'a4' ] - self.assertEqual(make_dafsa.parse_gperf(infile4), words4) - - def testOneWord(self): - """Tests a single key can be parsed.""" - infile = [ '%%', 'apa, 1', '%%' ] - words = [ 'apa1' ] - self.assertEqual(make_dafsa.parse_gperf(infile), words) - - def testTwoWords(self): - """Tests a sequence of keys can be parsed.""" - infile = [ '%%', 'apa, 1', 'bepa.com, 2', '%%' ] - words = [ 'apa1', 'bepa.com2' ] - self.assertEqual(make_dafsa.parse_gperf(infile), words) - - -class ToDafsaTest(unittest.TestCase): - def testEmptyInput(self): - """Tests exception is thrown at empty input.""" - words = () - self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words) - - def testNonASCII(self): - """Tests exception is thrown if illegal characters are used.""" - words1 = ( chr(0x1F) + 'a1', ) - self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words1) - - words2 = ( 'a' + chr(0x1F) + '1', ) - self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words2) - - words3 = ( chr(0x80) + 'a1', ) - self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words3) - - words4 = ( 'a' + chr(0x80) + '1', ) - self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words4) - - def testChar(self): - """Tests a DAFSA can be created from a single character domain name.""" - words = [ 'a0' ] - node2 = ( chr(0), [ None ] ) - node1 = ( 'a', [ node2 ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.to_dafsa(words), source) - - def testChars(self): - """Tests a DAFSA can be created from a multi character domain name.""" - words = [ 'ab0' ] - node3 = ( chr(0), [ None ] ) - node2 = ( 'b', [ node3 ] ) - node1 = ( 'a', [ node2 ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.to_dafsa(words), source) - - def testWords(self): - """Tests a DAFSA can be created from a sequence of domain names.""" - words = [ 'a0', 'b1' ] - node4 = ( chr(1), [ None ] ) - node3 = ( 'b', [ node4 ] ) - node2 = ( chr(0), [ None ] ) - node1 = ( 'a', [ node2 ] ) - source = [ node1, node3 ] - self.assertEqual(make_dafsa.to_dafsa(words), source) - - -class ToWordsTest(unittest.TestCase): - def testSink(self): - """Tests the sink is exapnded to a list with an empty string.""" - node1 = None - words = [ '' ] - self.assertEqual(make_dafsa.to_words(node1), words) - - def testSingleNode(self): - """Tests a single node is expanded to a list with the label string.""" - - # 'ab' -> [ 'ab' ] - - node1 = ( 'ab', [ None ] ) - words = [ 'ab' ] - self.assertEqual(make_dafsa.to_words(node1), words) - - def testChain(self): - """Tests a sequence of nodes are preoperly expanded.""" - - # 'ab' -> 'cd' => [ 'abcd' ] - - node2 = ( 'cd', [ None ] ) - node1 = ( 'ab', [ node2 ] ) - words = [ 'abcd' ] - self.assertEqual(make_dafsa.to_words(node1), words) - - def testInnerTerminator(self): - """Tests a sequence with an inner terminator is expanded to two strings.""" - - # 'a' -> 'b' - # \ => [ 'ab', 'a' ] - # {sink} - - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, None ] ) - words = [ 'ab', 'a' ] - self.assertEqual(make_dafsa.to_words(node1), words) - - def testDiamond(self): - """Tests a diamond can be expanded to a word list.""" - - # 'cd' - # / \ - # 'ab' 'gh' - # \ / - # 'ef' - - node4 = ( 'gh', [ None ] ) - node3 = ( 'ef', [ node4 ] ) - node2 = ( 'cd', [ node4 ] ) - node1 = ( 'ab', [ node2, node3 ] ) - words = [ 'abcdgh', 'abefgh' ] - self.assertEqual(make_dafsa.to_words(node1), words) - - -class JoinLabelsTest(unittest.TestCase): - def testLabel(self): - """Tests a single label passes unchanged.""" - - # 'a' => 'a' - - node1 = ( 'a', [ None ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.join_labels(source), source) - - def testInnerTerminator(self): - """Tests a sequence with an inner terminator passes unchanged.""" - - # 'a' -> 'b' 'a' -> 'b' - # \ => \ - # {sink} {sink} - - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, None ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.join_labels(source), source) - - def testLabels(self): - """Tests a sequence of labels can be joined.""" - - # 'a' -> 'b' => 'ab' - - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2 ] ) - source1 = [ node1 ] - node3 = ( 'ab', [ None ] ) - source2 = [ node3 ] - self.assertEqual(make_dafsa.join_labels(source1), source2) - - def testCompositeLabels(self): - """Tests a sequence of multi character labels can be joined.""" - - # 'ab' -> 'cd' => 'abcd' - - node2 = ( 'cd', [ None ] ) - node1 = ( 'ab', [ node2 ] ) - source1 = [ node1 ] - node3 = ( 'abcd', [ None ] ) - source2 = [ node3 ] - self.assertEqual(make_dafsa.join_labels(source1), source2) - - def testAtomicTrie(self): - """Tests a trie formed DAFSA with atomic labels passes unchanged.""" - - # 'b' 'b' - # / / - # 'a' => 'a' - # \ \ - # 'c' 'c' - - node3 = ( 'c', [ None ] ) - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, node3 ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.join_labels(source), source) - - def testReverseAtomicTrie(self): - """Tests a reverse trie formed DAFSA with atomic labels passes unchanged.""" - - # 'a' 'a' - # \ \ - # 'c' => 'c' - # / / - # 'b' 'b' - - node3 = ( 'c', [ None ] ) - node2 = ( 'b', [ node3 ] ) - node1 = ( 'a', [ node3 ] ) - source = [ node1, node2 ] - self.assertEqual(make_dafsa.join_labels(source), source) - - def testChainedTrie(self): - """Tests a trie formed DAFSA with chained labels can be joined.""" - - # 'c' -> 'd' 'cd' - # / / - # 'a' -> 'b' => 'ab' - # \ \ - # 'e' -> 'f' 'ef' - - node6 = ( 'f', [ None ] ) - node5 = ( 'e', [ node6 ] ) - node4 = ( 'd', [ None ] ) - node3 = ( 'c', [ node4 ] ) - node2 = ( 'b', [ node3, node5 ] ) - node1 = ( 'a', [ node2 ] ) - source1 = [ node1 ] - node9 = ( 'ef', [ None ] ) - node8 = ( 'cd', [ None ] ) - node7 = ( 'ab', [ node8, node9 ] ) - source2 = [ node7 ] - self.assertEqual(make_dafsa.join_labels(source1), source2) - - def testReverseChainedTrie(self): - """Tests a reverse trie formed DAFSA with chained labels can be joined.""" - - # 'a' -> 'b' 'ab' - # \ \ - # 'e' -> 'f' => 'ef' - # / / - # 'c' -> 'd' 'cd' - - node6 = ( 'f', [ None ] ) - node5 = ( 'e', [ node6 ] ) - node4 = ( 'd', [ node5 ] ) - node3 = ( 'c', [ node4 ] ) - node2 = ( 'b', [ node5 ] ) - node1 = ( 'a', [ node2 ] ) - source1 = [ node1, node3 ] - node9 = ( 'ef', [ None ] ) - node8 = ( 'cd', [ node9 ] ) - node7 = ( 'ab', [ node9 ] ) - source2 = [ node7, node8 ] - self.assertEqual(make_dafsa.join_labels(source1), source2) - - -class JoinSuffixesTest(unittest.TestCase): - def testSingleLabel(self): - """Tests a single label passes unchanged.""" - - # 'a' => 'a' - - node1 = ( 'a', [ None ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.join_suffixes(source), source) - - def testInnerTerminator(self): - """Tests a sequence with an inner terminator passes unchanged.""" - - # 'a' -> 'b' 'a' -> 'b' - # \ => \ - # {sink} {sink} - - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, None ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.join_suffixes(source), source) - - def testDistinctTrie(self): - """Tests a trie formed DAFSA with distinct labels passes unchanged.""" - - # 'b' 'b' - # / / - # 'a' => 'a' - # \ \ - # 'c' 'c' - - node3 = ( 'c', [ None ] ) - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, node3 ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.join_suffixes(source), source) - - def testReverseDistinctTrie(self): - """Tests a reverse trie formed DAFSA with distinct labels passes unchanged. - """ - - # 'a' 'a' - # \ \ - # 'c' => 'c' - # / / - # 'b' 'b' - - node3 = ( 'c', [ None ] ) - node2 = ( 'b', [ node3 ] ) - node1 = ( 'a', [ node3 ] ) - source = [ node1, node2 ] - self.assertEqual(make_dafsa.join_suffixes(source), source) - - def testJoinTwoHeads(self): - """Tests two heads can be joined even if there is something else between.""" - - # 'a' ------'a' - # / - # 'b' => 'b' / - # / - # 'a' --- - # - # The picture above should shows that the new version should have just one - # instance of the node with label 'a'. - - node3 = ( 'a', [ None ] ) - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ None ] ) - source1 = [ node1, node2, node3 ] - source2 = make_dafsa.join_suffixes(source1) - - # Both versions should expand to the same content. - self.assertEqual(source1, source2) - # But the new version should have just one instance of 'a'. - self.assertIs(source2[0], source2[2]) - - def testJoinTails(self): - """Tests tails can be joined.""" - - # 'a' -> 'c' 'a' - # \ - # => 'c' - # / - # 'b' -> 'c' 'b' - - node4 = ( 'c', [ None ] ) - node3 = ( 'b', [ node4 ] ) - node2 = ( 'c', [ None ] ) - node1 = ( 'a', [ node2 ] ) - source1 = [ node1, node3 ] - source2 = make_dafsa.join_suffixes(source1) - - # Both versions should expand to the same content. - self.assertEqual(source1, source2) - # But the new version should have just one tail. - self.assertIs(source2[0][1][0], source2[1][1][0]) - - def testMakeRecursiveTrie(self): - """Tests recursive suffix join.""" - - # 'a' -> 'e' -> 'g' 'a' - # \ - # 'e' - # / \ - # 'b' -> 'e' -> 'g' 'b' \ - # \ - # => 'g' - # / - # 'c' -> 'f' -> 'g' 'c' / - # \ / - # 'f' - # / - # 'd' -> 'f' -> 'g' 'd' - - node7 = ( 'g', [ None ] ) - node6 = ( 'f', [ node7 ] ) - node5 = ( 'e', [ node7 ] ) - node4 = ( 'd', [ node6 ] ) - node3 = ( 'c', [ node6 ] ) - node2 = ( 'b', [ node5 ] ) - node1 = ( 'a', [ node5 ] ) - source1 = [ node1, node2, node3, node4 ] - source2 = make_dafsa.join_suffixes(source1) - - # Both versions should expand to the same content. - self.assertEqual(source1, source2) - # But the new version should have just one 'e'. - self.assertIs(source2[0][1][0], source2[1][1][0]) - # And one 'f'. - self.assertIs(source2[2][1][0], source2[3][1][0]) - # And one 'g'. - self.assertIs(source2[0][1][0][1][0], source2[2][1][0][1][0]) - - def testMakeDiamond(self): - """Test we can join suffixes of a trie.""" - - # 'b' -> 'd' 'b' - # / / \ - # 'a' => 'a' 'd' - # \ \ / - # 'c' -> 'd' 'c' - - node5 = ( 'd', [ None ] ) - node4 = ( 'c', [ node5 ] ) - node3 = ( 'd', [ None ] ) - node2 = ( 'b', [ node3 ] ) - node1 = ( 'a', [ node2, node4 ] ) - source1 = [ node1 ] - source2 = make_dafsa.join_suffixes(source1) - - # Both versions should expand to the same content. - self.assertEqual(source1, source2) - # But the new version should have just one 'd'. - self.assertIs(source2[0][1][0][1][0], source2[0][1][1][1][0]) - - def testJoinOneChild(self): - """Tests that we can join some children but not all.""" - - # 'c' ----'c' - # / / / - # 'a' 'a' / - # \ \ / - # 'd' 'd'/ - # => / - # 'c' / - # / / - # 'b' 'b' - # \ \ - # 'e' 'e' - - node6 = ( 'e', [ None ] ) - node5 = ( 'c', [ None ] ) - node4 = ( 'b', [ node5, node6 ] ) - node3 = ( 'd', [ None ] ) - node2 = ( 'c', [ None ] ) - node1 = ( 'a', [ node2, node3 ] ) - source1 = [ node1, node4 ] - source2 = make_dafsa.join_suffixes(source1) - - # Both versions should expand to the same content. - self.assertEqual(source1, source2) - # But the new version should have just one 'c'. - self.assertIs(source2[0][1][0], source2[1][1][0]) - - -class ReverseTest(unittest.TestCase): - def testAtomicLabel(self): - """Tests an atomic label passes unchanged.""" - - # 'a' => 'a' - - node1 = ( 'a', [ None ] ) - source = [ node1 ] - self.assertEqual(make_dafsa.reverse(source), source) - - def testLabel(self): - """Tests that labels are reversed.""" - - # 'ab' => 'ba' - - node1 = ( 'ab', [ None ] ) - source1 = [ node1 ] - node2 = ( 'ba', [ None ] ) - source2 = [ node2 ] - self.assertEqual(make_dafsa.reverse(source1), source2) - - def testChain(self): - """Tests that edges are reversed.""" - - # 'a' -> 'b' => 'b' -> 'a' - - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2 ] ) - source1 = [ node1 ] - node4 = ( 'a', [ None ] ) - node3 = ( 'b', [ node4 ] ) - source2 = [ node3 ] - self.assertEqual(make_dafsa.reverse(source1), source2) - - def testInnerTerminator(self): - """Tests a sequence with an inner terminator can be reversed.""" - - # 'a' -> 'b' 'b' -> 'a' - # \ => / - # {sink} ------ - - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, None ] ) - source1 = [ node1 ] - node4 = ( 'a', [ None ] ) - node3 = ( 'b', [ node4 ] ) - source2 = [ node3, node4 ] - self.assertEqual(make_dafsa.reverse(source1), source2) - - def testAtomicTrie(self): - """Tests a trie formed DAFSA can be reversed.""" - - # 'b' 'b' - # / \ - # 'a' => 'a' - # \ / - # 'c' 'c' - - node3 = ( 'c', [ None ] ) - node2 = ( 'b', [ None ] ) - node1 = ( 'a', [ node2, node3 ] ) - source1 = [ node1 ] - node6 = ( 'a', [ None ] ) - node5 = ( 'c', [ node6 ] ) - node4 = ( 'b', [ node6 ] ) - source2 = [ node4, node5 ] - self.assertEqual(make_dafsa.reverse(source1), source2) - - def testReverseAtomicTrie(self): - """Tests a reverse trie formed DAFSA can be reversed.""" - - # 'a' 'a' - # \ / - # 'c' => 'c' - # / \ - # 'b' 'b' - - node3 = ( 'c', [ None ] ) - node2 = ( 'b', [ node3 ] ) - node1 = ( 'a', [ node3 ] ) - source1 = [ node1, node2 ] - node6 = ( 'b', [ None ] ) - node5 = ( 'a', [ None ] ) - node4 = ( 'c', [ node5, node6 ] ) - source2 = [ node4 ] - self.assertEqual(make_dafsa.reverse(source1), source2) - - def testDiamond(self): - """Tests we can reverse both edges and nodes in a diamond.""" - - # 'cd' 'dc' - # / \ / \ - # 'ab' 'gh' => 'hg' 'ba' - # \ / \ / - # 'ef' 'fe' - - node4 = ( 'gh', [ None ] ) - node3 = ( 'ef', [ node4 ] ) - node2 = ( 'cd', [ node4 ] ) - node1 = ( 'ab', [ node2, node3 ] ) - source1 = [ node1 ] - node8 = ( 'ba', [ None ] ) - node7 = ( 'fe', [ node8 ] ) - node6 = ( 'dc', [ node8 ] ) - node5 = ( 'hg', [ node6, node7 ] ) - source2 = [ node5 ] - self.assertEqual(make_dafsa.reverse(source1), source2) - - -class TopSortTest(unittest.TestCase): - def testNode(self): - """Tests a DAFSA with one node can be sorted.""" - - # 'a' => [ 'a' ] - - node1 = ( 'a', [ None ] ) - source = [ node1 ] - nodes = [ node1 ] - self.assertEqual(make_dafsa.top_sort(source), nodes) - - def testDiamond(self): - """Tests nodes in a diamond can be sorted.""" - - # 'b' - # / \ - # 'a' 'd' - # \ / - # 'c' - - node4 = ( 'd', [ None ] ) - node3 = ( 'c', [ node4 ] ) - node2 = ( 'b', [ node4 ] ) - node1 = ( 'a', [ node2, node3 ] ) - source = [ node1 ] - nodes = make_dafsa.top_sort(source) - self.assertLess(nodes.index(node1), nodes.index(node2)) - self.assertLess(nodes.index(node2), nodes.index(node4)) - self.assertLess(nodes.index(node3), nodes.index(node4)) - - -class EncodePrefixTest(unittest.TestCase): - def testChar(self): - """Tests to encode a single character prefix.""" - label = 'a' - bytes = [ ord('a') ] - self.assertEqual(make_dafsa.encode_prefix(label), bytes) - - def testChars(self): - """Tests to encode a multi character prefix.""" - label = 'ab' - bytes = [ ord('b'), ord('a') ] - self.assertEqual(make_dafsa.encode_prefix(label), bytes) - - -class EncodeLabelTest(unittest.TestCase): - def testChar(self): - """Tests to encode a single character label.""" - label = 'a' - bytes = [ ord('a') + 0x80 ] - self.assertEqual(make_dafsa.encode_label(label), bytes) - - def testChars(self): - """Tests to encode a multi character label.""" - label = 'ab' - bytes = [ ord('b') + 0x80, ord('a') ] - self.assertEqual(make_dafsa.encode_label(label), bytes) - - -class EncodeLinksTest(unittest.TestCase): - def testEndLabel(self): - """Tests to encode link to the sink.""" - children = [ None ] - offsets = {} - bytes = 0 - output = [] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testOneByteOffset(self): - """Tests to encode a single one byte offset.""" - node = ( '', [ None ] ) - children = [ node ] - offsets = { id(node) : 2 } - bytes = 5 - output = [ 132 ] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testOneByteOffsets(self): - """Tests to encode a sequence of one byte offsets.""" - node1 = ( '', [ None ] ) - node2 = ( '', [ None ] ) - children = [ node1, node2 ] - offsets = { id(node1) : 2, id(node2) : 1 } - bytes = 5 - output = [ 129, 5 ] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testTwoBytesOffset(self): - """Tests to encode a single two byte offset.""" - node = ( '', [ None ] ) - children = [ node ] - offsets = { id(node) : 2 } - bytes = 1005 - output = [ 237, 195] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testTwoBytesOffsets(self): - """Tests to encode a sequence of two byte offsets.""" - node1 = ( '', [ None ] ) - node2 = ( '', [ None ] ) - node3 = ( '', [ None ] ) - children = [ node1, node2, node3 ] - offsets = { id(node1) : 1002, id(node2) : 2, id(node3) : 2002 } - bytes = 3005 - output = [ 232, 195, 232, 67, 241, 67 ] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testThreeBytesOffset(self): - """Tests to encode a single three byte offset.""" - node = ( '', [ None ] ) - children = [ node ] - offsets = { id(node) : 2 } - bytes = 100005 - output = [ 166, 134, 225 ] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testThreeBytesOffsets(self): - """Tests to encode a sequence of three byte offsets.""" - node1 = ( '', [ None ] ) - node2 = ( '', [ None ] ) - node3 = ( '', [ None ] ) - children = [ node1, node2, node3 ] - offsets = { id(node1) : 100002, id(node2) : 2, id(node3) : 200002 } - bytes = 300005 - output = [ 160, 134, 225, 160, 134, 97, 172, 134, 97 ] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - def testOneTwoThreeBytesOffsets(self): - """Tests to encode offsets of different sizes.""" - node1 = ( '', [ None ] ) - node2 = ( '', [ None ] ) - node3 = ( '', [ None ] ) - children = [ node1, node2, node3 ] - offsets = { id(node1) : 10003, id(node2) : 10002, id(node3) : 100002 } - bytes = 300005 - output = [ 129, 143, 95, 97, 74, 13, 99 ] - self.assertEqual(make_dafsa.encode_links(children, offsets, bytes), - output) - - -class ExamplesTest(unittest.TestCase): - def testExample1(self): - """Tests Example 1 from make_dafsa.py.""" - infile = [ '%%', 'aa, 1', 'a, 2', '%%' ] - bytes = [ 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81 ] - outfile = make_dafsa.to_cxx(bytes) - self.assertEqual(make_dafsa.words_to_cxx(make_dafsa.parse_gperf(infile)), - outfile) - - def testExample2(self): - """Tests Example 2 from make_dafsa.py.""" - infile = [ '%%', 'aa, 1', 'bbb, 2', 'baa, 1', '%%' ] - bytes = [ 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, - 0x82 ] - outfile = make_dafsa.to_cxx(bytes) - self.assertEqual(make_dafsa.words_to_cxx(make_dafsa.parse_gperf(infile)), - outfile) - - -if __name__ == '__main__': - unittest.main() |