summaryrefslogtreecommitdiff
path: root/chromium/net/tools/tld_cleanup
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2016-01-25 11:39:07 +0100
committerOswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>2016-01-25 15:20:42 +0000
commit6c91641271e536ffaa88a1dff5127e42ee99a91e (patch)
tree703d9dd49602377ddc90cbf886aad37913f2496b /chromium/net/tools/tld_cleanup
parentb145b7fafd36f0c260d6a768c81fc14e32578099 (diff)
downloadqtwebengine-chromium-6c91641271e536ffaa88a1dff5127e42ee99a91e.tar.gz
BASELINE: Update Chromium to 49.0.2623.23
Also adds missing printing sources. Change-Id: I3726b8f0c7d6751c9fc846096c571fadca7108cd Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>
Diffstat (limited to 'chromium/net/tools/tld_cleanup')
-rw-r--r--chromium/net/tools/tld_cleanup/PRESUBMIT.py32
-rwxr-xr-xchromium/net/tools/tld_cleanup/make_dafsa.py469
-rwxr-xr-xchromium/net/tools/tld_cleanup/make_dafsa_unittest.py757
3 files changed, 0 insertions, 1258 deletions
diff --git a/chromium/net/tools/tld_cleanup/PRESUBMIT.py b/chromium/net/tools/tld_cleanup/PRESUBMIT.py
deleted file mode 100644
index 8391e0e61a1..00000000000
--- a/chromium/net/tools/tld_cleanup/PRESUBMIT.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2014 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-
-"""Chromium presubmit script for src/net/tools/tld_cleanup."""
-
-
-def _RunMakeDafsaTests(input_api, output_api):
- """Runs unittest for make_dafsa if any related file has been modified."""
- files = ('net/tools/tld_cleanup/make_dafsa.py',
- 'net/tools/tld_cleanup/make_dafsa_unittest.py')
- if not any(f in input_api.LocalPaths() for f in files):
- return []
- test_path = input_api.os_path.join(input_api.PresubmitLocalPath(),
- 'make_dafsa_unittest.py')
- cmd_name = 'make_dafsa_unittest'
- cmd = [input_api.python_executable, test_path]
- test_cmd = input_api.Command(
- name=cmd_name,
- cmd=cmd,
- kwargs={},
- message=output_api.PresubmitPromptWarning)
- return input_api.RunTests([test_cmd])
-
-
-def CheckChangeOnUpload(input_api, output_api):
- return _RunMakeDafsaTests(input_api, output_api)
-
-
-def CheckChangeOnCommit(input_api, output_api):
- return _RunMakeDafsaTests(input_api, output_api)
diff --git a/chromium/net/tools/tld_cleanup/make_dafsa.py b/chromium/net/tools/tld_cleanup/make_dafsa.py
deleted file mode 100755
index 78358effa84..00000000000
--- a/chromium/net/tools/tld_cleanup/make_dafsa.py
+++ /dev/null
@@ -1,469 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-"""
-A Deterministic acyclic finite state automaton (DAFSA) is a compact
-representation of an unordered word list (dictionary).
-
-http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
-
-This python program converts a list of strings to a byte array in C++.
-This python program fetches strings and return values from a gperf file
-and generates a C++ file with a byte array representing graph that can be
-used as a memory efficient replacement for the perfect hash table.
-
-The input strings are assumed to consist of printable 7-bit ASCII characters
-and the return values are assumed to be one digit integers.
-
-In this program a DAFSA is a diamond shaped graph starting at a common
-source node and ending at a common sink node. All internal nodes contain
-a label and each word is represented by the labels in one path from
-the source node to the sink node.
-
-The following python represention is used for nodes:
-
- Source node: [ children ]
- Internal node: (label, [ children ])
- Sink node: None
-
-The graph is first compressed by prefixes like a trie. In the next step
-suffixes are compressed so that the graph gets diamond shaped. Finally
-one to one linked nodes are replaced by nodes with the labels joined.
-
-The order of the operations is crucial since lookups will be performed
-starting from the source with no backtracking. Thus a node must have at
-most one child with a label starting by the same character. The output
-is also arranged so that all jumps are to increasing addresses, thus forward
-in memory.
-
-The generated output has suffix free decoding so that the sign of leading
-bits in a link (a reference to a child node) indicate if it has a size of one,
-two or three bytes and if it is the last outgoing link from the actual node.
-A node label is terminated by a byte with the leading bit set.
-
-The generated byte array can described by the following BNF:
-
-<byte> ::= < 8-bit value in range [0x00-0xFF] >
-
-<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
-<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
-<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
-
-<offset1> ::= < byte in range [0x00-0x3F] >
-<offset2> ::= < byte in range [0x40-0x5F] >
-<offset3> ::= < byte in range [0x60-0x7F] >
-
-<end_offset1> ::= < byte in range [0x80-0xBF] >
-<end_offset2> ::= < byte in range [0xC0-0xDF] >
-<end_offset3> ::= < byte in range [0xE0-0xFF] >
-
-<prefix> ::= <char>
-
-<label> ::= <end_char>
- | <char> <label>
-
-<end_label> ::= <return_value>
- | <char> <end_label>
-
-<offset> ::= <offset1>
- | <offset2> <byte>
- | <offset3> <byte> <byte>
-
-<end_offset> ::= <end_offset1>
- | <end_offset2> <byte>
- | <end_offset3> <byte> <byte>
-
-<offsets> ::= <end_offset>
- | <offset> <offsets>
-
-<source> ::= <offsets>
-
-<node> ::= <label> <offsets>
- | <prefix> <node>
- | <end_label>
-
-<dafsa> ::= <source>
- | <dafsa> <node>
-
-Decoding:
-
-<char> -> printable 7-bit ASCII character
-<end_char> & 0x7F -> printable 7-bit ASCII character
-<return value> & 0x0F -> integer
-<offset1 & 0x3F> -> integer
-((<offset2> & 0x1F>) << 8) + <byte> -> integer
-((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
-
-end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
-offset2 and offset3 respectively.
-
-The first offset in a list of offsets is the distance in bytes between the
-offset itself and the first child node. Subsequent offsets are the distance
-between previous child node and next child node. Thus each offset links a node
-to a child node. The distance is always counted between start addresses, i.e.
-first byte in decoded offset or first byte in child node.
-
-Example 1:
-
-%%
-aa, 1
-a, 2
-%%
-
-The input is first parsed to a list of words:
-["aa1", "a2"]
-
-A fully expanded graph is created from the words:
-source = [node1, node4]
-node1 = ("a", [node2])
-node2 = ("a", [node3])
-node3 = ("\x01", [sink])
-node4 = ("a", [node5])
-node5 = ("\x02", [sink])
-sink = None
-
-Compression results in the following graph:
-source = [node1]
-node1 = ("a", [node2, node3])
-node2 = ("\x02", [sink])
-node3 = ("a\x01", [sink])
-sink = None
-
-A C++ representation of the compressed graph is generated:
-
-const unsigned char dafsa[7] = {
- 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
-};
-
-The bytes in the generated array has the following meaning:
-
- 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
-
- 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
- 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
-
- 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
- 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
-
- 5: 0x61 <char> label character 0x61 -> match "a"
- 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
-
-Example 2:
-
-%%
-aa, 1
-bbb, 2
-baa, 1
-%%
-
-The input is first parsed to a list of words:
-["aa1", "bbb2", "baa1"]
-
-Compression results in the following graph:
-source = [node1, node2]
-node1 = ("b", [node2, node3])
-node2 = ("aa\x01", [sink])
-node3 = ("bb\x02", [sink])
-sink = None
-
-A C++ representation of the compressed graph is generated:
-
-const unsigned char dafsa[11] = {
- 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
-};
-
-The bytes in the generated array has the following meaning:
-
- 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
- 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
-
- 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
- 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
- 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
-
- 5: 0x61 <char> label character 0x61 -> match "a"
- 6: 0x61 <char> label character 0x61 -> match "a"
- 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
-
- 8: 0x62 <char> label character 0x62 -> match "b"
- 9: 0x62 <char> label character 0x62 -> match "b"
-10: 0x82 <return_value> 0x82 & 0x0F -> return 2
-"""
-
-import sys
-
-class InputError(Exception):
- """Exception raised for errors in the input file."""
-
-
-def to_dafsa(words):
- """Generates a DAFSA from a word list and returns the source node.
-
- Each word is split into characters so that each character is represented by
- a unique node. It is assumed the word list is not empty.
- """
- if not words:
- raise InputError('The domain list must not be empty')
- def ToNodes(word):
- """Split words into characters"""
- if not 0x1F < ord(word[0]) < 0x80:
- raise InputError('Domain names must be printable 7-bit ASCII')
- if len(word) == 1:
- return chr(ord(word[0]) & 0x0F), [None]
- return word[0], [ToNodes(word[1:])]
- return [ToNodes(word) for word in words]
-
-
-def to_words(node):
- """Generates a word list from all paths starting from an internal node."""
- if not node:
- return ['']
- return [(node[0] + word) for child in node[1] for word in to_words(child)]
-
-
-def reverse(dafsa):
- """Generates a new DAFSA that is reversed, so that the old sink node becomes
- the new source node.
- """
- sink = []
- nodemap = {}
-
- def dfs(node, parent):
- """Creates reverse nodes.
-
- A new reverse node will be created for each old node. The new node will
- get a reversed label and the parents of the old node as children.
- """
- if not node:
- sink.append(parent)
- elif id(node) not in nodemap:
- nodemap[id(node)] = (node[0][::-1], [parent])
- for child in node[1]:
- dfs(child, nodemap[id(node)])
- else:
- nodemap[id(node)][1].append(parent)
-
- for node in dafsa:
- dfs(node, None)
- return sink
-
-
-def join_labels(dafsa):
- """Generates a new DAFSA where internal nodes are merged if there is a one to
- one connection.
- """
- parentcount = { id(None): 2 }
- nodemap = { id(None): None }
-
- def count_parents(node):
- """Count incoming references"""
- if id(node) in parentcount:
- parentcount[id(node)] += 1
- else:
- parentcount[id(node)] = 1
- for child in node[1]:
- count_parents(child)
-
- def join(node):
- """Create new nodes"""
- if id(node) not in nodemap:
- children = [join(child) for child in node[1]]
- if len(children) == 1 and parentcount[id(node[1][0])] == 1:
- child = children[0]
- nodemap[id(node)] = (node[0] + child[0], child[1])
- else:
- nodemap[id(node)] = (node[0], children)
- return nodemap[id(node)]
-
- for node in dafsa:
- count_parents(node)
- return [join(node) for node in dafsa]
-
-
-def join_suffixes(dafsa):
- """Generates a new DAFSA where nodes that represent the same word lists
- towards the sink are merged.
- """
- nodemap = { frozenset(('',)): None }
-
- def join(node):
- """Returns a macthing node. A new node is created if no matching node
- exists. The graph is accessed in dfs order.
- """
- suffixes = frozenset(to_words(node))
- if suffixes not in nodemap:
- nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
- return nodemap[suffixes]
-
- return [join(node) for node in dafsa]
-
-
-def top_sort(dafsa):
- """Generates list of nodes in topological sort order."""
- incoming = {}
-
- def count_incoming(node):
- """Counts incoming references."""
- if node:
- if id(node) not in incoming:
- incoming[id(node)] = 1
- for child in node[1]:
- count_incoming(child)
- else:
- incoming[id(node)] += 1
-
- for node in dafsa:
- count_incoming(node)
-
- for node in dafsa:
- incoming[id(node)] -= 1
-
- waiting = [node for node in dafsa if incoming[id(node)] == 0]
- nodes = []
-
- while waiting:
- node = waiting.pop()
- assert incoming[id(node)] == 0
- nodes.append(node)
- for child in node[1]:
- if child:
- incoming[id(child)] -= 1
- if incoming[id(child)] == 0:
- waiting.append(child)
- return nodes
-
-
-def encode_links(children, offsets, current):
- """Encodes a list of children as one, two or three byte offsets."""
- if not children[0]:
- # This is an <end_label> node and no links follow such nodes
- assert len(children) == 1
- return []
- guess = 3 * len(children)
- assert children
- children = sorted(children, key = lambda x: -offsets[id(x)])
- while True:
- offset = current + guess
- buf = []
- for child in children:
- last = len(buf)
- distance = offset - offsets[id(child)]
- assert distance > 0 and distance < (1 << 21)
-
- if distance < (1 << 6):
- # A 6-bit offset: "s0xxxxxx"
- buf.append(distance)
- elif distance < (1 << 13):
- # A 13-bit offset: "s10xxxxxxxxxxxxx"
- buf.append(0x40 | (distance >> 8))
- buf.append(distance & 0xFF)
- else:
- # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
- buf.append(0x60 | (distance >> 16))
- buf.append((distance >> 8) & 0xFF)
- buf.append(distance & 0xFF)
- # Distance in first link is relative to following record.
- # Distance in other links are relative to previous link.
- offset -= distance
- if len(buf) == guess:
- break
- guess = len(buf)
- # Set most significant bit to mark end of links in this node.
- buf[last] |= (1 << 7)
- buf.reverse()
- return buf
-
-
-def encode_prefix(label):
- """Encodes a node label as a list of bytes without a trailing high byte.
-
- This method encodes a node if there is exactly one child and the
- child follows immidiately after so that no jump is needed. This label
- will then be a prefix to the label in the child node.
- """
- assert label
- return [ord(c) for c in reversed(label)]
-
-
-def encode_label(label):
- """Encodes a node label as a list of bytes with a trailing high byte >0x80.
- """
- buf = encode_prefix(label)
- # Set most significant bit to mark end of label in this node.
- buf[0] |= (1 << 7)
- return buf
-
-
-def encode(dafsa):
- """Encodes a DAFSA to a list of bytes"""
- output = []
- offsets = {}
-
- for node in reversed(top_sort(dafsa)):
- if (len(node[1]) == 1 and node[1][0] and
- (offsets[id(node[1][0])] == len(output))):
- output.extend(encode_prefix(node[0]))
- else:
- output.extend(encode_links(node[1], offsets, len(output)))
- output.extend(encode_label(node[0]))
- offsets[id(node)] = len(output)
-
- output.extend(encode_links(dafsa, offsets, len(output)))
- output.reverse()
- return output
-
-
-def to_cxx(data):
- """Generates C++ code from a list of encoded bytes."""
- text = '/* This file is generated. DO NOT EDIT!\n\n'
- text += 'The byte array encodes effective tld names. See make_dafsa.py for'
- text += ' documentation.'
- text += '*/\n\n'
- text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
- for i in range(0, len(data), 12):
- text += ' '
- text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
- text += ',\n'
- text += '};\n'
- return text
-
-
-def words_to_cxx(words):
- """Generates C++ code from a word list"""
- dafsa = to_dafsa(words)
- for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
- dafsa = fun(dafsa)
- return to_cxx(encode(dafsa))
-
-
-def parse_gperf(infile):
- """Parses gperf file and extract strings and return code"""
- lines = [line.strip() for line in infile]
- # Extract strings after the first '%%' and before the second '%%'.
- begin = lines.index('%%') + 1
- end = lines.index('%%', begin)
- lines = lines[begin:end]
- for line in lines:
- if line[-3:-1] != ', ':
- raise InputError('Expected "domainname, <digit>", found "%s"' % line)
- # Technically the DAFSA format could support return values in range [0-31],
- # but the values below are the only with a defined meaning.
- if line[-1] not in '0124':
- raise InputError('Expected value to be one of {0,1,2,4}, found "%s"' %
- line[-1])
- return [line[:-3] + line[-1] for line in lines]
-
-
-def main():
- if len(sys.argv) != 3:
- print('usage: %s infile outfile' % sys.argv[0])
- return 1
- with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile:
- outfile.write(words_to_cxx(parse_gperf(infile)))
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/chromium/net/tools/tld_cleanup/make_dafsa_unittest.py b/chromium/net/tools/tld_cleanup/make_dafsa_unittest.py
deleted file mode 100755
index 5ff92e62292..00000000000
--- a/chromium/net/tools/tld_cleanup/make_dafsa_unittest.py
+++ /dev/null
@@ -1,757 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-
-import sys
-import unittest
-import make_dafsa
-
-
-class ParseGperfTest(unittest.TestCase):
- def testMalformedKey(self):
- """Tests exception is thrown at bad format."""
- infile1 = [ '%%', '', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile1)
-
- infile2 = [ '%%', 'apa,1', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile2)
-
- infile3 = [ '%%', 'apa, 1', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile3)
-
- def testBadValues(self):
- """Tests exception is thrown when value is out of range."""
- infile1 = [ '%%', 'a, -1', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile1)
-
- infile2 = [ '%%', 'a, x', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile2)
-
- infile3 = [ '%%', 'a, 3', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile3)
-
- infile4 = [ '%%', 'a, 6', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile4)
-
- infile5 = [ '%%', 'a, 12', '%%' ]
- self.assertRaises(make_dafsa.InputError, make_dafsa.parse_gperf, infile5)
-
- def testValues(self):
- """Tests legal values are accepted."""
- infile1 = [ '%%', 'a, 0', '%%' ]
- words1 = [ 'a0' ]
- self.assertEqual(make_dafsa.parse_gperf(infile1), words1)
-
- infile2 = [ '%%', 'a, 1', '%%' ]
- words2 = [ 'a1' ]
- self.assertEqual(make_dafsa.parse_gperf(infile2), words2)
-
- infile3 = [ '%%', 'a, 2', '%%' ]
- words3 = [ 'a2' ]
- self.assertEqual(make_dafsa.parse_gperf(infile3), words3)
-
- infile4 = [ '%%', 'a, 4', '%%' ]
- words4 = [ 'a4' ]
- self.assertEqual(make_dafsa.parse_gperf(infile4), words4)
-
- def testOneWord(self):
- """Tests a single key can be parsed."""
- infile = [ '%%', 'apa, 1', '%%' ]
- words = [ 'apa1' ]
- self.assertEqual(make_dafsa.parse_gperf(infile), words)
-
- def testTwoWords(self):
- """Tests a sequence of keys can be parsed."""
- infile = [ '%%', 'apa, 1', 'bepa.com, 2', '%%' ]
- words = [ 'apa1', 'bepa.com2' ]
- self.assertEqual(make_dafsa.parse_gperf(infile), words)
-
-
-class ToDafsaTest(unittest.TestCase):
- def testEmptyInput(self):
- """Tests exception is thrown at empty input."""
- words = ()
- self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words)
-
- def testNonASCII(self):
- """Tests exception is thrown if illegal characters are used."""
- words1 = ( chr(0x1F) + 'a1', )
- self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words1)
-
- words2 = ( 'a' + chr(0x1F) + '1', )
- self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words2)
-
- words3 = ( chr(0x80) + 'a1', )
- self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words3)
-
- words4 = ( 'a' + chr(0x80) + '1', )
- self.assertRaises(make_dafsa.InputError, make_dafsa.to_dafsa, words4)
-
- def testChar(self):
- """Tests a DAFSA can be created from a single character domain name."""
- words = [ 'a0' ]
- node2 = ( chr(0), [ None ] )
- node1 = ( 'a', [ node2 ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.to_dafsa(words), source)
-
- def testChars(self):
- """Tests a DAFSA can be created from a multi character domain name."""
- words = [ 'ab0' ]
- node3 = ( chr(0), [ None ] )
- node2 = ( 'b', [ node3 ] )
- node1 = ( 'a', [ node2 ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.to_dafsa(words), source)
-
- def testWords(self):
- """Tests a DAFSA can be created from a sequence of domain names."""
- words = [ 'a0', 'b1' ]
- node4 = ( chr(1), [ None ] )
- node3 = ( 'b', [ node4 ] )
- node2 = ( chr(0), [ None ] )
- node1 = ( 'a', [ node2 ] )
- source = [ node1, node3 ]
- self.assertEqual(make_dafsa.to_dafsa(words), source)
-
-
-class ToWordsTest(unittest.TestCase):
- def testSink(self):
- """Tests the sink is exapnded to a list with an empty string."""
- node1 = None
- words = [ '' ]
- self.assertEqual(make_dafsa.to_words(node1), words)
-
- def testSingleNode(self):
- """Tests a single node is expanded to a list with the label string."""
-
- # 'ab' -> [ 'ab' ]
-
- node1 = ( 'ab', [ None ] )
- words = [ 'ab' ]
- self.assertEqual(make_dafsa.to_words(node1), words)
-
- def testChain(self):
- """Tests a sequence of nodes are preoperly expanded."""
-
- # 'ab' -> 'cd' => [ 'abcd' ]
-
- node2 = ( 'cd', [ None ] )
- node1 = ( 'ab', [ node2 ] )
- words = [ 'abcd' ]
- self.assertEqual(make_dafsa.to_words(node1), words)
-
- def testInnerTerminator(self):
- """Tests a sequence with an inner terminator is expanded to two strings."""
-
- # 'a' -> 'b'
- # \ => [ 'ab', 'a' ]
- # {sink}
-
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, None ] )
- words = [ 'ab', 'a' ]
- self.assertEqual(make_dafsa.to_words(node1), words)
-
- def testDiamond(self):
- """Tests a diamond can be expanded to a word list."""
-
- # 'cd'
- # / \
- # 'ab' 'gh'
- # \ /
- # 'ef'
-
- node4 = ( 'gh', [ None ] )
- node3 = ( 'ef', [ node4 ] )
- node2 = ( 'cd', [ node4 ] )
- node1 = ( 'ab', [ node2, node3 ] )
- words = [ 'abcdgh', 'abefgh' ]
- self.assertEqual(make_dafsa.to_words(node1), words)
-
-
-class JoinLabelsTest(unittest.TestCase):
- def testLabel(self):
- """Tests a single label passes unchanged."""
-
- # 'a' => 'a'
-
- node1 = ( 'a', [ None ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.join_labels(source), source)
-
- def testInnerTerminator(self):
- """Tests a sequence with an inner terminator passes unchanged."""
-
- # 'a' -> 'b' 'a' -> 'b'
- # \ => \
- # {sink} {sink}
-
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, None ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.join_labels(source), source)
-
- def testLabels(self):
- """Tests a sequence of labels can be joined."""
-
- # 'a' -> 'b' => 'ab'
-
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2 ] )
- source1 = [ node1 ]
- node3 = ( 'ab', [ None ] )
- source2 = [ node3 ]
- self.assertEqual(make_dafsa.join_labels(source1), source2)
-
- def testCompositeLabels(self):
- """Tests a sequence of multi character labels can be joined."""
-
- # 'ab' -> 'cd' => 'abcd'
-
- node2 = ( 'cd', [ None ] )
- node1 = ( 'ab', [ node2 ] )
- source1 = [ node1 ]
- node3 = ( 'abcd', [ None ] )
- source2 = [ node3 ]
- self.assertEqual(make_dafsa.join_labels(source1), source2)
-
- def testAtomicTrie(self):
- """Tests a trie formed DAFSA with atomic labels passes unchanged."""
-
- # 'b' 'b'
- # / /
- # 'a' => 'a'
- # \ \
- # 'c' 'c'
-
- node3 = ( 'c', [ None ] )
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, node3 ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.join_labels(source), source)
-
- def testReverseAtomicTrie(self):
- """Tests a reverse trie formed DAFSA with atomic labels passes unchanged."""
-
- # 'a' 'a'
- # \ \
- # 'c' => 'c'
- # / /
- # 'b' 'b'
-
- node3 = ( 'c', [ None ] )
- node2 = ( 'b', [ node3 ] )
- node1 = ( 'a', [ node3 ] )
- source = [ node1, node2 ]
- self.assertEqual(make_dafsa.join_labels(source), source)
-
- def testChainedTrie(self):
- """Tests a trie formed DAFSA with chained labels can be joined."""
-
- # 'c' -> 'd' 'cd'
- # / /
- # 'a' -> 'b' => 'ab'
- # \ \
- # 'e' -> 'f' 'ef'
-
- node6 = ( 'f', [ None ] )
- node5 = ( 'e', [ node6 ] )
- node4 = ( 'd', [ None ] )
- node3 = ( 'c', [ node4 ] )
- node2 = ( 'b', [ node3, node5 ] )
- node1 = ( 'a', [ node2 ] )
- source1 = [ node1 ]
- node9 = ( 'ef', [ None ] )
- node8 = ( 'cd', [ None ] )
- node7 = ( 'ab', [ node8, node9 ] )
- source2 = [ node7 ]
- self.assertEqual(make_dafsa.join_labels(source1), source2)
-
- def testReverseChainedTrie(self):
- """Tests a reverse trie formed DAFSA with chained labels can be joined."""
-
- # 'a' -> 'b' 'ab'
- # \ \
- # 'e' -> 'f' => 'ef'
- # / /
- # 'c' -> 'd' 'cd'
-
- node6 = ( 'f', [ None ] )
- node5 = ( 'e', [ node6 ] )
- node4 = ( 'd', [ node5 ] )
- node3 = ( 'c', [ node4 ] )
- node2 = ( 'b', [ node5 ] )
- node1 = ( 'a', [ node2 ] )
- source1 = [ node1, node3 ]
- node9 = ( 'ef', [ None ] )
- node8 = ( 'cd', [ node9 ] )
- node7 = ( 'ab', [ node9 ] )
- source2 = [ node7, node8 ]
- self.assertEqual(make_dafsa.join_labels(source1), source2)
-
-
-class JoinSuffixesTest(unittest.TestCase):
- def testSingleLabel(self):
- """Tests a single label passes unchanged."""
-
- # 'a' => 'a'
-
- node1 = ( 'a', [ None ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.join_suffixes(source), source)
-
- def testInnerTerminator(self):
- """Tests a sequence with an inner terminator passes unchanged."""
-
- # 'a' -> 'b' 'a' -> 'b'
- # \ => \
- # {sink} {sink}
-
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, None ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.join_suffixes(source), source)
-
- def testDistinctTrie(self):
- """Tests a trie formed DAFSA with distinct labels passes unchanged."""
-
- # 'b' 'b'
- # / /
- # 'a' => 'a'
- # \ \
- # 'c' 'c'
-
- node3 = ( 'c', [ None ] )
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, node3 ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.join_suffixes(source), source)
-
- def testReverseDistinctTrie(self):
- """Tests a reverse trie formed DAFSA with distinct labels passes unchanged.
- """
-
- # 'a' 'a'
- # \ \
- # 'c' => 'c'
- # / /
- # 'b' 'b'
-
- node3 = ( 'c', [ None ] )
- node2 = ( 'b', [ node3 ] )
- node1 = ( 'a', [ node3 ] )
- source = [ node1, node2 ]
- self.assertEqual(make_dafsa.join_suffixes(source), source)
-
- def testJoinTwoHeads(self):
- """Tests two heads can be joined even if there is something else between."""
-
- # 'a' ------'a'
- # /
- # 'b' => 'b' /
- # /
- # 'a' ---
- #
- # The picture above should shows that the new version should have just one
- # instance of the node with label 'a'.
-
- node3 = ( 'a', [ None ] )
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ None ] )
- source1 = [ node1, node2, node3 ]
- source2 = make_dafsa.join_suffixes(source1)
-
- # Both versions should expand to the same content.
- self.assertEqual(source1, source2)
- # But the new version should have just one instance of 'a'.
- self.assertIs(source2[0], source2[2])
-
- def testJoinTails(self):
- """Tests tails can be joined."""
-
- # 'a' -> 'c' 'a'
- # \
- # => 'c'
- # /
- # 'b' -> 'c' 'b'
-
- node4 = ( 'c', [ None ] )
- node3 = ( 'b', [ node4 ] )
- node2 = ( 'c', [ None ] )
- node1 = ( 'a', [ node2 ] )
- source1 = [ node1, node3 ]
- source2 = make_dafsa.join_suffixes(source1)
-
- # Both versions should expand to the same content.
- self.assertEqual(source1, source2)
- # But the new version should have just one tail.
- self.assertIs(source2[0][1][0], source2[1][1][0])
-
- def testMakeRecursiveTrie(self):
- """Tests recursive suffix join."""
-
- # 'a' -> 'e' -> 'g' 'a'
- # \
- # 'e'
- # / \
- # 'b' -> 'e' -> 'g' 'b' \
- # \
- # => 'g'
- # /
- # 'c' -> 'f' -> 'g' 'c' /
- # \ /
- # 'f'
- # /
- # 'd' -> 'f' -> 'g' 'd'
-
- node7 = ( 'g', [ None ] )
- node6 = ( 'f', [ node7 ] )
- node5 = ( 'e', [ node7 ] )
- node4 = ( 'd', [ node6 ] )
- node3 = ( 'c', [ node6 ] )
- node2 = ( 'b', [ node5 ] )
- node1 = ( 'a', [ node5 ] )
- source1 = [ node1, node2, node3, node4 ]
- source2 = make_dafsa.join_suffixes(source1)
-
- # Both versions should expand to the same content.
- self.assertEqual(source1, source2)
- # But the new version should have just one 'e'.
- self.assertIs(source2[0][1][0], source2[1][1][0])
- # And one 'f'.
- self.assertIs(source2[2][1][0], source2[3][1][0])
- # And one 'g'.
- self.assertIs(source2[0][1][0][1][0], source2[2][1][0][1][0])
-
- def testMakeDiamond(self):
- """Test we can join suffixes of a trie."""
-
- # 'b' -> 'd' 'b'
- # / / \
- # 'a' => 'a' 'd'
- # \ \ /
- # 'c' -> 'd' 'c'
-
- node5 = ( 'd', [ None ] )
- node4 = ( 'c', [ node5 ] )
- node3 = ( 'd', [ None ] )
- node2 = ( 'b', [ node3 ] )
- node1 = ( 'a', [ node2, node4 ] )
- source1 = [ node1 ]
- source2 = make_dafsa.join_suffixes(source1)
-
- # Both versions should expand to the same content.
- self.assertEqual(source1, source2)
- # But the new version should have just one 'd'.
- self.assertIs(source2[0][1][0][1][0], source2[0][1][1][1][0])
-
- def testJoinOneChild(self):
- """Tests that we can join some children but not all."""
-
- # 'c' ----'c'
- # / / /
- # 'a' 'a' /
- # \ \ /
- # 'd' 'd'/
- # => /
- # 'c' /
- # / /
- # 'b' 'b'
- # \ \
- # 'e' 'e'
-
- node6 = ( 'e', [ None ] )
- node5 = ( 'c', [ None ] )
- node4 = ( 'b', [ node5, node6 ] )
- node3 = ( 'd', [ None ] )
- node2 = ( 'c', [ None ] )
- node1 = ( 'a', [ node2, node3 ] )
- source1 = [ node1, node4 ]
- source2 = make_dafsa.join_suffixes(source1)
-
- # Both versions should expand to the same content.
- self.assertEqual(source1, source2)
- # But the new version should have just one 'c'.
- self.assertIs(source2[0][1][0], source2[1][1][0])
-
-
-class ReverseTest(unittest.TestCase):
- def testAtomicLabel(self):
- """Tests an atomic label passes unchanged."""
-
- # 'a' => 'a'
-
- node1 = ( 'a', [ None ] )
- source = [ node1 ]
- self.assertEqual(make_dafsa.reverse(source), source)
-
- def testLabel(self):
- """Tests that labels are reversed."""
-
- # 'ab' => 'ba'
-
- node1 = ( 'ab', [ None ] )
- source1 = [ node1 ]
- node2 = ( 'ba', [ None ] )
- source2 = [ node2 ]
- self.assertEqual(make_dafsa.reverse(source1), source2)
-
- def testChain(self):
- """Tests that edges are reversed."""
-
- # 'a' -> 'b' => 'b' -> 'a'
-
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2 ] )
- source1 = [ node1 ]
- node4 = ( 'a', [ None ] )
- node3 = ( 'b', [ node4 ] )
- source2 = [ node3 ]
- self.assertEqual(make_dafsa.reverse(source1), source2)
-
- def testInnerTerminator(self):
- """Tests a sequence with an inner terminator can be reversed."""
-
- # 'a' -> 'b' 'b' -> 'a'
- # \ => /
- # {sink} ------
-
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, None ] )
- source1 = [ node1 ]
- node4 = ( 'a', [ None ] )
- node3 = ( 'b', [ node4 ] )
- source2 = [ node3, node4 ]
- self.assertEqual(make_dafsa.reverse(source1), source2)
-
- def testAtomicTrie(self):
- """Tests a trie formed DAFSA can be reversed."""
-
- # 'b' 'b'
- # / \
- # 'a' => 'a'
- # \ /
- # 'c' 'c'
-
- node3 = ( 'c', [ None ] )
- node2 = ( 'b', [ None ] )
- node1 = ( 'a', [ node2, node3 ] )
- source1 = [ node1 ]
- node6 = ( 'a', [ None ] )
- node5 = ( 'c', [ node6 ] )
- node4 = ( 'b', [ node6 ] )
- source2 = [ node4, node5 ]
- self.assertEqual(make_dafsa.reverse(source1), source2)
-
- def testReverseAtomicTrie(self):
- """Tests a reverse trie formed DAFSA can be reversed."""
-
- # 'a' 'a'
- # \ /
- # 'c' => 'c'
- # / \
- # 'b' 'b'
-
- node3 = ( 'c', [ None ] )
- node2 = ( 'b', [ node3 ] )
- node1 = ( 'a', [ node3 ] )
- source1 = [ node1, node2 ]
- node6 = ( 'b', [ None ] )
- node5 = ( 'a', [ None ] )
- node4 = ( 'c', [ node5, node6 ] )
- source2 = [ node4 ]
- self.assertEqual(make_dafsa.reverse(source1), source2)
-
- def testDiamond(self):
- """Tests we can reverse both edges and nodes in a diamond."""
-
- # 'cd' 'dc'
- # / \ / \
- # 'ab' 'gh' => 'hg' 'ba'
- # \ / \ /
- # 'ef' 'fe'
-
- node4 = ( 'gh', [ None ] )
- node3 = ( 'ef', [ node4 ] )
- node2 = ( 'cd', [ node4 ] )
- node1 = ( 'ab', [ node2, node3 ] )
- source1 = [ node1 ]
- node8 = ( 'ba', [ None ] )
- node7 = ( 'fe', [ node8 ] )
- node6 = ( 'dc', [ node8 ] )
- node5 = ( 'hg', [ node6, node7 ] )
- source2 = [ node5 ]
- self.assertEqual(make_dafsa.reverse(source1), source2)
-
-
-class TopSortTest(unittest.TestCase):
- def testNode(self):
- """Tests a DAFSA with one node can be sorted."""
-
- # 'a' => [ 'a' ]
-
- node1 = ( 'a', [ None ] )
- source = [ node1 ]
- nodes = [ node1 ]
- self.assertEqual(make_dafsa.top_sort(source), nodes)
-
- def testDiamond(self):
- """Tests nodes in a diamond can be sorted."""
-
- # 'b'
- # / \
- # 'a' 'd'
- # \ /
- # 'c'
-
- node4 = ( 'd', [ None ] )
- node3 = ( 'c', [ node4 ] )
- node2 = ( 'b', [ node4 ] )
- node1 = ( 'a', [ node2, node3 ] )
- source = [ node1 ]
- nodes = make_dafsa.top_sort(source)
- self.assertLess(nodes.index(node1), nodes.index(node2))
- self.assertLess(nodes.index(node2), nodes.index(node4))
- self.assertLess(nodes.index(node3), nodes.index(node4))
-
-
-class EncodePrefixTest(unittest.TestCase):
- def testChar(self):
- """Tests to encode a single character prefix."""
- label = 'a'
- bytes = [ ord('a') ]
- self.assertEqual(make_dafsa.encode_prefix(label), bytes)
-
- def testChars(self):
- """Tests to encode a multi character prefix."""
- label = 'ab'
- bytes = [ ord('b'), ord('a') ]
- self.assertEqual(make_dafsa.encode_prefix(label), bytes)
-
-
-class EncodeLabelTest(unittest.TestCase):
- def testChar(self):
- """Tests to encode a single character label."""
- label = 'a'
- bytes = [ ord('a') + 0x80 ]
- self.assertEqual(make_dafsa.encode_label(label), bytes)
-
- def testChars(self):
- """Tests to encode a multi character label."""
- label = 'ab'
- bytes = [ ord('b') + 0x80, ord('a') ]
- self.assertEqual(make_dafsa.encode_label(label), bytes)
-
-
-class EncodeLinksTest(unittest.TestCase):
- def testEndLabel(self):
- """Tests to encode link to the sink."""
- children = [ None ]
- offsets = {}
- bytes = 0
- output = []
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testOneByteOffset(self):
- """Tests to encode a single one byte offset."""
- node = ( '', [ None ] )
- children = [ node ]
- offsets = { id(node) : 2 }
- bytes = 5
- output = [ 132 ]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testOneByteOffsets(self):
- """Tests to encode a sequence of one byte offsets."""
- node1 = ( '', [ None ] )
- node2 = ( '', [ None ] )
- children = [ node1, node2 ]
- offsets = { id(node1) : 2, id(node2) : 1 }
- bytes = 5
- output = [ 129, 5 ]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testTwoBytesOffset(self):
- """Tests to encode a single two byte offset."""
- node = ( '', [ None ] )
- children = [ node ]
- offsets = { id(node) : 2 }
- bytes = 1005
- output = [ 237, 195]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testTwoBytesOffsets(self):
- """Tests to encode a sequence of two byte offsets."""
- node1 = ( '', [ None ] )
- node2 = ( '', [ None ] )
- node3 = ( '', [ None ] )
- children = [ node1, node2, node3 ]
- offsets = { id(node1) : 1002, id(node2) : 2, id(node3) : 2002 }
- bytes = 3005
- output = [ 232, 195, 232, 67, 241, 67 ]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testThreeBytesOffset(self):
- """Tests to encode a single three byte offset."""
- node = ( '', [ None ] )
- children = [ node ]
- offsets = { id(node) : 2 }
- bytes = 100005
- output = [ 166, 134, 225 ]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testThreeBytesOffsets(self):
- """Tests to encode a sequence of three byte offsets."""
- node1 = ( '', [ None ] )
- node2 = ( '', [ None ] )
- node3 = ( '', [ None ] )
- children = [ node1, node2, node3 ]
- offsets = { id(node1) : 100002, id(node2) : 2, id(node3) : 200002 }
- bytes = 300005
- output = [ 160, 134, 225, 160, 134, 97, 172, 134, 97 ]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
- def testOneTwoThreeBytesOffsets(self):
- """Tests to encode offsets of different sizes."""
- node1 = ( '', [ None ] )
- node2 = ( '', [ None ] )
- node3 = ( '', [ None ] )
- children = [ node1, node2, node3 ]
- offsets = { id(node1) : 10003, id(node2) : 10002, id(node3) : 100002 }
- bytes = 300005
- output = [ 129, 143, 95, 97, 74, 13, 99 ]
- self.assertEqual(make_dafsa.encode_links(children, offsets, bytes),
- output)
-
-
-class ExamplesTest(unittest.TestCase):
- def testExample1(self):
- """Tests Example 1 from make_dafsa.py."""
- infile = [ '%%', 'aa, 1', 'a, 2', '%%' ]
- bytes = [ 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81 ]
- outfile = make_dafsa.to_cxx(bytes)
- self.assertEqual(make_dafsa.words_to_cxx(make_dafsa.parse_gperf(infile)),
- outfile)
-
- def testExample2(self):
- """Tests Example 2 from make_dafsa.py."""
- infile = [ '%%', 'aa, 1', 'bbb, 2', 'baa, 1', '%%' ]
- bytes = [ 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62,
- 0x82 ]
- outfile = make_dafsa.to_cxx(bytes)
- self.assertEqual(make_dafsa.words_to_cxx(make_dafsa.parse_gperf(infile)),
- outfile)
-
-
-if __name__ == '__main__':
- unittest.main()