summaryrefslogtreecommitdiff
path: root/lib/compression
diff options
context:
space:
mode:
authorDouglas Bagnall <douglas.bagnall@catalyst.net.nz>2022-11-22 08:23:30 +1300
committerJoseph Sutton <jsutton@samba.org>2022-12-01 22:56:39 +0000
commitdadecede544519e4747a8623a1f5e0d0a1450002 (patch)
tree7f7595390dba6d7962374a0dca241ce6a067de7c /lib/compression
parentbce33816ec9160373110f0ccbf3174c301218c9a (diff)
downloadsamba-dadecede544519e4747a8623a1f5e0d0a1450002.tar.gz
lib/compression: helper script to make unbalanced data
Huffman tree re-quantisation and perhaps other code paths are only triggered by pathological data like this. Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz> Reviewed-by: Joseph Sutton <josephsutton@catalyst.net.nz>
Diffstat (limited to 'lib/compression')
-rwxr-xr-xlib/compression/tests/scripts/make-test-vectors185
1 files changed, 185 insertions, 0 deletions
diff --git a/lib/compression/tests/scripts/make-test-vectors b/lib/compression/tests/scripts/make-test-vectors
new file mode 100755
index 00000000000..6f25866bf19
--- /dev/null
+++ b/lib/compression/tests/scripts/make-test-vectors
@@ -0,0 +1,185 @@
+#!/usr/bin/python3
+"""Generate a few strings with unbalanced distributions to test the
+regeneration of the Huffman tree when it gets too deep.
+
+USAGE: make-test-vectors DIR
+
+This will fill up DIR with test files.
+"""
+import sys
+import random
+from collections import defaultdict
+
+
+if '--help' in sys.argv or '-h' in sys.argv or len(sys.argv) != 2:
+ print(__doc__)
+ exit(len(sys.argv) != 2)
+
+
+DIR = sys.argv[1]
+
+SIZE = (1 << 17) + (23) # two and a bit blocks
+SIZE_NAME = "128k+"
+# SIZE = (1 << 16)
+# SIZE_NAME = "64"
+
+
+random.seed(1)
+
+
+def squares(n):
+ array = []
+ for i in range(n):
+ a = random.random()
+ b = random.random()
+ array.append(int(a * b * 256))
+ return bytes(array)
+
+
+def skewed_choices(n):
+ b = list(range(256))
+ array = random.choices(b, weights=b, k=n)
+ return bytes(array)
+
+
+def fib_shuffle(n):
+ array = []
+ a, b = 1, 1
+ for i in range(100):
+ array.extend([i] * a)
+ a, b = a + b, a
+ if len(array) > 1000000:
+ break
+ random.shuffle(array)
+ return bytes(array[:n])
+
+
+def exp_shuffle(n):
+ array = []
+ for i in range(256):
+ array.extend([i] * int(1.04 ** i))
+ if len(array) > 1000000:
+ break
+ random.shuffle(array)
+ return bytes(array[:n])
+
+
+def and_rand(n):
+ array = []
+ for i in range(n):
+ a = random.randrange(256)
+ b = random.randrange(256)
+ array.append(a & b)
+ return bytes(array)
+
+
+def betavar(n, a, b):
+ array = []
+ for i in range(n):
+ x = random.betavariate(a, b)
+ array.append(int(x * 255.999999999999))
+ return bytes(array)
+
+
+def repeated_alphabet(n):
+ a = b'abcdefghijklmnopqrstuvwxyz'
+ na = n // len(a) + 1
+ s = a * na
+ return s[:n]
+
+
+def decayed_alphabet(n):
+ s = list(repeated_alphabet(n))
+ for i in range(256):
+ j = random.randrange(n)
+ s[j] = i
+
+ return bytes(s)
+
+
+def trigram_model(n):
+ with open(__file__, 'rb') as f:
+ data = f.read()
+ lut = defaultdict(list)
+ for a, b, c in zip(data, data[1:], data[2:]):
+ k = bytes([a, b])
+ lut[k].append(c)
+
+ k = random.choice(list(lut.keys()))
+ s = []
+ p = k[1]
+ for i in range(n + 10):
+ c = random.choice(lut[k])
+ s.append(c)
+ k = bytes([p, c])
+ p = c
+
+ return bytes(s[10:])
+
+
+def trigram_sum_model(n):
+ with open(__file__, 'rb') as f:
+ data = f.read()
+ lut = [[random.randrange(256)] for i in range(512)]
+ for a, b, c in zip(data, data[1:], data[2:]):
+ lut[a + b].append(c)
+
+ s = []
+ i = random.randrange(len(data) - 1)
+ a = data[i]
+ b = data[i + 1]
+
+ for i in range(n + 10):
+ x = lut[a + b]
+ c = random.choice(x)
+ s.append(c)
+ a = b
+ b = c
+
+ return bytes(s[10:])
+
+
+def the_classics():
+ # this used to be main()
+ sq = squares(SIZE)
+ ch = skewed_choices(SIZE)
+ fs = fib_shuffle(SIZE)
+ es = exp_shuffle(SIZE)
+ ar = and_rand(SIZE)
+ bv1 = betavar(SIZE, 0.1, 1.5)
+ bv2 = betavar(SIZE, 0.5, 2.0)
+ bv3 = betavar(SIZE, 0.05, 0.05)
+
+ print("n sq ch fs es")
+ for i in range(256):
+ print(f"{i:3} {sq.count(i):5} {ch.count(i):5} "
+ f"{fs.count(i):5} {es.count(i):5}"
+ f"{ar.count(i):5} {bv1.count(i):5}"
+ f"{bv2.count(i):5} {bv3.count(i):5}"
+ )
+
+ for series, fn in ((sq, "square_series"),
+ (ch, "skewed_choices"),
+ (fs, "fib_shuffle"),
+ (es, "exp_shuffle"),
+ (ar, "and_rand"),
+ (bv1, "beta-variate1"),
+ (bv2, "beta-variate2"),
+ (bv3, "beta-variate3"),
+ ):
+ with open(f"{DIR}/{fn}-{SIZE_NAME}", "wb") as f:
+ f.write(series)
+
+
+def main():
+ if True:
+ the_classics()
+ for series, fn in ((decayed_alphabet(SIZE), "decayed_alphabet"),
+ (trigram_model(SIZE), "trigram"),
+ (trigram_sum_model(SIZE), "trigram_sum"),
+ ):
+ with open(f"{DIR}/{fn}_{SIZE_NAME}", "wb") as f:
+ f.write(series)
+
+
+main()