/* * Samba compression library - LGPLv3 * * Copyright © Catalyst IT 2022 * * Written by Douglas Bagnall * * ** NOTE! The following LGPL license applies to this file. * ** It does NOT imply that all of Samba is released under the LGPL * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 3 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, see . */ #include #include #include #include #include #include #include "replace.h" #include #include "lzxpress_huffman.h" #include "lib/util/stable_sort.h" #include "lib/util/data_blob.h" /* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */ #define LZXHUFF_DEBUG_FILES false /* set LZXHUFF_DEBUG_VERBOSE to true to print more. */ #define LZXHUFF_DEBUG_VERBOSE false #if LZXHUFF_DEBUG_VERBOSE #define debug_message(...) print_message(__VA_ARGS__) #include struct timespec start = {0}; struct timespec end = {0}; static void debug_start_timer(void) { clock_gettime(CLOCK_MONOTONIC, &start); } static void debug_end_timer(const char *name, size_t len) { uint64_t ns; double secs; double rate; clock_gettime(CLOCK_MONOTONIC, &end); ns = end.tv_nsec; ns += end.tv_sec * 1000 * 1000 * 1000; ns -= start.tv_nsec; ns -= start.tv_sec * 1000 * 1000 * 1000; secs = ns / 1e9; rate = len / (secs * 1024 * 1024); debug_message("%s %zu bytes in %.2g: \033[1;35m%.2f\033[0m MB per second\n", name, len, secs, rate); } #else #define debug_message(...) /* debug_message */ #define debug_start_timer(...) /* debug_start_timer */ #define debug_end_timer(...) /* debug_end_timer */ #endif struct lzx_pair { const char *name; DATA_BLOB compressed; DATA_BLOB decompressed; }; struct lzx_file_pair { const char *name; const char *compressed_file; const char *decompressed_file; }; #define DECOMP_DIR "testdata/compression/decompressed" #define COMP_DIR "testdata/compression/compressed-huffman" #define MORE_COMP_DIR "testdata/compression/compressed-more-huffman" #define VARRGH(...) __VA_ARGS__ #define BLOB_FROM_ARRAY(...) \ { \ .data = (uint8_t[]){__VA_ARGS__}, \ .length = sizeof((uint8_t[]){__VA_ARGS__}) \ } #define BLOB_FROM_STRING(s) \ { \ .data = discard_const_p(uint8_t, s), \ .length = (sizeof(s) - 1) \ } const char * file_names[] = { "27826-8.txt", "5d049b4cb1bd933f5e8ex19", "638e61e96d54279981c3x5", "64k-minus-one-zeros", "64k-plus-one-zeros", "64k-zeros", "96f696a4e5ce56c61a3dx10", "9e0b6a12febf38e98f13", "abc-times-101", "abc-times-105", "abc-times-200", "and_rand", "and_rand-128k+", "b63289ccc7f218c0d56b", "beta-variate1-128k+", "beta-variate2-128k+", "beta-variate3-128k+", "decayed_alphabet_128k+", "decayed_alphabet_64k", "exp_shuffle", "exp_shuffle-128k+", "f00842317dc6d5695b02", "fib_shuffle", "fib_shuffle-128k+", "fuzzing-0fc2d461b56cd8103c91", "fuzzing-17c961778538cc10ab7c", "fuzzing-3591f9dc02bb00a54b60", "fuzzing-3ec3bca27bb9eb40c128", "fuzzing-80b4fa18ff5f8dd04862", "fuzzing-a3115a81d1ac500318f9", "generate-windows-test-vectors.c", "midsummer-nights-dream.txt", "notes-on-the-underground.txt", "pg22009.txt", "repeating", "repeating-exactly-64k", "setup.log", "skewed_choices", "skewed_choices-128k+", /* These ones were deathly slow in fuzzing at one point */ "slow-015ddc36a71412ccc50d", "slow-100e9f966a7feb9ca40a", "slow-2a671c3cff4f1574cbab", "slow-33d90a24e70515b14cd0", "slow-49d8c05261e3f412fc72", "slow-50a249d2fe56873e56a0", "slow-63e9f0b52235fb0129fa", "slow-73b7f971d65908ac0095", "slow-8b61e3dd267908544531", "slow-9d1c5a079b0462986f1f", "slow-aa7262a821dabdcf04a6", "slow-b8a91d142b0d2af7f5ca", "slow-c79142457734bbc8d575", "slow-d736544545b90d83fe75", "slow-e3b9bdfaed7d1a606fdb", "slow-f3f1c02a9d006e5e1703", "square_series", "square_series-128k+", "trigram_128k+", "trigram_64k", "trigram_sum_128k+", "trigram_sum_64k", NULL }; struct lzx_pair bidirectional_pairs[] = { {.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */ .decompressed = BLOB_FROM_STRING( "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc" "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc" "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc" "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc" "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc" ), .compressed = BLOB_FROM_ARRAY( /* * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other * symbols occur. That means we need 48 0x00 bytes for the * first 96 symbol-nybbles, then some short codes, then zeros * again for the rest of the literals. */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, 0x30, 0x23, /* a_ cb */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */ 0x02, /* 'EOF' symbol 256 (byte 128 low) */ 0,0,0,0,0, 0,0,0,0,0, 0, /* 140 bytes */ 0,0,0, 0x20, /* codepoint 287 (byte 143 high) */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0, /* 160 bytes */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0, /* * So that's the tree. * * length 2 codes for 'c', EOF, 287 * length 3 for 'a', 'b'. * * c: 00 * EOF: 01 * 287: 10 * a: 110 * b: 111 * * thus the literal string "abc" is 110-111-00. * * Now for the lz77 match definitions for EOF and 287. * * Why 287? It encodes the match distance and offset. * * 287 - 256 = 31 * * _length = 31 % 16 = 15 * _distance = 31 / 16 = 1 * * (it's easier to look at the hex, 0x11f: * 1xx means a match; x1x is _distance; xxf is _length) * * _distance 1 means a two bit distance (10 or 11; 2 or 3). * That means the next bit will be the least significant bit * of distance (1 in this case, meaning distance 3). * * if _length is 15, real length is included inline. * * 'EOF' == 256 means _length = 0, _distance = 0. * * _distance 0 means 1, so no further bits needed. * _length 0 means length 3. * * but when used as EOF, this doesn't matter. */ 0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01 /* These remaining bytes are: * * 10101000 11011100 00000000 00000000 11111111 * 00100110 00000001 * * and we read them as 16 chunks (i.e. flipping odd/even order) * * 110-111-00 10-1-01-000 * a b c 287 | EOF * | * this is the 287 distance low bit. * * The last 3 bits are not used. The 287 length is sort of * out of band, coming up soon (because 287 encodes length * 15; most codes do not do this). * * 00000000 00000000 * * This padding is there because the first 32 bits are read * at the beginning of decoding. If there were more things to * be encoded, they would be in here. * * 11111111 * * This byte is pulled as the length for the 287 match. * Because it is 0xff, we pull a further 2 bytes for the * actual length, i.e. a 16 bit number greater than 270. * * 00000001 00100110 * * that is 0x126 = 294 = the match length - 3 (because we're * encoding ["abc", , EOF]). * */ ) }, {.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */ .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"), .compressed = BLOB_FROM_ARRAY( /* * In this case there are no matches encoded as there are no * repeated symbols. Including the EOF, there are 27 symbols * all occurring exactly as frequently as each other (once). * From that we would expect the codes to be mostly 5 bits * long, because 27 < 2^5 (32), but greater than 2^4. And * that's what we see. */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, /* 14 non-zero bytes for 26 letters/nibbles */ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x45, 0x44, 0x04, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, /* 80 */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, 0x04, /* 0x100 EOF */ /* no matches */ 0,0,0,0,0, 0,0,0,0,0, 0, /* 140 */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0, 0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9, 0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04, 0x00, 0x00, 0x00, 0x00) }, {0} }; static void test_lzxpress_huffman_decompress(void **state) { size_t i; ssize_t written; uint8_t *dest = NULL; TALLOC_CTX *mem_ctx = talloc_new(NULL); for (i = 0; bidirectional_pairs[i].name != NULL; i++) { struct lzx_pair p = bidirectional_pairs[i]; dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length); debug_message("%s compressed %zu decomp %zu\n", p.name, p.compressed.length, p.decompressed.length); written = lzxpress_huffman_decompress(p.compressed.data, p.compressed.length, dest, p.decompressed.length); assert_int_not_equal(written, -1); assert_int_equal(written, p.decompressed.length); assert_memory_equal(dest, p.decompressed.data, p.decompressed.length); talloc_free(dest); } } static void test_lzxpress_huffman_compress(void **state) { size_t i; ssize_t written; uint8_t *dest = NULL; TALLOC_CTX *mem_ctx = talloc_new(NULL); for (i = 0; bidirectional_pairs[i].name != NULL; i++) { struct lzx_pair p = bidirectional_pairs[i]; debug_message("%s compressed %zu decomp %zu\n", p.name, p.compressed.length, p.decompressed.length); written = lzxpress_huffman_compress_talloc(mem_ctx, p.decompressed.data, p.decompressed.length, &dest); assert_int_not_equal(written, -1); assert_int_equal(written, p.compressed.length); assert_memory_equal(dest, p.compressed.data, p.compressed.length); talloc_free(dest); } } static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx, const char *filename) { DATA_BLOB b = {0}; FILE *fh = fopen(filename, "rb"); int ret; struct stat s; size_t len; if (fh == NULL) { debug_message("could not open '%s'\n", filename); return b; } ret = fstat(fileno(fh), &s); if (ret != 0) { fclose(fh); return b; } b.data = talloc_array(mem_ctx, uint8_t, s.st_size); if (b.data == NULL) { fclose(fh); return b; } len = fread(b.data, 1, s.st_size, fh); if (ferror(fh) || len != s.st_size) { TALLOC_FREE(b.data); } else { b.length = len; } fclose(fh); return b; } static void test_lzxpress_huffman_decompress_files(void **state) { size_t i; int score = 0; TALLOC_CTX *mem_ctx = talloc_new(NULL); for (i = 0; file_names[i] != NULL; i++) { char filename[200]; uint8_t *dest = NULL; ssize_t written; TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx); struct lzx_pair p = { .name = file_names[i] }; debug_message("%s\n", p.name); snprintf(filename, sizeof(filename), "%s/%s.decomp", DECOMP_DIR, p.name); p.decompressed = datablob_from_file(tmp_ctx, filename); assert_non_null(p.decompressed.data); snprintf(filename, sizeof(filename), "%s/%s.lzhuff", COMP_DIR, p.name); p.compressed = datablob_from_file(tmp_ctx, filename); assert_non_null(p.compressed.data); dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length); debug_start_timer(); written = lzxpress_huffman_decompress(p.compressed.data, p.compressed.length, dest, p.decompressed.length); debug_end_timer("decompress", p.decompressed.length); if (written != -1 && written == p.decompressed.length && memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) { debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name); score++; } else { debug_message("\033[1;31mfailed to decompress %s!\033[0m\n", p.name); debug_message("size %zd vs reference %zu\n", written, p.decompressed.length); } talloc_free(tmp_ctx); } debug_message("%d/%zu correct\n", score, i); assert_int_equal(score, i); } static void test_lzxpress_huffman_decompress_more_compressed_files(void **state) { /* * This tests the decompression of files that have been compressed on * Windows with the level turned up (to 1, default for MS-XCA is 0). * * The format is identical, but it will have tried harder to find * matches. */ size_t i; int score = 0; int found = 0; TALLOC_CTX *mem_ctx = talloc_new(NULL); for (i = 0; file_names[i] != NULL; i++) { char filename[200]; uint8_t *dest = NULL; ssize_t written; TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx); struct lzx_pair p = { .name = file_names[i] }; debug_message("%s\n", p.name); snprintf(filename, sizeof(filename), "%s/%s.decomp", DECOMP_DIR, p.name); p.decompressed = datablob_from_file(tmp_ctx, filename); assert_non_null(p.decompressed.data); snprintf(filename, sizeof(filename), "%s/%s.lzhuff", MORE_COMP_DIR, p.name); p.compressed = datablob_from_file(tmp_ctx, filename); if (p.compressed.data == NULL) { /* * We don't have all the vectors in the * more-compressed directory, which is OK, we skip * them. */ continue; } found++; dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length); debug_start_timer(); written = lzxpress_huffman_decompress(p.compressed.data, p.compressed.length, dest, p.decompressed.length); debug_end_timer("decompress", p.decompressed.length); if (written == p.decompressed.length && memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) { debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name); score++; } else { debug_message("\033[1;31mfailed to decompress %s!\033[0m\n", p.name); debug_message("size %zd vs reference %zu\n", written, p.decompressed.length); } talloc_free(tmp_ctx); } debug_message("%d/%d correct\n", score, found); assert_int_equal(score, found); } /* * attempt_round_trip() tests whether a data blob can survive a compression * and decompression cycle. If save_name is not NULL and LZXHUFF_DEBUG_FILES * evals to true, the various stages are saved in files with that name and the * '-original', '-compressed', and '-decompressed' suffixes. If ref_compressed * has data, it'll print a message saying whether the compressed data matches * that. */ static ssize_t attempt_round_trip(TALLOC_CTX *mem_ctx, DATA_BLOB original, const char *save_name, DATA_BLOB ref_compressed) { TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx); DATA_BLOB compressed = data_blob_talloc(tmp_ctx, NULL, original.length * 4 / 3 + 260); DATA_BLOB decompressed = data_blob_talloc(tmp_ctx, NULL, original.length); ssize_t comp_written, decomp_written; debug_start_timer(); comp_written = lzxpress_huffman_compress_talloc(tmp_ctx, original.data, original.length, &compressed.data); debug_end_timer("compress", original.length); if (comp_written <= 0) { talloc_free(tmp_ctx); return -1; } if (ref_compressed.data != NULL) { /* * This is informational, not an assertion; there are * ~infinite legitimate ways to compress the data, many as * good as each other (think of compression as a language, not * a format). */ debug_message("compressed size %zd vs reference %zu\n", comp_written, ref_compressed.length); if (comp_written == compressed.length && memcmp(compressed.data, ref_compressed.data, comp_written) == 0) { debug_message("\033[1;32mbyte identical!\033[0m\n"); } } debug_start_timer(); decomp_written = lzxpress_huffman_decompress(compressed.data, comp_written, decompressed.data, original.length); debug_end_timer("decompress", original.length); if (save_name != NULL && LZXHUFF_DEBUG_FILES) { char s[300]; FILE *fh = NULL; snprintf(s, sizeof(s), "%s-original", save_name); fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s); fh = fopen(s, "w"); fwrite(original.data, 1, original.length, fh); fclose(fh); snprintf(s, sizeof(s), "%s-compressed", save_name); fprintf(stderr, "Saving %zu bytes to %s\n", comp_written, s); fh = fopen(s, "w"); fwrite(compressed.data, 1, comp_written, fh); fclose(fh); /* * We save the decompressed file using original.length, not * the returned size. If these differ, the returned size will * be -1. By saving the whole buffer we can see at what point * it went haywire. */ snprintf(s, sizeof(s), "%s-decompressed", save_name); fprintf(stderr, "Saving %zu bytes to %s\n", original.length, s); fh = fopen(s, "w"); fwrite(decompressed.data, 1, original.length, fh); fclose(fh); } if (original.length != decomp_written || memcmp(decompressed.data, original.data, original.length) != 0) { debug_message("\033[1;31mgot %zd, expected %zu\033[0m\n", decomp_written, original.length); talloc_free(tmp_ctx); return -1; } talloc_free(tmp_ctx); return comp_written; } static void test_lzxpress_huffman_round_trip(void **state) { size_t i; int score = 0; ssize_t compressed_total = 0; ssize_t reference_total = 0; TALLOC_CTX *mem_ctx = talloc_new(NULL); for (i = 0; file_names[i] != NULL; i++) { char filename[200]; char *debug_files = NULL; TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx); ssize_t comp_size; struct lzx_pair p = { .name = file_names[i] }; debug_message("-------------------\n"); debug_message("%s\n", p.name); snprintf(filename, sizeof(filename), "%s/%s.decomp", DECOMP_DIR, p.name); p.decompressed = datablob_from_file(tmp_ctx, filename); assert_non_null(p.decompressed.data); snprintf(filename, sizeof(filename), "%s/%s.lzhuff", COMP_DIR, p.name); p.compressed = datablob_from_file(tmp_ctx, filename); if (p.compressed.data == NULL) { debug_message( "Could not load %s reference file %s\n", p.name, filename); debug_message("%s decompressed %zu\n", p.name, p.decompressed.length); } else { debug_message("%s: reference compressed %zu decomp %zu\n", p.name, p.compressed.length, p.decompressed.length); } if (1) { /* * We're going to save copies in /tmp. */ snprintf(filename, sizeof(filename), "/tmp/lzxhuffman-%s", p.name); debug_files = filename; } comp_size = attempt_round_trip(mem_ctx, p.decompressed, debug_files, p.compressed); if (comp_size > 0) { debug_message("\033[1;32mround trip!\033[0m\n"); score++; if (p.compressed.length) { compressed_total += comp_size; reference_total += p.compressed.length; } } talloc_free(tmp_ctx); } debug_message("%d/%zu correct\n", score, i); print_message("\033[1;34mtotal compressed size: %zu\033[0m\n", compressed_total); print_message("total reference size: %zd \n", reference_total); print_message("diff: %7zd \n", reference_total - compressed_total); assert_true(reference_total != 0); print_message("ratio: \033[1;3%dm%.2f\033[0m \n", 2 + (compressed_total >= reference_total), ((double)compressed_total) / reference_total); /* * Assert that the compression is *about* as good as Windows. Of course * it doesn't matter if we do better, but mysteriously getting better * is usually a sign that something is wrong. * * At the time of writing, compressed_total is 2674004, or 10686 more * than the Windows reference total. That's < 0.5% difference, we're * asserting at 2%. */ assert_true(labs(compressed_total - reference_total) < compressed_total / 50); assert_int_equal(score, i); talloc_free(mem_ctx); } /* * Bob Jenkins' Small Fast RNG. * * We don't need it to be this good, but we do need it to be reproduceable * across platforms, which rand() etc aren't. * * http://burtleburtle.net/bob/rand/smallprng.html */ struct jsf_rng { uint32_t a; uint32_t b; uint32_t c; uint32_t d; }; #define ROTATE32(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) static uint32_t jsf32(struct jsf_rng *x) { uint32_t e = x->a - ROTATE32(x->b, 27); x->a = x->b ^ ROTATE32(x->c, 17); x->b = x->c + x->d; x->c = x->d + e; x->d = e + x->a; return x->d; } static void jsf32_init(struct jsf_rng *x, uint32_t seed) { size_t i; x->a = 0xf1ea5eed; x->b = x->c = x->d = seed; for (i = 0; i < 20; ++i) { jsf32(x); } } static void test_lzxpress_huffman_long_gpl_round_trip(void **state) { /* * We use a kind of model-free Markov model to generate a massively * extended pastiche of the GPLv3 (chosen because it is right there in * "COPYING" and won't change often). * * The point is to check a round trip of a very long message with * multiple repetitions on many scales, without having to add a very * large file. */ size_t i, j, k; uint8_t c; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB gpl = datablob_from_file(mem_ctx, "COPYING"); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024); DATA_BLOB ref = {0}; ssize_t comp_size; struct jsf_rng rng; if (gpl.data == NULL) { print_message("could not read COPYING\n"); fail(); } jsf32_init(&rng, 1); j = 1; original.data[0] = gpl.data[0]; for (i = 1; i < original.length; i++) { size_t m; char p = original.data[i - 1]; c = gpl.data[j]; original.data[i] = c; j++; m = (j + jsf32(&rng)) % (gpl.length - 50); for (k = m; k < m + 30; k++) { if (p == gpl.data[k] && c == gpl.data[k + 1]) { j = k + 2; break; } } if (j == gpl.length) { j = 1; } } comp_size = attempt_round_trip(mem_ctx, original, "/tmp/gpl", ref); assert_true(comp_size > 0); assert_true(comp_size < original.length); talloc_free(mem_ctx); } static void test_lzxpress_huffman_long_random_graph_round_trip(void **state) { size_t i; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024); DATA_BLOB ref = {0}; /* * There's a random trigram graph, with each pair of sequential bytes * pointing to a successor. This would probably fall into a fairly * simple loop, but we introduce damage into the system, randomly * flipping about 1 bit in 64. * * The result is semi-structured and compressible. */ uint8_t *d = original.data; uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536); uint32_t *table32 = (void*)table; ssize_t comp_size; struct jsf_rng rng; jsf32_init(&rng, 1); for (i = 0; i < (65536 / 4); i++) { table32[i] = jsf32(&rng); } d[0] = 'a'; d[1] = 'b'; for (i = 2; i < original.length; i++) { uint16_t k = (d[i - 2] << 8) | d[i - 1]; uint32_t damage = jsf32(&rng) & jsf32(&rng) & jsf32(&rng); damage &= (damage >> 16); k ^= damage & 0xffff; d[i] = table[k]; } comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-graph", ref); assert_true(comp_size > 0); assert_true(comp_size < original.length); talloc_free(mem_ctx); } static void test_lzxpress_huffman_chaos_graph_round_trip(void **state) { size_t i; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024); DATA_BLOB ref = {0}; /* * There's a random trigram graph, with each pair of sequential bytes * pointing to a successor. This would probably fall into a fairly * simple loop, but we keep changing the graph. The result is long * periods of stability separatd by bursts of noise. */ uint8_t *d = original.data; uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536); uint32_t *table32 = (void*)table; ssize_t comp_size; struct jsf_rng rng; jsf32_init(&rng, 1); for (i = 0; i < (65536 / 4); i++) { table32[i] = jsf32(&rng); } d[0] = 'a'; d[1] = 'b'; for (i = 2; i < original.length; i++) { uint16_t k = (d[i - 2] << 8) | d[i - 1]; uint32_t damage = jsf32(&rng); d[i] = table[k]; if ((damage >> 29) == 0) { uint16_t index = damage & 0xffff; uint8_t value = (damage >> 16) & 0xff; table[index] = value; } } comp_size = attempt_round_trip(mem_ctx, original, "/tmp/chaos-graph", ref); assert_true(comp_size > 0); assert_true(comp_size < original.length); talloc_free(mem_ctx); } static void test_lzxpress_huffman_sparse_random_graph_round_trip(void **state) { size_t i; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 5 * 1024 * 1024); DATA_BLOB ref = {0}; /* * There's a random trigram graph, with each pair of sequential bytes * pointing to a successor. This will fall into a fairly simple loops, * but we introduce damage into the system, randomly mangling about 1 * byte in 65536. * * The result has very long repetitive runs, which should lead to * oversized blocks. */ uint8_t *d = original.data; uint8_t *table = talloc_array(mem_ctx, uint8_t, 65536); uint32_t *table32 = (void*)table; ssize_t comp_size; struct jsf_rng rng; jsf32_init(&rng, 3); for (i = 0; i < (65536 / 4); i++) { table32[i] = jsf32(&rng); } d[0] = 'a'; d[1] = 'b'; for (i = 2; i < original.length; i++) { uint16_t k = (d[i - 2] << 8) | d[i - 1]; uint32_t damage = jsf32(&rng); if ((damage & 0xffff0000) == 0) { k ^= damage & 0xffff; } d[i] = table[k]; } comp_size = attempt_round_trip(mem_ctx, original, "/tmp/sparse-random-graph", ref); assert_true(comp_size > 0); assert_true(comp_size < original.length); talloc_free(mem_ctx); } static void test_lzxpress_huffman_random_noise_round_trip(void **state) { size_t i; size_t len = 1024 * 1024; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len); DATA_BLOB ref = {0}; ssize_t comp_size; /* * We are filling this up with incompressible noise, but we can assert * quite tight bounds on how badly it will fail to compress. * * Specifically, with randomly distributed codes, the Huffman table * should come out as roughly even, averaging 8 bit codes. Then there * will be a 256 byte table every 64k, which is a 1/256 overhead (i.e. * the compressed length will be 257/256 the original *on average*). * We assert it is less than 1 in 200 but more than 1 in 300. */ uint32_t *d32 = (uint32_t*)((void*)original.data); struct jsf_rng rng; jsf32_init(&rng, 2); for (i = 0; i < (len / 4); i++) { d32[i] = jsf32(&rng); } comp_size = attempt_round_trip(mem_ctx, original, "/tmp/random-noise", ref); assert_true(comp_size > 0); assert_true(comp_size > original.length + original.length / 300); assert_true(comp_size < original.length + original.length / 200); debug_message("original size %zu; compressed size %zd; ratio %.3f\n", len, comp_size, ((double)comp_size) / len); talloc_free(mem_ctx); } static void test_lzxpress_huffman_overlong_matches(void **state) { size_t i, j = 0; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024); DATA_BLOB ref = {0}; uint8_t *d = original.data; char filename[300]; /* * We are testing with something like "aaaaaaaaaaaaaaaaaaaaaaabbbbb" * where typically the number of "a"s is > 65536, and the number of * "b"s is < 42. */ ssize_t na[] = {65535, 65536, 65537, 65559, 65575, 200000, -1}; ssize_t nb[] = {1, 2, 20, 39, 40, 41, 42, -1}; int score = 0; ssize_t comp_size; for (i = 0; na[i] >= 0; i++) { ssize_t a = na[i]; memset(d, 'a', a); for (j = 0; nb[j] >= 0; j++) { ssize_t b = nb[j]; memset(d + a, 'b', b); original.length = a + b; snprintf(filename, sizeof(filename), "/tmp/overlong-%zd-%zd", a, b); comp_size = attempt_round_trip(mem_ctx, original, filename, ref); if (comp_size > 0) { score++; } } } debug_message("%d/%zu correct\n", score, i * j); assert_int_equal(score, i * j); talloc_free(mem_ctx); } static void test_lzxpress_huffman_overlong_matches_abc(void **state) { size_t i, j = 0, k = 0; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, 1024 * 1024); DATA_BLOB ref = {0}; uint8_t *d = original.data; char filename[300]; /* * We are testing with something like "aaaabbbbcc" where typically * the number of "a"s + "b"s is around 65536, and the number of "c"s * is < 43. */ ssize_t nab[] = {1, 21, 32767, 32768, 32769, -1}; ssize_t nc[] = {1, 2, 20, 39, 40, 41, 42, -1}; int score = 0; ssize_t comp_size; for (i = 0; nab[i] >= 0; i++) { ssize_t a = nab[i]; memset(d, 'a', a); for (j = 0; nab[j] >= 0; j++) { ssize_t b = nab[j]; memset(d + a, 'b', b); for (k = 0; nc[k] >= 0; k++) { ssize_t c = nc[k]; memset(d + a + b, 'c', c); original.length = a + b + c; snprintf(filename, sizeof(filename), "/tmp/overlong-abc-%zd-%zd-%zd", a, b, c); comp_size = attempt_round_trip(mem_ctx, original, filename, ref); if (comp_size > 0) { score++; } } } } debug_message("%d/%zu correct\n", score, i * j * k); assert_int_equal(score, i * j * k); talloc_free(mem_ctx); } static void test_lzxpress_huffman_extremely_compressible_middle(void **state) { size_t len = 192 * 1024; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len); DATA_BLOB ref = {0}; ssize_t comp_size; /* * When a middle block (i.e. not the first and not the last of >= 3), * can be entirely expressed as a match starting in the previous * block, the Huffman tree would end up with 1 element, which does not * work for the code construction. It really wants to use both bits. * So we need to ensure we have some way of dealing with this. */ memset(original.data, 'a', 0x10000 - 1); memset(original.data + 0x10000 - 1, 'b', 0x10000 + 1); memset(original.data + 0x20000, 'a', 0x10000); comp_size = attempt_round_trip(mem_ctx, original, "/tmp/compressible-middle", ref); assert_true(comp_size > 0); assert_true(comp_size < 1024); debug_message("original size %zu; compressed size %zd; ratio %.3f\n", len, comp_size, ((double)comp_size) / len); talloc_free(mem_ctx); } static void test_lzxpress_huffman_max_length_limit(void **state) { size_t len = 65 * 1024 * 1024; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc_zero(mem_ctx, len); DATA_BLOB ref = {0}; ssize_t comp_size; /* * Reputedly Windows has a 64MB limit in the maximum match length it * will encode. We follow this, and test that here with nearly 65 MB * of zeros between two letters; this should be encoded in three * blocks: * * 1. 'a', 64M × '\0' * 2. (1M - 2) × '\0' -- finishing off what would have been the same match * 3. 'b' EOF * * Which we can assert by saying the length is > 768, < 1024. */ original.data[0] = 'a'; original.data[len - 1] = 'b'; comp_size = attempt_round_trip(mem_ctx, original, "/tmp/max-length-limit", ref); assert_true(comp_size > 0x300); assert_true(comp_size < 0x400); debug_message("original size %zu; compressed size %zd; ratio %.3f\n", len, comp_size, ((double)comp_size) / len); talloc_free(mem_ctx); } static void test_lzxpress_huffman_short_boring_strings(void **state) { size_t len = 64 * 1024; TALLOC_CTX *mem_ctx = talloc_new(NULL); DATA_BLOB original = data_blob_talloc(mem_ctx, NULL, len); DATA_BLOB ref = {0}; ssize_t comp_size; ssize_t lengths[] = { 1, 2, 20, 39, 40, 41, 42, 256, 270, 273, 274, 1000, 64000, -1}; char filename[300]; size_t i; /* * How do short repetitive strings work? We're poking at the limit * around which LZ77 comprssion is turned on. * * For this test we don't change the blob memory between runs, just * the declared length. */ memset(original.data, 'a', len); for (i = 0; lengths[i] >= 0; i++) { original.length = lengths[i]; snprintf(filename, sizeof(filename), "/tmp/short-boring-%zu", original.length); comp_size = attempt_round_trip(mem_ctx, original, filename, ref); if (original.length < 41) { assert_true(comp_size > 256 + original.length / 8); } else if (original.length < 274) { assert_true(comp_size == 261); } else { assert_true(comp_size == 263); } assert_true(comp_size < 261 + original.length / 8); } /* let's just show we didn't change the original */ for (i = 0; i < len; i++) { if (original.data[i] != 'a') { fail_msg("input data[%zu] was changed! (%2x, expected %2x)\n", i, original.data[i], 'a'); } } talloc_free(mem_ctx); } static void test_lzxpress_huffman_compress_empty_or_null(void **state) { /* * We expect these to fail with a -1, except the last one, which does * the real thing. */ ssize_t ret; const uint8_t *input = bidirectional_pairs[0].decompressed.data; size_t ilen = bidirectional_pairs[0].decompressed.length; size_t olen = bidirectional_pairs[0].compressed.length; uint8_t output[olen]; struct lzxhuff_compressor_mem cmp_mem; ret = lzxpress_huffman_compress(&cmp_mem, input, 0, output, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, 0); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_compress(&cmp_mem, NULL, ilen, output, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, NULL, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_compress(NULL, input, ilen, output, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_compress(&cmp_mem, input, ilen, output, olen); assert_int_equal(ret, olen); } static void test_lzxpress_huffman_decompress_empty_or_null(void **state) { /* * We expect these to fail with a -1, except the last one. */ ssize_t ret; const uint8_t *input = bidirectional_pairs[0].compressed.data; size_t ilen = bidirectional_pairs[0].compressed.length; size_t olen = bidirectional_pairs[0].decompressed.length; uint8_t output[olen]; ret = lzxpress_huffman_decompress(input, 0, output, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_decompress(input, ilen, output, 0); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_decompress(NULL, ilen, output, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_decompress(input, ilen, NULL, olen); assert_int_equal(ret, -1LL); ret = lzxpress_huffman_decompress(input, ilen, output, olen); assert_int_equal(ret, olen); } int main(void) { const struct CMUnitTest tests[] = { cmocka_unit_test(test_lzxpress_huffman_short_boring_strings), cmocka_unit_test(test_lzxpress_huffman_max_length_limit), cmocka_unit_test(test_lzxpress_huffman_extremely_compressible_middle), cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip), cmocka_unit_test(test_lzxpress_huffman_chaos_graph_round_trip), cmocka_unit_test(test_lzxpress_huffman_sparse_random_graph_round_trip), cmocka_unit_test(test_lzxpress_huffman_round_trip), cmocka_unit_test(test_lzxpress_huffman_decompress_files), cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files), cmocka_unit_test(test_lzxpress_huffman_compress), cmocka_unit_test(test_lzxpress_huffman_decompress), cmocka_unit_test(test_lzxpress_huffman_long_gpl_round_trip), cmocka_unit_test(test_lzxpress_huffman_long_random_graph_round_trip), cmocka_unit_test(test_lzxpress_huffman_random_noise_round_trip), cmocka_unit_test(test_lzxpress_huffman_overlong_matches_abc), cmocka_unit_test(test_lzxpress_huffman_overlong_matches), cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null), cmocka_unit_test(test_lzxpress_huffman_compress_empty_or_null), }; if (!isatty(1)) { cmocka_set_message_output(CM_OUTPUT_SUBUNIT); } return cmocka_run_group_tests(tests, NULL, NULL); }