lib/compression: add LZ77 + Huffman decompression

This format is described in [MS-XCA] 2.1 and 2.2, with exegesis in many posts on the cifs-protocol list[1]. The two public functions are: ssize_t lzxpress_huffman_decompress(const uint8_t *input, size_t input_size, uint8_t *output, size_t output_size); uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx, const uint8_t *input_bytes, size_t input_size, size_t output_size); In both cases the caller needs to know the *exact* decompressed size, which is essential for decompression. The _talloc version allocates the buffer for you, and uses the talloc context to allocate a 128k working buffer. THe non-talloc function will allocate the working buffer on the stack. This compression format gives better compression for messages of several kilobytes than the "plain" LXZPRESS compression, but is probably a bit slower to decompress and is certainly worse for very short messages, having a fixed 256 byte overhead for the first Huffman table. Experiments show decompression rates between 20 and 500 MB per second, depending on the compression ratio and data size, on an i5-1135G7 with no compiler optimisations. This compression format is used in AD claims and in SMB, but that doesn't happen with this commit. I will not try to describe LZ77 or Huffman encoding here. Don't expect an answer in MS-XCA either; instead read the code and/or Wikipedia. [1] Much of that starts here: https://lists.samba.org/archive/cifs-protocol/2022-October/ but there's more earlier, particularly in June/July 2020, when Aurélien Aptel was working on an implementation that ended up in Wireshark. Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz> Pair-programmed-with: Joseph Sutton <josephsutton@catalyst.net.nz> Reviewed-by: Joseph Sutton <josephsutton@catalyst.net.nz>
author: Douglas Bagnall <douglas.bagnall@catalyst.net.nz> 2022-11-17 14:24:52 +1300
committer: Joseph Sutton <jsutton@samba.org> 2022-12-01 22:56:39 +0000
commit: f86035c65bf4ae41a2c210dbff132dbce499f03c (patch)
tree: 357a873b7ffc81b5ce58b44dff99c4dbf2e76625 /lib/compression
parent: bd35feaf7ed649968465a2643b42982d3e6f3d56 (diff)
download: samba-f86035c65bf4ae41a2c210dbff132dbce499f03c.tar.gz
4 files changed, 1218 insertions, 3 deletions
diff --git a/lib/compression/lzxpress_huffman.c b/lib/compression/lzxpress_huffman.c
new file mode 100644
index 00000000000..105b1bda0d4
--- /dev/null
+++ b/lib/compression/lzxpress_huffman.c
@@ -0,0 +1,656 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *        and Joseph Sutton   <josephsutton@catalyst.net.nz>
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/byteorder.h"
+#include "lib/util/bytearray.h"
+
+
+#define LZXPRESS_ERROR -1LL
+
+
+struct bitstream {
+	const uint8_t *bytes;
+	size_t byte_pos;
+	size_t byte_size;
+	uint32_t bits;
+	int remaining_bits;
+	uint16_t *table;
+};
+
+
+/**
+ * Determines the sort order of one prefix_code_symbol relative to another
+ */
+static int compare_uint16(const uint16_t *a, const uint16_t *b)
+{
+	if (*a < *b) {
+		return -1;
+	}
+	if (*a > *b) {
+		return 1;
+	}
+	return 0;
+}
+
+
+static bool fill_decomp_table(struct bitstream *input)
+{
+	/*
+	 * There are 512 symbols, each encoded in 4 bits, which indicates
+	 * their depth in the Huffman tree. The even numbers get the lower
+	 * nibble of each byte, so that the byte hex values look backwards
+	 * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
+	 * order of appearance, per depth.
+	 *
+	 * For example, if the first two bytes were:
+	 *
+	 * 0x23 0x53
+	 *
+	 * the first four codes have the lengths 3, 2, 3, 5.
+	 * Let's call them A, B, C, D.
+	 *
+	 * Suppose there is no other codeword with length 1 (which is
+	 * necessarily true in this example) or 2, but there might be others
+	 * of length 3 or 4. Then we can say this about the codes:
+	 *
+	 *        _ --*--_
+	 *      /          \
+	 *     0           1
+	 *    / \         / \
+	 *   0   1       0   1
+	 *  B    |\     / \  |\
+	 *       0 1   0   1 0 1
+	 *       A C   |\ /| | |\
+	 *
+	 * pos bits  code
+	 * A    3    010
+	 * B    2    00
+	 * C    3    011
+	 * D    5    1????
+	 *
+	 * B has the shortest code, so takes the leftmost branch, 00. That
+	 * ends the branch -- nothing else can start with 00. There are no
+	 * more 2s, so we look at the 3s, starting as far left as possible. So
+	 * A takes 010 and C takes 011. That means everything else has to
+	 * start with 1xx. We don't know how many codewords of length 3 or 4
+	 * there are; if there are none, D would end up with 10000, the
+	 * leftmost available code of length 5. If the compressor is any good,
+	 * there should be no unused leaf nodes left dangling at the end.
+	 *
+	 * (this is "Canonical Huffman Coding").
+	 *
+	 *
+	 * But what symbols do these codes actually stand for?
+	 * --------------------------------------------------
+	 *
+	 * Good question. The first 256 codes stand for the corresponding
+	 * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
+	 * which have a distance and a length, encoded in a strange way that
+	 * isn't entirely the purview of this function.
+	 *
+	 * What does the value 0 mean?
+	 * ---------------------------
+	 *
+	 * The code does not occur. For example, if the next byte in the
+	 * example above was 0x07, that would give the byte 0x04 a 7-long
+	 * code, and no code to the 0x05 byte, which means we there is no way
+	 * we going to see a 5 in the decoded stream.
+	 *
+	 * Isn't LZ77 + Huffman what zip/gzip/zlib do?
+	 * -------------------------------------------
+	 *
+	 * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
+	 */
+	uint16_t symbols[512];
+	uint16_t sort_mem[512];
+	size_t i, n_symbols;
+	ssize_t code;
+	uint16_t len, prev_len;
+	const uint8_t *table_bytes = input->bytes + input->byte_pos;
+
+	if (input->byte_pos + 260 > input->byte_size) {
+		return false;
+	}
+
+	n_symbols = 0;
+	for (i = 0; i < 256; i++) {
+		uint16_t even = table_bytes[i] & 15;
+		uint16_t odd = table_bytes[i] >> 4;
+		if (even != 0) {
+			symbols[n_symbols] = (even << 9) + i * 2;
+			n_symbols++;
+		}
+		if (odd != 0) {
+			symbols[n_symbols] = (odd << 9) + i * 2 + 1;
+			n_symbols++;
+		}
+	}
+	input->byte_pos += 256;
+	if (n_symbols == 0) {
+		return false;
+	}
+
+	stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
+		    (samba_compare_fn_t)compare_uint16);
+
+	/*
+	 * we're using an implicit binary tree, as you'd see in a heap.
+	 * table[0] = unused
+	 * table[1] = '0'
+	 * table[2] = '1'
+	 * table[3] = '00'     <-- '00' and '01' are children of '0'
+	 * table[4] = '01'     <-- '0' is [0], children are [0 * 2 + {1,2}]
+	 * table[5] = '10'
+	 * table[6] = '11'
+	 * table[7] = '000'
+	 * table[8] = '001'
+	 * table[9] = '010'
+	 * table[10]= '011'
+	 * table[11]= '100
+	 *'
+	 * table[1 << n - 1] = '0' * n
+	 * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
+	 * table[1 << n - 2] = '1' * (n - 1)
+	 *
+	 * table[i]->left =  table[i*2 + 1]
+	 * table[i]->right = table[i*2 + 2]
+	 * table[0xffff] = unused (16 '0's, max len is 15)
+	 *
+	 * therefore e.g. table[70] = table[64     - 1 + 7]
+	 *                          = table[1 << 6 - 1 + 7]
+	 *                          = '000111' (binary 7, widened to 6 bits)
+	 *
+	 *   and if '000111' is a code,
+	 *   '00011', '0001', '000', '00', '0' are unavailable prefixes.
+	 *       34      16      7     3    1  are their indices
+	 *   and (i - 1) >> 1 is the path back from 70 through these.
+	 *
+	 * the lookup is
+	 *
+	 * 1 start with i = 0
+	 * 2 extract a symbol bit (i = (i << 1) + bit + 1)
+	 * 3 is table[i] == 0xffff?
+	 * 4  yes -- goto 2
+	 * 4  table[i] & 511 is the symbol, stop
+	 *
+	 * and the construction (here) is sort of the reverse.
+	 *
+	 * Most of this table is free space that can never be reached, and
+	 * most of the activity is at the beginning (since all codes start
+	 * there, and by design the shortest codes are the most common).
+	 */
+	for (i = 0; i < 32; i++) {
+		/* prefill the table head */
+		input->table[i] = 0xffff;
+	}
+	code = -1;
+	prev_len = 0;
+	for (i = 0; i < n_symbols; i++) {
+		uint16_t s = symbols[i];
+		uint16_t prefix;
+		len = (s >> 9) & 15;
+		s &= 511;
+		code++;
+		while (len != prev_len) {
+			code <<= 1;
+			code++;
+			prev_len++;
+		}
+
+		if (code >= 65535) {
+			return false;
+		}
+		input->table[code] = s;
+		for(prefix = (code - 1) >> 1;
+		    prefix > 31;
+		    prefix = (prefix - 1) >> 1) {
+			input->table[prefix] = 0xffff;
+		}
+	}
+
+	/*
+	 * check that the last code encodes 11111..., with right number of
+	 * ones, pointing to the right symbol -- otherwise we have a dangling
+	 * uninitialised symbol.
+	 */
+	if (code != (1 << (len + 1)) - 2) {
+		return false;
+	}
+	return true;
+}
+
+
+#define CHECK_READ_32(dest)					  \
+	do {							  \
+		if (input->byte_pos + 4 > input->byte_size) {     \
+			return LZXPRESS_ERROR;			   \
+		}						   \
+		dest = PULL_LE_U32(input->bytes, input->byte_pos); \
+		input->byte_pos += 4;				   \
+	} while (0)
+
+#define CHECK_READ_16(dest)					  \
+	do {							  \
+		if (input->byte_pos + 2 > input->byte_size) {     \
+			return LZXPRESS_ERROR;			   \
+		}						   \
+		dest = PULL_LE_U16(input->bytes, input->byte_pos); \
+		input->byte_pos += 2;				   \
+	} while (0)
+
+#define CHECK_READ_8(dest) \
+	do {								\
+		if (input->byte_pos >= input->byte_size) {		\
+			return LZXPRESS_ERROR;				\
+		}							\
+		dest = PULL_LE_U8(input->bytes, input->byte_pos);	\
+		input->byte_pos++;					\
+	} while(0)
+
+
+static inline ssize_t pull_bits(struct bitstream *input)
+{
+	if (input->byte_pos + 1 < input->byte_size) {
+		uint16_t tmp;
+		CHECK_READ_16(tmp);
+		input->remaining_bits += 16;
+		input->bits <<= 16;
+		input->bits |= tmp;
+	} else if (input->byte_pos < input->byte_size) {
+		uint8_t tmp;
+		CHECK_READ_8(tmp);
+		input->remaining_bits += 8;
+		input->bits <<= 8;
+		input->bits |= tmp;
+	} else {
+		return LZXPRESS_ERROR;
+	}
+	return 0;
+}
+
+
+/*
+ * Decompress a block. The actual decompressed size is returned (or -1 on
+ * error). The putative block length is 64k (or shorter, if the message ends
+ * first), but a match can run over the end, extending the block. That's why
+ * we need the overall output size as well as the block size. A match encoded
+ * in this block can point back to previous blocks, but not before the
+ * beginning of the message, so we also need the previously decoded size.
+ *
+ * The compressed block will have 256 bytes for the Huffman table, and at
+ * least 4 bytes of (possibly padded) encoded values.
+ */
+static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
+					    uint8_t *output,
+					    size_t block_size,
+					    size_t output_size,
+					    size_t previous_size)
+{
+	size_t output_pos = 0;
+	uint16_t symbol;
+	size_t index;
+	uint16_t distance_bits_wanted = 0;
+	size_t distance = 0;
+	size_t length = 0;
+	bool ok;
+	uint32_t tmp;
+	bool seen_eof_marker = false;
+
+	ok = fill_decomp_table(input);
+	if (! ok) {
+		return LZXPRESS_ERROR;
+	}
+
+	/*
+	 * Always read 32 bits at the start, even if we don't need them.
+	 */
+	CHECK_READ_16(tmp);
+	CHECK_READ_16(input->bits);
+	input->bits |= tmp << 16;
+	input->remaining_bits = 32;
+
+	/*
+	 * This loop iterates over individual *bits*. These are read from
+	 * little-endian 16 bit words, most significant bit first.
+	 *
+	 * At points in the bitstream, the following are possible:
+	 *
+	 * # the source word is empty and needs to be refilled from the input
+	 *    stream.
+	 * # an incomplete codeword is being extended.
+	 * # a codeword is resolved, either as a literal or a match.
+	 * # a literal is written.
+	 * # a match is collecting distance bits.
+	 * # the output stream is copied, as specified by a match.
+	 * # input bytes are read for match lengths.
+	 *
+	 * Note that we *don't* specifically check for the EOF marker (symbol
+	 * 256) in this loop, because the a precondition for stopping for the
+	 * EOF marker is that the output buffer is full (otherwise, you
+	 * wouldn't know which 256 is EOF, rather than an actual symbol), and
+	 * we *always* want to stop when the buffer is full. So we work out if
+	 * there is an EOF in in another loop after we stop writing.
+	 */
+
+	index = 0;
+	while (output_pos < block_size) {
+		uint16_t b;
+		if (input->remaining_bits == 16) {
+			ssize_t ret = pull_bits(input);
+			if (ret) {
+				return ret;
+			}
+		}
+		input->remaining_bits--;
+
+		b = (input->bits >> input->remaining_bits) & 1;
+		if (length == 0) {
+			/* not in a match; pulling a codeword */
+			index <<= 1;
+			index += b + 1;
+			if (input->table[index] == 0xffff) {
+				/* incomplete codeword, the common case */
+				continue;
+			}
+			/* found the symbol, reset the code string */
+			symbol = input->table[index] & 511;
+			index = 0;
+			if (symbol < 256) {
+				/* a literal, the easy case */
+				output[output_pos] = symbol;
+				output_pos++;
+				continue;
+			}
+
+			/* the beginning of a match */
+			distance_bits_wanted = (symbol >> 4) & 15;
+			distance = 1 << distance_bits_wanted;
+			length = symbol & 15;
+			if (length == 15) {
+				CHECK_READ_8(tmp);
+				length += tmp;
+				if (length == 255 + 15) {
+					/*
+					 * note, we discard (don't add) the
+					 * length so far.
+					 */
+					CHECK_READ_16(length);
+					if (length == 0) {
+						CHECK_READ_32(length);
+					}
+				}
+			}
+			length += 3;
+		} else {
+			/* we are pulling extra distance bits */
+			distance_bits_wanted--;
+			distance |= b << distance_bits_wanted;
+		}
+
+		if (distance_bits_wanted == 0) {
+			/*
+			 * We have a complete match, and it is time to do the
+			 * copy (byte by byte, because the ranges can overlap,
+			 * and we might need to copy bytes we just copied in).
+			 *
+			 * It is possible that this match will extend beyond
+			 * the end of the expected block. That's fine, so long
+			 * as it doesn't extend past the total output size.
+			 */
+			size_t end = output_pos + length;
+			if (end > output_size ||
+			    previous_size + output_pos < distance ||
+			    unlikely(end < output_pos)) {
+				return LZXPRESS_ERROR;
+			}
+
+			for (; output_pos < end; output_pos++) {
+				output[output_pos] = \
+					output[output_pos - distance];
+			}
+			distance = 0;
+			length = 0;
+		}
+	}
+
+	if (length != 0 || index != 0) {
+		/* it seems like we've hit an early end, mid-code */
+		return LZXPRESS_ERROR;
+	}
+
+	if (input->byte_pos + 256 < input->byte_size) {
+		/*
+		 * This block is over, but it clearly isn't the last block, so
+		 * we don't want to look for the EOF.
+		 */
+		return output_pos;
+	}
+	/*
+	 * We won't write any more, but we try to read some more to make sure
+	 * we're finishing in a good place. That means we want to see a 256
+	 * symbol and then some number of zeroes, possibly zero, but as many
+	 * as 32.
+	 *
+	 * In this we are perhaps a bit stricter than Windows, which
+	 * apparently does not insist on the EOF marker, nor on a lack of
+	 * trailing bytes.
+	 */
+	while (true) {
+		uint16_t b;
+		if (input->remaining_bits == 16) {
+			ssize_t ret;
+			if (input->byte_pos == input->byte_size) {
+				/* FIN */
+				break;
+			}
+			ret = pull_bits(input);
+			if (ret) {
+				return ret;
+			}
+		}
+		input->remaining_bits--;
+		b = (input->bits >> input->remaining_bits) & 1;
+		if (seen_eof_marker) {
+			/*
+			 * we have read an EOF symbols. Now we just want to
+			 * see zeroes.
+			 */
+			if (b != 0) {
+				return LZXPRESS_ERROR;
+			}
+			continue;
+		}
+
+		/* we're pulling in a symbol, which had better be 256 */
+		index <<= 1;
+		index += b + 1;
+		if (input->table[index] == 0xffff) {
+			continue;
+		}
+
+		symbol = input->table[index] & 511;
+		if (symbol != 256) {
+			return LZXPRESS_ERROR;
+		}
+		seen_eof_marker = true;
+		continue;
+	}
+
+	if (! seen_eof_marker) {
+		return LZXPRESS_ERROR;
+	}
+
+	return output_pos;
+}
+
+static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
+						    uint8_t *output,
+						    size_t output_size)
+{
+	size_t output_pos = 0;
+
+	if (input->byte_size < 260) {
+		return LZXPRESS_ERROR;
+	}
+
+	while (input->byte_pos < input->byte_size) {
+		ssize_t block_output_pos;
+		ssize_t block_output_size;
+		size_t remaining_output_size = output_size - output_pos;
+
+		block_output_size = MIN(65536, remaining_output_size);
+
+		block_output_pos = lzx_huffman_decompress_block(
+			input,
+			output + output_pos,
+			block_output_size,
+			remaining_output_size,
+			output_pos);
+
+		if (block_output_pos < block_output_size) {
+			return LZXPRESS_ERROR;
+		}
+		output_pos += block_output_pos;
+		if (output_pos > output_size) {
+			/* not expecting to get here. */
+			return LZXPRESS_ERROR;
+		}
+	}
+
+	if (input->byte_pos != input->byte_size) {
+		return LZXPRESS_ERROR;
+	}
+
+	return output_pos;
+}
+
+
+/*
+ * lzxpress_huffman_decompress()
+ *
+ * output_size must be the expected length of the decompressed data.
+ * input_size and output_size are limited to the minimum of UINT32_MAX and
+ * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
+ *
+ * @param input_bytes  memory to be decompressed.
+ * @param input_size   length of the compressed buffer.
+ * @param output       destination for the decompressed data.
+ * @param output_size  exact expected length of the decompressed data.
+ *
+ * @return the number of bytes written or -1 on error.
+ */
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
+				    size_t input_size,
+				    uint8_t *output,
+				    size_t output_size)
+{
+	uint16_t table[65536];
+	struct bitstream input = {
+		.bytes = input_bytes,
+		.byte_size = input_size,
+		.byte_pos = 0,
+		.bits = 0,
+		.remaining_bits = 0,
+		.table = table
+	};
+
+	if (input_size > SSIZE_MAX ||
+	    input_size > UINT32_MAX ||
+	    output_size > SSIZE_MAX ||
+	    output_size > UINT32_MAX ||
+	    input_size == 0 ||
+	    output_size == 0 ||
+	    input_bytes == NULL ||
+	    output == NULL) {
+		/*
+		 * We use negative ssize_t to return errors, which is limiting
+		 * on 32 bit machines, and the 4GB limit exists on Windows.
+		 */
+		return  LZXPRESS_ERROR;
+	}
+
+	return lzxpress_huffman_decompress_internal(&input,
+						    output,
+						    output_size);
+}
+
+
+/**
+ * lzxpress_huffman_decompress_talloc()
+ *
+ * The caller must provide the exact size of the expected output.
+ *
+ * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
+ * output_size is limited to 256MB due to a limit in talloc. This effectively
+ * limits input_size too, as non-crafted compressed data will not exceed the
+ * decompressed size by very much.
+ *
+ * @param mem_ctx      TALLOC_CTX parent for the decompressed buffer.
+ * @param input_bytes  memory to be decompressed.
+ * @param input_size   length of the compressed buffer.
+ * @param output_size  expected decompressed size.
+ *
+ * @return a talloc'ed buffer exactly output_size in length, or NULL.
+ */
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+					    const uint8_t *input_bytes,
+					    size_t input_size,
+					    size_t output_size)
+{
+	ssize_t result;
+	uint8_t *output = NULL;
+	struct bitstream input = {
+		.bytes = input_bytes,
+		.byte_size = input_size
+	};
+
+	output = talloc_array(mem_ctx, uint8_t, output_size);
+	if (output == NULL) {
+		return NULL;
+	}
+
+	input.table = talloc_array(mem_ctx, uint16_t, 65536);
+	if (input.table == NULL) {
+		talloc_free(output);
+		return NULL;
+	}
+	result = lzxpress_huffman_decompress_internal(&input,
+						      output,
+						      output_size);
+	talloc_free(input.table);
+
+	if (result != output_size) {
+		talloc_free(output);
+		return NULL;
+	}
+	return output;
+}
diff --git a/lib/compression/lzxpress_huffman.h b/lib/compression/lzxpress_huffman.h
new file mode 100644
index 00000000000..c484181d504
--- /dev/null
+++ b/lib/compression/lzxpress_huffman.h
@@ -0,0 +1,38 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HAVE_LZXPRESS_HUFFMAN_H
+#define HAVE_LZXPRESS_HUFFMAN_H
+
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input,
+				    size_t input_size,
+				    uint8_t *output,
+				    size_t max_output_size);
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+					    const uint8_t *input_bytes,
+					    size_t input_size,
+					    size_t output_size);
+
+
+#endif /* HAVE_LZXPRESS_HUFFMAN_H */
diff --git a/lib/compression/tests/test_lzx_huffman.c b/lib/compression/tests/test_lzx_huffman.c
new file mode 100644
index 00000000000..43275d2d32f
--- /dev/null
+++ b/lib/compression/tests/test_lzx_huffman.c
@@ -0,0 +1,514 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka.h>
+#include <stdbool.h>
+#include "replace.h"
+#include <talloc.h>
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/data_blob.h"
+
+/* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZXHUFF_DEBUG_FILES false
+
+/* set LZXHUFF_DEBUG_VERBOSE to true to print more. */
+#define LZXHUFF_DEBUG_VERBOSE false
+
+
+#if LZXHUFF_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+#else
+#define debug_message(...) /* debug_message */
+#endif
+
+
+struct lzx_pair {
+	const char *name;
+	DATA_BLOB compressed;
+	DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+	const char *name;
+	const char *compressed_file;
+	const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/lzxpress-huffman/decompressed"
+#define COMP_DIR "testdata/compression/lzxpress-huffman/compressed"
+#define MORE_COMP_DIR "testdata/compression/lzxpress-huffman/more-compressed"
+
+
+#define VARRGH(...) __VA_ARGS__
+
+#define BLOB_FROM_ARRAY(...)                             \
+	{                                                \
+		.data = (uint8_t[]){__VA_ARGS__},          \
+		.length = sizeof((uint8_t[]){__VA_ARGS__}) \
+	}
+
+#define BLOB_FROM_STRING(s)                                      \
+	{                                                            \
+		.data = discard_const_p(uint8_t, s),		     \
+		.length = (sizeof(s) - 1)		     \
+	}
+
+
+const char * file_names[] = {
+	"27826-8.txt",
+	"5d049b4cb1bd933f5e8ex19",
+	"638e61e96d54279981c3x5",
+	"64k-minus-one-zeros",
+	"64k-plus-one-zeros",
+	"64k-zeros",
+	"96f696a4e5ce56c61a3dx10",
+	"9e0b6a12febf38e98f13",
+	"abc-times-101",
+	"abc-times-105",
+	"abc-times-200",
+	"and_rand",
+	"and_rand-128k+",
+	"b63289ccc7f218c0d56b",
+	"beta-variate1-128k+",
+	"beta-variate2-128k+",
+	"beta-variate3-128k+",
+	"decayed_alphabet_128k+",
+	"decayed_alphabet_64k",
+	"exp_shuffle",
+	"exp_shuffle-128k+",
+	"f00842317dc6d5695b02",
+	"fib_shuffle",
+	"fib_shuffle-128k+",
+	"fuzzing-0fc2d461b56cd8103c91",
+	"fuzzing-17c961778538cc10ab7c",
+	"fuzzing-3591f9dc02bb00a54b60",
+	"fuzzing-3ec3bca27bb9eb40c128",
+	"fuzzing-80b4fa18ff5f8dd04862",
+	"fuzzing-a3115a81d1ac500318f9",
+	"generate-windows-test-vectors.c",
+	"midsummer-nights-dream.txt",
+	"notes-on-the-underground.txt",
+	"pg22009.txt",
+	"repeating",
+	"repeating-exactly-64k",
+	"setup.log",
+	"skewed_choices",
+	"skewed_choices-128k+",
+	/* These ones were deathly slow in fuzzing at one point */
+	"slow-015ddc36a71412ccc50d",
+	"slow-100e9f966a7feb9ca40a",
+	"slow-2a671c3cff4f1574cbab",
+	"slow-33d90a24e70515b14cd0",
+	"slow-49d8c05261e3f412fc72",
+	"slow-50a249d2fe56873e56a0",
+	"slow-63e9f0b52235fb0129fa",
+	"slow-73b7f971d65908ac0095",
+	"slow-8b61e3dd267908544531",
+	"slow-9d1c5a079b0462986f1f",
+	"slow-aa7262a821dabdcf04a6",
+	"slow-b8a91d142b0d2af7f5ca",
+	"slow-c79142457734bbc8d575",
+	"slow-d736544545b90d83fe75",
+	"slow-e3b9bdfaed7d1a606fdb",
+	"slow-f3f1c02a9d006e5e1703",
+	"square_series",
+	"square_series-128k+",
+	"trigram_128k+",
+	"trigram_64k",
+	"trigram_sum_128k+",
+	"trigram_sum_64k",
+	NULL
+};
+
+struct lzx_pair bidirectional_pairs[] = {
+
+	{.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */
+	 .decompressed = BLOB_FROM_STRING(
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+		 ),
+	 .compressed = BLOB_FROM_ARRAY(
+		 /*
+		  * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other
+		  * symbols occur. That means we need 48 0x00 bytes for the
+		  * first 96 symbol-nybbles, then some short codes, then zeros
+		  * again for the rest of the literals.
+		  */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,
+		 0x30, 0x23, /* a_ cb */
+		 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */
+		 0x02, /* 'EOF' symbol 256 (byte 128 low) */
+		 0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 bytes */
+	         0,0,0,
+		 0x20, /* codepoint 287 (byte 143 high) */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,         /* 160 bytes */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+		 /*
+		  * So that's the tree.
+		  *
+		  * length 2 codes for 'c', EOF, 287
+		  * length 3 for 'a', 'b'.
+		  *
+		  * c:   00
+		  * EOF: 01
+		  * 287: 10
+		  * a:   110
+		  * b:   111
+		  *
+		  * thus the literal string "abc" is 110-111-00.
+		  *
+		  * Now for the lz77 match definitions for EOF and 287.
+		  *
+		  * Why 287? It encodes the match distance and offset.
+		  *
+		  * 287 - 256 = 31
+		  *
+		  * _length = 31 % 16 = 15
+		  * _distance = 31 / 16 = 1
+		  *
+		  * (it's easier to look at the hex, 0x11f:
+		  * 1xx means a match; x1x is _distance; xxf is _length)
+		  *
+		  * _distance 1 means a two bit distance (10 or 11; 2 or 3).
+		  * That means the next bit will be the least significant bit
+		  * of distance (1 in this case, meaning distance 3).
+		  *
+		  * if _length is 15, real length is included inline.
+		  *
+		  * 'EOF' == 256 means _length = 0, _distance = 0.
+		  *
+		  * _distance 0 means 1, so no further bits needed.
+		  * _length 0 means length 3.
+		  *
+		  * but when used as EOF, this doesn't matter.
+		  */
+		 0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01
+		  /* These remaining bytes are:
+		  *
+		  * 10101000 11011100 00000000 00000000 11111111
+		  * 00100110 00000001
+		  *
+		  * and we read them as 16 chunks (i.e. flipping odd/even order)
+		  *
+		  * 110-111-00  10-1-01-000
+		  *   a  b   c 287 | EOF
+		  *                |
+		  *                this is the 287 distance low bit.
+		  *
+		  * The last 3 bits are not used. The 287 length is sort of
+		  * out of band, coming up soon (because 287 encodes length
+		  * 15; most codes do not do this).
+		  *
+		  * 00000000 00000000
+		  *
+		  * This padding is there because the first 32 bits are read
+		  * at the beginning of decoding. If there were more things to
+		  * be encoded, they would be in here.
+		  *
+		  * 11111111
+		  *
+		  * This byte is pulled as the length for the 287 match.
+		  * Because it is 0xff, we pull a further 2 bytes for the
+		  * actual length, i.e. a 16 bit number greater than 270.
+		  *
+		  * 00000001 00100110
+		  *
+		  * that is 0x126 = 294 = the match length - 3 (because we're
+		  * encoding ["abc", <copy from 3 back, 297 chars>, EOF]).
+		  *
+		  */
+		 )
+	},
+	{.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */
+	 .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"),
+	 .compressed = BLOB_FROM_ARRAY(
+		 /*
+		  * In this case there are no matches encoded as there are no
+		  * repeated symbols. Including the EOF, there are 27 symbols
+		  * all occuring exactly as frequently as each other (once).
+		  * From that we would expect the codes to be mostly 5 bits
+		  * long, because 27 < 2^5 (32), but greater than 2^4. And
+		  * that's what we see.
+		  */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,
+		 /* 14 non-zero bytes for 26 letters/nibbles */
+		 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		 0x55, 0x55, 0x55, 0x45, 0x44, 0x04,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,   /* 80 */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,
+		 0x04,                                 /* 0x100 EOF */
+		 /* no matches */
+		 0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */
+		 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+
+		 0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9,
+		 0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04,
+		 0x00, 0x00, 0x00, 0x00)
+	},
+	{0}
+};
+
+
+static void test_lzxpress_huffman_decompress(void **state)
+{
+	size_t i;
+	ssize_t written;
+	uint8_t *dest = NULL;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+		struct lzx_pair p = bidirectional_pairs[i];
+		dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length);
+
+		debug_message("%s compressed %zu decomp %zu\n", p.name,
+			      p.compressed.length,
+			      p.decompressed.length);
+
+		written = lzxpress_huffman_decompress(p.compressed.data,
+						      p.compressed.length,
+						      dest,
+						      p.decompressed.length);
+		assert_int_equal(written, p.decompressed.length);
+
+		assert_memory_equal(dest, p.decompressed.data, p.decompressed.length);
+		talloc_free(dest);
+	}
+}
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+				    const char *filename)
+{
+	DATA_BLOB b = {0};
+	FILE *fh = fopen(filename, "rb");
+	int ret;
+	struct stat s;
+	size_t len;
+	if (fh == NULL) {
+		debug_message("could not open '%s'\n", filename);
+		return b;
+	}
+	ret = fstat(fileno(fh), &s);
+	if (ret != 0) {
+		fclose(fh);
+		return b;
+	}
+	b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+	if (b.data == NULL) {
+		fclose(fh);
+		return b;
+	}
+	len = fread(b.data, 1, s.st_size, fh);
+	if (ferror(fh) || len != s.st_size) {
+		TALLOC_FREE(b.data);
+	} else {
+		b.length = len;
+	}
+	fclose(fh);
+	return b;
+}
+
+
+static void test_lzxpress_huffman_decompress_files(void **state)
+{
+	size_t i;
+	int score = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzhuff", COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.compressed.data);
+
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+
+		written = lzxpress_huffman_decompress(p.compressed.data,
+						      p.compressed.length,
+						      dest,
+						      p.decompressed.length);
+		if (written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%zu correct\n", score, i);
+	assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_huffman_decompress_more_compressed_files(void **state)
+{
+	/*
+	 * This tests the decompression of files that have been compressed on
+	 * Windows with the level turned up (to 1, default for MS-XCA is 0).
+	 *
+	 * The format is identical, but it will have tried harder to find
+	 * matches.
+	 */
+	size_t i;
+	int score = 0;
+	int found = 0;
+	TALLOC_CTX *mem_ctx = talloc_new(NULL);
+	for (i = 0; file_names[i] != NULL; i++) {
+		char filename[200];
+		uint8_t *dest = NULL;
+		ssize_t written;
+		TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+		struct lzx_pair p = {
+			.name = file_names[i]
+		};
+
+		debug_message("%s\n", p.name);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.decomp", DECOMP_DIR, p.name);
+
+		p.decompressed = datablob_from_file(tmp_ctx, filename);
+		assert_non_null(p.decompressed.data);
+
+		snprintf(filename, sizeof(filename),
+			 "%s/%s.lzhuff", MORE_COMP_DIR, p.name);
+
+		p.compressed = datablob_from_file(tmp_ctx, filename);
+		if (p.compressed.data == NULL) {
+			/*
+			 * We don't have all the vectors in the
+			 * more-compressed directory, which is OK, we skip
+			 * them.
+			 */
+			continue;
+		}
+		found++;
+		dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+
+		written = lzxpress_huffman_decompress(p.compressed.data,
+						      p.compressed.length,
+						      dest,
+						      p.decompressed.length);
+		if (written == p.decompressed.length &&
+		    memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+			debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+			score++;
+		} else {
+			debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+				      p.name);
+			debug_message("size %zd vs reference %zu\n",
+				      written, p.decompressed.length);
+		}
+		talloc_free(tmp_ctx);
+	}
+	debug_message("%d/%d correct\n", score, found);
+	assert_int_equal(score, found);
+}
+
+
+static void test_lzxpress_huffman_decompress_empty_or_null(void **state)
+{
+	/*
+	 * We expect these to fail with a -1, except the last one.
+	 */
+	ssize_t ret;
+	const uint8_t *input = bidirectional_pairs[0].compressed.data;
+	size_t ilen = bidirectional_pairs[0].compressed.length;
+	size_t olen = bidirectional_pairs[0].decompressed.length;
+	uint8_t output[olen];
+
+	ret = lzxpress_huffman_decompress(input, 0, output, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_decompress(input, ilen, output, 0);
+	assert_int_equal(ret, -1LL);
+
+	ret = lzxpress_huffman_decompress(NULL, ilen, output, olen);
+	assert_int_equal(ret, -1LL);
+	ret = lzxpress_huffman_decompress(input, ilen, NULL, olen);
+	assert_int_equal(ret, -1LL);
+
+	ret = lzxpress_huffman_decompress(input, ilen, output, olen);
+	assert_int_equal(ret, olen);
+}
+
+
+int main(void) {
+	const struct CMUnitTest tests[] = {
+		cmocka_unit_test(test_lzxpress_huffman_decompress_files),
+		cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files),
+		cmocka_unit_test(test_lzxpress_huffman_decompress),
+		cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null),
+	};
+	if (!isatty(1)) {
+		cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+	}
+
+	return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/lib/compression/wscript_build b/lib/compression/wscript_build
index 7ad533340de..df931bef365 100644
--- a/lib/compression/wscript_build
+++ b/lib/compression/wscript_build
@@ -1,6 +1,13 @@
 #!/usr/bin/env python
 
 bld.SAMBA_SUBSYSTEM('LZXPRESS',
-        deps='replace',
-	source='lzxpress.c'
-	)
+                    deps='replace talloc stable_sort samba-debug',
+                    source='lzxpress.c lzxpress_huffman.c'
+                    )
+
+bld.SAMBA_BINARY('test_lzx_huffman',
+                 source='tests/test_lzx_huffman.c',
+                 deps=('cmocka replace LZXPRESS'
+                       ' samba-util'),
+                 local_include=False,
+                 for_selftest=True)
author	Douglas Bagnall <douglas.bagnall@catalyst.net.nz>	2022-11-17 14:24:52 +1300
committer	Joseph Sutton <jsutton@samba.org>	2022-12-01 22:56:39 +0000
commit	f86035c65bf4ae41a2c210dbff132dbce499f03c (patch)
tree	357a873b7ffc81b5ce58b44dff99c4dbf2e76625 /lib/compression
parent	bd35feaf7ed649968465a2643b42982d3e6f3d56 (diff)
download	samba-f86035c65bf4ae41a2c210dbff132dbce499f03c.tar.gz