1 files changed, 452 insertions, 0 deletions
diff --git a/examples/pigz.c b/examples/pigz.c
new file mode 100644
index 0000000..42794d0
--- /dev/null
+++ b/examples/pigz.c
@@ -0,0 +1,452 @@
+/* pigz.c -- parallel implementation of gzip
+ * Copyright (C) 2007 Mark Adler
+ * Version 1.1  28 January 2007  Mark Adler
+ */
+
+/* Version history:
+   1.0  17 Jan 2007  First version
+   1.1  28 Jan 2007  Avoid void * arithmetic (some compilers don't get that)
+                     Add note about requiring zlib 1.2.3
+                     Allow compression level 0 (no compression)
+                     Completely rewrite parallelism -- add a write thread
+                     Use deflateSetDictionary() to make use of history
+                     Tune argument defaults to best performance on four cores
+ */
+
+/*
+   pigz compresses from stdin to stdout using threads to make use of multiple
+   processors and cores.  The input is broken up into 128 KB chunks, and each
+   is compressed separately.  The CRC for each chunk is also calculated
+   separately.  The compressed chunks are written in order to the output,
+   and the overall CRC is calculated from the CRC's of the chunks.
+
+   The compressed data format generated is the gzip format using the deflate
+   compression method.  First a gzip header is written, followed by raw deflate
+   partial streams.  They are partial, in that they do not have a terminating
+   block.  At the end, the deflate stream is terminated with a final empty
+   static block, and lastly a gzip trailer is written with the CRC and the
+   number of input bytes.
+
+   Each raw deflate partial stream is terminated by an empty stored block
+   (using the Z_SYNC_FLUSH option of zlib), in order to end that partial
+   bit stream at a byte boundary.  That allows the partial streams to be
+   concantenated simply as sequences of bytes.  This adds a very small four
+   or five byte overhead to the output for each input chunk.
+
+   zlib's crc32_combine() routine allows the calcuation of the CRC of the
+   entire input using the independent CRC's of the chunks.  pigz requires zlib
+   version 1.2.3 or later, since that is the first version that provides the
+   crc32_combine() function.
+
+   pigz uses the POSIX pthread library for thread control and communication.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include "zlib.h"
+
+#define local static
+
+/* exit with error */
+local void bail(char *msg)
+{
+    fprintf(stderr, "pigz abort: %s\n", msg);
+    exit(1);
+}
+
+/* read up to len bytes into buf, repeating read() calls as needed */
+local size_t readn(int desc, unsigned char *buf, size_t len)
+{
+    ssize_t ret;
+    size_t got;
+
+    got = 0;
+    while (len) {
+        ret = read(desc, buf, len);
+        if (ret < 0)
+            bail("read error");
+        if (ret == 0)
+            break;
+        buf += ret;
+        len -= ret;
+        got += ret;
+    }
+    return got;
+}
+
+/* write len bytes, repeating write() calls as needed */
+local void writen(int desc, unsigned char *buf, size_t len)
+{
+    ssize_t ret;
+
+    while (len) {
+        ret = write(desc, buf, len);
+        if (ret < 1)
+            bail("write error");
+        buf += ret;
+        len -= ret;
+    }
+}
+
+/* a flag variable for communication between two threads */
+struct flag {
+    int value;              /* value of flag */
+    pthread_mutex_t lock;   /* lock for checking and changing flag */
+    pthread_cond_t cond;    /* condition for signaling on flag change */
+};
+
+/* initialize a flag for use, starting with value val */
+local void flag_init(struct flag *me, int val)
+{
+    me->value = val;
+    pthread_mutex_init(&(me->lock), NULL);
+    pthread_cond_init(&(me->cond), NULL);
+}
+
+/* set the flag to val, signal another process that may be waiting for it */
+local void flag_set(struct flag *me, int val)
+{
+    pthread_mutex_lock(&(me->lock));
+    me->value = val;
+    pthread_cond_signal(&(me->cond));
+    pthread_mutex_unlock(&(me->lock));
+}
+
+/* if it isn't already, wait for some other thread to set the flag to val */
+local void flag_wait(struct flag *me, int val)
+{
+    pthread_mutex_lock(&(me->lock));
+    while (me->value != val)
+        pthread_cond_wait(&(me->cond), &(me->lock));
+    pthread_mutex_unlock(&(me->lock));
+}
+
+/* if flag is equal to val, wait for some other thread to change it */
+local void flag_wait_not(struct flag *me, int val)
+{
+    pthread_mutex_lock(&(me->lock));
+    while (me->value == val)
+        pthread_cond_wait(&(me->cond), &(me->lock));
+    pthread_mutex_unlock(&(me->lock));
+}
+
+/* clean up the flag when done with it */
+local void flag_done(struct flag *me)
+{
+    pthread_cond_destroy(&(me->cond));
+    pthread_mutex_destroy(&(me->lock));
+}
+
+/* a unit of work to feed to compress_thread() -- it is assumed that the out
+   buffer is large enough to hold the maximum size len bytes could deflate to,
+   plus five bytes for the final sync marker */
+struct work {
+    size_t len;                 /* length of input */
+    unsigned long crc;          /* crc of input */
+    unsigned char *buf;         /* input */
+    unsigned char *out;         /* space for output (guaranteed big enough) */
+    z_stream strm;              /* pre-initialized z_stream */
+    struct flag busy;           /* busy flag indicating work unit in use */
+    pthread_t comp;             /* this compression thread */
+};
+
+/* busy flag values */
+#define IDLE 0          /* compress and writing done -- can start compress */
+#define COMP 1          /* compress -- input and output buffers in use */
+#define WRITE 2         /* compress done, writing output -- can read input */
+
+/* read-only globals (set by main/read thread before others started) */
+local int ind;              /* input file descriptor */
+local int outd;             /* output file descriptor */
+local int level;            /* compression level */
+local int procs;            /* number of compression threads (>= 2) */
+local size_t size;          /* uncompressed input size per thread (>= 32K) */
+local struct work *jobs;    /* work units: jobs[0..procs-1] */
+
+/* next and previous jobs[] indices */
+#define NEXT(n) ((n) == procs - 1 ? 0 : (n) + 1)
+#define PREV(n) ((n) == 0 ? procs - 1 : (n) - 1)
+
+/* sliding dictionary size for deflate */
+#define DICT 32768U
+
+/* largest power of 2 that fits in an unsigned int -- used to limit requests
+   to zlib functions that use unsigned int lengths */
+#define MAX ((((unsigned)-1) >> 1) + 1)
+
+/* compress thread: compress the input in the provided work unit and compute
+   its crc -- assume that the amount of space at job->out is guaranteed to be
+   enough for the compressed output, as determined by the maximum expansion
+   of deflate compression -- use the input in the previous work unit (if there
+   is one) to set the deflate dictionary for better compression */
+local void *compress_thread(void *arg)
+{
+    size_t len;                     /* input length for this work unit */
+    unsigned long crc;              /* crc of input data */
+    struct work *prev;              /* previous work unit */
+    struct work *job = arg;         /* work unit for this thread */
+    z_stream *strm = &(job->strm);  /* zlib stream for this work unit */
+
+    /* reset state for a new compressed stream */
+    (void)deflateReset(strm);
+
+    /* initialize input, output, and crc */
+    strm->next_in = job->buf;
+    strm->next_out = job->out;
+    len = job->len;
+    crc = crc32(0L, Z_NULL, 0);
+
+    /* set dictionary if this isn't the first work unit, and if we will be
+       compressing something (the read thread assures that the dictionary
+       data in the previous work unit is still there) */
+    prev = jobs + PREV(job - jobs);
+    if (prev->buf != NULL && len != 0)
+        deflateSetDictionary(strm, prev->buf + (size - DICT), DICT);
+
+    /* run MAX-sized amounts of input through deflate and crc32 -- this loop
+       is needed for those cases where the integer type is smaller than the
+       size_t type, or when len is close to the limit of the size_t type */
+    while (len > MAX) {
+        strm->avail_in = MAX;
+        strm->avail_out = (unsigned)-1;
+        crc = crc32(crc, strm->next_in, strm->avail_in);
+        (void)deflate(strm, Z_NO_FLUSH);
+        len -= MAX;
+    }
+
+    /* run last piece through deflate and crc32, follow with a sync marker */
+    if (len) {
+        strm->avail_in = len;
+        strm->avail_out = (unsigned)-1;
+        crc = crc32(crc, strm->next_in, strm->avail_in);
+        (void)deflate(strm, Z_SYNC_FLUSH);
+    }
+
+    /* don't need to Z_FINISH, since we'd delete the last two bytes anyway */
+
+    /* return result */
+    job->crc = crc;
+    return NULL;
+}
+
+/* put a 4-byte integer into a byte array in LSB order */
+#define PUT4(a,b) (*(a)=(b),(a)[1]=(b)>>8,(a)[2]=(b)>>16,(a)[3]=(b)>>24)
+
+/* write thread: wait for compression threads to complete, write output in
+   order, also write gzip header and trailer around the compressed data */
+local void *write_thread(void *arg)
+{
+    int n;                          /* compress thread index */
+    size_t len;                     /* length of input processed */
+    unsigned long tot;              /* total uncompressed size (overflow ok) */
+    unsigned long crc;              /* CRC-32 of uncompressed data */
+    unsigned char wrap[10];         /* gzip header or trailer */
+
+    /* write simple gzip header */
+    memcpy(wrap, "\037\213\10\0\0\0\0\0\0\3", 10);
+    wrap[8] = level == 9 ? 2 : (level == 1 ? 4 : 0);
+    writen(outd, wrap, 10);
+
+    /* process output of compress threads until end of input */    
+    tot = 0;
+    crc = crc32(0L, Z_NULL, 0);
+    n = 0;
+    do {
+        /* wait for compress thread to start, then wait to complete */
+        flag_wait(&(jobs[n].busy), COMP);
+        pthread_join(jobs[n].comp, NULL);
+
+        /* now that compress is done, allow read thread to use input buffer */
+        flag_set(&(jobs[n].busy), WRITE);
+
+        /* write compressed data and update length and crc */
+        writen(outd, jobs[n].out, jobs[n].strm.next_out - jobs[n].out);
+        len = jobs[n].len;
+        tot += len;
+        crc = crc32_combine(crc, jobs[n].crc, len);
+
+        /* release this work unit and go to the next work unit */
+        flag_set(&(jobs[n].busy), IDLE);
+        n = NEXT(n);
+
+        /* an input buffer less than size in length indicates end of input */
+    } while (len == size);
+
+    /* write final static block and gzip trailer (crc and len mod 2^32) */
+    wrap[0] = 3;  wrap[1] = 0;
+    PUT4(wrap + 2, crc);
+    PUT4(wrap + 6, tot);
+    writen(outd, wrap, 10);
+    return NULL;
+}
+
+/* one-time initialization of a work unit -- this is where we set the deflate
+   compression level and request raw deflate, and also where we set the size
+   of the output buffer to guarantee enough space for a worst-case deflate
+   ending with a Z_SYNC_FLUSH */
+local void job_init(struct work *job)
+{
+    int ret;                        /* deflateInit2() return value */
+
+    job->buf = malloc(size);
+    job->out = malloc(size + (size >> 11) + 10);
+    job->strm.zfree = Z_NULL;
+    job->strm.zalloc = Z_NULL;
+    job->strm.opaque = Z_NULL;
+    ret = deflateInit2(&(job->strm), level, Z_DEFLATED, -15, 8,
+                       Z_DEFAULT_STRATEGY);
+    if (job->buf == NULL || job->out == NULL || ret != Z_OK)
+        bail("not enough memory");
+}
+
+/* compress ind to outd in the gzip format, using multiple threads for the
+   compression and crc calculation and another thread for writing the output --
+   the read thread is the main thread */
+local void read_thread(void)
+{
+    int n;                          /* general index */
+    size_t got;                     /* amount read */
+    pthread_attr_t attr;            /* thread attributes (left at defaults) */
+    pthread_t write;                /* write thread */
+
+    /* set defaults (not all pthread implementations default to joinable) */
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+    /* allocate and set up work list (individual work units will be initialized
+       as needed, in case the input is short), assure that allocation size
+       arithmetic does not overflow */
+    if (size + (size >> 11) + 10 < (size >> 11) + 10 ||
+        (ssize_t)(size + (size >> 11) + 10) < 0 ||
+        ((size_t)0 - 1) / procs <= sizeof(struct work) ||
+        (jobs = malloc(procs * sizeof(struct work))) == NULL)
+        bail("not enough memory");
+    for (n = 0; n < procs; n++) {
+        jobs[n].buf = NULL;
+        flag_init(&(jobs[n].busy), IDLE);
+    }
+
+    /* start write thread */
+    pthread_create(&write, &attr, write_thread, NULL);
+
+    /* read from input and start compress threads (write thread will pick up
+       the output of the compress threads) */
+    n = 0;
+    do {
+        /* initialize this work unit if it's the first time it's used */
+        if (jobs[n].buf == NULL)
+            job_init(jobs + n);
+
+        /* read input data, but wait for last compress on this work unit to be
+           done, and wait for the dictionary to be used by the last compress on
+           the next work unit */
+        flag_wait_not(&(jobs[n].busy), COMP);
+        flag_wait_not(&(jobs[NEXT(n)].busy), COMP);
+        got = readn(ind, jobs[n].buf, size);
+
+        /* start compress thread, but wait for write to be done first */
+        flag_wait(&(jobs[n].busy), IDLE);
+        jobs[n].len = got;
+        pthread_create(&(jobs[n].comp), &attr, compress_thread, jobs + n);
+
+        /* mark work unit so write thread knows compress was started */
+        flag_set(&(jobs[n].busy), COMP);
+
+        /* go to the next work unit */
+        n = NEXT(n);
+
+        /* do until end of input, indicated by a read less than size */
+    } while (got == size);
+
+    /* wait for the write thread to complete -- the write thread will join with
+       all of the compress threads, so this waits for all of the threads to
+       complete */
+    pthread_join(write, NULL);
+
+    /* free up all requested resources and return */
+    for (n = procs - 1; n >= 0; n--) {
+        flag_done(&(jobs[n].busy));
+        (void)deflateEnd(&(jobs[n].strm));
+        free(jobs[n].out);
+        free(jobs[n].buf);
+    }
+    free(jobs);
+    pthread_attr_destroy(&attr);
+}
+
+/* Process arguments for level, size, and procs, compress from stdin to
+   stdout in the gzip format.  Note that procs must be at least two in
+   order to provide a dictionary in one work unit for the other work
+   unit, and that size must be at least 32K to store a full dictionary. */
+int main(int argc, char **argv)
+{
+    int n;                          /* general index */
+    int get;                        /* command line parameters to get */
+    char *arg;                      /* command line argument */
+
+    /* set defaults -- 32 processes and 128K buffers was found to provide
+       good utilization of four cores (about 97%) and balanced the overall
+       execution time impact of more threads against more dictionary
+       processing for a fixed amount of memory -- the memory usage for these
+       settings and full use of all work units (at least 4 MB of input) is
+       16.2 MB
+       */
+    level = Z_DEFAULT_COMPRESSION;
+    procs = 32;
+    size = 131072UL;
+
+    /* process command-line arguments */
+    get = 0;
+    for (n = 1; n < argc; n++) {
+        arg = argv[n];
+        if (*arg == '-') {
+            while (*++arg)
+                if (*arg >= '0' && *arg <= '9')     /* compression level */
+                    level = *arg - '0';
+                else if (*arg == 'b')               /* chunk size in K */
+                    get |= 1;
+                else if (*arg == 'p')               /* number of processes */
+                    get |= 2;
+                else if (*arg == 'h') {             /* help */
+                    fputs("usage: pigz [-0..9] [-b blocksizeinK]", stderr);
+                    fputs(" [-p processes] < foo > foo.gz\n", stderr);
+                    return 0;
+                }
+                else
+                    bail("invalid option");
+        }
+        else if (get & 1) {
+            if (get & 2)
+                bail("you need to separate the -b and -p options");
+            size = (size_t)(atol(arg)) << 10;       /* chunk size */
+            if (size < DICT)
+                bail("invalid option");
+            get = 0;
+        }
+        else if (get & 2) {
+            procs = atoi(arg);                      /* processes */
+            if (procs < 2)
+                bail("invalid option");
+            get = 0;
+        }
+        else
+            bail("invalid option (you need to pipe input and output)");
+    }
+    if (get)
+        bail("missing option argument");
+
+    /* do parallel compression from stdin to stdout (the read thread starts up
+       the write thread and the compression threads, and they all join before
+       the read thread returns) */
+    ind = 0;
+    outd = 1;
+    read_thread();
+
+    /* done */
+    return 0;
+}