1 files changed, 1076 insertions, 0 deletions
diff --git a/src/c/gf-complete/src/gf.c b/src/c/gf-complete/src/gf.c
new file mode 100644
index 0000000..ac400ba
--- /dev/null
+++ b/src/c/gf-complete/src/gf.c
@@ -0,0 +1,1076 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf.c
+ *
+ * Generic routines for Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+  char *s;
+
+  switch(_gf_errno) {
+    case GF_E_DEFAULT: s = "No Error."; break;
+    case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+    case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+    case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+    case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+    case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+    case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+    case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+    case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+    case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+    case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+    case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+    case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+    case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+    case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+    case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+    case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+    case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+    case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+    case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+    case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+    case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+    case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
+    case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+    case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+    case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+    case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+    case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+    case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+    case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+    case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+    case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+    case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+    case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+    case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+    case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+    case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
+    case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
+    case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+    case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
+    case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+    case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+    case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+    case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+    case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+    case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+    case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+    case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+    case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+    case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
+    case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
+    case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+    case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
+    case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
+    case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+    case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+    case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+    case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+    case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+    case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
+    case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
+    case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+    case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+    case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+    case GF_E_UNK_REG: s = "Unknown region type."; break;
+    case GF_E_UNK_DIV: s = "Unknown division type."; break;
+    default: s = "Undefined error.";
+  }
+
+  fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base)
+{
+  gf_internal_t *h;
+  int rv;
+
+  h = (gf_internal_t *) base->scratch;
+  if (h->w == 4) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x13) return 2;
+    return 0;
+  }
+  if (h->w == 8) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x11d) return 3;
+    return 0;
+  }
+  if (h->w == 16) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x105;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1100b) return 2;
+      if (h->prim_poly == 0x1002d) return 7;
+      return 0;
+    }
+  }
+  if (h->w == 32) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 2) return 0x10005;
+      if (rv == 7) return 0x10008;
+      if (rv == 0x105) return 0x10002;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x400007) return 2;
+      if (h->prim_poly == 0xc5) return 3;
+      return 0;
+    }
+  }
+  if (h->w == 64) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x100000009ULL;
+      if (rv == 2) return 0x100000004ULL;
+      if (rv == 0x10005) return 0x100000003ULL;
+      if (rv == 0x10002) return 0x100000005ULL;
+      if (rv == 0x10008) return 0x100000006ULL;  /* JSP: (0x0x100000003 works too,
+                                                    but I want to differentiate cases). */
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1bULL) return 2;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+                   int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+  int sse3 = 0;
+  int sse2 = 0;
+  int pclmul = 0;
+  int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
+  gf_internal_t *sub;
+
+  rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+  rquad   = (region_type & GF_REGION_QUAD_TABLE);
+  rlazy   = (region_type & GF_REGION_LAZY);
+  rsimd   = (region_type & GF_REGION_SIMD);
+  rnosimd = (region_type & GF_REGION_NOSIMD);
+  raltmap = (region_type & GF_REGION_ALTMAP);
+  rcauchy = (region_type & GF_REGION_CAUCHY);
+
+  if (divide_type != GF_DIVIDE_DEFAULT &&
+      divide_type != GF_DIVIDE_MATRIX &&
+      divide_type != GF_DIVIDE_EUCLID) {
+    _gf_errno = GF_E_UNK_DIV;
+    return 0;
+  }
+
+  tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+          GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
+          GF_REGION_CAUCHY );
+  if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+  sse2 = 1;
+#endif
+
+#ifdef INTEL_SSSE3
+  sse3 = 1;
+#endif
+
+#ifdef INTEL_SSE4_PCLMUL
+  pclmul = 1;
+#endif
+
+#ifdef ARM_NEON
+  pclmul = 1;
+  sse3 = 1;
+#endif
+
+
+  if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+
+  if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+    if ((poly >> (w+1)) != 0)                   { _gf_errno = GF_E_BADPOLY; return 0; }
+  }
+
+  if (mult_type == GF_MULT_DEFAULT) {
+    if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+    if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+    if (arg1 != 0 || arg2 != 0)           { _gf_errno = GF_E_MDEFARG; return 0; }
+    return 1;
+  }
+
+  if (rsimd && rnosimd)                              { _gf_errno = GF_E_SIMD_NO; return 0; }
+  if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
+  if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
+  if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+  if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&
+      mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG1SET;
+    return 0;
+  }
+
+  if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG2SET;
+    return 0;
+  }
+
+  if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+  if (rdouble) {
+    if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+    if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
+    if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
+    return 1;
+  }
+
+  if (rquad) {
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+    if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
+    return 1;
+  }
+
+  if (rlazy)                        { _gf_errno = GF_E_LAZY__X; return 0; }
+
+  if (mult_type == GF_MULT_SHIFT) {
+    if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
+    if (rsimd || rnosimd)           { _gf_errno = GF_E_SSESHIF; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (w == 4 && (poly & 0xc))                    { _gf_errno = GF_E_CFM4POL; return 0; }
+    if (w == 8 && (poly & 0x80))                   { _gf_errno = GF_E_CFM8POL; return 0; }
+    if (w == 16 && (poly & 0xe000))                { _gf_errno = GF_E_CF16POL; return 0; }
+    if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
+    if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE_GK) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+    if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
+    if (rsimd && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+                                     || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+    if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
+    if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
+
+    if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+    if (w != 8 && w != 16)          { _gf_errno = GF_E_ZERBADW; return 0; }
+
+    if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+    if (w != 8)                     { _gf_errno = GF_E_ZEXBADW; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_GROUP) {
+    if (arg1 <= 0 || arg2 <= 0)                 { _gf_errno = GF_E_GR_ARGX; return 0; }
+    if (w == 4 || w == 8)                       { _gf_errno = GF_E_GR_W_48; return 0; }
+    if (w == 16 && (arg1 != 4 || arg2 != 4))     { _gf_errno = GF_E_GR_W_16; return 0; }
+    if (w == 128 && (arg1 != 4 ||
+       (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+    if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
+    if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
+    if (raltmap || rsimd || rnosimd)            { _gf_errno = GF_E_GR____J; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_TABLE) {
+    if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
+    if (w != 4 && (rsimd || rnosimd))           { _gf_errno = GF_E_TAB_SSE; return 0; }
+    if (rsimd && !sse3)                         { _gf_errno = GF_E_TABSSE3; return 0; }
+    if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_SPLIT_TABLE) {
+    if (arg1 > arg2) {
+      tmp = arg1;
+      arg1 = arg2;
+      arg2 = tmp;
+    }
+    if (w == 8) {
+      if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
+      if (rsimd && !sse3)                       { _gf_errno = GF_E_SP_SSE3; return 0; }
+      if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
+    } else if (w == 16) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 16)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_16_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 16) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
+    } else if (w == 32) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 32) ||
+          (arg1 == 16 && arg2 == 32)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_32_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 32) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_32AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
+    } else if (w == 64) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 64) ||
+          (arg1 == 16 && arg2 == 64)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_64_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 64) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_64AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
+    } else if (w == 128) {
+      if (arg1 == 8 && arg2 == 128) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP128_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 128) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP128AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
+    } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_COMPOSITE) {
+    if (w != 8 && w != 16 && w != 32
+               && w != 64 && w != 128)          { _gf_errno = GF_E_COMP__W; return 0; }
+    if (w < 128 && (poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
+    if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
+    if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
+    if (rsimd || rnosimd)                       { _gf_errno = GF_E_COMP_SS; return 0; }
+    if (base != NULL) {
+      sub = (gf_internal_t *) base->scratch;
+      if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
+      if (poly == 0) {
+        if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+      }
+    }
+    return 1;
+  }
+
+  _gf_errno = GF_E_UNKNOWN;
+  return 0;
+}
+
+int gf_scratch_size(int w,
+                    int mult_type,
+                    int region_type,
+                    int divide_type,
+                    int arg1,
+                    int arg2)
+{
+  if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
+  switch(w) {
+    case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  }
+}
+
+extern int gf_size(gf_t *gf)
+{
+  gf_internal_t *h;
+  int s;
+
+  s = sizeof(gf_t);
+  h = (gf_internal_t *) gf->scratch;
+  s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+  if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+  return s;
+}
+
+
+int gf_init_easy(gf_t *gf, int w)
+{
+  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                      0, 0, 0, NULL, NULL);
+}
+
+/* Allen: What's going on here is this function is putting info into the
+       scratch mem of gf, and then calling the relevant REAL init
+       func for the word size.  Probably done this way to consolidate
+       those aspects of initialization that don't rely on word size,
+       and then take care of word-size-specific stuff. */
+
+int gf_init_hard(gf_t *gf, int w, int mult_type,
+                        int region_type,
+                        int divide_type,
+                        uint64_t prim_poly,
+                        int arg1, int arg2,
+                        gf_t *base_gf,
+                        void *scratch_memory)
+{
+  int sz;
+  gf_internal_t *h;
+
+  if (gf_error_check(w, mult_type, region_type, divide_type,
+                     arg1, arg2, prim_poly, base_gf) == 0) return 0;
+
+  sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  if (sz <= 0) return 0;  /* This shouldn't happen, as all errors should get caught
+                             in gf_error_check() */
+
+  if (scratch_memory == NULL) {
+    h = (gf_internal_t *) malloc(sz);
+    h->free_me = 1;
+  } else {
+    h = scratch_memory;
+    h->free_me = 0;
+  }
+  gf->scratch = (void *) h;
+  h->mult_type = mult_type;
+  h->region_type = region_type;
+  h->divide_type = divide_type;
+  h->w = w;
+  h->prim_poly = prim_poly;
+  h->arg1 = arg1;
+  h->arg2 = arg2;
+  h->base_gf = base_gf;
+  h->private = (void *) gf->scratch;
+  h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
+  gf->extract_word.w32 = NULL;
+
+  switch(w) {
+    case 4: return gf_w4_init(gf);
+    case 8: return gf_w8_init(gf);
+    case 16: return gf_w16_init(gf);
+    case 32: return gf_w32_init(gf);
+    case 64: return gf_w64_init(gf);
+    case 128: return gf_w128_init(gf);
+    default: return gf_wgen_init(gf);
+  }
+}
+
+int gf_free(gf_t *gf, int recursive)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (recursive && h->base_gf != NULL) {
+    gf_free(h->base_gf, 1);
+    free(h->base_gf);
+  }
+  if (h->free_me) free(h);
+  return 0; /* Making compiler happy */
+}
+
+void gf_alignment_error(char *s, int a)
+{
+  fprintf(stderr, "Alignment error in %s:\n", s);
+  fprintf(stderr, "   The source and destination buffers must be aligned to each other,\n");
+  fprintf(stderr, "   and they must be aligned to a %d-byte address.\n", a);
+  assert(0);
+}
+
+static
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
+  int cols, i, j;
+  uint32_t tmp;
+
+  cols = rows;
+
+  for (i = 0; i < rows; i++) inv[i] = (1 << i);
+
+  /* First -- convert into upper triangular */
+
+  for (i = 0; i < cols; i++) {
+
+    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the
+       matrix was not invertible */
+
+    if ((mat[i] & (1 << i)) == 0) {
+      for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
+      if (j == rows) {
+        fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
+        assert(0);
+      }
+      tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
+      tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
+    }
+
+    /* Now for each j>i, add A_ji*Ai to Aj */
+    for (j = i+1; j != rows; j++) {
+      if ((mat[j] & (1 << i)) != 0) {
+        mat[j] ^= mat[i];
+        inv[j] ^= inv[i];
+      }
+    }
+  }
+
+  /* Now the matrix is upper triangular.  Start at the top and multiply down */
+
+  for (i = rows-1; i >= 0; i--) {
+    for (j = 0; j < i; j++) {
+      if (mat[j] & (1 << i)) {
+        /*  mat[j] ^= mat[i]; */
+        inv[j] ^= inv[i];
+      }
+    }
+  }
+}
+
+uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
+{
+  uint32_t mat[32], inv[32], mask;
+  int i;
+
+  mask = (w == 32) ? 0xffffffff : (1 << w) - 1;
+  for (i = 0; i < w; i++) {
+    mat[i] = y;
+
+    if (y & (1 << (w-1))) {
+      y = y << 1;
+      y = ((y ^ pp) & mask);
+    } else {
+      y = y << 1;
+    }
+  }
+
+  gf_invert_binary_matrix(mat, inv, w);
+  return inv[0];
+}
+
+void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
+{
+  uint64_t a, prod;
+  int xor;
+  uint64_t *s64, *d64, *top;
+
+  s64 = rd->s_start;
+  d64 = rd->d_start;
+  top = rd->d_top;
+  xor = rd->xor;
+
+  if (xor) {
+    while (d64 != top) {
+      a = *s64;
+      prod = base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      prod ^= *d64;
+      *d64 = prod;
+      s64++;
+      d64++;
+    }
+  } else {
+    while (d64 != top) {
+      a = *s64;
+      prod = base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      *d64 = prod;
+      s64++;
+      d64++;
+    }
+  }
+}
+
+static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
+{
+  uint8_t *s8, *d8;
+  uint16_t *s16, *d16;
+  uint32_t *s32, *d32;
+  uint64_t *s64, *d64;
+  gf_internal_t *h;
+  int wb;
+  uint32_t p, a;
+
+  h = rd->gf->scratch;
+  wb = (h->w)/8;
+  if (wb == 0) wb = 1;
+
+  while (src < s_top) {
+    switch (h->w) {
+    case 8:
+      s8 = (uint8_t *) src;
+      d8 = (uint8_t *) dest;
+      *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s8);
+      break;
+    case 4:
+      s8 = (uint8_t *) src;
+      d8 = (uint8_t *) dest;
+      a = *s8;
+      p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
+      p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
+      if (rd->xor) p ^= *d8;
+      *d8 = p;
+      break;
+    case 16:
+      s16 = (uint16_t *) src;
+      d16 = (uint16_t *) dest;
+      *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s16);
+      break;
+    case 32:
+      s32 = (uint32_t *) src;
+      d32 = (uint32_t *) dest;
+      *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s32);
+      break;
+    case 64:
+      s64 = (uint64_t *) src;
+      d64 = (uint64_t *) dest;
+      *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :
+                      rd->gf->multiply.w64(rd->gf, rd->val, *s64);
+      break;
+    default:
+      fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
+      exit(1);
+    }
+    src = (uint8_t *)src + wb;
+    dest = (uint8_t *)dest + wb;
+  }
+}
+
+/* JSP - The purpose of this procedure is to error check alignment,
+   and to set up the region operation so that it can best leverage
+   large words.
+
+   It stores its information in rd.
+
+   Assuming you're not doing Cauchy coding, (see below for that),
+   then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+   should change that).
+
+   src and dest must then be aligned on ceil(w/8)-byte boundaries.
+   Moreover, bytes must be a multiple of ceil(w/8).  If the variable
+   align is equal to ceil(w/8), then we will set s_start = src,
+   d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+   And we return -- the implementation will go ahead and do the
+   multiplication on individual words (e.g. using discrete logs).
+
+   If align is greater than ceil(w/8), then the implementation needs
+   to work on groups of "align" bytes.  For example, suppose you are
+   implementing BYTWO, without SSE. Then you will be doing the region
+   multiplication in units of 8 bytes, so align = 8. Or, suppose you
+   are doing a Quad table in GF(2^4). You will be doing the region
+   multiplication in units of 2 bytes, so align = 2. Or, suppose you
+   are doing split multiplication with SSE operations in GF(2^8).
+   Then align = 16. Worse yet, suppose you are doing split
+   multiplication with SSE operations in GF(2^16), with or without
+   ALTMAP. Then, you will be doing the multiplication on 256 bits at
+   a time.  So align = 32.
+
+   When align does not equal ceil(w/8), we split the region
+   multiplication into three parts.  We are going to make s_start be
+   the first address greater than or equal to src that is a multiple
+   of align.  s_top is going to be the largest address >= src+bytes
+   such that (s_top - s_start) is a multiple of align.  We do the
+   same with d_start and d_top.  When we say that "src and dest must
+   be aligned with respect to each other, we mean that s_start-src
+   must equal d_start-dest.
+
+   Now, the region multiplication is done in three parts -- the part
+   between src and s_start must be done using single words.
+   Similarly, the part between s_top and src+bytes must also be done
+   using single words.  The part between s_start and s_top will be
+   done in chunks of "align" bytes.
+
+   One final thing -- if align > 16, then s_start and d_start will be
+   aligned on a 16 byte boundary.  Perhaps we should have two
+   variables: align and chunksize.  Then we'd have s_start & d_start
+   aligned to "align", and have s_top-s_start be a multiple of
+   chunksize.  That may be less confusing, but it would be a big
+   change.
+
+   Finally, if align = -1, then we are doing Cauchy multiplication,
+   using only XOR's.  In this case, we're not going to care about
+   alignment because we are just doing XOR's.  Instead, the only
+   thing we care about is that bytes must be a multiple of w.
+
+   This is not to say that alignment doesn't matter in performance
+   with XOR's.  See that discussion in gf_multby_one().
+
+   After you call gf_set_region_data(), the procedure
+   gf_do_initial_region_alignment() calls gf->multiply.w32() on
+   everything between src and s_start.  The procedure
+   gf_do_final_region_alignment() calls gf->multiply.w32() on
+   everything between s_top and src+bytes.
+   */
+
+void gf_set_region_data(gf_region_data *rd,
+  gf_t *gf,
+  void *src,
+  void *dest,
+  int bytes,
+  uint64_t val,
+  int xor,
+  int align)
+{
+  gf_internal_t *h = NULL;
+  int wb;
+  uint32_t a;
+  unsigned long uls, uld;
+
+  if (gf == NULL) {  /* JSP - Can be NULL if you're just doing XOR's */
+    wb = 1;
+  } else {
+    h = gf->scratch;
+    wb = (h->w)/8;
+    if (wb == 0) wb = 1;
+  }
+
+  rd->gf = gf;
+  rd->src = src;
+  rd->dest = dest;
+  rd->bytes = bytes;
+  rd->val = val;
+  rd->xor = xor;
+  rd->align = align;
+
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
+
+  a = (align <= 16) ? align : 16;
+
+  if (align == -1) { /* JSP: This is cauchy.  Error check bytes, then set up the pointers
+                        so that there are no alignment regions. */
+    if (h != NULL && bytes % h->w != 0) {
+      fprintf(stderr, "Error in region multiply operation.\n");
+      fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
+      assert(0);
+    }
+
+    rd->s_start = src;
+    rd->d_start = dest;
+    rd->s_top = (uint8_t *)src + bytes;
+    rd->d_top = (uint8_t *)src + bytes;
+    return;
+  }
+
+  if (uls % a != uld % a) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
+    fprintf(stderr, "to each other along a %d byte boundary.\n", a);
+    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
+            (unsigned long) dest);
+    assert(0);
+  }
+
+  if (uls % wb != 0) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
+    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
+            (unsigned long) dest);
+    assert(0);
+  }
+
+  if (bytes % wb != 0) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
+    assert(0);
+  }
+
+  uls %= a;
+  if (uls != 0) uls = (a-uls);
+  rd->s_start = (uint8_t *)rd->src + uls;
+  rd->d_start = (uint8_t *)rd->dest + uls;
+  bytes -= uls;
+  bytes -= (bytes % align);
+  rd->s_top = (uint8_t *)rd->s_start + bytes;
+  rd->d_top = (uint8_t *)rd->d_start + bytes;
+
+}
+
+void gf_do_initial_region_alignment(gf_region_data *rd)
+{
+  gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
+}
+
+void gf_do_final_region_alignment(gf_region_data *rd)
+{
+  gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
+}
+
+void gf_multby_zero(void *dest, int bytes, int xor)
+{
+  if (xor) return;
+  bzero(dest, bytes);
+  return;
+}
+
+/* JSP - gf_multby_one tries to do this in the most efficient way
+   possible.  If xor = 0, then simply call memcpy() since that
+   should be optimized by the system.  Otherwise, try to do the xor
+   in the following order:
+
+   If src and dest are aligned with respect to each other on 16-byte
+   boundaries and you have SSE instructions, then use aligned SSE
+   instructions.
+
+   If they aren't but you still have SSE instructions, use unaligned
+   SSE instructions.
+
+   If there are no SSE instructions, but they are aligned with
+   respect to each other on 8-byte boundaries, then do them with
+   uint64_t's.
+
+   Otherwise, call gf_unaligned_xor(), which does the following:
+   align a destination pointer along an 8-byte boundary, and then
+   memcpy 32 bytes at a time from the src pointer to an array of
+   doubles.  I'm not sure if that's the best -- probably needs
+   testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
+void gf_multby_one(void *src, void *dest, int bytes, int xor)
+{
+#ifdef   INTEL_SSE2
+  __m128i ms, md;
+#endif
+  unsigned long uls, uld;
+  uint8_t *s8, *d8;
+  uint64_t *s64, *d64, *dtop64;
+  gf_region_data rd;
+
+  if (!xor) {
+    memcpy(dest, src, bytes);
+    return;
+  }
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
+
+#ifdef   INTEL_SSE2
+  int abytes;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      ms = _mm_load_si128 ((__m128i *)(s8));
+      md = _mm_load_si128 ((__m128i *)(d8));
+      md = _mm_xor_si128(md, ms);
+      _mm_store_si128((__m128i *)(d8), md);
+      s8 += 16;
+      d8 += 16;
+    }
+    while (s8 != (uint8_t *) src + bytes) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    return;
+  }
+
+  abytes = (bytes & 0xfffffff0);
+
+  while (d8 < (uint8_t *) dest + abytes) {
+    ms = _mm_loadu_si128 ((__m128i *)(s8));
+    md = _mm_loadu_si128 ((__m128i *)(d8));
+    md = _mm_xor_si128(md, ms);
+    _mm_storeu_si128((__m128i *)(d8), md);
+    s8 += 16;
+    d8 += 16;
+  }
+  while (d8 != (uint8_t *) dest+bytes) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  return;
+#endif
+#if defined(ARM_NEON)
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      s8++;
+      d8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      uint8x16_t vs = vld1q_u8 (s8);
+      uint8x16_t vd = vld1q_u8 (d8);
+      uint8x16_t vr = veorq_u8 (vs, vd);
+      vst1q_u8 (d8, vr);
+      s8 += 16;
+      d8 += 16;
+    }
+  } else {
+    while (s8 + 15 < (uint8_t *) src + bytes) {
+      uint8x16_t vs = vld1q_u8 (s8);
+      uint8x16_t vd = vld1q_u8 (d8);
+      uint8x16_t vr = veorq_u8 (vs, vd);
+      vst1q_u8 (d8, vr);
+      s8 += 16;
+      d8 += 16;
+    }
+  }
+  while (s8 < (uint8_t *) src + bytes) {
+    *d8 ^= *s8;
+    s8++;
+    d8++;
+  }
+  return;
+#endif
+  if (uls % 8 != uld % 8) {
+    gf_unaligned_xor(src, dest, bytes);
+    return;
+  }
+
+  gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  while (d8 != rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  dtop64 = (uint64_t *) rd.d_top;
+
+  d64 = (uint64_t *) rd.d_start;
+  s64 = (uint64_t *) rd.s_start;
+
+  while (d64 < dtop64) {
+    *d64 ^= *s64;
+    d64++;
+    s64++;
+  }
+
+  s8 = (uint8_t *) rd.s_top;
+  d8 = (uint8_t *) rd.d_top;
+
+  while (d8 != (uint8_t *) dest+bytes) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  return;
+}
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+  uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+  int i;
+  gf_region_data rd;
+  uint8_t *s8, *d8;
+
+  /* JSP - call gf_set_region_data(), but use dest in both places.  This is
+     because I only want to set up dest.  If I used src, gf_set_region_data()
+     would fail because src and dest are not aligned to each other wrt
+     8-byte pointers.  I know this will actually align d_start to 16 bytes.
+     If I change gf_set_region_data() to split alignment & chunksize, then
+     I could do this correctly. */
+
+  gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  while (d8 < (uint8_t *) rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+
+  d64 = (uint64_t *) d8;
+  while (d64 < (uint64_t *) rd.d_top) {
+    memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+    s8 += 8*UNALIGNED_BUFSIZE;
+    for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+      *d64 ^= scopy[i];
+      d64++;
+    }
+  }
+
+  d8 = (uint8_t *) d64;
+  while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+}