summaryrefslogtreecommitdiff
path: root/src/c/gf-complete/src/gf.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/c/gf-complete/src/gf.c')
-rw-r--r--src/c/gf-complete/src/gf.c1076
1 files changed, 1076 insertions, 0 deletions
diff --git a/src/c/gf-complete/src/gf.c b/src/c/gf-complete/src/gf.c
new file mode 100644
index 0000000..ac400ba
--- /dev/null
+++ b/src/c/gf-complete/src/gf.c
@@ -0,0 +1,1076 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf.c
+ *
+ * Generic routines for Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+ char *s;
+
+ switch(_gf_errno) {
+ case GF_E_DEFAULT: s = "No Error."; break;
+ case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+ case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+ case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+ case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+ case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+ case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+ case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+ case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+ case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+ case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+ case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+ case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+ case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+ case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+ case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+ case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+ case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+ case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+ case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+ case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+ case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+ case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
+ case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+ case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+ case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+ case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+ case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+ case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+ case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+ case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+ case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+ case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
+ case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+ case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+ case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+ case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
+ case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+ case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+ case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+ case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+ case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
+ case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+ case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
+ case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+ case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+ case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
+ case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+ case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
+ case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+ case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+ case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+ case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+ case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+ case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+ case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+ case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+ case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+ case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
+ case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+ case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
+ case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
+ case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+ case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+ case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
+ case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+ case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+ case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
+ case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+ case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+ case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+ case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
+ case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+ case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+ case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+ case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
+ case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+ case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+ case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+ case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
+ case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+ case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+ case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
+ case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+ case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
+ case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+ case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+ case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+ case GF_E_UNK_REG: s = "Unknown region type."; break;
+ case GF_E_UNK_DIV: s = "Unknown division type."; break;
+ default: s = "Undefined error.";
+ }
+
+ fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base)
+{
+ gf_internal_t *h;
+ int rv;
+
+ h = (gf_internal_t *) base->scratch;
+ if (h->w == 4) {
+ if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+ if (h->prim_poly == 0x13) return 2;
+ return 0;
+ }
+ if (h->w == 8) {
+ if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+ if (h->prim_poly == 0x11d) return 3;
+ return 0;
+ }
+ if (h->w == 16) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ rv = gf_composite_get_default_poly(h->base_gf);
+ if (rv != h->prim_poly) return 0;
+ if (rv == 3) return 0x105;
+ return 0;
+ } else {
+ if (h->prim_poly == 0x1100b) return 2;
+ if (h->prim_poly == 0x1002d) return 7;
+ return 0;
+ }
+ }
+ if (h->w == 32) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ rv = gf_composite_get_default_poly(h->base_gf);
+ if (rv != h->prim_poly) return 0;
+ if (rv == 2) return 0x10005;
+ if (rv == 7) return 0x10008;
+ if (rv == 0x105) return 0x10002;
+ return 0;
+ } else {
+ if (h->prim_poly == 0x400007) return 2;
+ if (h->prim_poly == 0xc5) return 3;
+ return 0;
+ }
+ }
+ if (h->w == 64) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ rv = gf_composite_get_default_poly(h->base_gf);
+ if (rv != h->prim_poly) return 0;
+ if (rv == 3) return 0x100000009ULL;
+ if (rv == 2) return 0x100000004ULL;
+ if (rv == 0x10005) return 0x100000003ULL;
+ if (rv == 0x10002) return 0x100000005ULL;
+ if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too,
+ but I want to differentiate cases). */
+ return 0;
+ } else {
+ if (h->prim_poly == 0x1bULL) return 2;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+ int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+ int sse3 = 0;
+ int sse2 = 0;
+ int pclmul = 0;
+ int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
+ gf_internal_t *sub;
+
+ rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+ rquad = (region_type & GF_REGION_QUAD_TABLE);
+ rlazy = (region_type & GF_REGION_LAZY);
+ rsimd = (region_type & GF_REGION_SIMD);
+ rnosimd = (region_type & GF_REGION_NOSIMD);
+ raltmap = (region_type & GF_REGION_ALTMAP);
+ rcauchy = (region_type & GF_REGION_CAUCHY);
+
+ if (divide_type != GF_DIVIDE_DEFAULT &&
+ divide_type != GF_DIVIDE_MATRIX &&
+ divide_type != GF_DIVIDE_EUCLID) {
+ _gf_errno = GF_E_UNK_DIV;
+ return 0;
+ }
+
+ tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+ GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
+ GF_REGION_CAUCHY );
+ if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+ sse2 = 1;
+#endif
+
+#ifdef INTEL_SSSE3
+ sse3 = 1;
+#endif
+
+#ifdef INTEL_SSE4_PCLMUL
+ pclmul = 1;
+#endif
+
+#ifdef ARM_NEON
+ pclmul = 1;
+ sse3 = 1;
+#endif
+
+
+ if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+
+ if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+ if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; }
+ }
+
+ if (mult_type == GF_MULT_DEFAULT) {
+ if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+ if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+ if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; }
+ return 1;
+ }
+
+ if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
+ if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
+ if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
+ if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+ if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&
+ mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+ _gf_errno = GF_E_ARG1SET;
+ return 0;
+ }
+
+ if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+ _gf_errno = GF_E_ARG2SET;
+ return 0;
+ }
+
+ if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+ if (rdouble) {
+ if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
+ if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+ if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
+ if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
+ if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
+ return 1;
+ }
+
+ if (rquad) {
+ if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+ if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
+ if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
+ return 1;
+ }
+
+ if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; }
+
+ if (mult_type == GF_MULT_SHIFT) {
+ if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_CARRY_FREE) {
+ if (w != 4 && w != 8 && w != 16 &&
+ w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
+ if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; }
+ if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; }
+ if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; }
+ if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
+ if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+ if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
+ if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_CARRY_FREE_GK) {
+ if (w != 4 && w != 8 && w != 16 &&
+ w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
+ if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
+ if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+ if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
+ if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+ || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+ if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
+ if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
+
+ if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+ if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; }
+
+ if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+ if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_GROUP) {
+ if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; }
+ if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; }
+ if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; }
+ if (w == 128 && (arg1 != 4 ||
+ (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+ if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
+ if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
+ if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_TABLE) {
+ if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
+ if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
+ if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_SPLIT_TABLE) {
+ if (arg1 > arg2) {
+ tmp = arg1;
+ arg1 = arg2;
+ arg2 = tmp;
+ }
+ if (w == 8) {
+ if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
+ } else if (w == 16) {
+ if ((arg1 == 8 && arg2 == 8) ||
+ (arg1 == 8 && arg2 == 16)) {
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
+ } else if (arg1 == 4 && arg2 == 16) {
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ } else { _gf_errno = GF_E_SP_16AR; return 0; }
+ } else if (w == 32) {
+ if ((arg1 == 8 && arg2 == 8) ||
+ (arg1 == 8 && arg2 == 32) ||
+ (arg1 == 16 && arg2 == 32)) {
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
+ } else if (arg1 == 4 && arg2 == 32) {
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
+ if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
+ } else { _gf_errno = GF_E_SP_32AR; return 0; }
+ } else if (w == 64) {
+ if ((arg1 == 8 && arg2 == 8) ||
+ (arg1 == 8 && arg2 == 64) ||
+ (arg1 == 16 && arg2 == 64)) {
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
+ } else if (arg1 == 4 && arg2 == 64) {
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
+ if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
+ } else { _gf_errno = GF_E_SP_64AR; return 0; }
+ } else if (w == 128) {
+ if (arg1 == 8 && arg2 == 128) {
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
+ } else if (arg1 == 4 && arg2 == 128) {
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
+ if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
+ } else { _gf_errno = GF_E_SP128AR; return 0; }
+ } else { _gf_errno = GF_E_SPLIT_W; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_COMPOSITE) {
+ if (w != 8 && w != 16 && w != 32
+ && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; }
+ if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
+ if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
+ if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
+ if (base != NULL) {
+ sub = (gf_internal_t *) base->scratch;
+ if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
+ if (poly == 0) {
+ if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+ }
+ }
+ return 1;
+ }
+
+ _gf_errno = GF_E_UNKNOWN;
+ return 0;
+}
+
+int gf_scratch_size(int w,
+ int mult_type,
+ int region_type,
+ int divide_type,
+ int arg1,
+ int arg2)
+{
+ if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
+ switch(w) {
+ case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+ case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+ case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+ case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+ case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+ case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+ default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+ }
+}
+
+extern int gf_size(gf_t *gf)
+{
+ gf_internal_t *h;
+ int s;
+
+ s = sizeof(gf_t);
+ h = (gf_internal_t *) gf->scratch;
+ s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+ if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+ return s;
+}
+
+
+int gf_init_easy(gf_t *gf, int w)
+{
+ return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+ 0, 0, 0, NULL, NULL);
+}
+
+/* Allen: What's going on here is this function is putting info into the
+ scratch mem of gf, and then calling the relevant REAL init
+ func for the word size. Probably done this way to consolidate
+ those aspects of initialization that don't rely on word size,
+ and then take care of word-size-specific stuff. */
+
+int gf_init_hard(gf_t *gf, int w, int mult_type,
+ int region_type,
+ int divide_type,
+ uint64_t prim_poly,
+ int arg1, int arg2,
+ gf_t *base_gf,
+ void *scratch_memory)
+{
+ int sz;
+ gf_internal_t *h;
+
+ if (gf_error_check(w, mult_type, region_type, divide_type,
+ arg1, arg2, prim_poly, base_gf) == 0) return 0;
+
+ sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+ if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught
+ in gf_error_check() */
+
+ if (scratch_memory == NULL) {
+ h = (gf_internal_t *) malloc(sz);
+ h->free_me = 1;
+ } else {
+ h = scratch_memory;
+ h->free_me = 0;
+ }
+ gf->scratch = (void *) h;
+ h->mult_type = mult_type;
+ h->region_type = region_type;
+ h->divide_type = divide_type;
+ h->w = w;
+ h->prim_poly = prim_poly;
+ h->arg1 = arg1;
+ h->arg2 = arg2;
+ h->base_gf = base_gf;
+ h->private = (void *) gf->scratch;
+ h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
+ gf->extract_word.w32 = NULL;
+
+ switch(w) {
+ case 4: return gf_w4_init(gf);
+ case 8: return gf_w8_init(gf);
+ case 16: return gf_w16_init(gf);
+ case 32: return gf_w32_init(gf);
+ case 64: return gf_w64_init(gf);
+ case 128: return gf_w128_init(gf);
+ default: return gf_wgen_init(gf);
+ }
+}
+
+int gf_free(gf_t *gf, int recursive)
+{
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+ if (recursive && h->base_gf != NULL) {
+ gf_free(h->base_gf, 1);
+ free(h->base_gf);
+ }
+ if (h->free_me) free(h);
+ return 0; /* Making compiler happy */
+}
+
+void gf_alignment_error(char *s, int a)
+{
+ fprintf(stderr, "Alignment error in %s:\n", s);
+ fprintf(stderr, " The source and destination buffers must be aligned to each other,\n");
+ fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a);
+ assert(0);
+}
+
+static
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
+ int cols, i, j;
+ uint32_t tmp;
+
+ cols = rows;
+
+ for (i = 0; i < rows; i++) inv[i] = (1 << i);
+
+ /* First -- convert into upper triangular */
+
+ for (i = 0; i < cols; i++) {
+
+ /* Swap rows if we ave a zero i,i element. If we can't swap, then the
+ matrix was not invertible */
+
+ if ((mat[i] & (1 << i)) == 0) {
+ for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
+ if (j == rows) {
+ fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
+ assert(0);
+ }
+ tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
+ tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
+ }
+
+ /* Now for each j>i, add A_ji*Ai to Aj */
+ for (j = i+1; j != rows; j++) {
+ if ((mat[j] & (1 << i)) != 0) {
+ mat[j] ^= mat[i];
+ inv[j] ^= inv[i];
+ }
+ }
+ }
+
+ /* Now the matrix is upper triangular. Start at the top and multiply down */
+
+ for (i = rows-1; i >= 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (mat[j] & (1 << i)) {
+ /* mat[j] ^= mat[i]; */
+ inv[j] ^= inv[i];
+ }
+ }
+ }
+}
+
+uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
+{
+ uint32_t mat[32], inv[32], mask;
+ int i;
+
+ mask = (w == 32) ? 0xffffffff : (1 << w) - 1;
+ for (i = 0; i < w; i++) {
+ mat[i] = y;
+
+ if (y & (1 << (w-1))) {
+ y = y << 1;
+ y = ((y ^ pp) & mask);
+ } else {
+ y = y << 1;
+ }
+ }
+
+ gf_invert_binary_matrix(mat, inv, w);
+ return inv[0];
+}
+
+void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
+{
+ uint64_t a, prod;
+ int xor;
+ uint64_t *s64, *d64, *top;
+
+ s64 = rd->s_start;
+ d64 = rd->d_start;
+ top = rd->d_top;
+ xor = rd->xor;
+
+ if (xor) {
+ while (d64 != top) {
+ a = *s64;
+ prod = base[a >> 48];
+ a <<= 16;
+ prod <<= 16;
+ prod ^= base[a >> 48];
+ a <<= 16;
+ prod <<= 16;
+ prod ^= base[a >> 48];
+ a <<= 16;
+ prod <<= 16;
+ prod ^= base[a >> 48];
+ prod ^= *d64;
+ *d64 = prod;
+ s64++;
+ d64++;
+ }
+ } else {
+ while (d64 != top) {
+ a = *s64;
+ prod = base[a >> 48];
+ a <<= 16;
+ prod <<= 16;
+ prod ^= base[a >> 48];
+ a <<= 16;
+ prod <<= 16;
+ prod ^= base[a >> 48];
+ a <<= 16;
+ prod <<= 16;
+ prod ^= base[a >> 48];
+ *d64 = prod;
+ s64++;
+ d64++;
+ }
+ }
+}
+
+static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
+{
+ uint8_t *s8, *d8;
+ uint16_t *s16, *d16;
+ uint32_t *s32, *d32;
+ uint64_t *s64, *d64;
+ gf_internal_t *h;
+ int wb;
+ uint32_t p, a;
+
+ h = rd->gf->scratch;
+ wb = (h->w)/8;
+ if (wb == 0) wb = 1;
+
+ while (src < s_top) {
+ switch (h->w) {
+ case 8:
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :
+ rd->gf->multiply.w32(rd->gf, rd->val, *s8);
+ break;
+ case 4:
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ a = *s8;
+ p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
+ p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
+ if (rd->xor) p ^= *d8;
+ *d8 = p;
+ break;
+ case 16:
+ s16 = (uint16_t *) src;
+ d16 = (uint16_t *) dest;
+ *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :
+ rd->gf->multiply.w32(rd->gf, rd->val, *s16);
+ break;
+ case 32:
+ s32 = (uint32_t *) src;
+ d32 = (uint32_t *) dest;
+ *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :
+ rd->gf->multiply.w32(rd->gf, rd->val, *s32);
+ break;
+ case 64:
+ s64 = (uint64_t *) src;
+ d64 = (uint64_t *) dest;
+ *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :
+ rd->gf->multiply.w64(rd->gf, rd->val, *s64);
+ break;
+ default:
+ fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
+ exit(1);
+ }
+ src = (uint8_t *)src + wb;
+ dest = (uint8_t *)dest + wb;
+ }
+}
+
+/* JSP - The purpose of this procedure is to error check alignment,
+ and to set up the region operation so that it can best leverage
+ large words.
+
+ It stores its information in rd.
+
+ Assuming you're not doing Cauchy coding, (see below for that),
+ then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+ should change that).
+
+ src and dest must then be aligned on ceil(w/8)-byte boundaries.
+ Moreover, bytes must be a multiple of ceil(w/8). If the variable
+ align is equal to ceil(w/8), then we will set s_start = src,
+ d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+ And we return -- the implementation will go ahead and do the
+ multiplication on individual words (e.g. using discrete logs).
+
+ If align is greater than ceil(w/8), then the implementation needs
+ to work on groups of "align" bytes. For example, suppose you are
+ implementing BYTWO, without SSE. Then you will be doing the region
+ multiplication in units of 8 bytes, so align = 8. Or, suppose you
+ are doing a Quad table in GF(2^4). You will be doing the region
+ multiplication in units of 2 bytes, so align = 2. Or, suppose you
+ are doing split multiplication with SSE operations in GF(2^8).
+ Then align = 16. Worse yet, suppose you are doing split
+ multiplication with SSE operations in GF(2^16), with or without
+ ALTMAP. Then, you will be doing the multiplication on 256 bits at
+ a time. So align = 32.
+
+ When align does not equal ceil(w/8), we split the region
+ multiplication into three parts. We are going to make s_start be
+ the first address greater than or equal to src that is a multiple
+ of align. s_top is going to be the largest address >= src+bytes
+ such that (s_top - s_start) is a multiple of align. We do the
+ same with d_start and d_top. When we say that "src and dest must
+ be aligned with respect to each other, we mean that s_start-src
+ must equal d_start-dest.
+
+ Now, the region multiplication is done in three parts -- the part
+ between src and s_start must be done using single words.
+ Similarly, the part between s_top and src+bytes must also be done
+ using single words. The part between s_start and s_top will be
+ done in chunks of "align" bytes.
+
+ One final thing -- if align > 16, then s_start and d_start will be
+ aligned on a 16 byte boundary. Perhaps we should have two
+ variables: align and chunksize. Then we'd have s_start & d_start
+ aligned to "align", and have s_top-s_start be a multiple of
+ chunksize. That may be less confusing, but it would be a big
+ change.
+
+ Finally, if align = -1, then we are doing Cauchy multiplication,
+ using only XOR's. In this case, we're not going to care about
+ alignment because we are just doing XOR's. Instead, the only
+ thing we care about is that bytes must be a multiple of w.
+
+ This is not to say that alignment doesn't matter in performance
+ with XOR's. See that discussion in gf_multby_one().
+
+ After you call gf_set_region_data(), the procedure
+ gf_do_initial_region_alignment() calls gf->multiply.w32() on
+ everything between src and s_start. The procedure
+ gf_do_final_region_alignment() calls gf->multiply.w32() on
+ everything between s_top and src+bytes.
+ */
+
+void gf_set_region_data(gf_region_data *rd,
+ gf_t *gf,
+ void *src,
+ void *dest,
+ int bytes,
+ uint64_t val,
+ int xor,
+ int align)
+{
+ gf_internal_t *h = NULL;
+ int wb;
+ uint32_t a;
+ unsigned long uls, uld;
+
+ if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */
+ wb = 1;
+ } else {
+ h = gf->scratch;
+ wb = (h->w)/8;
+ if (wb == 0) wb = 1;
+ }
+
+ rd->gf = gf;
+ rd->src = src;
+ rd->dest = dest;
+ rd->bytes = bytes;
+ rd->val = val;
+ rd->xor = xor;
+ rd->align = align;
+
+ uls = (unsigned long) src;
+ uld = (unsigned long) dest;
+
+ a = (align <= 16) ? align : 16;
+
+ if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers
+ so that there are no alignment regions. */
+ if (h != NULL && bytes % h->w != 0) {
+ fprintf(stderr, "Error in region multiply operation.\n");
+ fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
+ assert(0);
+ }
+
+ rd->s_start = src;
+ rd->d_start = dest;
+ rd->s_top = (uint8_t *)src + bytes;
+ rd->d_top = (uint8_t *)src + bytes;
+ return;
+ }
+
+ if (uls % a != uld % a) {
+ fprintf(stderr, "Error in region multiply operation.\n");
+ fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
+ fprintf(stderr, "to each other along a %d byte boundary.\n", a);
+ fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
+ (unsigned long) dest);
+ assert(0);
+ }
+
+ if (uls % wb != 0) {
+ fprintf(stderr, "Error in region multiply operation.\n");
+ fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
+ fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
+ (unsigned long) dest);
+ assert(0);
+ }
+
+ if (bytes % wb != 0) {
+ fprintf(stderr, "Error in region multiply operation.\n");
+ fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
+ assert(0);
+ }
+
+ uls %= a;
+ if (uls != 0) uls = (a-uls);
+ rd->s_start = (uint8_t *)rd->src + uls;
+ rd->d_start = (uint8_t *)rd->dest + uls;
+ bytes -= uls;
+ bytes -= (bytes % align);
+ rd->s_top = (uint8_t *)rd->s_start + bytes;
+ rd->d_top = (uint8_t *)rd->d_start + bytes;
+
+}
+
+void gf_do_initial_region_alignment(gf_region_data *rd)
+{
+ gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
+}
+
+void gf_do_final_region_alignment(gf_region_data *rd)
+{
+ gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
+}
+
+void gf_multby_zero(void *dest, int bytes, int xor)
+{
+ if (xor) return;
+ bzero(dest, bytes);
+ return;
+}
+
+/* JSP - gf_multby_one tries to do this in the most efficient way
+ possible. If xor = 0, then simply call memcpy() since that
+ should be optimized by the system. Otherwise, try to do the xor
+ in the following order:
+
+ If src and dest are aligned with respect to each other on 16-byte
+ boundaries and you have SSE instructions, then use aligned SSE
+ instructions.
+
+ If they aren't but you still have SSE instructions, use unaligned
+ SSE instructions.
+
+ If there are no SSE instructions, but they are aligned with
+ respect to each other on 8-byte boundaries, then do them with
+ uint64_t's.
+
+ Otherwise, call gf_unaligned_xor(), which does the following:
+ align a destination pointer along an 8-byte boundary, and then
+ memcpy 32 bytes at a time from the src pointer to an array of
+ doubles. I'm not sure if that's the best -- probably needs
+ testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
+void gf_multby_one(void *src, void *dest, int bytes, int xor)
+{
+#ifdef INTEL_SSE2
+ __m128i ms, md;
+#endif
+ unsigned long uls, uld;
+ uint8_t *s8, *d8;
+ uint64_t *s64, *d64, *dtop64;
+ gf_region_data rd;
+
+ if (!xor) {
+ memcpy(dest, src, bytes);
+ return;
+ }
+ uls = (unsigned long) src;
+ uld = (unsigned long) dest;
+
+#ifdef INTEL_SSE2
+ int abytes;
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ ms = _mm_load_si128 ((__m128i *)(s8));
+ md = _mm_load_si128 ((__m128i *)(d8));
+ md = _mm_xor_si128(md, ms);
+ _mm_store_si128((__m128i *)(d8), md);
+ s8 += 16;
+ d8 += 16;
+ }
+ while (s8 != (uint8_t *) src + bytes) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ return;
+ }
+
+ abytes = (bytes & 0xfffffff0);
+
+ while (d8 < (uint8_t *) dest + abytes) {
+ ms = _mm_loadu_si128 ((__m128i *)(s8));
+ md = _mm_loadu_si128 ((__m128i *)(d8));
+ md = _mm_xor_si128(md, ms);
+ _mm_storeu_si128((__m128i *)(d8), md);
+ s8 += 16;
+ d8 += 16;
+ }
+ while (d8 != (uint8_t *) dest+bytes) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ return;
+#endif
+#if defined(ARM_NEON)
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ s8++;
+ d8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ } else {
+ while (s8 + 15 < (uint8_t *) src + bytes) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ }
+ while (s8 < (uint8_t *) src + bytes) {
+ *d8 ^= *s8;
+ s8++;
+ d8++;
+ }
+ return;
+#endif
+ if (uls % 8 != uld % 8) {
+ gf_unaligned_xor(src, dest, bytes);
+ return;
+ }
+
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ while (d8 != rd.d_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ dtop64 = (uint64_t *) rd.d_top;
+
+ d64 = (uint64_t *) rd.d_start;
+ s64 = (uint64_t *) rd.s_start;
+
+ while (d64 < dtop64) {
+ *d64 ^= *s64;
+ d64++;
+ s64++;
+ }
+
+ s8 = (uint8_t *) rd.s_top;
+ d8 = (uint8_t *) rd.d_top;
+
+ while (d8 != (uint8_t *) dest+bytes) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ return;
+}
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+ uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+ int i;
+ gf_region_data rd;
+ uint8_t *s8, *d8;
+
+ /* JSP - call gf_set_region_data(), but use dest in both places. This is
+ because I only want to set up dest. If I used src, gf_set_region_data()
+ would fail because src and dest are not aligned to each other wrt
+ 8-byte pointers. I know this will actually align d_start to 16 bytes.
+ If I change gf_set_region_data() to split alignment & chunksize, then
+ I could do this correctly. */
+
+ gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+
+ while (d8 < (uint8_t *) rd.d_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+
+ d64 = (uint64_t *) d8;
+ while (d64 < (uint64_t *) rd.d_top) {
+ memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+ s8 += 8*UNALIGNED_BUFSIZE;
+ for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+ *d64 ^= scopy[i];
+ d64++;
+ }
+ }
+
+ d8 = (uint8_t *) d64;
+ while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+}