summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Greenan <kmgreen2@gmail.com>2014-10-24 14:19:31 -0700
committerKevin Greenan <kmgreen2@gmail.com>2014-10-24 14:19:31 -0700
commit70dd94ae38f2d20dd78532a6dfd1310fdfb4a884 (patch)
tree6d55be6869bcef1882fff82751dfcf10424aaca1
parent62d4b81a833477e596c3ec264f83c08901eeea5b (diff)
parent6fdd8bc3d32cb2f7fa55d2de9dc7cc5bb2f885aa (diff)
downloadgf-complete-70dd94ae38f2d20dd78532a6dfd1310fdfb4a884.tar.gz
Merged in jannau/gf-complete/neon (pull request #25)
arm neon optimisations
-rw-r--r--configure.ac36
-rw-r--r--examples/Makefile.am4
-rw-r--r--include/gf_complete.h6
-rw-r--r--include/gf_int.h10
-rw-r--r--include/gf_w16.h66
-rw-r--r--include/gf_w32.h71
-rw-r--r--include/gf_w4.h63
-rw-r--r--include/gf_w64.h50
-rw-r--r--include/gf_w8.h99
-rw-r--r--m4/ax_ext.m449
-rw-r--r--src/Makefile.am15
-rw-r--r--src/gf.c131
-rw-r--r--src/gf_method.c10
-rw-r--r--src/gf_w128.c6
-rw-r--r--src/gf_w16.c75
-rw-r--r--src/gf_w32.c84
-rw-r--r--src/gf_w4.c80
-rw-r--r--src/gf_w64.c67
-rw-r--r--src/gf_w8.c120
-rw-r--r--src/neon/gf_w16_neon.c356
-rw-r--r--src/neon/gf_w32_neon.c269
-rw-r--r--src/neon/gf_w4_neon.c247
-rw-r--r--src/neon/gf_w64_neon.c333
-rw-r--r--src/neon/gf_w8_neon.c302
-rw-r--r--test/Makefile.am4
-rw-r--r--test/gf_unit.c48
-rw-r--r--tools/Makefile.am22
-rw-r--r--tools/gf_methods.c2
-rw-r--r--tools/gf_time.c24
-rwxr-xr-xtools/run-tests.sh9
30 files changed, 2253 insertions, 405 deletions
diff --git a/configure.ac b/configure.ac
index 9f33852..ad7bb83 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,9 +3,12 @@
# FIXME - add project url as the last argument
AC_INIT(gf-complete, 1.0)
+# Override default CFLAGS
+: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"}
+
AC_PREREQ([2.61])
-AM_INIT_AUTOMAKE([no-dependencies foreign])
+AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests])
LT_INIT # libtool
AC_CONFIG_HEADER(include/config.h)
@@ -16,14 +19,39 @@ AC_CONFIG_MACRO_DIR([m4])
# This prevents './configure; make' from trying to run autotools.
AM_MAINTAINER_MODE([disable])
-# Override default CFLAGS
-CFLAGS="-Wall -Wpointer-arith -O3 -g"
-
dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
AC_PROG_CC
+# Check for functions to provide aligned memory
+#
+AC_CHECK_FUNCS([posix_memalign],
+ [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
AX_EXT()
+AC_ARG_ENABLE([neon],
+ AS_HELP_STRING([--disable-neon], [Build without NEON optimizations]))
+
+AS_IF([test "x$enable_neon" != "xno"],
+ [noneon_CPPFLAGS=$CPPFLAGS
+ CPPFLAGS="$CPPFLAGS $SIMD_FLAGS"
+ AC_CHECK_HEADER([arm_neon.h],
+ [have_neon=yes],
+ [have_neon=no
+ CPPFLAGS=$noneon_CPPFLAGS])],
+ [have_neon=no
+ AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"],
+ [SIMD_FLAGS=""])
+ ])
+
+AS_IF([test "x$have_neon" = "xno"],
+ [AS_IF([test "x$enable_neon" = "xyes"],
+ [AC_MSG_ERROR([neon requested but arm_neon.h not found])])
+ ])
+AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"])
+
AC_ARG_ENABLE([sse],
AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]),
[if test "x$enableval" = "xno" ; then
diff --git a/examples/Makefile.am b/examples/Makefile.am
index fd547d2..a420bda 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,7 +1,7 @@
# GF-Complete 'examples' AM file
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \
gf_example_5 gf_example_6 gf_example_7
diff --git a/include/gf_complete.h b/include/gf_complete.h
index 5806625..c4783e8 100644
--- a/include/gf_complete.h
+++ b/include/gf_complete.h
@@ -33,6 +33,10 @@
#include <wmmintrin.h>
#endif
+#if defined(ARM_NEON)
+ #include <arm_neon.h>
+#endif
+
/* These are the different ways to perform multiplication.
Not all are implemented for all values of w.
@@ -61,7 +65,9 @@ typedef enum {GF_MULT_DEFAULT,
#define GF_REGION_DOUBLE_TABLE (0x1)
#define GF_REGION_QUAD_TABLE (0x2)
#define GF_REGION_LAZY (0x4)
+#define GF_REGION_SIMD (0x8)
#define GF_REGION_SSE (0x8)
+#define GF_REGION_NOSIMD (0x10)
#define GF_REGION_NOSSE (0x10)
#define GF_REGION_ALTMAP (0x20)
#define GF_REGION_CAUCHY (0x40)
diff --git a/include/gf_int.h b/include/gf_int.h
index 98294cc..32866f4 100644
--- a/include/gf_int.h
+++ b/include/gf_int.h
@@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
- GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
+ GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
@@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_QUAD__J, /* Reg == QUAD && other Reg */
GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
- GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
+ GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
- GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
+ GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
@@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
GF_E_TABLE_W, /* Mult == TABLE, w too big */
- GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
+ GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
@@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
- GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
+ GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
GF_E_COMP__W, /* Mult == COMP, Bad w. */
GF_E_UNKFLAG, /* Unknown flag in create_from.... */
GF_E_UNKNOWN, /* Unknown mult_type. */
diff --git a/include/gf_w16.h b/include/gf_w16.h
new file mode 100644
index 0000000..fb4c0e9
--- /dev/null
+++ b/include/gf_w16.h
@@ -0,0 +1,66 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w16.h
+ *
+ * Defines and data structures for 16-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W16_H
+#define GF_COMPLETE_GF_W16_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (16)
+#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (8)
+#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w16_logtable_data {
+ uint16_t log_tbl[GF_FIELD_SIZE];
+ uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
+ uint16_t inv_tbl[GF_FIELD_SIZE];
+ uint16_t *d_antilog;
+};
+
+struct gf_w16_zero_logtable_data {
+ int log_tbl[GF_FIELD_SIZE];
+ uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
+ uint16_t *antilog_tbl;
+ uint16_t inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w16_lazytable_data {
+ uint16_t log_tbl[GF_FIELD_SIZE];
+ uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
+ uint16_t inv_tbl[GF_FIELD_SIZE];
+ uint16_t *d_antilog;
+ uint16_t lazytable[GF_FIELD_SIZE];
+};
+
+struct gf_w16_bytwo_data {
+ uint64_t prim_poly;
+ uint64_t mask1;
+ uint64_t mask2;
+};
+
+struct gf_w16_split_8_8_data {
+ uint16_t tables[3][256][256];
+};
+
+struct gf_w16_group_4_4_data {
+ uint16_t reduce[16];
+ uint16_t shift[16];
+};
+
+struct gf_w16_composite_data {
+ uint8_t *mult_table;
+};
+
+void gf_w16_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W16_H */
diff --git a/include/gf_w32.h b/include/gf_w32.h
new file mode 100644
index 0000000..3396402
--- /dev/null
+++ b/include/gf_w32.h
@@ -0,0 +1,71 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w32.h
+ *
+ * Defines and data structures for 32-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W32_H
+#define GF_COMPLETE_GF_W32_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (32)
+#define GF_FIRST_BIT (1 << 31)
+
+#define GF_BASE_FIELD_WIDTH (16)
+#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
+struct gf_split_2_32_lazy_data {
+ uint32_t tables[16][4];
+ uint32_t last_value;
+};
+
+struct gf_w32_split_8_8_data {
+ uint32_t tables[7][256][256];
+ uint32_t region_tables[4][256];
+ uint32_t last_value;
+};
+
+struct gf_w32_group_data {
+ uint32_t *reduce;
+ uint32_t *shift;
+ int tshift;
+ uint64_t rmask;
+ uint32_t *memory;
+};
+
+struct gf_split_16_32_lazy_data {
+ uint32_t tables[2][(1<<16)];
+ uint32_t last_value;
+};
+
+struct gf_split_8_32_lazy_data {
+ uint32_t tables[4][256];
+ uint32_t last_value;
+};
+
+struct gf_split_4_32_lazy_data {
+ uint32_t tables[8][16];
+ uint32_t last_value;
+};
+
+struct gf_w32_bytwo_data {
+ uint64_t prim_poly;
+ uint64_t mask1;
+ uint64_t mask2;
+};
+
+struct gf_w32_composite_data {
+ uint16_t *log;
+ uint16_t *alog;
+};
+
+void gf_w32_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W32_H */
diff --git a/include/gf_w4.h b/include/gf_w4.h
new file mode 100644
index 0000000..8ee94a3
--- /dev/null
+++ b/include/gf_w4.h
@@ -0,0 +1,63 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w4.h
+ *
+ * Defines and data structures for 4-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W4_H
+#define GF_COMPLETE_GF_W4_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH 4
+#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
+#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
+
+/* ------------------------------------------------------------
+ JSP: Each implementation has its own data, which is allocated
+ at one time as part of the handle. For that reason, it
+ shouldn't be hierarchical -- i.e. one should be able to
+ allocate it with one call to malloc. */
+
+struct gf_logtable_data {
+ uint8_t log_tbl[GF_FIELD_SIZE];
+ uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
+ uint8_t *antilog_tbl_div;
+};
+
+struct gf_single_table_data {
+ uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_double_table_data {
+ uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+struct gf_quad_table_data {
+ uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint16_t mult[GF_FIELD_SIZE][(1<<16)];
+};
+
+struct gf_quad_table_lazy_data {
+ uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint16_t mult[(1 << 16)];
+};
+
+struct gf_bytwo_data {
+ uint64_t prim_poly;
+ uint64_t mask1;
+ uint64_t mask2;
+};
+
+// ARM NEON init functions
+int gf_w4_neon_cfm_init(gf_t *gf);
+void gf_w4_neon_single_table_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W4_H */
diff --git a/include/gf_w64.h b/include/gf_w64.h
new file mode 100644
index 0000000..9a74a81
--- /dev/null
+++ b/include/gf_w64.h
@@ -0,0 +1,50 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w64.h
+ *
+ * Defines and data structures for 64-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W64_H
+#define GF_COMPLETE_GF_W64_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (64)
+#define GF_FIRST_BIT (1ULL << 63)
+
+#define GF_BASE_FIELD_WIDTH (32)
+#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
+
+struct gf_w64_group_data {
+ uint64_t *reduce;
+ uint64_t *shift;
+ uint64_t *memory;
+};
+
+struct gf_split_4_64_lazy_data {
+ uint64_t tables[16][16];
+ uint64_t last_value;
+};
+
+struct gf_split_8_64_lazy_data {
+ uint64_t tables[8][(1<<8)];
+ uint64_t last_value;
+};
+
+struct gf_split_16_64_lazy_data {
+ uint64_t tables[4][(1<<16)];
+ uint64_t last_value;
+};
+
+struct gf_split_8_8_data {
+ uint64_t tables[15][256][256];
+};
+
+void gf_w64_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W64_H */
diff --git a/include/gf_w8.h b/include/gf_w8.h
new file mode 100644
index 0000000..938fcfd
--- /dev/null
+++ b/include/gf_w8.h
@@ -0,0 +1,99 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w8.c
+ *
+ * Defines and data stuctures for 8-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W8_H
+#define GF_COMPLETE_GF_W8_H
+
+#include "gf_int.h"
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (8)
+#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
+#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
+#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (4)
+#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w8_logtable_data {
+ uint8_t log_tbl[GF_FIELD_SIZE];
+ uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
+ uint8_t inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w8_logzero_table_data {
+ short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
+ uint8_t antilog_tbl[512+512+1];
+ uint8_t *div_tbl;
+ uint8_t *inv_tbl;
+};
+
+struct gf_w8_logzero_small_table_data {
+ short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
+ uint8_t antilog_tbl[255*3];
+ uint8_t inv_tbl[GF_FIELD_SIZE];
+ uint8_t *div_tbl;
+};
+
+struct gf_w8_composite_data {
+ uint8_t *mult_table;
+};
+
+/* Don't change the order of these relative to gf_w8_half_table_data */
+
+struct gf_w8_default_data {
+ uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
+ uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
+ uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_half_table_data {
+ uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
+ uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
+};
+
+struct gf_w8_single_table_data {
+ uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_data {
+ uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_lazy_data {
+ uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+ uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w4_logtable_data {
+ uint8_t log_tbl[GF_BASE_FIELD_SIZE];
+ uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
+ uint8_t *antilog_tbl_div;
+};
+
+struct gf_w4_single_table_data {
+ uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+ uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+};
+
+struct gf_w8_bytwo_data {
+ uint64_t prim_poly;
+ uint64_t mask1;
+ uint64_t mask2;
+};
+
+int gf_w8_neon_cfm_init(gf_t *gf);
+void gf_w8_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W8_H */
diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4
index cfbb797..c03ccef 100644
--- a/m4/ax_ext.m4
+++ b/m4/ax_ext.m4
@@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT],
AC_REQUIRE([AC_CANONICAL_HOST])
case $host_cpu in
+ aarch64*)
+ AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
+ SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
+
+ AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
+ [
+ # TODO: detect / cross-compile
+ ax_cv_have_neon_ext=yes
+ ])
+ AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext],
+ [
+ # TODO: detect / cross-compile
+ ax_cv_have_arm_crypt_ext=yes
+ ])
+
+ if test "$ax_cv_have_arm_crypt_ext" = yes; then
+ AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension])
+ fi
+
+ if test "$ax_cv_have_neon_ext" = yes; then
+ AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
+ fi
+
+ if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto,
+ SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", [])
+ elif test "$ax_cv_have_arm_crypt_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto,
+ SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", [])
+ elif test "$ax_cv_have_neon_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd,
+ SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", [])
+ fi
+ ;;
+
+ arm*)
+ AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
+ [
+ # TODO: detect / cross-compile
+ ax_cv_have_neon_ext=yes
+ ])
+
+ if test "$ax_cv_have_neon_ext" = yes; then
+ AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
+ AX_CHECK_COMPILE_FLAG(-mfpu=neon,
+ SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", [])
+ fi
+ ;;
+
powerpc*)
AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
[
diff --git a/src/Makefile.am b/src/Makefile.am
index ba3ad5e..240c1fe 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,11 +1,22 @@
# GF-Complete 'core' AM file
# Creates the library
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AUTOMAKE_OPTIONS = subdir-objects
+
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
lib_LTLIBRARIES = libgf_complete.la
libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
gf_w64.c gf_w128.c gf_rand.c gf_general.c
+
+if HAVE_NEON
+libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
+ neon/gf_w8_neon.c \
+ neon/gf_w16_neon.c \
+ neon/gf_w32_neon.c \
+ neon/gf_w64_neon.c
+endif
+
libgf_complete_la_LDFLAGS = -version-info 1:0:0
diff --git a/src/gf.c b/src/gf.c
index 10c9b3c..6d34c46 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -41,7 +41,7 @@ void gf_error()
case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
- case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
+ case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
@@ -51,23 +51,23 @@ void gf_error()
case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
- case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
- case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
- case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
+ case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
- case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
+ case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
- case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
+ case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
- case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
@@ -77,33 +77,33 @@ void gf_error()
case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
- case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
- case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
- case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
+ case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
+ case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
- case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
+ case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
- case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
+ case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
- case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
+ case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
- case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
+ case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
- case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
+ case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
- case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
+ case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
- case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
+ case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
@@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
int sse3 = 0;
int sse2 = 0;
int pclmul = 0;
- int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
+ int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
gf_internal_t *sub;
rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
rquad = (region_type & GF_REGION_QUAD_TABLE);
rlazy = (region_type & GF_REGION_LAZY);
- rsse = (region_type & GF_REGION_SSE);
- rnosse = (region_type & GF_REGION_NOSSE);
+ rsimd = (region_type & GF_REGION_SIMD);
+ rnosimd = (region_type & GF_REGION_NOSIMD);
raltmap = (region_type & GF_REGION_ALTMAP);
rcauchy = (region_type & GF_REGION_CAUCHY);
@@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
}
tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
- GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
+ GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
+ GF_REGION_CAUCHY );
if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
#ifdef INTEL_SSE2
@@ -216,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
pclmul = 1;
#endif
+#ifdef ARM_NEON
+ pclmul = 1;
+ sse3 = 1;
+#endif
+
if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
@@ -230,7 +236,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
return 1;
}
- if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; }
+ if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
@@ -252,7 +258,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
- if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
+ if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
return 1;
}
@@ -260,7 +266,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (rquad) {
if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
- if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
+ if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
return 1;
}
@@ -268,7 +274,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (mult_type == GF_MULT_SHIFT) {
if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
- if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
return 1;
}
@@ -281,7 +287,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
- if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
return 1;
}
@@ -290,21 +296,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (w != 4 && w != 8 && w != 16 &&
w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
- if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
return 1;
}
if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
- if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
+ if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
return 1;
}
if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
|| mult_type == GF_MULT_LOG_ZERO_EXT ) {
if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
- if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; }
+ if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
if (mult_type == GF_MULT_LOG_TABLE) return 1;
@@ -324,14 +330,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
(arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
- if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; }
+ if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
return 1;
}
if (mult_type == GF_MULT_TABLE) {
if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
- if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; }
- if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
+ if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
return 1;
}
@@ -344,46 +350,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
}
if (w == 8) {
if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
- if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
} else if (w == 16) {
if ((arg1 == 8 && arg2 == 8) ||
(arg1 == 8 && arg2 == 16)) {
- if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
} else if (arg1 == 4 && arg2 == 16) {
- if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
} else { _gf_errno = GF_E_SP_16AR; return 0; }
} else if (w == 32) {
if ((arg1 == 8 && arg2 == 8) ||
(arg1 == 8 && arg2 == 32) ||
(arg1 == 16 && arg2 == 32)) {
- if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
} else if (arg1 == 4 && arg2 == 32) {
- if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
- if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; }
+ if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
} else { _gf_errno = GF_E_SP_32AR; return 0; }
} else if (w == 64) {
if ((arg1 == 8 && arg2 == 8) ||
(arg1 == 8 && arg2 == 64) ||
(arg1 == 16 && arg2 == 64)) {
- if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
} else if (arg1 == 4 && arg2 == 64) {
- if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
- if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; }
+ if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
} else { _gf_errno = GF_E_SP_64AR; return 0; }
} else if (w == 128) {
if (arg1 == 8 && arg2 == 128) {
- if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
} else if (arg1 == 4 && arg2 == 128) {
- if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
- if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; }
+ if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
} else { _gf_errno = GF_E_SP128AR; return 0; }
} else { _gf_errno = GF_E_SPLIT_W; return 0; }
return 1;
@@ -395,7 +401,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
- if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; }
+ if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
if (base != NULL) {
sub = (gf_internal_t *) base->scratch;
if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
@@ -953,7 +959,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
}
return;
#endif
+#if defined(ARM_NEON)
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ s8++;
+ d8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ } else {
+ while (s8 + 15 < (uint8_t *) src + bytes) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ }
+ while (s8 < (uint8_t *) src + bytes) {
+ *d8 ^= *s8;
+ s8++;
+ d8++;
+ }
+ return;
+#endif
if (uls % 8 != uld % 8) {
gf_unaligned_xor(src, dest, bytes);
return;
diff --git a/src/gf_method.c b/src/gf_method.c
index 2548a63..2210305 100644
--- a/src/gf_method.c
+++ b/src/gf_method.c
@@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
} else if (strcmp(argv[starting], "LAZY") == 0) {
region_type |= GF_REGION_LAZY;
starting++;
+ } else if (strcmp(argv[starting], "SIMD") == 0) {
+ region_type |= GF_REGION_SIMD;
+ starting++;
+ } else if (strcmp(argv[starting], "NOSIMD") == 0) {
+ region_type |= GF_REGION_NOSIMD;
+ starting++;
} else if (strcmp(argv[starting], "SSE") == 0) {
- region_type |= GF_REGION_SSE;
+ region_type |= GF_REGION_SIMD;
starting++;
} else if (strcmp(argv[starting], "NOSSE") == 0) {
- region_type |= GF_REGION_NOSSE;
+ region_type |= GF_REGION_NOSIMD;
starting++;
} else if (strcmp(argv[starting], "CAUCHY") == 0) {
region_type |= GF_REGION_CAUCHY;
diff --git a/src/gf_w128.c b/src/gf_w128.c
index 66f9422..190f6b0 100644
--- a/src/gf_w128.c
+++ b/src/gf_w128.c
@@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf)
gf->multiply.w128 = gf_w128_bytwo_p_multiply;
#if defined(INTEL_SSE4_PCLMUL)
- if (!(h->region_type & GF_REGION_NOSSE)){
+ if (!(h->region_type & GF_REGION_NOSIMD)){
gf->multiply.w128 = gf_w128_clm_multiply;
}
#endif
@@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf)
if((h->region_type & GF_REGION_ALTMAP))
{
#ifdef INTEL_SSE4
- if(!(h->region_type & GF_REGION_NOSSE))
+ if(!(h->region_type & GF_REGION_NOSIMD))
gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
else
return 0;
@@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf)
}
else {
#ifdef INTEL_SSE4
- if(!(h->region_type & GF_REGION_NOSSE))
+ if(!(h->region_type & GF_REGION_NOSIMD))
gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
else
gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
diff --git a/src/gf_w16.c b/src/gf_w16.c
index c4cd22d..ce47849 100644
--- a/src/gf_w16.c
+++ b/src/gf_w16.c
@@ -11,54 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
-
-#define GF_FIELD_WIDTH (16)
-#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
-#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
-
-#define GF_BASE_FIELD_WIDTH (8)
-#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
-
-struct gf_w16_logtable_data {
- uint16_t log_tbl[GF_FIELD_SIZE];
- uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
- uint16_t inv_tbl[GF_FIELD_SIZE];
- uint16_t *d_antilog;
-};
-
-struct gf_w16_zero_logtable_data {
- int log_tbl[GF_FIELD_SIZE];
- uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
- uint16_t *antilog_tbl;
- uint16_t inv_tbl[GF_FIELD_SIZE];
-};
-
-struct gf_w16_lazytable_data {
- uint16_t log_tbl[GF_FIELD_SIZE];
- uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
- uint16_t inv_tbl[GF_FIELD_SIZE];
- uint16_t *d_antilog;
- uint16_t lazytable[GF_FIELD_SIZE];
-};
-
-struct gf_w16_bytwo_data {
- uint64_t prim_poly;
- uint64_t mask1;
- uint64_t mask2;
-};
-
-struct gf_w16_split_8_8_data {
- uint16_t tables[3][256][256];
-};
-
-struct gf_w16_group_4_4_data {
- uint16_t reduce[16];
- uint16_t shift[16];
-};
-
-struct gf_w16_composite_data {
- uint8_t *mult_table;
-};
+#include "gf_w16.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf)
gf_internal_t *h;
struct gf_w16_split_8_8_data *d8;
int i, j, exp, issse3;
+ int isneon = 0;
uint32_t p, basep;
h = (gf_internal_t *) gf->scratch;
@@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf)
#else
issse3 = 0;
#endif
+#ifdef ARM_NEON
+ isneon = 1;
+#endif
if (h->arg1 == 8 && h->arg2 == 8) {
d8 = (struct gf_w16_split_8_8_data *) h->private;
@@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf)
if (issse3) {
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
+ } else if (isneon) {
+#ifdef ARM_NEON
+ gf_w16_neon_split_init(gf);
+#endif
} else {
gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
}
@@ -1326,15 +1287,15 @@ int gf_w16_split_init(gf_t *gf)
gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
} else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
- if (issse3) {
- if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE)
+ if (issse3 || isneon) {
+ if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
- else if(h->region_type & GF_REGION_NOSSE)
+ else if(h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
- else if(h->region_type & GF_REGION_ALTMAP)
+ else if(h->region_type & GF_REGION_ALTMAP && issse3)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
} else {
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
else if(h->region_type & GF_REGION_ALTMAP)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
@@ -1884,25 +1845,25 @@ int gf_w16_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w16_bytwo_p_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w16_bytwo_b_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
diff --git a/src/gf_w32.c b/src/gf_w32.c
index 5ec2aa7..2e187fd 100644
--- a/src/gf_w32.c
+++ b/src/gf_w32.c
@@ -12,59 +12,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
-
-#define GF_FIELD_WIDTH (32)
-#define GF_FIRST_BIT (1 << 31)
-
-#define GF_BASE_FIELD_WIDTH (16)
-#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
-#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
-#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
-
-struct gf_split_2_32_lazy_data {
- uint32_t tables[16][4];
- uint32_t last_value;
-};
-
-struct gf_w32_split_8_8_data {
- uint32_t tables[7][256][256];
- uint32_t region_tables[4][256];
- uint32_t last_value;
-};
-
-struct gf_w32_group_data {
- uint32_t *reduce;
- uint32_t *shift;
- int tshift;
- uint64_t rmask;
- uint32_t *memory;
-};
-
-struct gf_split_16_32_lazy_data {
- uint32_t tables[2][(1<<16)];
- uint32_t last_value;
-};
-
-struct gf_split_8_32_lazy_data {
- uint32_t tables[4][256];
- uint32_t last_value;
-};
-
-struct gf_split_4_32_lazy_data {
- uint32_t tables[8][16];
- uint32_t last_value;
-};
-
-struct gf_w32_bytwo_data {
- uint64_t prim_poly;
- uint64_t mask1;
- uint64_t mask2;
-};
-
-struct gf_w32_composite_data {
- uint16_t *log;
- uint16_t *alog;
-};
+#include "gf_w32.h"
#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
@@ -1434,25 +1382,25 @@ int gf_w32_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w32_bytwo_p_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w32_bytwo_b_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf)
struct gf_split_16_32_lazy_data *d16;
uint32_t p, basep;
int i, j, exp, ispclmul, issse3;
+ int isneon = 0;
#if defined(INTEL_SSE4_PCLMUL)
ispclmul = 1;
@@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf)
#else
issse3 = 0;
#endif
+#ifdef ARM_NEON
+ isneon = 1;
+#endif
h = (gf_internal_t *) gf->scratch;
@@ -2335,13 +2287,13 @@ int gf_w32_split_init(gf_t *gf)
ld2 = (struct gf_split_2_32_lazy_data *) h->private;
ld2->last_value = 0;
#ifdef INTEL_SSSE3
- if (!(h->region_type & GF_REGION_NOSSE))
+ if (!(h->region_type & GF_REGION_NOSIMD))
gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
else
gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
#else
gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
- if(h->region_type & GF_REGION_SSE) return 0;
+ if(h->region_type & GF_REGION_SIMD) return 0;
#endif
return 1;
}
@@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf)
/* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
- (issse3 && h->mult_type == GF_REGION_DEFAULT)) {
+ ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
ld4 = (struct gf_split_4_32_lazy_data *) h->private;
ld4->last_value = 0;
- if ((h->region_type & GF_REGION_NOSSE) || !issse3) {
+ if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
+ } else if (isneon) {
+#ifdef ARM_NEON
+ gf_w32_neon_split_init(gf);
+#endif
} else if (h->region_type & GF_REGION_ALTMAP) {
gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
} else {
@@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf)
int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
int issse3 = 0;
+ int isneon = 0;
#ifdef INTEL_SSSE3
issse3 = 1;
#endif
+#ifdef ARM_NEON
+ isneon = 1;
+#endif
switch(mult_type)
{
@@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
}
if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
- (mult_type == GF_MULT_DEFAULT && !issse3)) {
+ (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
}
if ((arg1 == 4 && arg2 == 32) ||
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 6bc79d0..0e86aa8 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -11,49 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
-
-#define GF_FIELD_WIDTH 4
-#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
-#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
-#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
-
-/* ------------------------------------------------------------
- JSP: Each implementation has its own data, which is allocated
- at one time as part of the handle. For that reason, it
- shouldn't be hierarchical -- i.e. one should be able to
- allocate it with one call to malloc. */
-
-struct gf_logtable_data {
- uint8_t log_tbl[GF_FIELD_SIZE];
- uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
- uint8_t *antilog_tbl_div;
-};
-
-struct gf_single_table_data {
- uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-};
-
-struct gf_double_table_data {
- uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
-};
-struct gf_quad_table_data {
- uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint16_t mult[GF_FIELD_SIZE][(1<<16)];
-};
-
-struct gf_quad_table_lazy_data {
- uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint16_t mult[(1 << 16)];
-};
-
-struct gf_bytwo_data {
- uint64_t prim_poly;
- uint64_t mask1;
- uint64_t mask2;
-};
+#include "gf_w4.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -489,14 +447,18 @@ int gf_w4_single_table_init(gf_t *gf)
gf->inverse.w32 = NULL;
gf->divide.w32 = gf_w4_single_table_divide;
gf->multiply.w32 = gf_w4_single_table_multiply;
- #ifdef INTEL_SSSE3
- if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY))
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
else
+ #if defined(INTEL_SSSE3)
gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
+ #elif defined(ARM_NEON)
+ gf_w4_neon_single_table_init(gf);
+ #endif
#else
gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
- if (h->region_type & GF_REGION_SSE) return 0;
+ if (h->region_type & GF_REGION_SIMD) return 0;
#endif
return 1;
@@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf)
{
int rt;
gf_internal_t *h;
- int issse3 = 0;
+ int simd = 0;
-#ifdef INTEL_SSSE3
- issse3 = 1;
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ simd = 1;
#endif
h = (gf_internal_t *) gf->scratch;
rt = (h->region_type);
- if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE;
+ if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
if (rt & GF_REGION_DOUBLE_TABLE) {
return gf_w4_double_table_init(gf);
@@ -1905,25 +1867,25 @@ int gf_w4_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w4_bytwo_p_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
- if (h->region_type & GF_REGION_SSE)
+ if (h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w4_bytwo_b_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
- if (h->region_type & GF_REGION_SSE)
+ if (h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf)
#if defined(INTEL_SSE4_PCLMUL)
gf->multiply.w32 = gf_w4_clm_multiply;
return 1;
+#elif defined(ARM_NEON)
+ return gf_w4_neon_cfm_init(gf);
#endif
return 0;
}
@@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf)
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int issse3 = 0;
+ int issse3 = 0, isneon = 0;
#ifdef INTEL_SSSE3
issse3 = 1;
#endif
+#ifdef ARM_NEON
+ isneon = 1;
+#endif
switch(mult_type)
{
@@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
- if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE;
+ if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+ region_type = GF_REGION_DOUBLE_TABLE;
if (region_type & GF_REGION_DOUBLE_TABLE) {
return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
diff --git a/src/gf_w64.c b/src/gf_w64.c
index fdc4a7c..6e75f5e 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -11,38 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
-
-#define GF_FIELD_WIDTH (64)
-#define GF_FIRST_BIT (1ULL << 63)
-
-#define GF_BASE_FIELD_WIDTH (32)
-#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
-#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
-
-struct gf_w64_group_data {
- uint64_t *reduce;
- uint64_t *shift;
- uint64_t *memory;
-};
-
-struct gf_split_4_64_lazy_data {
- uint64_t tables[16][16];
- uint64_t last_value;
-};
-
-struct gf_split_8_64_lazy_data {
- uint64_t tables[8][(1<<8)];
- uint64_t last_value;
-};
-
-struct gf_split_16_64_lazy_data {
- uint64_t tables[4][(1<<16)];
- uint64_t last_value;
-};
-
-struct gf_split_8_8_data {
- uint64_t tables[15][256][256];
-};
+#include "gf_w64.h"
static
inline
@@ -1488,25 +1457,25 @@ int gf_w64_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w64 = gf_w64_bytwo_b_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@@ -2006,7 +1975,7 @@ int gf_w64_split_init(gf_t *gf)
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
#if defined(INTEL_SSE4_PCLMUL)
- if ((!(h->region_type & GF_REGION_NOSSE) &&
+ if ((!(h->region_type & GF_REGION_NOSIMD) &&
(h->arg1 == 64 || h->arg2 == 64)) ||
h->mult_type == GF_MULT_DEFAULT){
@@ -2027,11 +1996,15 @@ int gf_w64_split_init(gf_t *gf)
/* Allen: set region pointers for default mult type. Single pointers are
* taken care of above (explicitly for sse, implicitly for no sse). */
-#ifdef INTEL_SSE4
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
if (h->mult_type == GF_MULT_DEFAULT) {
d4 = (struct gf_split_4_64_lazy_data *) h->private;
d4->last_value = 0;
+#if defined(INTEL_SSE4)
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+#elif defined(ARCH_AARCH64)
+ gf_w64_neon_split_init(gf);
+#endif
}
#else
if (h->mult_type == GF_MULT_DEFAULT) {
@@ -2045,25 +2018,31 @@ int gf_w64_split_init(gf_t *gf)
d4 = (struct gf_split_4_64_lazy_data *) h->private;
d4->last_value = 0;
- if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0;
+ if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
if(h->region_type & GF_REGION_ALTMAP)
{
#ifdef INTEL_SSSE3
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region;
+ #elif defined(ARCH_AARCH64)
+ gf_w64_neon_split_init(gf);
#else
return 0;
#endif
}
else //no altmap
{
- #ifdef INTEL_SSE4
- if(h->region_type & GF_REGION_NOSSE)
+ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if(h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
else
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+ #if defined(INTEL_SSE4)
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+ #elif defined(ARCH_AARCH64)
+ gf_w64_neon_split_init(gf);
+ #endif
#else
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@@ -2134,7 +2113,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
/* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
* then fall through to split table scratch size code. */
-#ifdef INTEL_SSE4
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
arg1 = 64;
arg2 = 4;
#else
diff --git a/src/gf_w8.c b/src/gf_w8.c
index 67fd688..8449298 100644
--- a/src/gf_w8.c
+++ b/src/gf_w8.c
@@ -9,88 +9,10 @@
*/
#include "gf_int.h"
+#include "gf_w8.h"
#include <stdio.h>
#include <stdlib.h>
-#define GF_FIELD_WIDTH (8)
-#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
-#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
-#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
-
-#define GF_BASE_FIELD_WIDTH (4)
-#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
-
-struct gf_w8_logtable_data {
- uint8_t log_tbl[GF_FIELD_SIZE];
- uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
- uint8_t inv_tbl[GF_FIELD_SIZE];
-};
-
-struct gf_w8_logzero_table_data {
- short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
- uint8_t antilog_tbl[512+512+1];
- uint8_t *div_tbl;
- uint8_t *inv_tbl;
-};
-
-struct gf_w8_logzero_small_table_data {
- short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
- uint8_t antilog_tbl[255*3];
- uint8_t inv_tbl[GF_FIELD_SIZE];
- uint8_t *div_tbl;
-};
-
-struct gf_w8_composite_data {
- uint8_t *mult_table;
-};
-
-/* Don't change the order of these relative to gf_w8_half_table_data */
-
-struct gf_w8_default_data {
- uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
- uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
- uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
-};
-
-struct gf_w8_half_table_data {
- uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
- uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
-};
-
-struct gf_w8_single_table_data {
- uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
-};
-
-struct gf_w8_double_table_data {
- uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
-};
-
-struct gf_w8_double_table_lazy_data {
- uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
- uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
-};
-
-struct gf_w4_logtable_data {
- uint8_t log_tbl[GF_BASE_FIELD_SIZE];
- uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
- uint8_t *antilog_tbl_div;
-};
-
-struct gf_w4_single_table_data {
- uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
- uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
-};
-
-struct gf_w8_bytwo_data {
- uint64_t prim_poly;
- uint64_t mask1;
- uint64_t mask2;
-};
-
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
t2 = b & am2; \
@@ -603,6 +525,8 @@ int gf_w8_cfm_init(gf_t *gf)
return 0;
}
return 1;
+#elif defined(ARM_NEON)
+ return gf_w8_neon_cfm_init(gf);
#endif
return 0;
@@ -938,7 +862,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
return (ftd->multtable[a][b]);
}
-#ifdef INTEL_SSSE3
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
static
gf_val_32_t
gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
@@ -1179,14 +1103,18 @@ int gf_w8_split_init(gf_t *gf)
gf->multiply.w32 = gf_w8_split_multiply;
- #ifdef INTEL_SSSE3
- if (h->region_type & GF_REGION_NOSSE)
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w8_split_multiply_region;
else
+ #if defined(INTEL_SSSE3)
gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+ #elif defined(ARM_NEON)
+ gf_w8_neon_split_init(gf);
+ #endif
#else
gf->multiply_region.w32 = gf_w8_split_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
@@ -1205,17 +1133,17 @@ int gf_w8_table_init(gf_t *gf)
struct gf_w8_double_table_data *dtd = NULL;
struct gf_w8_double_table_lazy_data *ltd = NULL;
struct gf_w8_default_data *dd = NULL;
- int a, b, c, prod, scase, issse;
+ int a, b, c, prod, scase, use_simd;
h = (gf_internal_t *) gf->scratch;
-#ifdef INTEL_SSSE3
- issse = 1;
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ use_simd = 1;
#else
- issse = 0;
+ use_simd = 0;
#endif
- if (h->mult_type == GF_MULT_DEFAULT && issse) {
+ if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
dd = (struct gf_w8_default_data *)h->private;
scase = 3;
bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
@@ -1290,10 +1218,14 @@ int gf_w8_table_init(gf_t *gf)
gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
break;
case 3:
-#ifdef INTEL_SSSE3
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
gf->divide.w32 = gf_w8_default_divide;
gf->multiply.w32 = gf_w8_default_multiply;
+#if defined(INTEL_SSSE3)
gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+#elif defined(ARM_NEON)
+ gf_w8_neon_split_init(gf);
+#endif
#endif
break;
}
@@ -2259,25 +2191,25 @@ int gf_w8_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w8_bytwo_p_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w8_bytwo_b_multiply;
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSSE)
+ if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
- if(h->region_type & GF_REGION_SSE)
+ if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@@ -2296,7 +2228,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
switch(mult_type)
{
case GF_MULT_DEFAULT:
-#ifdef INTEL_SSSE3
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
#endif
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
new file mode 100644
index 0000000..95bfd80
--- /dev/null
+++ b/src/neon/gf_w16_neon.c
@@ -0,0 +1,356 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the University of Tennessee nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * gf_w16_neon.c
+ *
+ * Neon routines for 16-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w16.h"
+
+#ifdef ARCH_AARCH64
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+ uint16_t *d_end, uint8_t *tbl,
+ gf_val_32_t val, int xor)
+{
+ unsigned i;
+ uint8_t *high = tbl + 4 * 16;
+ uint16x8_t va0, va1, r0, r1;
+ uint8x16_t loset, rl, rh;
+ uint8x16x2_t va;
+
+ uint8x16_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i] = vld1q_u8(tbl + i*16);
+ tbl_h[i] = vld1q_u8(high + i*16);
+ }
+
+ loset = vdupq_n_u8(0xf);
+
+ while (dst < d_end) {
+ va0 = vld1q_u16(src);
+ va1 = vld1q_u16(src + 8);
+
+ va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
+
+ rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+ rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+ va.val[0] = vshrq_n_u8(va.val[0], 4);
+ va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+ va = vtrnq_u8(rl, rh);
+ r0 = vreinterpretq_u16_u8(va.val[0]);
+ r1 = vreinterpretq_u16_u8(va.val[1]);
+
+ if (xor) {
+ va0 = vld1q_u16(dst);
+ va1 = vld1q_u16(dst + 8);
+ r0 = veorq_u16(r0, va0);
+ r1 = veorq_u16(r1, va1);
+ }
+ vst1q_u16(dst, r0);
+ vst1q_u16(dst + 8, r1);
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+ uint8_t *dst, uint8_t *d_end,
+ uint8_t *tbl, gf_val_32_t val,
+ int xor)
+{
+ unsigned i;
+ uint8_t *high = tbl + 4 * 16;
+ uint8x16_t vh, vl, rh, rl;
+ uint8x16_t loset;
+
+ uint8x16_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i] = vld1q_u8(tbl + i*16);
+ tbl_h[i] = vld1q_u8(high + i*16);
+ }
+
+ loset = vdupq_n_u8(0xf);
+
+ while (dst < d_end) {
+ vh = vld1q_u8(src);
+ vl = vld1q_u8(src + 16);
+
+ rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
+ rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
+
+ vl = vshrq_n_u8(vl, 4);
+ vh = vshrq_n_u8(vh, 4);
+
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
+
+ if (xor) {
+ vh = vld1q_u8(dst);
+ vl = vld1q_u8(dst + 16);
+ rh = veorq_u8(rh, vh);
+ rl = veorq_u8(rl, vl);
+ }
+ vst1q_u8(dst, rh);
+ vst1q_u8(dst + 16, rl);
+
+ src += 32;
+ dst += 32;
+ }
+}
+
+#else /* ARCH_AARCH64 */
+
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+ uint16_t *d_end, uint8_t *tbl,
+ gf_val_32_t val, int xor)
+{
+ unsigned i;
+ uint8_t *high = tbl + 4 * 16;
+ uint16x8_t va, r;
+ uint8x8_t loset, vb, vc, rl, rh;
+
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+ }
+
+ loset = vdup_n_u8(0xf);
+
+ while (dst < d_end) {
+ va = vld1q_u16(src);
+
+ vb = vmovn_u16(va);
+ vc = vshrn_n_u16(va, 8);
+
+ rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
+ rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
+ vb = vshr_n_u8(vb, 4);
+ rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
+ rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
+ vc = vshr_n_u8(vc, 4);
+ rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
+ rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
+ rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
+ rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
+
+ r = vmovl_u8(rl);
+ r = vorrq_u16(r, vshll_n_u8(rh, 8));
+
+ if (xor) {
+ va = vld1q_u16(dst);
+ r = veorq_u16(r, va);
+ }
+ vst1q_u16(dst, r);
+
+ src += 8;
+ dst += 8;
+ }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+ uint8_t *dst, uint8_t *d_end,
+ uint8_t *tbl, gf_val_32_t val,
+ int xor)
+{
+ unsigned i;
+ uint8_t *high = tbl + 4 * 16;
+ uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
+ uint8x8_t loset;
+
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+ }
+
+ loset = vdup_n_u8(0xf);
+
+ while (dst < d_end) {
+ vh0 = vld1_u8(src);
+ vh1 = vld1_u8(src + 8);
+ vl0 = vld1_u8(src + 16);
+ vl1 = vld1_u8(src + 24);
+
+ r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
+ r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
+ r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
+ r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
+
+ vh0 = vshr_n_u8(vh0, 4);
+ vh1 = vshr_n_u8(vh1, 4);
+ vl0 = vshr_n_u8(vl0, 4);
+ vl1 = vshr_n_u8(vl1, 4);
+
+ r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
+ r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
+ r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
+ r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
+
+ if (xor) {
+ vh0 = vld1_u8(dst);
+ vh1 = vld1_u8(dst + 8);
+ vl0 = vld1_u8(dst + 16);
+ vl1 = vld1_u8(dst + 24);
+ r0 = veor_u8(r0, vh0);
+ r1 = veor_u8(r1, vh1);
+ r2 = veor_u8(r2, vl0);
+ r3 = veor_u8(r3, vl1);
+ }
+ vst1_u8(dst, r0);
+ vst1_u8(dst + 8, r1);
+ vst1_u8(dst + 16, r2);
+ vst1_u8(dst + 24, r3);
+
+ src += 32;
+ dst += 32;
+ }
+}
+#endif /* ARCH_AARCH64 */
+
+static
+inline
+void
+neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+ gf_val_32_t val, int bytes, int xor,
+ int altmap)
+{
+ gf_region_data rd;
+ unsigned i, j;
+ uint64_t c, prod;
+ uint8_t tbl[2 * 4 * 16];
+ uint8_t *high = tbl + 4 * 16;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 16; j++) {
+ c = (j << (i*4));
+ prod = gf->multiply.w32(gf, c, val);
+ tbl[i*16 + j] = prod & 0xff;
+ high[i*16 + j] = prod >> 8;
+ }
+ }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+ gf_do_initial_region_alignment(&rd);
+
+ if (altmap) {
+ uint8_t *s8 = rd.s_start;
+ uint8_t *d8 = rd.d_start;
+ uint8_t *end8 = rd.d_top;
+ if (xor)
+ neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
+ else
+ neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
+ } else {
+ uint16_t *s16 = rd.s_start;
+ uint16_t *d16 = rd.d_start;
+ uint16_t *end16 = rd.d_top;
+ if (xor)
+ neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
+ else
+ neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
+ }
+
+ gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+ gf_val_32_t val, int bytes, int xor)
+{
+ neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+ void *dest,
+ gf_val_32_t val, int bytes,
+ int xor)
+{
+ neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+
+void gf_w16_neon_split_init(gf_t *gf)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+ if (h->region_type & GF_REGION_ALTMAP)
+ gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon;
+ else
+ gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon;
+}
diff --git a/src/neon/gf_w32_neon.c b/src/neon/gf_w32_neon.c
new file mode 100644
index 0000000..8231eb3
--- /dev/null
+++ b/src/neon/gf_w32_neon.c
@@ -0,0 +1,269 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the University of Tennessee nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w32_neon.c
+ *
+ * Neon routines for 32-bit Galois fields
+ *
+ */
+
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w32.h"
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
+ uint32_t *d_end, uint8_t btable[8][4][16],
+ uint32_t val, int xor, int altmap)
+{
+ int i, j;
+#ifdef ARCH_AARCH64
+ uint8x16_t tables[8][4];
+#else
+ uint8x8x2_t tables[8][4];
+#endif
+ uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
+ uint8x16_t p0, p1, p2, p3, si, mask1;
+ uint16x8x2_t r0, r1;
+ uint8x16x2_t q0, q1;
+
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 4; j++) {
+#ifdef ARCH_AARCH64
+ tables[i][j] = vld1q_u8(btable[i][j]);
+#else
+ tables[i][j].val[0] = vld1_u8(btable[i][j]);
+ tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
+#endif
+ }
+ }
+
+ mask1 = vdupq_n_u8(0xf);
+
+ while (dst < d_end) {
+
+ v0 = vld1q_u32(src); src += 4;
+ v1 = vld1q_u32(src); src += 4;
+ v2 = vld1q_u32(src); src += 4;
+ v3 = vld1q_u32(src); src += 4;
+
+ if (altmap) {
+ q0.val[0] = vreinterpretq_u8_u32(v0);
+ q0.val[1] = vreinterpretq_u8_u32(v1);
+ q1.val[0] = vreinterpretq_u8_u32(v2);
+ q1.val[1] = vreinterpretq_u8_u32(v3);
+ } else {
+ r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
+ r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));
+
+ q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
+ vreinterpretq_u8_u16(r1.val[0]));
+ q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
+ vreinterpretq_u8_u16(r1.val[1]));
+ }
+
+ si = vandq_u8(q0.val[0], mask1);
+ p0 = vqtbl1q_u8(tables[0][0], si);
+ p1 = vqtbl1q_u8(tables[0][1], si);
+ p2 = vqtbl1q_u8(tables[0][2], si);
+ p3 = vqtbl1q_u8(tables[0][3], si);
+
+ si = vshrq_n_u8(q0.val[0], 4);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));
+
+ si = vandq_u8(q0.val[1], mask1);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));
+
+ si = vshrq_n_u8(q0.val[1], 4);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));
+
+ si = vandq_u8(q1.val[0], mask1);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));
+
+ si = vshrq_n_u8(q1.val[0], 4);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));
+
+ si = vandq_u8(q1.val[1], mask1);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));
+
+ si = vshrq_n_u8(q1.val[1], 4);
+ p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
+ p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
+ p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
+ p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));
+
+ if (altmap) {
+ s0 = vreinterpretq_u32_u8(p0);
+ s1 = vreinterpretq_u32_u8(p1);
+ s2 = vreinterpretq_u32_u8(p2);
+ s3 = vreinterpretq_u32_u8(p3);
+ } else {
+ q0 = vtrnq_u8(p0, p1);
+ q1 = vtrnq_u8(p2, p3);
+
+ r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
+ vreinterpretq_u16_u8(q1.val[0]));
+ r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
+ vreinterpretq_u16_u8(q1.val[1]));
+
+ s0 = vreinterpretq_u32_u16(r0.val[0]);
+ s1 = vreinterpretq_u32_u16(r1.val[0]);
+ s2 = vreinterpretq_u32_u16(r0.val[1]);
+ s3 = vreinterpretq_u32_u16(r1.val[1]);
+ }
+
+ if (xor) {
+ v0 = vld1q_u32(dst);
+ v1 = vld1q_u32(dst + 4);
+ v2 = vld1q_u32(dst + 8);
+ v3 = vld1q_u32(dst + 12);
+ s0 = veorq_u32(s0, v0);
+ s1 = veorq_u32(s1, v1);
+ s2 = veorq_u32(s2, v2);
+ s3 = veorq_u32(s3, v3);
+ }
+
+ vst1q_u32(dst, s0);
+ vst1q_u32(dst + 4, s1);
+ vst1q_u32(dst + 8, s2);
+ vst1q_u32(dst + 12, s3);
+
+ dst += 16;
+ }
+}
+
+static
+inline
+void
+neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap)
+{
+ gf_internal_t *h;
+ int i, j, k;
+ uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
+ uint8_t btable[8][4][16];
+ gf_region_data rd;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = h->prim_poly;
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+ gf_do_initial_region_alignment(&rd);
+
+ s32 = (uint32_t *) rd.s_start;
+ d32 = (uint32_t *) rd.d_start;
+ top = (uint32_t *) rd.d_top;
+
+ v = val;
+ for (i = 0; i < 8; i++) {
+ tmp_table[0] = 0;
+ for (j = 1; j < 16; j <<= 1) {
+ for (k = 0; k < j; k++) {
+ tmp_table[k^j] = (v ^ tmp_table[k]);
+ }
+ v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+ }
+ for (j = 0; j < 4; j++) {
+ for (k = 0; k < 16; k++) {
+ btable[i][j][k] = (uint8_t) tmp_table[k];
+ tmp_table[k] >>= 8;
+ }
+ }
+ }
+
+ if (xor)
+ neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap);
+ else
+ neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap);
+
+ gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+ gf_val_32_t val, int bytes, int xor)
+{
+ neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+ void *dest, gf_val_32_t val,
+ int bytes, int xor)
+{
+ neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w32_neon_split_init(gf_t *gf)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+ if (h->region_type & GF_REGION_ALTMAP)
+ gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon;
+ else
+ gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon;
+
+}
diff --git a/src/neon/gf_w4_neon.c b/src/neon/gf_w4_neon.c
new file mode 100644
index 0000000..3a21432
--- /dev/null
+++ b/src/neon/gf_w4_neon.c
@@ -0,0 +1,247 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the University of Tennessee nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w4_neon.c
+ *
+ * Neon routines for 4-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w4.h"
+
+static
+gf_val_32_t
+gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+ gf_val_32_t rv = 0;
+ poly8x8_t result, prim_poly;
+ poly8x8_t a, b, w;
+ uint8x8_t v;
+ gf_internal_t * h = gf->scratch;
+
+ a = vdup_n_p8 (a4);
+ b = vdup_n_p8 (b4);
+
+ prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
+
+ /* Do the initial multiply */
+ result = vmul_p8 (a, b);
+ v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
+ w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+ result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
+
+ /* Extracts 32 bit value from result. */
+ rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
+
+ return rv;
+}
+
+static inline void
+neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
+ gf_val_32_t val, uint8_t *d_end, int xor)
+{
+ gf_internal_t * h = gf->scratch;
+ poly8x8_t prim_poly;
+ poly8x8_t a, w, even, odd;
+ uint8x8_t b, c, v, mask;
+
+ a = vdup_n_p8 (val);
+ mask = vdup_n_u8 (0xf);
+ prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
+
+ while (d8 < d_end) {
+ b = vld1_u8 (s8);
+
+ even = vreinterpret_p8_u8 (vand_u8 (b, mask));
+ odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
+
+ if (xor)
+ c = vld1_u8 (d8);
+
+ even = vmul_p8 (a, even);
+ odd = vmul_p8 (a, odd);
+
+ v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
+ w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+ even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
+
+ v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
+ w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+ odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
+
+ v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
+
+ if (xor)
+ v = veor_u8 (c, v);
+
+ vst1_u8 (d8, v);
+
+ d8 += 8;
+ s8 += 8;
+ }
+}
+
+
+static void
+gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
+ gf_val_32_t val, int bytes, int xor)
+{
+ gf_region_data rd;
+ uint8_t *s8;
+ uint8_t *d8;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+ gf_do_initial_region_alignment(&rd);
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+
+ if (xor)
+ neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
+ else
+ neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
+
+ gf_do_final_region_alignment(&rd);
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
+ uint8_t * d_end, gf_val_32_t val, int xor)
+{
+ struct gf_single_table_data *std;
+ uint8_t *base;
+ uint8x16_t r, va, vh, vl, loset;
+
+#ifdef ARCH_AARCH64
+ uint8x16_t th, tl;
+#else
+ uint8x8x2_t th, tl;
+#endif
+
+ std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+ base = (uint8_t *) std->mult;
+ base += (val << GF_FIELD_WIDTH);
+
+#ifdef ARCH_AARCH64
+ tl = vld1q_u8 (base);
+ th = vshlq_n_u8 (tl, 4);
+#else
+ tl.val[0] = vld1_u8 (base);
+ tl.val[1] = vld1_u8 (base + 8);
+ th.val[0] = vshl_n_u8 (tl.val[0], 4);
+ th.val[1] = vshl_n_u8 (tl.val[1], 4);
+#endif
+
+ loset = vdupq_n_u8(0xf);
+
+ while (dst < d_end) {
+ va = vld1q_u8 (src);
+
+ vh = vshrq_n_u8 (va, 4);
+ vl = vandq_u8 (va, loset);
+
+ if (xor)
+ va = vld1q_u8 (dst);
+
+ vh = vqtbl1q_u8 (th, vh);
+ vl = vqtbl1q_u8 (tl, vl);
+
+ r = veorq_u8 (vh, vl);
+
+ if (xor)
+ r = veorq_u8 (va, r);
+
+ vst1q_u8 (dst, r);
+
+ dst += 16;
+ src += 16;
+ }
+}
+
+static
+void
+gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
+ gf_val_32_t val, int bytes, int xor)
+{
+ gf_region_data rd;
+ uint8_t *sptr, *dptr, *top;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+ gf_do_initial_region_alignment(&rd);
+
+ sptr = rd.s_start;
+ dptr = rd.d_start;
+ top = rd.d_top;
+
+ if (xor)
+ w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
+ else
+ w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
+
+ gf_do_final_region_alignment(&rd);
+
+}
+
+
+int gf_w4_neon_cfm_init(gf_t *gf)
+{
+ // single clm multiplication probably pointless
+ gf->multiply.w32 = gf_w4_neon_clm_multiply;
+ gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single;
+
+ return 1;
+}
+
+void gf_w4_neon_single_table_init(gf_t *gf)
+{
+ gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon;
+}
diff --git a/src/neon/gf_w64_neon.c b/src/neon/gf_w64_neon.c
new file mode 100644
index 0000000..0eca9c7
--- /dev/null
+++ b/src/neon/gf_w64_neon.c
@@ -0,0 +1,333 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the University of Tennessee nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w64_neon.c
+ *
+ * Neon routines for 64-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w64.h"
+
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
+ uint64_t *dst, uint64_t *d_end,
+ uint64_t val, int xor)
+{
+ unsigned i, j, k;
+ uint8_t btable[16];
+#ifdef ARCH_AARCH64
+ uint8x16_t tables[16][8];
+#else
+ uint8x8x2_t tables[16][8];
+#endif
+ uint8x16_t p[8], mask1, si;
+
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+ struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+ for (i = 0; i < 16; i++) {
+ for (j = 0; j < 8; j++) {
+ for (k = 0; k < 16; k++) {
+ btable[k] = (uint8_t) ld->tables[i][k];
+ ld->tables[i][k] >>= 8;
+ }
+#ifdef ARCH_AARCH64
+ tables[i][j] = vld1q_u8(btable);
+#else
+ tables[i][j].val[0] = vld1_u8(btable);
+ tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+ }
+ }
+
+ mask1 = vdupq_n_u8(0xf);
+
+ while (dst < d_end) {
+
+ if (xor) {
+ for (i = 0; i < 8; i++)
+ p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
+ } else {
+ for (i = 0; i < 8; i++)
+ p[i] = vdupq_n_u8(0);
+ }
+
+ i = 0;
+ for (k = 0; k < 8; k++) {
+ uint8x16_t v0 = vld1q_u8((uint8_t *) src);
+ src += 2;
+
+ si = vandq_u8(v0, mask1);
+ for (j = 0; j < 8; j++) {
+ p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+ }
+ i++;
+ si = vshrq_n_u8(v0, 4);
+ for (j = 0; j < 8; j++) {
+ p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+ }
+ i++;
+
+ }
+ for (i = 0; i < 8; i++) {
+ vst1q_u8((uint8_t *) dst, p[i]);
+ dst += 2;
+ }
+ }
+}
+
+static
+inline
+void
+neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
+ uint64_t *d_end, uint64_t val, int xor)
+{
+ unsigned i, j, k;
+ uint8_t btable[16];
+#ifdef ARCH_AARCH64
+ uint8x16_t tables[16][8];
+#else
+ uint8x8x2_t tables[16][8];
+#endif
+ uint8x16_t p[8], mask1, si;
+ uint64x2_t st[8];
+ uint32x4x2_t s32[4];
+ uint16x8x2_t s16[4];
+ uint8x16x2_t s8[4];
+
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+ struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+ for (i = 0; i < 16; i++) {
+ for (j = 0; j < 8; j++) {
+ for (k = 0; k < 16; k++) {
+ btable[k] = (uint8_t) ld->tables[i][k];
+ ld->tables[i][k] >>= 8;
+ }
+#ifdef ARCH_AARCH64
+ tables[i][j] = vld1q_u8(btable);
+#else
+ tables[i][j].val[0] = vld1_u8(btable);
+ tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+ }
+ }
+
+ mask1 = vdupq_n_u8(0xf);
+
+ while (dst < d_end) {
+
+ for (k = 0; k < 8; k++) {
+ st[k] = vld1q_u64(src);
+ src += 2;
+ p[k] = vdupq_n_u8(0);
+ }
+
+ s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
+ vreinterpretq_u32_u64(st[1]));
+ s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
+ vreinterpretq_u32_u64(st[3]));
+ s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
+ vreinterpretq_u32_u64(st[5]));
+ s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
+ vreinterpretq_u32_u64(st[7]));
+
+ s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
+ vreinterpretq_u16_u32(s32[1].val[0]));
+ s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
+ vreinterpretq_u16_u32(s32[3].val[0]));
+ s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
+ vreinterpretq_u16_u32(s32[1].val[1]));
+ s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
+ vreinterpretq_u16_u32(s32[3].val[1]));
+
+ s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
+ vreinterpretq_u8_u16(s16[1].val[0]));
+ s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
+ vreinterpretq_u8_u16(s16[1].val[1]));
+ s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
+ vreinterpretq_u8_u16(s16[3].val[0]));
+ s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
+ vreinterpretq_u8_u16(s16[3].val[1]));
+
+ i = 0;
+ for (k = 0; k < 8; k++) {
+ si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
+ for (j = 0; j < 8; j++) {
+ p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+ }
+ i++;
+ si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
+ for (j = 0; j < 8; j++) {
+ p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+ }
+ i++;
+ }
+
+ s8[0] = vzipq_u8(p[0], p[1]);
+ s8[1] = vzipq_u8(p[2], p[3]);
+ s8[2] = vzipq_u8(p[4], p[5]);
+ s8[3] = vzipq_u8(p[6], p[7]);
+
+ s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
+ vreinterpretq_u16_u8(s8[1].val[0]));
+ s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
+ vreinterpretq_u16_u8(s8[3].val[0]));
+ s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
+ vreinterpretq_u16_u8(s8[1].val[1]));
+ s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
+ vreinterpretq_u16_u8(s8[3].val[1]));
+
+ s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
+ vreinterpretq_u32_u16(s16[1].val[0]));
+ s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
+ vreinterpretq_u32_u16(s16[1].val[1]));
+ s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
+ vreinterpretq_u32_u16(s16[3].val[0]));
+ s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
+ vreinterpretq_u32_u16(s16[3].val[1]));
+
+ for (k = 0; k < 8; k ++) {
+ st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
+ }
+
+ if (xor) {
+ for (i = 0; i < 8; i++) {
+ uint64x2_t t1 = vld1q_u64(dst);
+ vst1q_u64(dst, veorq_u64(st[i], t1));
+ dst += 2;
+ }
+ } else {
+ for (i = 0; i < 8; i++) {
+ vst1q_u64(dst, st[i]);
+ dst += 2;
+ }
+ }
+
+ }
+}
+
+static
+void
+gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+ uint64_t val, int bytes, int xor,
+ int altmap)
+{
+ gf_internal_t *h;
+ int i, j, k;
+ uint64_t pp, v, *s64, *d64, *top;
+ struct gf_split_4_64_lazy_data *ld;
+ gf_region_data rd;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+ gf_do_initial_region_alignment(&rd);
+
+ s64 = (uint64_t *) rd.s_start;
+ d64 = (uint64_t *) rd.d_start;
+ top = (uint64_t *) rd.d_top;
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = h->prim_poly;
+ ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+ v = val;
+ for (i = 0; i < 16; i++) {
+ ld->tables[i][0] = 0;
+ for (j = 1; j < 16; j <<= 1) {
+ for (k = 0; k < j; k++) {
+ ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+ }
+ v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+ }
+ }
+
+ if (altmap) {
+ if (xor)
+ neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
+ else
+ neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
+ } else {
+ if (xor)
+ neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
+ else
+ neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
+ }
+
+ gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+ uint64_t val, int bytes, int xor)
+{
+ gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+ void *dest, uint64_t val,
+ int bytes, int xor)
+{
+ gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w64_neon_split_init(gf_t *gf)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+ if (h->region_type & GF_REGION_ALTMAP)
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon;
+ else
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon;
+
+}
diff --git a/src/neon/gf_w8_neon.c b/src/neon/gf_w8_neon.c
new file mode 100644
index 0000000..930a916
--- /dev/null
+++ b/src/neon/gf_w8_neon.c
@@ -0,0 +1,302 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the University of Tennessee nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w8_neon.c
+ *
+ * Neon optimized routines for 8-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include "gf_w8.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ARM NEON reducing macro for the carry free multiplication
+ * vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
+ * the result to the right by 1 byte. This allows us to multiply
+ * the prim_poly by the leading bits of the result. We then xor the result
+ * of that operation back with the result. */
+#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial) \
+ do { \
+ if (initial) \
+ v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8); \
+ else \
+ v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8)); \
+ w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v)); \
+ result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
+ } while (0)
+
+static
+inline
+gf_val_32_t
+gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
+{
+ gf_val_32_t rv = 0;
+ poly8x8_t a, b;
+ uint8x8_t v;
+ poly16x8_t result;
+ poly8x8_t prim_poly;
+ poly16x8_t w;
+ gf_internal_t * h = gf->scratch;
+
+ a = vdup_n_p8 (a8);
+ b = vdup_n_p8 (b8);
+
+ prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
+
+ /* Do the initial multiply */
+ result = vmull_p8 (a, b);
+
+ /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+ have to do the reduction at most twice, because (w-2)/z == 2. Where
+ z is equal to the number of zeros after the leading 1 */
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+ if (x >= 3) {
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+ }
+ if (x >= 4) {
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+ }
+ /* Extracts 32 bit value from result. */
+ rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
+
+ return rv;
+}
+
+#define CLM_MULTIPLY(x) \
+static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
+{\
+ return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
+}
+
+CLM_MULTIPLY(2)
+CLM_MULTIPLY(3)
+CLM_MULTIPLY(4)
+
+static inline void
+neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
+ gf_val_32_t val, uint8_t *d_end,
+ int xor, int x)
+{
+ gf_internal_t * h = gf->scratch;
+ poly8x8_t a, b;
+ uint8x8_t c, v;
+ poly16x8_t result;
+ poly8x8_t prim_poly;
+ poly16x8_t w;
+
+ a = vdup_n_p8 (val);
+ prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
+
+ while (d8 < d_end) {
+ b = vld1_p8 ((poly8_t *) s8);
+
+ if (xor)
+ c = vld1_u8 (d8);
+
+ result = vmull_p8 (a, b);
+
+ NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+ if (x >= 3) {
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+ }
+ if (x >= 4) {
+ NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+ }
+ v = vmovn_u16 (vreinterpretq_u16_p16 (result));
+ if (xor)
+ v = veor_u8 (c, v);
+
+ vst1_u8 (d8, v);
+
+ d8 += 8;
+ s8 += 8;
+ }
+}
+
+#define CLM_MULT_REGION(x) \
+static void \
+gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src, \
+ void *dest, \
+ gf_val_32_t val, int bytes, \
+ int xor) \
+{ \
+ gf_region_data rd; \
+ uint8_t *s8; \
+ uint8_t *d8; \
+ \
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } \
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } \
+ \
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); \
+ gf_do_initial_region_alignment(&rd); \
+ s8 = (uint8_t *) rd.s_start; \
+ d8 = (uint8_t *) rd.d_start; \
+ \
+ if (xor) \
+ neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
+ else \
+ neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
+ gf_do_final_region_alignment(&rd); \
+}
+
+CLM_MULT_REGION(2)
+CLM_MULT_REGION(3)
+CLM_MULT_REGION(4)
+
+
+int gf_w8_neon_cfm_init(gf_t *gf)
+{
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ if ((0xe0 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w8_neon_clm_multiply_2;
+ gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2;
+ }else if ((0xc0 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w8_neon_clm_multiply_3;
+ gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3;
+ }else if ((0x80 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w8_neon_clm_multiply_4;
+ gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4;
+ }else{
+ return 0;
+ }
+ return 1;
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+ uint8_t *bh, *bl, *sptr, *dptr;
+ uint8x16_t r, va, vh, vl, loset;
+#ifdef ARCH_AARCH64
+ uint8x16_t mth, mtl;
+#else
+ uint8x8x2_t mth, mtl;
+#endif
+ struct gf_w8_half_table_data *htd;
+ gf_region_data rd;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+ gf_do_initial_region_alignment(&rd);
+
+ bh = (uint8_t *) htd->high;
+ bh += (val << 4);
+ bl = (uint8_t *) htd->low;
+ bl += (val << 4);
+
+ sptr = rd.s_start;
+ dptr = rd.d_start;
+
+#ifdef ARCH_AARCH64
+ mth = vld1q_u8 (bh);
+ mtl = vld1q_u8 (bl);
+#else
+ mth.val[0] = vld1_u8 (bh);
+ mtl.val[0] = vld1_u8 (bl);
+ mth.val[1] = vld1_u8 (bh + 8);
+ mtl.val[1] = vld1_u8 (bl + 8);
+#endif
+
+ loset = vdupq_n_u8(0xf);
+
+ if (xor) {
+ while (sptr < (uint8_t *) rd.s_top) {
+ va = vld1q_u8 (sptr);
+
+ vh = vshrq_n_u8 (va, 4);
+ vl = vandq_u8 (va, loset);
+ va = vld1q_u8 (dptr);
+
+ vh = vqtbl1q_u8 (mth, vh);
+ vl = vqtbl1q_u8 (mtl, vl);
+
+ r = veorq_u8 (vh, vl);
+
+ vst1q_u8 (dptr, veorq_u8 (va, r));
+
+ dptr += 16;
+ sptr += 16;
+ }
+ } else {
+ while (sptr < (uint8_t *) rd.s_top) {
+ va = vld1q_u8 (sptr);
+
+ vh = vshrq_n_u8 (va, 4);
+ vl = vandq_u8 (va, loset);
+#ifdef ARCH_AARCH64
+ vh = vqtbl1q_u8 (mth, vh);
+ vl = vqtbl1q_u8 (mtl, vl);
+#else
+ vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
+ vtbl2_u8 (mth, vget_high_u8 (vh)));
+ vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
+ vtbl2_u8 (mtl, vget_high_u8 (vl)));
+#endif
+
+ r = veorq_u8 (vh, vl);
+
+ vst1q_u8(dptr, r);
+
+ dptr += 16;
+ sptr += 16;
+ }
+ }
+
+ gf_do_final_region_alignment(&rd);
+}
+
+
+void gf_w8_neon_split_init(gf_t *gf)
+{
+ gf->multiply_region.w32 = gf_w8_split_multiply_region_neon;
+}
diff --git a/test/Makefile.am b/test/Makefile.am
index 7f39a48..2791528 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1,7 +1,7 @@
# GF-Complete 'test' AM file
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
bin_PROGRAMS = gf_unit
diff --git a/test/gf_unit.c b/test/gf_unit.c
index 98ff98c..db26849 100644
--- a/test/gf_unit.c
+++ b/test/gf_unit.c
@@ -8,6 +8,14 @@
* Performs unit testing for gf arithmetic
*/
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+#endif
+
#include <stdio.h>
#include <getopt.h>
#include <stdint.h>
@@ -82,6 +90,9 @@ int main(int argc, char **argv)
uint32_t mask = 0;
char *ra, *rb, *rc, *rd, *target;
int align;
+#ifndef HAVE_POSIX_MEMALIGN
+ char *malloc_ra, *malloc_rb, *malloc_rc, *malloc_rd;
+#endif
if (argc < 4) usage(NULL);
@@ -116,18 +127,26 @@ int main(int argc, char **argv)
c = (gf_general_t *) malloc(sizeof(gf_general_t));
d = (gf_general_t *) malloc(sizeof(gf_general_t));
+#if HAVE_POSIX_MEMALIGN
+ if (posix_memalign((void **) &ra, 16, sizeof(char)*REGION_SIZE))
+ ra = NULL;
+ if (posix_memalign((void **) &rb, 16, sizeof(char)*REGION_SIZE))
+ rb = NULL;
+ if (posix_memalign((void **) &rc, 16, sizeof(char)*REGION_SIZE))
+ rc = NULL;
+ if (posix_memalign((void **) &rd, 16, sizeof(char)*REGION_SIZE))
+ rd = NULL;
+#else
//15 bytes extra to make sure it's 16byte aligned
- ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
- rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
- rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
- rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
-
- //this still assumes 8 byte aligned pointer from malloc
- //(which is usual on 32-bit machines)
- ra += (uint64_t)ra & 0xf;
- rb += (uint64_t)rb & 0xf;
- rc += (uint64_t)rc & 0xf;
- rd += (uint64_t)rd & 0xf;
+ malloc_ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ malloc_rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ malloc_rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ malloc_rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
+ rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
+ rc = (uint8_t *) (((uintptr_t) malloc_rc + 15) & ~((uintptr_t) 0xf));
+ rd = (uint8_t *) (((uintptr_t) malloc_rd + 15) & ~((uintptr_t) 0xf));
+#endif
if (w <= 32) {
mask = 0;
@@ -423,10 +442,17 @@ int main(int argc, char **argv)
free(b);
free(c);
free(d);
+#ifdef HAVE_POSIX_MEMALIGN
free(ra);
free(rb);
free(rc);
free(rd);
+#else
+ free(malloc_ra);
+ free(malloc_rb);
+ free(malloc_rc);
+ free(malloc_rd);
+#endif
return 0;
}
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 7c55d65..eb27d4a 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -1,9 +1,7 @@
# GF-Complete 'tools' AM file
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
-
-TESTS=run-tests.sh
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time
@@ -35,3 +33,19 @@ gf_inline_time_SOURCES = gf_inline_time.c
#gf_inline_time_LDFLAGS = -lgf_complete
gf_inline_time_LDADD = ../src/libgf_complete.la
+# gf_unit tests as generated by gf_methods
+gf_unit_w%.sh: gf_methods
+ ./$^ $(@:gf_unit_w%.sh=%) -A -U > $@ || rm $@
+
+TESTS = gf_unit_w128.sh \
+ gf_unit_w64.sh \
+ gf_unit_w32.sh \
+ gf_unit_w16.sh \
+ gf_unit_w8.sh \
+ gf_unit_w4.sh
+
+TEST_EXTENSIONS = .sh
+SH_LOG_COMPILER = $(SHELL)
+AM_SH_LOG_FLAGS = -e
+
+CLEANFILES = $(TESTS)
diff --git a/tools/gf_methods.c b/tools/gf_methods.c
index 43589ac..c7d3d58 100644
--- a/tools/gf_methods.c
+++ b/tools/gf_methods.c
@@ -28,7 +28,7 @@ static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44"
/* Make sure CAUCHY is last */
#define NREGIONS (7)
-static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE",
+static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SIMD", "NOSIMD",
"ALTMAP", "CAUCHY" };
#define BNREGIONS (4)
diff --git a/tools/gf_time.c b/tools/gf_time.c
index d17a7c2..7402ab5 100644
--- a/tools/gf_time.c
+++ b/tools/gf_time.c
@@ -8,6 +8,14 @@
* Performs timing for gf arithmetic
*/
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+#endif
+
#include <stdio.h>
#include <getopt.h>
#include <stdint.h>
@@ -95,6 +103,9 @@ int main(int argc, char **argv)
time_t t0;
uint8_t *ra, *rb;
gf_general_t a;
+#ifndef HAVE_POSIX_MEMALIGN
+ uint8_t *malloc_ra, *malloc_rb;
+#endif
if (argc < 6) usage(NULL);
@@ -155,8 +166,17 @@ int main(int argc, char **argv)
printf("Seed: %ld\n", t0);
- ra = (uint8_t *) malloc(size);
- rb = (uint8_t *) malloc(size);
+#ifdef HAVE_POSIX_MEMALIGN
+ if (posix_memalign((void **) &ra, 16, size))
+ ra = NULL;
+ if (posix_memalign((void **) &rb, 16, size))
+ rb = NULL;
+#else
+ malloc_ra = (uint8_t *) malloc(size + 15);
+ malloc_rb = (uint8_t *) malloc(size + 15);
+ ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
+ rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
+#endif
if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); }
diff --git a/tools/run-tests.sh b/tools/run-tests.sh
deleted file mode 100755
index bd3cc60..0000000
--- a/tools/run-tests.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-for w in 4 8 16 32 64 128 ; do
- ./gf_methods $w -A -U | sh -e
- if [ $? != "0" ] ; then
- echo "Failed unit tests for w=$w"
- break
- fi
-done