summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore8
-rwxr-xr-xcompile347
-rw-r--r--configure.ac17
-rwxr-xr-xdepcomp791
-rw-r--r--include/gf_cpu.h20
-rw-r--r--include/gf_int.h16
-rw-r--r--m4/ax_ext.m4295
-rw-r--r--m4/ax_gcc_x86_avx_xgetbv.m479
-rw-r--r--m4/ax_gcc_x86_cpuid.m479
-rw-r--r--m4/ltoptions.m4384
-rw-r--r--m4/ltsugar.m4123
-rw-r--r--m4/lt~obsolete.m498
-rw-r--r--src/Makefile.am16
-rw-r--r--src/gf.c149
-rw-r--r--src/gf_cpu.c168
-rw-r--r--src/gf_w128.c114
-rw-r--r--src/gf_w16.c231
-rw-r--r--src/gf_w32.c295
-rw-r--r--src/gf_w4.c172
-rw-r--r--src/gf_w64.c244
-rw-r--r--src/gf_w8.c236
-rw-r--r--src/gf_wgen.c64
-rw-r--r--src/neon/gf_w16_neon.c4
-rw-r--r--src/neon/gf_w32_neon.c4
-rw-r--r--src/neon/gf_w4_neon.c6
-rw-r--r--src/neon/gf_w64_neon.c4
-rw-r--r--src/neon/gf_w8_neon.c14
-rw-r--r--test/Makefile.am2
-rw-r--r--tools/Makefile.am2
-rw-r--r--tools/gf_methods.c24
-rwxr-xr-xtools/test_simd.sh367
-rwxr-xr-xtools/test_simd_qemu.sh258
32 files changed, 1699 insertions, 2932 deletions
diff --git a/.gitignore b/.gitignore
index c455d23..bfc1dfc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,10 @@ config.sub
ltmain.sh
m4/libtool.m4
m4/ltversion.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/lt~obsolete.m4
+test-driver
src/.dirstamp
test-driver
@@ -68,3 +72,7 @@ tools/gf_methods
tools/gf_mult
tools/gf_poly
tools/gf_time
+tools/gf_unit_w*
+tools/test-suite.log
+tools/.qemu/
+tools/test_simd*.results*
diff --git a/compile b/compile
deleted file mode 100755
index 531136b..0000000
--- a/compile
+++ /dev/null
@@ -1,347 +0,0 @@
-#! /bin/sh
-# Wrapper for compilers which do not understand '-c -o'.
-
-scriptversion=2012-10-14.11; # UTC
-
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
-# Written by Tom Tromey <tromey@cygnus.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-nl='
-'
-
-# We need space, tab and new line, in precisely that order. Quoting is
-# there to prevent tools from complaining about whitespace usage.
-IFS=" "" $nl"
-
-file_conv=
-
-# func_file_conv build_file lazy
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts. If the determined conversion
-# type is listed in (the comma separated) LAZY, no conversion will
-# take place.
-func_file_conv ()
-{
- file=$1
- case $file in
- / | /[!/]*) # absolute file, and not a UNC file
- if test -z "$file_conv"; then
- # lazily determine how to convert abs files
- case `uname -s` in
- MINGW*)
- file_conv=mingw
- ;;
- CYGWIN*)
- file_conv=cygwin
- ;;
- *)
- file_conv=wine
- ;;
- esac
- fi
- case $file_conv/,$2, in
- *,$file_conv,*)
- ;;
- mingw/*)
- file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
- ;;
- cygwin/*)
- file=`cygpath -m "$file" || echo "$file"`
- ;;
- wine/*)
- file=`winepath -w "$file" || echo "$file"`
- ;;
- esac
- ;;
- esac
-}
-
-# func_cl_dashL linkdir
-# Make cl look for libraries in LINKDIR
-func_cl_dashL ()
-{
- func_file_conv "$1"
- if test -z "$lib_path"; then
- lib_path=$file
- else
- lib_path="$lib_path;$file"
- fi
- linker_opts="$linker_opts -LIBPATH:$file"
-}
-
-# func_cl_dashl library
-# Do a library search-path lookup for cl
-func_cl_dashl ()
-{
- lib=$1
- found=no
- save_IFS=$IFS
- IFS=';'
- for dir in $lib_path $LIB
- do
- IFS=$save_IFS
- if $shared && test -f "$dir/$lib.dll.lib"; then
- found=yes
- lib=$dir/$lib.dll.lib
- break
- fi
- if test -f "$dir/$lib.lib"; then
- found=yes
- lib=$dir/$lib.lib
- break
- fi
- if test -f "$dir/lib$lib.a"; then
- found=yes
- lib=$dir/lib$lib.a
- break
- fi
- done
- IFS=$save_IFS
-
- if test "$found" != yes; then
- lib=$lib.lib
- fi
-}
-
-# func_cl_wrapper cl arg...
-# Adjust compile command to suit cl
-func_cl_wrapper ()
-{
- # Assume a capable shell
- lib_path=
- shared=:
- linker_opts=
- for arg
- do
- if test -n "$eat"; then
- eat=
- else
- case $1 in
- -o)
- # configure might choose to run compile as 'compile cc -o foo foo.c'.
- eat=1
- case $2 in
- *.o | *.[oO][bB][jJ])
- func_file_conv "$2"
- set x "$@" -Fo"$file"
- shift
- ;;
- *)
- func_file_conv "$2"
- set x "$@" -Fe"$file"
- shift
- ;;
- esac
- ;;
- -I)
- eat=1
- func_file_conv "$2" mingw
- set x "$@" -I"$file"
- shift
- ;;
- -I*)
- func_file_conv "${1#-I}" mingw
- set x "$@" -I"$file"
- shift
- ;;
- -l)
- eat=1
- func_cl_dashl "$2"
- set x "$@" "$lib"
- shift
- ;;
- -l*)
- func_cl_dashl "${1#-l}"
- set x "$@" "$lib"
- shift
- ;;
- -L)
- eat=1
- func_cl_dashL "$2"
- ;;
- -L*)
- func_cl_dashL "${1#-L}"
- ;;
- -static)
- shared=false
- ;;
- -Wl,*)
- arg=${1#-Wl,}
- save_ifs="$IFS"; IFS=','
- for flag in $arg; do
- IFS="$save_ifs"
- linker_opts="$linker_opts $flag"
- done
- IFS="$save_ifs"
- ;;
- -Xlinker)
- eat=1
- linker_opts="$linker_opts $2"
- ;;
- -*)
- set x "$@" "$1"
- shift
- ;;
- *.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
- func_file_conv "$1"
- set x "$@" -Tp"$file"
- shift
- ;;
- *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
- func_file_conv "$1" mingw
- set x "$@" "$file"
- shift
- ;;
- *)
- set x "$@" "$1"
- shift
- ;;
- esac
- fi
- shift
- done
- if test -n "$linker_opts"; then
- linker_opts="-link$linker_opts"
- fi
- exec "$@" $linker_opts
- exit 1
-}
-
-eat=
-
-case $1 in
- '')
- echo "$0: No command. Try '$0 --help' for more information." 1>&2
- exit 1;
- ;;
- -h | --h*)
- cat <<\EOF
-Usage: compile [--help] [--version] PROGRAM [ARGS]
-
-Wrapper for compilers which do not understand '-c -o'.
-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
-arguments, and rename the output as expected.
-
-If you are trying to build a whole package this is not the
-right script to run: please start by reading the file 'INSTALL'.
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
- exit $?
- ;;
- -v | --v*)
- echo "compile $scriptversion"
- exit $?
- ;;
- cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
- func_cl_wrapper "$@" # Doesn't return...
- ;;
-esac
-
-ofile=
-cfile=
-
-for arg
-do
- if test -n "$eat"; then
- eat=
- else
- case $1 in
- -o)
- # configure might choose to run compile as 'compile cc -o foo foo.c'.
- # So we strip '-o arg' only if arg is an object.
- eat=1
- case $2 in
- *.o | *.obj)
- ofile=$2
- ;;
- *)
- set x "$@" -o "$2"
- shift
- ;;
- esac
- ;;
- *.c)
- cfile=$1
- set x "$@" "$1"
- shift
- ;;
- *)
- set x "$@" "$1"
- shift
- ;;
- esac
- fi
- shift
-done
-
-if test -z "$ofile" || test -z "$cfile"; then
- # If no '-o' option was seen then we might have been invoked from a
- # pattern rule where we don't need one. That is ok -- this is a
- # normal compilation that the losing compiler can handle. If no
- # '.c' file was seen then we are probably linking. That is also
- # ok.
- exec "$@"
-fi
-
-# Name of file we expect compiler to create.
-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
-
-# Create the lock directory.
-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
-# that we are using for the .o file. Also, base the name on the expected
-# object file name, since that is what matters with a parallel build.
-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
-while true; do
- if mkdir "$lockdir" >/dev/null 2>&1; then
- break
- fi
- sleep 1
-done
-# FIXME: race condition here if user kills between mkdir and trap.
-trap "rmdir '$lockdir'; exit 1" 1 2 15
-
-# Run the compile.
-"$@"
-ret=$?
-
-if test -f "$cofile"; then
- test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
-elif test -f "${cofile}bj"; then
- test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
-fi
-
-rmdir "$lockdir"
-exit $ret
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/configure.ac b/configure.ac
index 3e8cf18..d696f6e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -29,6 +29,14 @@ AC_CHECK_FUNCS([posix_memalign],
AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+AC_ARG_ENABLE([debug-functions],
+ AS_HELP_STRING([--enable-debug-func], [Enable debugging of functions selected]))
+AS_IF([test "x$enable_debug_func" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_FUNCTIONS"])
+
+AC_ARG_ENABLE([debug-cpu],
+ AS_HELP_STRING([--enable-debug-cpu], [Enable debugging of SIMD detection]))
+AS_IF([test "x$enable_debug_cpu" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_CPU_DETECTION"])
+
AX_EXT()
AC_ARG_ENABLE([neon],
@@ -66,5 +74,14 @@ AC_ARG_ENABLE([valgrind],
[enable_valgrind=no])
AM_CONDITIONAL(ENABLE_VALGRIND, test "x$enable_valgrind" != xno)
+AC_ARG_ENABLE([avx], AS_HELP_STRING([--enable-avx], [Build with AVX optimizations]))
+AX_CHECK_COMPILE_FLAG(-mavx, [ax_cv_support_avx=yes], [])
+
+AS_IF([test "x$enable_avx" = "xyes"],
+ [AS_IF([test "x$ax_cv_support_avx" = "xno"],
+ [AC_MSG_ERROR([AVX requested but compiler does not support -mavx])],
+ [SIMD_FLAGS="$SIMD_FLAGS -mavx"])
+ ])
+
AC_CONFIG_FILES([Makefile src/Makefile tools/Makefile test/Makefile examples/Makefile])
AC_OUTPUT
diff --git a/depcomp b/depcomp
deleted file mode 100755
index 4ebd5b3..0000000
--- a/depcomp
+++ /dev/null
@@ -1,791 +0,0 @@
-#! /bin/sh
-# depcomp - compile a program generating dependencies as side-effects
-
-scriptversion=2013-05-30.07; # UTC
-
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
-
-case $1 in
- '')
- echo "$0: No command. Try '$0 --help' for more information." 1>&2
- exit 1;
- ;;
- -h | --h*)
- cat <<\EOF
-Usage: depcomp [--help] [--version] PROGRAM [ARGS]
-
-Run PROGRAMS ARGS to compile a file, generating dependencies
-as side-effects.
-
-Environment variables:
- depmode Dependency tracking mode.
- source Source file read by 'PROGRAMS ARGS'.
- object Object file output by 'PROGRAMS ARGS'.
- DEPDIR directory where to store dependencies.
- depfile Dependency file to output.
- tmpdepfile Temporary file to use when outputting dependencies.
- libtool Whether libtool is used (yes/no).
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
- exit $?
- ;;
- -v | --v*)
- echo "depcomp $scriptversion"
- exit $?
- ;;
-esac
-
-# Get the directory component of the given path, and save it in the
-# global variables '$dir'. Note that this directory component will
-# be either empty or ending with a '/' character. This is deliberate.
-set_dir_from ()
-{
- case $1 in
- */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;;
- *) dir=;;
- esac
-}
-
-# Get the suffix-stripped basename of the given path, and save it the
-# global variable '$base'.
-set_base_from ()
-{
- base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'`
-}
-
-# If no dependency file was actually created by the compiler invocation,
-# we still have to create a dummy depfile, to avoid errors with the
-# Makefile "include basename.Plo" scheme.
-make_dummy_depfile ()
-{
- echo "#dummy" > "$depfile"
-}
-
-# Factor out some common post-processing of the generated depfile.
-# Requires the auxiliary global variable '$tmpdepfile' to be set.
-aix_post_process_depfile ()
-{
- # If the compiler actually managed to produce a dependency file,
- # post-process it.
- if test -f "$tmpdepfile"; then
- # Each line is of the form 'foo.o: dependency.h'.
- # Do two passes, one to just change these to
- # $object: dependency.h
- # and one to simply output
- # dependency.h:
- # which is needed to avoid the deleted-header problem.
- { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile"
- sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile"
- } > "$depfile"
- rm -f "$tmpdepfile"
- else
- make_dummy_depfile
- fi
-}
-
-# A tabulation character.
-tab=' '
-# A newline character.
-nl='
-'
-# Character ranges might be problematic outside the C locale.
-# These definitions help.
-upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ
-lower=abcdefghijklmnopqrstuvwxyz
-digits=0123456789
-alpha=${upper}${lower}
-
-if test -z "$depmode" || test -z "$source" || test -z "$object"; then
- echo "depcomp: Variables source, object and depmode must be set" 1>&2
- exit 1
-fi
-
-# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
-depfile=${depfile-`echo "$object" |
- sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
-tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
-
-rm -f "$tmpdepfile"
-
-# Avoid interferences from the environment.
-gccflag= dashmflag=
-
-# Some modes work just like other modes, but use different flags. We
-# parameterize here, but still list the modes in the big case below,
-# to make depend.m4 easier to write. Note that we *cannot* use a case
-# here, because this file can only contain one case statement.
-if test "$depmode" = hp; then
- # HP compiler uses -M and no extra arg.
- gccflag=-M
- depmode=gcc
-fi
-
-if test "$depmode" = dashXmstdout; then
- # This is just like dashmstdout with a different argument.
- dashmflag=-xM
- depmode=dashmstdout
-fi
-
-cygpath_u="cygpath -u -f -"
-if test "$depmode" = msvcmsys; then
- # This is just like msvisualcpp but w/o cygpath translation.
- # Just convert the backslash-escaped backslashes to single forward
- # slashes to satisfy depend.m4
- cygpath_u='sed s,\\\\,/,g'
- depmode=msvisualcpp
-fi
-
-if test "$depmode" = msvc7msys; then
- # This is just like msvc7 but w/o cygpath translation.
- # Just convert the backslash-escaped backslashes to single forward
- # slashes to satisfy depend.m4
- cygpath_u='sed s,\\\\,/,g'
- depmode=msvc7
-fi
-
-if test "$depmode" = xlc; then
- # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information.
- gccflag=-qmakedep=gcc,-MF
- depmode=gcc
-fi
-
-case "$depmode" in
-gcc3)
-## gcc 3 implements dependency tracking that does exactly what
-## we want. Yay! Note: for some reason libtool 1.4 doesn't like
-## it if -MD -MP comes after the -MF stuff. Hmm.
-## Unfortunately, FreeBSD c89 acceptance of flags depends upon
-## the command line argument order; so add the flags where they
-## appear in depend2.am. Note that the slowdown incurred here
-## affects only configure: in makefiles, %FASTDEP% shortcuts this.
- for arg
- do
- case $arg in
- -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
- *) set fnord "$@" "$arg" ;;
- esac
- shift # fnord
- shift # $arg
- done
- "$@"
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile"
- exit $stat
- fi
- mv "$tmpdepfile" "$depfile"
- ;;
-
-gcc)
-## Note that this doesn't just cater to obsosete pre-3.x GCC compilers.
-## but also to in-use compilers like IMB xlc/xlC and the HP C compiler.
-## (see the conditional assignment to $gccflag above).
-## There are various ways to get dependency output from gcc. Here's
-## why we pick this rather obscure method:
-## - Don't want to use -MD because we'd like the dependencies to end
-## up in a subdir. Having to rename by hand is ugly.
-## (We might end up doing this anyway to support other compilers.)
-## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-## -MM, not -M (despite what the docs say). Also, it might not be
-## supported by the other compilers which use the 'gcc' depmode.
-## - Using -M directly means running the compiler twice (even worse
-## than renaming).
- if test -z "$gccflag"; then
- gccflag=-MD,
- fi
- "$@" -Wp,"$gccflag$tmpdepfile"
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile"
- exit $stat
- fi
- rm -f "$depfile"
- echo "$object : \\" > "$depfile"
- # The second -e expression handles DOS-style file names with drive
- # letters.
- sed -e 's/^[^:]*: / /' \
- -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
-## This next piece of magic avoids the "deleted header file" problem.
-## The problem is that when a header file which appears in a .P file
-## is deleted, the dependency causes make to die (because there is
-## typically no way to rebuild the header). We avoid this by adding
-## dummy dependencies for each header file. Too bad gcc doesn't do
-## this for us directly.
-## Some versions of gcc put a space before the ':'. On the theory
-## that the space means something, we add a space to the output as
-## well. hp depmode also adds that space, but also prefixes the VPATH
-## to the object. Take care to not repeat it in the output.
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly. Breaking it into two sed invocations is a workaround.
- tr ' ' "$nl" < "$tmpdepfile" \
- | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
- | sed -e 's/$/ :/' >> "$depfile"
- rm -f "$tmpdepfile"
- ;;
-
-hp)
- # This case exists only to let depend.m4 do its work. It works by
- # looking at the text of this script. This case will never be run,
- # since it is checked for above.
- exit 1
- ;;
-
-sgi)
- if test "$libtool" = yes; then
- "$@" "-Wp,-MDupdate,$tmpdepfile"
- else
- "$@" -MDupdate "$tmpdepfile"
- fi
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile"
- exit $stat
- fi
- rm -f "$depfile"
-
- if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files
- echo "$object : \\" > "$depfile"
- # Clip off the initial element (the dependent). Don't try to be
- # clever and replace this with sed code, as IRIX sed won't handle
- # lines with more than a fixed number of characters (4096 in
- # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines;
- # the IRIX cc adds comments like '#:fec' to the end of the
- # dependency line.
- tr ' ' "$nl" < "$tmpdepfile" \
- | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \
- | tr "$nl" ' ' >> "$depfile"
- echo >> "$depfile"
- # The second pass generates a dummy entry for each header file.
- tr ' ' "$nl" < "$tmpdepfile" \
- | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
- >> "$depfile"
- else
- make_dummy_depfile
- fi
- rm -f "$tmpdepfile"
- ;;
-
-xlc)
- # This case exists only to let depend.m4 do its work. It works by
- # looking at the text of this script. This case will never be run,
- # since it is checked for above.
- exit 1
- ;;
-
-aix)
- # The C for AIX Compiler uses -M and outputs the dependencies
- # in a .u file. In older versions, this file always lives in the
- # current directory. Also, the AIX compiler puts '$object:' at the
- # start of each line; $object doesn't have directory information.
- # Version 6 uses the directory in both cases.
- set_dir_from "$object"
- set_base_from "$object"
- if test "$libtool" = yes; then
- tmpdepfile1=$dir$base.u
- tmpdepfile2=$base.u
- tmpdepfile3=$dir.libs/$base.u
- "$@" -Wc,-M
- else
- tmpdepfile1=$dir$base.u
- tmpdepfile2=$dir$base.u
- tmpdepfile3=$dir$base.u
- "$@" -M
- fi
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
- exit $stat
- fi
-
- for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
- do
- test -f "$tmpdepfile" && break
- done
- aix_post_process_depfile
- ;;
-
-tcc)
- # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26
- # FIXME: That version still under development at the moment of writing.
- # Make that this statement remains true also for stable, released
- # versions.
- # It will wrap lines (doesn't matter whether long or short) with a
- # trailing '\', as in:
- #
- # foo.o : \
- # foo.c \
- # foo.h \
- #
- # It will put a trailing '\' even on the last line, and will use leading
- # spaces rather than leading tabs (at least since its commit 0394caf7
- # "Emit spaces for -MD").
- "$@" -MD -MF "$tmpdepfile"
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile"
- exit $stat
- fi
- rm -f "$depfile"
- # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'.
- # We have to change lines of the first kind to '$object: \'.
- sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile"
- # And for each line of the second kind, we have to emit a 'dep.h:'
- # dummy dependency, to avoid the deleted-header problem.
- sed -n -e 's|^ *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile"
- rm -f "$tmpdepfile"
- ;;
-
-## The order of this option in the case statement is important, since the
-## shell code in configure will try each of these formats in the order
-## listed in this file. A plain '-MD' option would be understood by many
-## compilers, so we must ensure this comes after the gcc and icc options.
-pgcc)
- # Portland's C compiler understands '-MD'.
- # Will always output deps to 'file.d' where file is the root name of the
- # source file under compilation, even if file resides in a subdirectory.
- # The object file name does not affect the name of the '.d' file.
- # pgcc 10.2 will output
- # foo.o: sub/foo.c sub/foo.h
- # and will wrap long lines using '\' :
- # foo.o: sub/foo.c ... \
- # sub/foo.h ... \
- # ...
- set_dir_from "$object"
- # Use the source, not the object, to determine the base name, since
- # that's sadly what pgcc will do too.
- set_base_from "$source"
- tmpdepfile=$base.d
-
- # For projects that build the same source file twice into different object
- # files, the pgcc approach of using the *source* file root name can cause
- # problems in parallel builds. Use a locking strategy to avoid stomping on
- # the same $tmpdepfile.
- lockdir=$base.d-lock
- trap "
- echo '$0: caught signal, cleaning up...' >&2
- rmdir '$lockdir'
- exit 1
- " 1 2 13 15
- numtries=100
- i=$numtries
- while test $i -gt 0; do
- # mkdir is a portable test-and-set.
- if mkdir "$lockdir" 2>/dev/null; then
- # This process acquired the lock.
- "$@" -MD
- stat=$?
- # Release the lock.
- rmdir "$lockdir"
- break
- else
- # If the lock is being held by a different process, wait
- # until the winning process is done or we timeout.
- while test -d "$lockdir" && test $i -gt 0; do
- sleep 1
- i=`expr $i - 1`
- done
- fi
- i=`expr $i - 1`
- done
- trap - 1 2 13 15
- if test $i -le 0; then
- echo "$0: failed to acquire lock after $numtries attempts" >&2
- echo "$0: check lockdir '$lockdir'" >&2
- exit 1
- fi
-
- if test $stat -ne 0; then
- rm -f "$tmpdepfile"
- exit $stat
- fi
- rm -f "$depfile"
- # Each line is of the form `foo.o: dependent.h',
- # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
- # Do two passes, one to just change these to
- # `$object: dependent.h' and one to simply `dependent.h:'.
- sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
- # Some versions of the HPUX 10.20 sed can't process this invocation
- # correctly. Breaking it into two sed invocations is a workaround.
- sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \
- | sed -e 's/$/ :/' >> "$depfile"
- rm -f "$tmpdepfile"
- ;;
-
-hp2)
- # The "hp" stanza above does not work with aCC (C++) and HP's ia64
- # compilers, which have integrated preprocessors. The correct option
- # to use with these is +Maked; it writes dependencies to a file named
- # 'foo.d', which lands next to the object file, wherever that
- # happens to be.
- # Much of this is similar to the tru64 case; see comments there.
- set_dir_from "$object"
- set_base_from "$object"
- if test "$libtool" = yes; then
- tmpdepfile1=$dir$base.d
- tmpdepfile2=$dir.libs/$base.d
- "$@" -Wc,+Maked
- else
- tmpdepfile1=$dir$base.d
- tmpdepfile2=$dir$base.d
- "$@" +Maked
- fi
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile1" "$tmpdepfile2"
- exit $stat
- fi
-
- for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
- do
- test -f "$tmpdepfile" && break
- done
- if test -f "$tmpdepfile"; then
- sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile"
- # Add 'dependent.h:' lines.
- sed -ne '2,${
- s/^ *//
- s/ \\*$//
- s/$/:/
- p
- }' "$tmpdepfile" >> "$depfile"
- else
- make_dummy_depfile
- fi
- rm -f "$tmpdepfile" "$tmpdepfile2"
- ;;
-
-tru64)
- # The Tru64 compiler uses -MD to generate dependencies as a side
- # effect. 'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'.
- # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
- # dependencies in 'foo.d' instead, so we check for that too.
- # Subdirectories are respected.
- set_dir_from "$object"
- set_base_from "$object"
-
- if test "$libtool" = yes; then
- # Libtool generates 2 separate objects for the 2 libraries. These
- # two compilations output dependencies in $dir.libs/$base.o.d and
- # in $dir$base.o.d. We have to check for both files, because
- # one of the two compilations can be disabled. We should prefer
- # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
- # automatically cleaned when .libs/ is deleted, while ignoring
- # the former would cause a distcleancheck panic.
- tmpdepfile1=$dir$base.o.d # libtool 1.5
- tmpdepfile2=$dir.libs/$base.o.d # Likewise.
- tmpdepfile3=$dir.libs/$base.d # Compaq CCC V6.2-504
- "$@" -Wc,-MD
- else
- tmpdepfile1=$dir$base.d
- tmpdepfile2=$dir$base.d
- tmpdepfile3=$dir$base.d
- "$@" -MD
- fi
-
- stat=$?
- if test $stat -ne 0; then
- rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
- exit $stat
- fi
-
- for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
- do
- test -f "$tmpdepfile" && break
- done
- # Same post-processing that is required for AIX mode.
- aix_post_process_depfile
- ;;
-
-msvc7)
- if test "$libtool" = yes; then
- showIncludes=-Wc,-showIncludes
- else
- showIncludes=-showIncludes
- fi
- "$@" $showIncludes > "$tmpdepfile"
- stat=$?
- grep -v '^Note: including file: ' "$tmpdepfile"
- if test $stat -ne 0; then
- rm -f "$tmpdepfile"
- exit $stat
- fi
- rm -f "$depfile"
- echo "$object : \\" > "$depfile"
- # The first sed program below extracts the file names and escapes
- # backslashes for cygpath. The second sed program outputs the file
- # name when reading, but also accumulates all include files in the
- # hold buffer in order to output them again at the end. This only
- # works with sed implementations that can handle large buffers.
- sed < "$tmpdepfile" -n '
-/^Note: including file: *\(.*\)/ {
- s//\1/
- s/\\/\\\\/g
- p
-}' | $cygpath_u | sort -u | sed -n '
-s/ /\\ /g
-s/\(.*\)/'"$tab"'\1 \\/p
-s/.\(.*\) \\/\1:/
-H
-$ {
- s/.*/'"$tab"'/
- G
- p
-}' >> "$depfile"
- echo >> "$depfile" # make sure the fragment doesn't end with a backslash
- rm -f "$tmpdepfile"
- ;;
-
-msvc7msys)
- # This case exists only to let depend.m4 do its work. It works by
- # looking at the text of this script. This case will never be run,
- # since it is checked for above.
- exit 1
- ;;
-
-#nosideeffect)
- # This comment above is used by automake to tell side-effect
- # dependency tracking mechanisms from slower ones.
-
-dashmstdout)
- # Important note: in order to support this mode, a compiler *must*
- # always write the preprocessed file to stdout, regardless of -o.
- "$@" || exit $?
-
- # Remove the call to Libtool.
- if test "$libtool" = yes; then
- while test "X$1" != 'X--mode=compile'; do
- shift
- done
- shift
- fi
-
- # Remove '-o $object'.
- IFS=" "
- for arg
- do
- case $arg in
- -o)
- shift
- ;;
- $object)
- shift
- ;;
- *)
- set fnord "$@" "$arg"
- shift # fnord
- shift # $arg
- ;;
- esac
- done
-
- test -z "$dashmflag" && dashmflag=-M
- # Require at least two characters before searching for ':'
- # in the target name. This is to cope with DOS-style filenames:
- # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise.
- "$@" $dashmflag |
- sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile"
- rm -f "$depfile"
- cat < "$tmpdepfile" > "$depfile"
- # Some versions of the HPUX 10.20 sed can't process this sed invocation
- # correctly. Breaking it into two sed invocations is a workaround.
- tr ' ' "$nl" < "$tmpdepfile" \
- | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \
- | sed -e 's/$/ :/' >> "$depfile"
- rm -f "$tmpdepfile"
- ;;
-
-dashXmstdout)
- # This case only exists to satisfy depend.m4. It is never actually
- # run, as this mode is specially recognized in the preamble.
- exit 1
- ;;
-
-makedepend)
- "$@" || exit $?
- # Remove any Libtool call
- if test "$libtool" = yes; then
- while test "X$1" != 'X--mode=compile'; do
- shift
- done
- shift
- fi
- # X makedepend
- shift
- cleared=no eat=no
- for arg
- do
- case $cleared in
- no)
- set ""; shift
- cleared=yes ;;
- esac
- if test $eat = yes; then
- eat=no
- continue
- fi
- case "$arg" in
- -D*|-I*)
- set fnord "$@" "$arg"; shift ;;
- # Strip any option that makedepend may not understand. Remove
- # the object too, otherwise makedepend will parse it as a source file.
- -arch)
- eat=yes ;;
- -*|$object)
- ;;
- *)
- set fnord "$@" "$arg"; shift ;;
- esac
- done
- obj_suffix=`echo "$object" | sed 's/^.*\././'`
- touch "$tmpdepfile"
- ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
- rm -f "$depfile"
- # makedepend may prepend the VPATH from the source file name to the object.
- # No need to regex-escape $object, excess matching of '.' is harmless.
- sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
- # Some versions of the HPUX 10.20 sed can't process the last invocation
- # correctly. Breaking it into two sed invocations is a workaround.
- sed '1,2d' "$tmpdepfile" \
- | tr ' ' "$nl" \
- | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \
- | sed -e 's/$/ :/' >> "$depfile"
- rm -f "$tmpdepfile" "$tmpdepfile".bak
- ;;
-
-cpp)
- # Important note: in order to support this mode, a compiler *must*
- # always write the preprocessed file to stdout.
- "$@" || exit $?
-
- # Remove the call to Libtool.
- if test "$libtool" = yes; then
- while test "X$1" != 'X--mode=compile'; do
- shift
- done
- shift
- fi
-
- # Remove '-o $object'.
- IFS=" "
- for arg
- do
- case $arg in
- -o)
- shift
- ;;
- $object)
- shift
- ;;
- *)
- set fnord "$@" "$arg"
- shift # fnord
- shift # $arg
- ;;
- esac
- done
-
- "$@" -E \
- | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
- -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
- | sed '$ s: \\$::' > "$tmpdepfile"
- rm -f "$depfile"
- echo "$object : \\" > "$depfile"
- cat < "$tmpdepfile" >> "$depfile"
- sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
- rm -f "$tmpdepfile"
- ;;
-
-msvisualcpp)
- # Important note: in order to support this mode, a compiler *must*
- # always write the preprocessed file to stdout.
- "$@" || exit $?
-
- # Remove the call to Libtool.
- if test "$libtool" = yes; then
- while test "X$1" != 'X--mode=compile'; do
- shift
- done
- shift
- fi
-
- IFS=" "
- for arg
- do
- case "$arg" in
- -o)
- shift
- ;;
- $object)
- shift
- ;;
- "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
- set fnord "$@"
- shift
- shift
- ;;
- *)
- set fnord "$@" "$arg"
- shift
- shift
- ;;
- esac
- done
- "$@" -E 2>/dev/null |
- sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile"
- rm -f "$depfile"
- echo "$object : \\" > "$depfile"
- sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile"
- echo "$tab" >> "$depfile"
- sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile"
- rm -f "$tmpdepfile"
- ;;
-
-msvcmsys)
- # This case exists only to let depend.m4 do its work. It works by
- # looking at the text of this script. This case will never be run,
- # since it is checked for above.
- exit 1
- ;;
-
-none)
- exec "$@"
- ;;
-
-*)
- echo "Unknown depmode $depmode" 1>&2
- exit 1
- ;;
-esac
-
-exit 0
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/include/gf_cpu.h b/include/gf_cpu.h
new file mode 100644
index 0000000..71c7227
--- /dev/null
+++ b/include/gf_cpu.h
@@ -0,0 +1,20 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#pragma once
+
+extern int gf_cpu_supports_intel_pclmul;
+extern int gf_cpu_supports_intel_sse4;
+extern int gf_cpu_supports_intel_ssse3;
+extern int gf_cpu_supports_intel_sse3;
+extern int gf_cpu_supports_intel_sse2;
+extern int gf_cpu_supports_arm_neon;
+
+void gf_cpu_identify(void);
diff --git a/include/gf_int.h b/include/gf_int.h
index 32866f4..0356920 100644
--- a/include/gf_int.h
+++ b/include/gf_int.h
@@ -30,8 +30,24 @@ typedef struct {
int arg2;
gf_t *base_gf;
void *private;
+#ifdef DEBUG_FUNCTIONS
+ const char *multiply;
+ const char *divide;
+ const char *inverse;
+ const char *multiply_region;
+ const char *extract_word;
+#endif
} gf_internal_t;
+#ifdef DEBUG_FUNCTIONS
+#define SET_FUNCTION(gf,method,size,func) \
+ { (gf)->method.size = (func); \
+ ((gf_internal_t*)(gf)->scratch)->method = #func; }
+#else
+#define SET_FUNCTION(gf,method,size,func) \
+ (gf)->method.size = (func);
+#endif
+
extern int gf_w4_init (gf_t *gf);
extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4
index c03ccef..95c4dbe 100644
--- a/m4/ax_ext.m4
+++ b/m4/ax_ext.m4
@@ -1,40 +1,7 @@
#
-# Updated by KMG to support -DINTEL_SSE for GF-Complete
+# This macro is based on http://www.gnu.org/software/autoconf-archive/ax_ext.html
+# but simplified to do compile time SIMD checks only
#
-# ===========================================================================
-# http://www.gnu.org/software/autoconf-archive/ax_ext.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-# AX_EXT
-#
-# DESCRIPTION
-#
-# Find supported SIMD extensions by requesting cpuid. When an SIMD
-# extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if
-# compiler supports it. For example, if "sse2" is available, then "-msse2"
-# is added to SIMD_FLAGS.
-#
-# This macro calls:
-#
-# AC_SUBST(SIMD_FLAGS)
-#
-# And defines:
-#
-# HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4.1 / HAVE_SSE4.2 / HAVE_AVX
-#
-# LICENSE
-#
-# Copyright (c) 2007 Christophe Tournayre <turn3r@users.sourceforge.net>
-# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
-#
-# Copying and distribution of this file, with or without modification, are
-# permitted in any medium without royalty provided the copyright notice
-# and this notice are preserved. This file is offered as-is, without any
-# warranty.
-
-#serial 12
AC_DEFUN([AX_EXT],
[
@@ -45,263 +12,63 @@ AC_DEFUN([AX_EXT],
AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
- AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
- [
- # TODO: detect / cross-compile
- ax_cv_have_neon_ext=yes
- ])
- AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext],
- [
- # TODO: detect / cross-compile
- ax_cv_have_arm_crypt_ext=yes
- ])
-
- if test "$ax_cv_have_arm_crypt_ext" = yes; then
- AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension])
- fi
-
+ AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes])
if test "$ax_cv_have_neon_ext" = yes; then
- AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
+ AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON"], [ax_cv_have_neon_ext=no])
fi
-
- if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto,
- SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", [])
- elif test "$ax_cv_have_arm_crypt_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto,
- SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", [])
- elif test "$ax_cv_have_neon_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd,
- SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", [])
- fi
- ;;
+ ;;
arm*)
- AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
- [
- # TODO: detect / cross-compile
- ax_cv_have_neon_ext=yes
- ])
-
+ AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes])
if test "$ax_cv_have_neon_ext" = yes; then
- AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
- AX_CHECK_COMPILE_FLAG(-mfpu=neon,
- SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", [])
+ AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON"], [ax_cv_have_neon_ext=no])
fi
- ;;
+ ;;
powerpc*)
- AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
- [
- if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then
- if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then
- ax_cv_have_altivec_ext=yes
- fi
- fi
- ])
-
- if test "$ax_cv_have_altivec_ext" = yes; then
- AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
- AX_CHECK_COMPILE_FLAG(-faltivec, SIMD_FLAGS="$SIMD_FLAGS -faltivec", [])
- fi
- ;;
-
-
- i[[3456]]86*|x86_64*|amd64*)
-
- AC_REQUIRE([AX_GCC_X86_CPUID])
- AC_REQUIRE([AX_GCC_X86_AVX_XGETBV])
-
- AX_GCC_X86_CPUID(0x00000001)
- ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
- edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
-
- AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext],
- [
- ax_cv_have_mmx_ext=no
- if test "$((0x$edx>>23&0x01))" = 1; then
- ax_cv_have_mmx_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext],
- [
- ax_cv_have_sse_ext=no
- if test "$((0x$edx>>25&0x01))" = 1; then
- ax_cv_have_sse_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext],
- [
- ax_cv_have_sse2_ext=no
- if test "$((0x$edx>>26&0x01))" = 1; then
- ax_cv_have_sse2_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext],
- [
- ax_cv_have_sse3_ext=no
- if test "$((0x$ecx&0x01))" = 1; then
- ax_cv_have_sse3_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether pclmuldq is supported], [ax_cv_have_pclmuldq_ext],
- [
- ax_cv_have_pclmuldq_ext=no
- if test "$((0x$ecx>>1&0x01))" = 1; then
- ax_cv_have_pclmuldq_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext],
- [
- ax_cv_have_ssse3_ext=no
- if test "$((0x$ecx>>9&0x01))" = 1; then
- ax_cv_have_ssse3_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether sse4.1 is supported], [ax_cv_have_sse41_ext],
- [
- ax_cv_have_sse41_ext=no
- if test "$((0x$ecx>>19&0x01))" = 1; then
- ax_cv_have_sse41_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether sse4.2 is supported], [ax_cv_have_sse42_ext],
- [
- ax_cv_have_sse42_ext=no
- if test "$((0x$ecx>>20&0x01))" = 1; then
- ax_cv_have_sse42_ext=yes
- fi
- ])
-
- AC_CACHE_CHECK([whether avx is supported by processor], [ax_cv_have_avx_cpu_ext],
- [
- ax_cv_have_avx_cpu_ext=no
- if test "$((0x$ecx>>28&0x01))" = 1; then
- ax_cv_have_avx_cpu_ext=yes
- fi
- ])
-
- if test x"$ax_cv_have_avx_cpu_ext" = x"yes"; then
- AX_GCC_X86_AVX_XGETBV(0x00000000)
-
- xgetbv_eax="0"
- if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
- xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
- fi
-
- AC_CACHE_CHECK([whether avx is supported by operating system], [ax_cv_have_avx_ext],
- [
- ax_cv_have_avx_ext=no
-
- if test "$((0x$ecx>>27&0x01))" = 1; then
- if test "$((0x$xgetbv_eax&0x6))" = 6; then
- ax_cv_have_avx_ext=yes
- fi
- fi
- ])
- if test x"$ax_cv_have_avx_ext" = x"no"; then
- AC_MSG_WARN([Your processor supports AVX, but your operating system doesn't])
- fi
+ AC_CACHE_CHECK([whether altivec is enabled], [ax_cv_have_altivec_ext], [ax_cv_have_altivec_ext=yes])
+ if test "$ax_cv_have_altivec_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-faltivec, [SIMD_FLAGS="$SIMD_FLAGS -faltivec"], [ax_cv_have_altivec_ext=no])
fi
+ ;;
- if test "$ax_cv_have_mmx_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-mmmx, ax_cv_support_mmx_ext=yes, [])
- if test x"$ax_cv_support_mmx_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -mmmx"
- AC_DEFINE(HAVE_MMX,,[Support mmx instructions])
- else
- AC_MSG_WARN([Your processor supports mmx instructions but not your compiler, can you try another compiler?])
- fi
- fi
+ i[[3456]]86*|x86_64*|amd64*)
+ AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], [ax_cv_have_sse_ext=yes])
if test "$ax_cv_have_sse_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-msse, ax_cv_support_sse_ext=yes, [])
- if test x"$ax_cv_support_sse_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"
- AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions])
- else
- AC_MSG_WARN([Your processor supports sse instructions but not your compiler, can you try another compiler?])
- fi
+ AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"], [ax_cv_have_sse_ext=no])
fi
+ AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], [ax_cv_have_sse2_ext=yes])
if test "$ax_cv_have_sse2_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-msse2, ax_cv_support_sse2_ext=yes, [])
- if test x"$ax_cv_support_sse2_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"
- AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions])
- else
- AC_MSG_WARN([Your processor supports sse2 instructions but not your compiler, can you try another compiler?])
- fi
+ AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no])
fi
+ AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], [ax_cv_have_sse3_ext=yes])
if test "$ax_cv_have_sse3_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-msse3, ax_cv_support_sse3_ext=yes, [])
- if test x"$ax_cv_support_sse3_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"
- AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions])
- else
- AC_MSG_WARN([Your processor supports sse3 instructions but not your compiler, can you try another compiler?])
- fi
- fi
-
- if test "$ax_cv_have_pclmuldq_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-mpclmul, ax_cv_support_pclmuldq_ext=yes, [])
- if test x"$ax_cv_support_pclmuldq_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"
- AC_DEFINE(HAVE_PCLMULDQ,,[Support (PCLMULDQ) Carry-Free Muliplication])
- else
- AC_MSG_WARN([Your processor supports pclmuldq instructions but not your compiler, can you try another compiler?])
- fi
+ AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no])
fi
+ AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], [ax_cv_have_ssse3_ext=yes])
if test "$ax_cv_have_ssse3_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-mssse3, ax_cv_support_ssse3_ext=yes, [])
- if test x"$ax_cv_support_ssse3_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"
- AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions])
- else
- AC_MSG_WARN([Your processor supports ssse3 instructions but not your compiler, can you try another compiler?])
- fi
+ AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no])
fi
- if test "$ax_cv_have_sse41_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, [])
- if test x"$ax_cv_support_sse41_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"
- AC_DEFINE(HAVE_SSE4_1,,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions])
- else
- AC_MSG_WARN([Your processor supports sse4.1 instructions but not your compiler, can you try another compiler?])
- fi
+ AC_CACHE_CHECK([whether pclmuldq is enabled], [ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes])
+ if test "$ax_cv_have_pclmuldq_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no])
fi
- if test "$ax_cv_have_sse42_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-msse4.2, ax_cv_support_sse42_ext=yes, [])
- if test x"$ax_cv_support_sse42_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"
- AC_DEFINE(HAVE_SSE4_2,,[Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions])
- else
- AC_MSG_WARN([Your processor supports sse4.2 instructions but not your compiler, can you try another compiler?])
- fi
+ AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], [ax_cv_have_sse41_ext=yes])
+ if test "$ax_cv_have_sse41_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no])
fi
- if test "$ax_cv_have_avx_ext" = yes; then
- AX_CHECK_COMPILE_FLAG(-mavx, ax_cv_support_avx_ext=yes, [])
- if test x"$ax_cv_support_avx_ext" = x"yes"; then
- SIMD_FLAGS="$SIMD_FLAGS -mavx"
- AC_DEFINE(HAVE_AVX,,[Support AVX (Advanced Vector Extensions) instructions])
- else
- AC_MSG_WARN([Your processor supports avx instructions but not your compiler, can you try another compiler?])
- fi
+ AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], [ax_cv_have_sse42_ext=yes])
+ if test "$ax_cv_have_sse42_ext" = yes; then
+ AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no])
fi
-
- ;;
+ ;;
esac
AC_SUBST(SIMD_FLAGS)
diff --git a/m4/ax_gcc_x86_avx_xgetbv.m4 b/m4/ax_gcc_x86_avx_xgetbv.m4
deleted file mode 100644
index 0624eeb..0000000
--- a/m4/ax_gcc_x86_avx_xgetbv.m4
+++ /dev/null
@@ -1,79 +0,0 @@
-# ===========================================================================
-# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-# AX_GCC_X86_AVX_XGETBV
-#
-# DESCRIPTION
-#
-# On later x86 processors with AVX SIMD support, with gcc or a compiler
-# that has a compatible syntax for inline assembly instructions, run a
-# small program that executes the xgetbv instruction with input OP. This
-# can be used to detect if the OS supports AVX instruction usage.
-#
-# On output, the values of the eax and edx registers are stored as
-# hexadecimal strings as "eax:edx" in the cache variable
-# ax_cv_gcc_x86_avx_xgetbv.
-#
-# If the xgetbv instruction fails (because you are running a
-# cross-compiler, or because you are not using gcc, or because you are on
-# a processor that doesn't have this instruction),
-# ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown".
-#
-# This macro mainly exists to be used in AX_EXT.
-#
-# LICENSE
-#
-# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
-#
-# This program is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation, either version 3 of the License, or (at your
-# option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-# Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-# As a special exception, the respective Autoconf Macro's copyright owner
-# gives unlimited permission to copy, distribute and modify the configure
-# scripts that are the output of Autoconf when processing the Macro. You
-# need not follow the terms of the GNU General Public License when using
-# or distributing such scripts, even though portions of the text of the
-# Macro appear in them. The GNU General Public License (GPL) does govern
-# all other use of the material that constitutes the Autoconf Macro.
-#
-# This special exception to the GPL applies to versions of the Autoconf
-# Macro released by the Autoconf Archive. When you make and distribute a
-# modified version of the Autoconf Macro, you may extend this special
-# exception to the GPL to apply to your modified version as well.
-
-#serial 1
-
-AC_DEFUN([AX_GCC_X86_AVX_XGETBV],
-[AC_REQUIRE([AC_PROG_CC])
-AC_LANG_PUSH([C])
-AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1,
- [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
- int op = $1, eax, edx;
- FILE *f;
- /* Opcodes for xgetbv */
- __asm__(".byte 0x0f, 0x01, 0xd0"
- : "=a" (eax), "=d" (edx)
- : "c" (op));
- f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
- fprintf(f, "%x:%x\n", eax, edx);
- fclose(f);
- return 0;
-])],
- [ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv],
- [ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv],
- [ax_cv_gcc_x86_avx_xgetbv_$1=unknown])])
-AC_LANG_POP([C])
-])
diff --git a/m4/ax_gcc_x86_cpuid.m4 b/m4/ax_gcc_x86_cpuid.m4
deleted file mode 100644
index 7d46fee..0000000
--- a/m4/ax_gcc_x86_cpuid.m4
+++ /dev/null
@@ -1,79 +0,0 @@
-# ===========================================================================
-# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpuid.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-# AX_GCC_X86_CPUID(OP)
-#
-# DESCRIPTION
-#
-# On Pentium and later x86 processors, with gcc or a compiler that has a
-# compatible syntax for inline assembly instructions, run a small program
-# that executes the cpuid instruction with input OP. This can be used to
-# detect the CPU type.
-#
-# On output, the values of the eax, ebx, ecx, and edx registers are stored
-# as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable
-# ax_cv_gcc_x86_cpuid_OP.
-#
-# If the cpuid instruction fails (because you are running a
-# cross-compiler, or because you are not using gcc, or because you are on
-# a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP
-# is set to the string "unknown".
-#
-# This macro mainly exists to be used in AX_GCC_ARCHFLAG.
-#
-# LICENSE
-#
-# Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
-# Copyright (c) 2008 Matteo Frigo
-#
-# This program is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation, either version 3 of the License, or (at your
-# option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-# Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-# As a special exception, the respective Autoconf Macro's copyright owner
-# gives unlimited permission to copy, distribute and modify the configure
-# scripts that are the output of Autoconf when processing the Macro. You
-# need not follow the terms of the GNU General Public License when using
-# or distributing such scripts, even though portions of the text of the
-# Macro appear in them. The GNU General Public License (GPL) does govern
-# all other use of the material that constitutes the Autoconf Macro.
-#
-# This special exception to the GPL applies to versions of the Autoconf
-# Macro released by the Autoconf Archive. When you make and distribute a
-# modified version of the Autoconf Macro, you may extend this special
-# exception to the GPL to apply to your modified version as well.
-
-#serial 7
-
-AC_DEFUN([AX_GCC_X86_CPUID],
-[AC_REQUIRE([AC_PROG_CC])
-AC_LANG_PUSH([C])
-AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1,
- [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
- int op = $1, eax, ebx, ecx, edx;
- FILE *f;
- __asm__("cpuid"
- : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
- : "a" (op));
- f = fopen("conftest_cpuid", "w"); if (!f) return 1;
- fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
- fclose(f);
- return 0;
-])],
- [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid],
- [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid],
- [ax_cv_gcc_x86_cpuid_$1=unknown])])
-AC_LANG_POP([C])
-])
diff --git a/m4/ltoptions.m4 b/m4/ltoptions.m4
deleted file mode 100644
index 5d9acd8..0000000
--- a/m4/ltoptions.m4
+++ /dev/null
@@ -1,384 +0,0 @@
-# Helper functions for option handling. -*- Autoconf -*-
-#
-# Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation,
-# Inc.
-# Written by Gary V. Vaughan, 2004
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# serial 7 ltoptions.m4
-
-# This is to help aclocal find these macros, as it can't see m4_define.
-AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])])
-
-
-# _LT_MANGLE_OPTION(MACRO-NAME, OPTION-NAME)
-# ------------------------------------------
-m4_define([_LT_MANGLE_OPTION],
-[[_LT_OPTION_]m4_bpatsubst($1__$2, [[^a-zA-Z0-9_]], [_])])
-
-
-# _LT_SET_OPTION(MACRO-NAME, OPTION-NAME)
-# ---------------------------------------
-# Set option OPTION-NAME for macro MACRO-NAME, and if there is a
-# matching handler defined, dispatch to it. Other OPTION-NAMEs are
-# saved as a flag.
-m4_define([_LT_SET_OPTION],
-[m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl
-m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]),
- _LT_MANGLE_DEFUN([$1], [$2]),
- [m4_warning([Unknown $1 option `$2'])])[]dnl
-])
-
-
-# _LT_IF_OPTION(MACRO-NAME, OPTION-NAME, IF-SET, [IF-NOT-SET])
-# ------------------------------------------------------------
-# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
-m4_define([_LT_IF_OPTION],
-[m4_ifdef(_LT_MANGLE_OPTION([$1], [$2]), [$3], [$4])])
-
-
-# _LT_UNLESS_OPTIONS(MACRO-NAME, OPTION-LIST, IF-NOT-SET)
-# -------------------------------------------------------
-# Execute IF-NOT-SET unless all options in OPTION-LIST for MACRO-NAME
-# are set.
-m4_define([_LT_UNLESS_OPTIONS],
-[m4_foreach([_LT_Option], m4_split(m4_normalize([$2])),
- [m4_ifdef(_LT_MANGLE_OPTION([$1], _LT_Option),
- [m4_define([$0_found])])])[]dnl
-m4_ifdef([$0_found], [m4_undefine([$0_found])], [$3
-])[]dnl
-])
-
-
-# _LT_SET_OPTIONS(MACRO-NAME, OPTION-LIST)
-# ----------------------------------------
-# OPTION-LIST is a space-separated list of Libtool options associated
-# with MACRO-NAME. If any OPTION has a matching handler declared with
-# LT_OPTION_DEFINE, dispatch to that macro; otherwise complain about
-# the unknown option and exit.
-m4_defun([_LT_SET_OPTIONS],
-[# Set options
-m4_foreach([_LT_Option], m4_split(m4_normalize([$2])),
- [_LT_SET_OPTION([$1], _LT_Option)])
-
-m4_if([$1],[LT_INIT],[
- dnl
- dnl Simply set some default values (i.e off) if boolean options were not
- dnl specified:
- _LT_UNLESS_OPTIONS([LT_INIT], [dlopen], [enable_dlopen=no
- ])
- _LT_UNLESS_OPTIONS([LT_INIT], [win32-dll], [enable_win32_dll=no
- ])
- dnl
- dnl If no reference was made to various pairs of opposing options, then
- dnl we run the default mode handler for the pair. For example, if neither
- dnl `shared' nor `disable-shared' was passed, we enable building of shared
- dnl archives by default:
- _LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED])
- _LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC])
- _LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC])
- _LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install],
- [_LT_ENABLE_FAST_INSTALL])
- ])
-])# _LT_SET_OPTIONS
-
-
-## --------------------------------- ##
-## Macros to handle LT_INIT options. ##
-## --------------------------------- ##
-
-# _LT_MANGLE_DEFUN(MACRO-NAME, OPTION-NAME)
-# -----------------------------------------
-m4_define([_LT_MANGLE_DEFUN],
-[[_LT_OPTION_DEFUN_]m4_bpatsubst(m4_toupper([$1__$2]), [[^A-Z0-9_]], [_])])
-
-
-# LT_OPTION_DEFINE(MACRO-NAME, OPTION-NAME, CODE)
-# -----------------------------------------------
-m4_define([LT_OPTION_DEFINE],
-[m4_define(_LT_MANGLE_DEFUN([$1], [$2]), [$3])[]dnl
-])# LT_OPTION_DEFINE
-
-
-# dlopen
-# ------
-LT_OPTION_DEFINE([LT_INIT], [dlopen], [enable_dlopen=yes
-])
-
-AU_DEFUN([AC_LIBTOOL_DLOPEN],
-[_LT_SET_OPTION([LT_INIT], [dlopen])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `dlopen' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_DLOPEN], [])
-
-
-# win32-dll
-# ---------
-# Declare package support for building win32 dll's.
-LT_OPTION_DEFINE([LT_INIT], [win32-dll],
-[enable_win32_dll=yes
-
-case $host in
-*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*)
- AC_CHECK_TOOL(AS, as, false)
- AC_CHECK_TOOL(DLLTOOL, dlltool, false)
- AC_CHECK_TOOL(OBJDUMP, objdump, false)
- ;;
-esac
-
-test -z "$AS" && AS=as
-_LT_DECL([], [AS], [1], [Assembler program])dnl
-
-test -z "$DLLTOOL" && DLLTOOL=dlltool
-_LT_DECL([], [DLLTOOL], [1], [DLL creation program])dnl
-
-test -z "$OBJDUMP" && OBJDUMP=objdump
-_LT_DECL([], [OBJDUMP], [1], [Object dumper program])dnl
-])# win32-dll
-
-AU_DEFUN([AC_LIBTOOL_WIN32_DLL],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-_LT_SET_OPTION([LT_INIT], [win32-dll])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `win32-dll' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [])
-
-
-# _LT_ENABLE_SHARED([DEFAULT])
-# ----------------------------
-# implement the --enable-shared flag, and supports the `shared' and
-# `disable-shared' LT_INIT options.
-# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'.
-m4_define([_LT_ENABLE_SHARED],
-[m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl
-AC_ARG_ENABLE([shared],
- [AS_HELP_STRING([--enable-shared@<:@=PKGS@:>@],
- [build shared libraries @<:@default=]_LT_ENABLE_SHARED_DEFAULT[@:>@])],
- [p=${PACKAGE-default}
- case $enableval in
- yes) enable_shared=yes ;;
- no) enable_shared=no ;;
- *)
- enable_shared=no
- # Look at the argument we got. We use all the common list separators.
- lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
- for pkg in $enableval; do
- IFS="$lt_save_ifs"
- if test "X$pkg" = "X$p"; then
- enable_shared=yes
- fi
- done
- IFS="$lt_save_ifs"
- ;;
- esac],
- [enable_shared=]_LT_ENABLE_SHARED_DEFAULT)
-
- _LT_DECL([build_libtool_libs], [enable_shared], [0],
- [Whether or not to build shared libraries])
-])# _LT_ENABLE_SHARED
-
-LT_OPTION_DEFINE([LT_INIT], [shared], [_LT_ENABLE_SHARED([yes])])
-LT_OPTION_DEFINE([LT_INIT], [disable-shared], [_LT_ENABLE_SHARED([no])])
-
-# Old names:
-AC_DEFUN([AC_ENABLE_SHARED],
-[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[shared])
-])
-
-AC_DEFUN([AC_DISABLE_SHARED],
-[_LT_SET_OPTION([LT_INIT], [disable-shared])
-])
-
-AU_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)])
-AU_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AM_ENABLE_SHARED], [])
-dnl AC_DEFUN([AM_DISABLE_SHARED], [])
-
-
-
-# _LT_ENABLE_STATIC([DEFAULT])
-# ----------------------------
-# implement the --enable-static flag, and support the `static' and
-# `disable-static' LT_INIT options.
-# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'.
-m4_define([_LT_ENABLE_STATIC],
-[m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl
-AC_ARG_ENABLE([static],
- [AS_HELP_STRING([--enable-static@<:@=PKGS@:>@],
- [build static libraries @<:@default=]_LT_ENABLE_STATIC_DEFAULT[@:>@])],
- [p=${PACKAGE-default}
- case $enableval in
- yes) enable_static=yes ;;
- no) enable_static=no ;;
- *)
- enable_static=no
- # Look at the argument we got. We use all the common list separators.
- lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
- for pkg in $enableval; do
- IFS="$lt_save_ifs"
- if test "X$pkg" = "X$p"; then
- enable_static=yes
- fi
- done
- IFS="$lt_save_ifs"
- ;;
- esac],
- [enable_static=]_LT_ENABLE_STATIC_DEFAULT)
-
- _LT_DECL([build_old_libs], [enable_static], [0],
- [Whether or not to build static libraries])
-])# _LT_ENABLE_STATIC
-
-LT_OPTION_DEFINE([LT_INIT], [static], [_LT_ENABLE_STATIC([yes])])
-LT_OPTION_DEFINE([LT_INIT], [disable-static], [_LT_ENABLE_STATIC([no])])
-
-# Old names:
-AC_DEFUN([AC_ENABLE_STATIC],
-[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[static])
-])
-
-AC_DEFUN([AC_DISABLE_STATIC],
-[_LT_SET_OPTION([LT_INIT], [disable-static])
-])
-
-AU_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)])
-AU_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AM_ENABLE_STATIC], [])
-dnl AC_DEFUN([AM_DISABLE_STATIC], [])
-
-
-
-# _LT_ENABLE_FAST_INSTALL([DEFAULT])
-# ----------------------------------
-# implement the --enable-fast-install flag, and support the `fast-install'
-# and `disable-fast-install' LT_INIT options.
-# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'.
-m4_define([_LT_ENABLE_FAST_INSTALL],
-[m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl
-AC_ARG_ENABLE([fast-install],
- [AS_HELP_STRING([--enable-fast-install@<:@=PKGS@:>@],
- [optimize for fast installation @<:@default=]_LT_ENABLE_FAST_INSTALL_DEFAULT[@:>@])],
- [p=${PACKAGE-default}
- case $enableval in
- yes) enable_fast_install=yes ;;
- no) enable_fast_install=no ;;
- *)
- enable_fast_install=no
- # Look at the argument we got. We use all the common list separators.
- lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
- for pkg in $enableval; do
- IFS="$lt_save_ifs"
- if test "X$pkg" = "X$p"; then
- enable_fast_install=yes
- fi
- done
- IFS="$lt_save_ifs"
- ;;
- esac],
- [enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT)
-
-_LT_DECL([fast_install], [enable_fast_install], [0],
- [Whether or not to optimize for fast installation])dnl
-])# _LT_ENABLE_FAST_INSTALL
-
-LT_OPTION_DEFINE([LT_INIT], [fast-install], [_LT_ENABLE_FAST_INSTALL([yes])])
-LT_OPTION_DEFINE([LT_INIT], [disable-fast-install], [_LT_ENABLE_FAST_INSTALL([no])])
-
-# Old names:
-AU_DEFUN([AC_ENABLE_FAST_INSTALL],
-[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you put
-the `fast-install' option into LT_INIT's first parameter.])
-])
-
-AU_DEFUN([AC_DISABLE_FAST_INSTALL],
-[_LT_SET_OPTION([LT_INIT], [disable-fast-install])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you put
-the `disable-fast-install' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], [])
-dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], [])
-
-
-# _LT_WITH_PIC([MODE])
-# --------------------
-# implement the --with-pic flag, and support the `pic-only' and `no-pic'
-# LT_INIT options.
-# MODE is either `yes' or `no'. If omitted, it defaults to `both'.
-m4_define([_LT_WITH_PIC],
-[AC_ARG_WITH([pic],
- [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@],
- [try to use only PIC/non-PIC objects @<:@default=use both@:>@])],
- [lt_p=${PACKAGE-default}
- case $withval in
- yes|no) pic_mode=$withval ;;
- *)
- pic_mode=default
- # Look at the argument we got. We use all the common list separators.
- lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
- for lt_pkg in $withval; do
- IFS="$lt_save_ifs"
- if test "X$lt_pkg" = "X$lt_p"; then
- pic_mode=yes
- fi
- done
- IFS="$lt_save_ifs"
- ;;
- esac],
- [pic_mode=default])
-
-test -z "$pic_mode" && pic_mode=m4_default([$1], [default])
-
-_LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl
-])# _LT_WITH_PIC
-
-LT_OPTION_DEFINE([LT_INIT], [pic-only], [_LT_WITH_PIC([yes])])
-LT_OPTION_DEFINE([LT_INIT], [no-pic], [_LT_WITH_PIC([no])])
-
-# Old name:
-AU_DEFUN([AC_LIBTOOL_PICMODE],
-[_LT_SET_OPTION([LT_INIT], [pic-only])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `pic-only' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_PICMODE], [])
-
-## ----------------- ##
-## LTDL_INIT Options ##
-## ----------------- ##
-
-m4_define([_LTDL_MODE], [])
-LT_OPTION_DEFINE([LTDL_INIT], [nonrecursive],
- [m4_define([_LTDL_MODE], [nonrecursive])])
-LT_OPTION_DEFINE([LTDL_INIT], [recursive],
- [m4_define([_LTDL_MODE], [recursive])])
-LT_OPTION_DEFINE([LTDL_INIT], [subproject],
- [m4_define([_LTDL_MODE], [subproject])])
-
-m4_define([_LTDL_TYPE], [])
-LT_OPTION_DEFINE([LTDL_INIT], [installable],
- [m4_define([_LTDL_TYPE], [installable])])
-LT_OPTION_DEFINE([LTDL_INIT], [convenience],
- [m4_define([_LTDL_TYPE], [convenience])])
diff --git a/m4/ltsugar.m4 b/m4/ltsugar.m4
deleted file mode 100644
index 9000a05..0000000
--- a/m4/ltsugar.m4
+++ /dev/null
@@ -1,123 +0,0 @@
-# ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*-
-#
-# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
-# Written by Gary V. Vaughan, 2004
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# serial 6 ltsugar.m4
-
-# This is to help aclocal find these macros, as it can't see m4_define.
-AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])])
-
-
-# lt_join(SEP, ARG1, [ARG2...])
-# -----------------------------
-# Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their
-# associated separator.
-# Needed until we can rely on m4_join from Autoconf 2.62, since all earlier
-# versions in m4sugar had bugs.
-m4_define([lt_join],
-[m4_if([$#], [1], [],
- [$#], [2], [[$2]],
- [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])])
-m4_define([_lt_join],
-[m4_if([$#$2], [2], [],
- [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])])
-
-
-# lt_car(LIST)
-# lt_cdr(LIST)
-# ------------
-# Manipulate m4 lists.
-# These macros are necessary as long as will still need to support
-# Autoconf-2.59 which quotes differently.
-m4_define([lt_car], [[$1]])
-m4_define([lt_cdr],
-[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
- [$#], 1, [],
- [m4_dquote(m4_shift($@))])])
-m4_define([lt_unquote], $1)
-
-
-# lt_append(MACRO-NAME, STRING, [SEPARATOR])
-# ------------------------------------------
-# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
-# Note that neither SEPARATOR nor STRING are expanded; they are appended
-# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
-# No SEPARATOR is output if MACRO-NAME was previously undefined (different
-# than defined and empty).
-#
-# This macro is needed until we can rely on Autoconf 2.62, since earlier
-# versions of m4sugar mistakenly expanded SEPARATOR but not STRING.
-m4_define([lt_append],
-[m4_define([$1],
- m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])])
-
-
-
-# lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...])
-# ----------------------------------------------------------
-# Produce a SEP delimited list of all paired combinations of elements of
-# PREFIX-LIST with SUFFIX1 through SUFFIXn. Each element of the list
-# has the form PREFIXmINFIXSUFFIXn.
-# Needed until we can rely on m4_combine added in Autoconf 2.62.
-m4_define([lt_combine],
-[m4_if(m4_eval([$# > 3]), [1],
- [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl
-[[m4_foreach([_Lt_prefix], [$2],
- [m4_foreach([_Lt_suffix],
- ]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[,
- [_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])])
-
-
-# lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ])
-# -----------------------------------------------------------------------
-# Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited
-# by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ.
-m4_define([lt_if_append_uniq],
-[m4_ifdef([$1],
- [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1],
- [lt_append([$1], [$2], [$3])$4],
- [$5])],
- [lt_append([$1], [$2], [$3])$4])])
-
-
-# lt_dict_add(DICT, KEY, VALUE)
-# -----------------------------
-m4_define([lt_dict_add],
-[m4_define([$1($2)], [$3])])
-
-
-# lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE)
-# --------------------------------------------
-m4_define([lt_dict_add_subkey],
-[m4_define([$1($2:$3)], [$4])])
-
-
-# lt_dict_fetch(DICT, KEY, [SUBKEY])
-# ----------------------------------
-m4_define([lt_dict_fetch],
-[m4_ifval([$3],
- m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]),
- m4_ifdef([$1($2)], [m4_defn([$1($2)])]))])
-
-
-# lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE])
-# -----------------------------------------------------------------
-m4_define([lt_if_dict_fetch],
-[m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4],
- [$5],
- [$6])])
-
-
-# lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...])
-# --------------------------------------------------------------
-m4_define([lt_dict_filter],
-[m4_if([$5], [], [],
- [lt_join(m4_quote(m4_default([$4], [[, ]])),
- lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]),
- [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl
-])
diff --git a/m4/lt~obsolete.m4 b/m4/lt~obsolete.m4
deleted file mode 100644
index c573da9..0000000
--- a/m4/lt~obsolete.m4
+++ /dev/null
@@ -1,98 +0,0 @@
-# lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*-
-#
-# Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
-# Written by Scott James Remnant, 2004.
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# serial 5 lt~obsolete.m4
-
-# These exist entirely to fool aclocal when bootstrapping libtool.
-#
-# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
-# which have later been changed to m4_define as they aren't part of the
-# exported API, or moved to Autoconf or Automake where they belong.
-#
-# The trouble is, aclocal is a bit thick. It'll see the old AC_DEFUN
-# in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us
-# using a macro with the same name in our local m4/libtool.m4 it'll
-# pull the old libtool.m4 in (it doesn't see our shiny new m4_define
-# and doesn't know about Autoconf macros at all.)
-#
-# So we provide this file, which has a silly filename so it's always
-# included after everything else. This provides aclocal with the
-# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
-# because those macros already exist, or will be overwritten later.
-# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
-#
-# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
-# Yes, that means every name once taken will need to remain here until
-# we give up compatibility with versions before 1.7, at which point
-# we need to keep only those names which we still refer to.
-
-# This is to help aclocal find these macros, as it can't see m4_define.
-AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])])
-
-m4_ifndef([AC_LIBTOOL_LINKER_OPTION], [AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])])
-m4_ifndef([AC_PROG_EGREP], [AC_DEFUN([AC_PROG_EGREP])])
-m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])])
-m4_ifndef([_LT_AC_SHELL_INIT], [AC_DEFUN([_LT_AC_SHELL_INIT])])
-m4_ifndef([_LT_AC_SYS_LIBPATH_AIX], [AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])])
-m4_ifndef([_LT_PROG_LTMAIN], [AC_DEFUN([_LT_PROG_LTMAIN])])
-m4_ifndef([_LT_AC_TAGVAR], [AC_DEFUN([_LT_AC_TAGVAR])])
-m4_ifndef([AC_LTDL_ENABLE_INSTALL], [AC_DEFUN([AC_LTDL_ENABLE_INSTALL])])
-m4_ifndef([AC_LTDL_PREOPEN], [AC_DEFUN([AC_LTDL_PREOPEN])])
-m4_ifndef([_LT_AC_SYS_COMPILER], [AC_DEFUN([_LT_AC_SYS_COMPILER])])
-m4_ifndef([_LT_AC_LOCK], [AC_DEFUN([_LT_AC_LOCK])])
-m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE], [AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])])
-m4_ifndef([_LT_AC_TRY_DLOPEN_SELF], [AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])])
-m4_ifndef([AC_LIBTOOL_PROG_CC_C_O], [AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])])
-m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])])
-m4_ifndef([AC_LIBTOOL_OBJDIR], [AC_DEFUN([AC_LIBTOOL_OBJDIR])])
-m4_ifndef([AC_LTDL_OBJDIR], [AC_DEFUN([AC_LTDL_OBJDIR])])
-m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])])
-m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP], [AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])])
-m4_ifndef([AC_PATH_MAGIC], [AC_DEFUN([AC_PATH_MAGIC])])
-m4_ifndef([AC_PROG_LD_GNU], [AC_DEFUN([AC_PROG_LD_GNU])])
-m4_ifndef([AC_PROG_LD_RELOAD_FLAG], [AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])])
-m4_ifndef([AC_DEPLIBS_CHECK_METHOD], [AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])])
-m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])])
-m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])])
-m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])])
-m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS], [AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])])
-m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP], [AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])])
-m4_ifndef([LT_AC_PROG_EGREP], [AC_DEFUN([LT_AC_PROG_EGREP])])
-m4_ifndef([LT_AC_PROG_SED], [AC_DEFUN([LT_AC_PROG_SED])])
-m4_ifndef([_LT_CC_BASENAME], [AC_DEFUN([_LT_CC_BASENAME])])
-m4_ifndef([_LT_COMPILER_BOILERPLATE], [AC_DEFUN([_LT_COMPILER_BOILERPLATE])])
-m4_ifndef([_LT_LINKER_BOILERPLATE], [AC_DEFUN([_LT_LINKER_BOILERPLATE])])
-m4_ifndef([_AC_PROG_LIBTOOL], [AC_DEFUN([_AC_PROG_LIBTOOL])])
-m4_ifndef([AC_LIBTOOL_SETUP], [AC_DEFUN([AC_LIBTOOL_SETUP])])
-m4_ifndef([_LT_AC_CHECK_DLFCN], [AC_DEFUN([_LT_AC_CHECK_DLFCN])])
-m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER], [AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])])
-m4_ifndef([_LT_AC_TAGCONFIG], [AC_DEFUN([_LT_AC_TAGCONFIG])])
-m4_ifndef([AC_DISABLE_FAST_INSTALL], [AC_DEFUN([AC_DISABLE_FAST_INSTALL])])
-m4_ifndef([_LT_AC_LANG_CXX], [AC_DEFUN([_LT_AC_LANG_CXX])])
-m4_ifndef([_LT_AC_LANG_F77], [AC_DEFUN([_LT_AC_LANG_F77])])
-m4_ifndef([_LT_AC_LANG_GCJ], [AC_DEFUN([_LT_AC_LANG_GCJ])])
-m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])])
-m4_ifndef([_LT_AC_LANG_C_CONFIG], [AC_DEFUN([_LT_AC_LANG_C_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])])
-m4_ifndef([_LT_AC_LANG_CXX_CONFIG], [AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])])
-m4_ifndef([_LT_AC_LANG_F77_CONFIG], [AC_DEFUN([_LT_AC_LANG_F77_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])])
-m4_ifndef([_LT_AC_LANG_GCJ_CONFIG], [AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])])
-m4_ifndef([_LT_AC_LANG_RC_CONFIG], [AC_DEFUN([_LT_AC_LANG_RC_CONFIG])])
-m4_ifndef([AC_LIBTOOL_CONFIG], [AC_DEFUN([AC_LIBTOOL_CONFIG])])
-m4_ifndef([_LT_AC_FILE_LTDLL_C], [AC_DEFUN([_LT_AC_FILE_LTDLL_C])])
-m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS], [AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])])
-m4_ifndef([_LT_AC_PROG_CXXCPP], [AC_DEFUN([_LT_AC_PROG_CXXCPP])])
-m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS], [AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])])
-m4_ifndef([_LT_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])])
-m4_ifndef([_LT_PROG_F77], [AC_DEFUN([_LT_PROG_F77])])
-m4_ifndef([_LT_PROG_FC], [AC_DEFUN([_LT_PROG_FC])])
-m4_ifndef([_LT_PROG_CXX], [AC_DEFUN([_LT_PROG_CXX])])
diff --git a/src/Makefile.am b/src/Makefile.am
index a3bd37a..cfc2a50 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -4,11 +4,21 @@
AUTOMAKE_OPTIONS = subdir-objects
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
+# versions will use SIMD for the strcmp implementation. Instead
+# we create a static library just for gf_method that is not compiled
+# with SIMD_FLAGS, this static library will get linked into gf_complete.so
+noinst_LTLIBRARIES = libgf_util.la
+libgf_util_la_SOURCES = gf_method.c
+libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
+
+# we narrowly use SIMD_FLAGS for code that needs it
lib_LTLIBRARIES = libgf_complete.la
-libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
- gf_w64.c gf_w128.c gf_rand.c gf_general.c
+libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
+ gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
+libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+libgf_complete_la_LIBADD = libgf_util.la
if HAVE_NEON
libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
diff --git a/src/gf.c b/src/gf.c
index b7a5c01..84d6996 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
+#include "gf_cpu.h"
int _gf_errno = GF_E_DEFAULT;
@@ -207,20 +208,28 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
#ifdef INTEL_SSE2
- sse2 = 1;
+ if (gf_cpu_supports_intel_sse2) {
+ sse2 = 1;
+ }
#endif
#ifdef INTEL_SSSE3
- sse3 = 1;
+ if (gf_cpu_supports_intel_ssse3) {
+ sse3 = 1;
+ }
#endif
#ifdef INTEL_SSE4_PCLMUL
- pclmul = 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ pclmul = 1;
+ }
#endif
#ifdef ARM_NEON
- pclmul = (w == 4 || w == 8);
- sse3 = 1;
+ if (gf_cpu_supports_arm_neon) {
+ pclmul = (w == 4 || w == 8);
+ sse3 = 1;
+ }
#endif
@@ -473,6 +482,8 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
int sz;
gf_internal_t *h;
+ gf_cpu_identify();
+
if (gf_error_check(w, mult_type, region_type, divide_type,
arg1, arg2, prim_poly, base_gf) == 0) return 0;
@@ -901,9 +912,6 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes);
void gf_multby_one(void *src, void *dest, int bytes, int xor)
{
-#ifdef INTEL_SSE2
- __m128i ms, md;
-#endif
unsigned long uls, uld;
uint8_t *s8, *d8;
uint64_t *s64, *d64, *dtop64;
@@ -918,84 +926,89 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
uld = (unsigned long) dest;
#ifdef INTEL_SSE2
- int abytes;
- s8 = (uint8_t *) src;
- d8 = (uint8_t *) dest;
- if (uls % 16 == uld % 16) {
- gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
- while (s8 != rd.s_start) {
- *d8 ^= *s8;
- d8++;
- s8++;
+ if (gf_cpu_supports_intel_sse2) {
+ __m128i ms, md;
+ int abytes;
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ ms = _mm_load_si128 ((__m128i *)(s8));
+ md = _mm_load_si128 ((__m128i *)(d8));
+ md = _mm_xor_si128(md, ms);
+ _mm_store_si128((__m128i *)(d8), md);
+ s8 += 16;
+ d8 += 16;
+ }
+ while (s8 != (uint8_t *) src + bytes) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ return;
}
- while (s8 < (uint8_t *) rd.s_top) {
- ms = _mm_load_si128 ((__m128i *)(s8));
- md = _mm_load_si128 ((__m128i *)(d8));
+
+ abytes = (bytes & 0xfffffff0);
+
+ while (d8 < (uint8_t *) dest + abytes) {
+ ms = _mm_loadu_si128 ((__m128i *)(s8));
+ md = _mm_loadu_si128 ((__m128i *)(d8));
md = _mm_xor_si128(md, ms);
- _mm_store_si128((__m128i *)(d8), md);
+ _mm_storeu_si128((__m128i *)(d8), md);
s8 += 16;
d8 += 16;
}
- while (s8 != (uint8_t *) src + bytes) {
+ while (d8 != (uint8_t *) dest+bytes) {
*d8 ^= *s8;
d8++;
s8++;
}
return;
}
-
- abytes = (bytes & 0xfffffff0);
-
- while (d8 < (uint8_t *) dest + abytes) {
- ms = _mm_loadu_si128 ((__m128i *)(s8));
- md = _mm_loadu_si128 ((__m128i *)(d8));
- md = _mm_xor_si128(md, ms);
- _mm_storeu_si128((__m128i *)(d8), md);
- s8 += 16;
- d8 += 16;
- }
- while (d8 != (uint8_t *) dest+bytes) {
- *d8 ^= *s8;
- d8++;
- s8++;
- }
- return;
#endif
#if defined(ARM_NEON)
- s8 = (uint8_t *) src;
- d8 = (uint8_t *) dest;
-
- if (uls % 16 == uld % 16) {
- gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
- while (s8 != rd.s_start) {
+ if (gf_cpu_supports_arm_neon) {
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ s8++;
+ d8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ } else {
+ while (s8 + 15 < (uint8_t *) src + bytes) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ }
+ while (s8 < (uint8_t *) src + bytes) {
*d8 ^= *s8;
s8++;
d8++;
}
- while (s8 < (uint8_t *) rd.s_top) {
- uint8x16_t vs = vld1q_u8 (s8);
- uint8x16_t vd = vld1q_u8 (d8);
- uint8x16_t vr = veorq_u8 (vs, vd);
- vst1q_u8 (d8, vr);
- s8 += 16;
- d8 += 16;
- }
- } else {
- while (s8 + 15 < (uint8_t *) src + bytes) {
- uint8x16_t vs = vld1q_u8 (s8);
- uint8x16_t vd = vld1q_u8 (d8);
- uint8x16_t vr = veorq_u8 (vs, vd);
- vst1q_u8 (d8, vr);
- s8 += 16;
- d8 += 16;
- }
- }
- while (s8 < (uint8_t *) src + bytes) {
- *d8 ^= *s8;
- s8++;
- d8++;
+ return;
}
- return;
#endif
if (uls % 8 != uld % 8) {
gf_unaligned_xor(src, dest, bytes);
diff --git a/src/gf_cpu.c b/src/gf_cpu.c
new file mode 100644
index 0000000..fae2cd5
--- /dev/null
+++ b/src/gf_cpu.c
@@ -0,0 +1,168 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int gf_cpu_identified = 0;
+
+int gf_cpu_supports_intel_pclmul = 0;
+int gf_cpu_supports_intel_sse4 = 0;
+int gf_cpu_supports_intel_ssse3 = 0;
+int gf_cpu_supports_intel_sse3 = 0;
+int gf_cpu_supports_intel_sse2 = 0;
+int gf_cpu_supports_arm_neon = 0;
+
+#if defined(__x86_64__)
+
+#if defined(_MSC_VER)
+
+#define cpuid(info, x) __cpuidex(info, x, 0)
+
+#elif defined(__GNUC__)
+
+#include <cpuid.h>
+void cpuid(int info[4], int InfoType){
+ __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+}
+
+#else
+
+#error please add a way to detect CPU SIMD support at runtime
+
+#endif
+
+void gf_cpu_identify(void)
+{
+ if (gf_cpu_identified) {
+ return;
+ }
+
+ int reg[4];
+
+ cpuid(reg, 1);
+
+#if defined(INTEL_SSE4_PCLMUL)
+ if ((reg[2] & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) {
+ gf_cpu_supports_intel_pclmul = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_pclmul\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSE4)
+ if (((reg[2] & (1<<20)) != 0 || (reg[2] & (1<<19)) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) {
+ gf_cpu_supports_intel_sse4 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_sse4\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSSE3)
+ if ((reg[2] & (1<<9)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) {
+ gf_cpu_supports_intel_ssse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_ssse3\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSE3)
+ if ((reg[2] & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) {
+ gf_cpu_supports_intel_sse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_sse3\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSE2)
+ if ((reg[3] & (1<<26)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) {
+ gf_cpu_supports_intel_sse2 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_sse2\n");
+#endif
+ }
+#endif
+
+ gf_cpu_identified = 1;
+}
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+#ifdef __linux__
+
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <linux/auxvec.h>
+#include <asm/hwcap.h>
+#include <fcntl.h>
+
+unsigned long get_hwcap(unsigned long type) {
+ unsigned long hwcap = 0;
+ int fd = open("/proc/self/auxv", O_RDONLY);
+ if (fd > 0) {
+ Elf32_auxv_t auxv;
+ while (read(fd, &auxv, sizeof(Elf32_auxv_t))) {
+ if (auxv.a_type == type) {
+ hwcap = auxv.a_un.a_val;
+ break;
+ }
+ }
+ close(fd);
+ }
+
+ return hwcap;
+}
+
+#endif // linux
+
+void gf_cpu_identify(void)
+{
+ if (gf_cpu_identified) {
+ return;
+ }
+
+#if defined(ARM_NEON)
+ if (!getenv("GF_COMPLETE_DISABLE_NEON")) {
+#if __linux__ && __arm__
+ gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0;
+#elif __aarch64__
+ // ASIMD is supported on all aarch64 architectures
+ gf_cpu_supports_arm_neon = 1;
+#else
+ // we assume that NEON is supported if the compiler supports
+ // NEON and we dont have a reliable way to detect runtime support.
+ gf_cpu_supports_arm_neon = 1;
+#endif
+
+#ifdef DEBUG_CPU_DETECTION
+ if (gf_cpu_supports_arm_neon) {
+ printf("#gf_cpu_supports_arm_neon\n");
+ }
+#endif
+ }
+#endif // defined(ARM_NEON)
+
+ gf_cpu_identified = 1;
+}
+
+#else // defined(__arm__) || defined(__aarch64__)
+
+int gf_cpu_identify(void)
+{
+ gf_cpu_identified = 1;
+ return 0;
+}
+
+#endif
diff --git a/src/gf_w128.c b/src/gf_w128.c
index b1e3d92..74f72e8 100644
--- a/src/gf_w128.c
+++ b/src/gf_w128.c
@@ -11,6 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
+#include "gf_cpu.h"
#define GF_FIELD_WIDTH (128)
@@ -290,11 +291,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
return;
}
+#if defined(INTEL_SSE4_PCLMUL)
+
void
gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a,b;
__m128i result0,result1;
__m128i prim_poly;
@@ -338,9 +339,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
-#endif
-return;
}
+#endif
void
gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
@@ -376,10 +376,10 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
return;
}
+#if defined(INTEL_SSE4)
void
gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
-#if defined(INTEL_SSE4)
int i;
__m128i a, b, pp, prod, amask, u_middle_one;
/*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
@@ -427,16 +427,16 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
}
c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
-#endif
return;
}
+#endif
/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+#if defined(INTEL_SSE4)
void
gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
-#if defined(INTEL_SSE4)
__m128i a, b, lmask, hmask, pp, c, middle_one;
gf_internal_t *h;
uint64_t topbit, middlebit;
@@ -471,8 +471,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
if (middlebit) b = _mm_xor_si128(b, middle_one);
if (topbit) b = _mm_xor_si128(b, pp);
}
-#endif
}
+#endif
void
gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
@@ -1146,7 +1146,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
}
/* a^-1 -> b */
- void
+void
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
uint64_t e_i[2], e_im1[2], e_ip1[2];
@@ -1239,7 +1239,7 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
return;
}
- void
+void
gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
uint64_t d[2];
@@ -1248,7 +1248,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
return;
}
- void
+void
gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
uint64_t one128[2];
@@ -1260,7 +1260,7 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
static
- void
+void
gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
@@ -1405,14 +1405,14 @@ int gf_w128_composite_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt;
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt)
} else {
- gf->multiply_region.w128 = gf_w128_composite_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region)
}
- gf->multiply.w128 = gf_w128_composite_multiply;
- gf->divide.w128 = gf_w128_divide_from_inverse;
- gf->inverse.w128 = gf_w128_composite_inverse;
+ SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply)
+ SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse)
return 1;
}
@@ -1421,10 +1421,12 @@ static
int gf_w128_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf->inverse.w128 = gf_w128_euclid;
- gf->multiply.w128 = gf_w128_clm_multiply;
- gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single;
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+ SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
+ return 1;
+ }
#endif
return 0;
@@ -1433,9 +1435,9 @@ int gf_w128_cfm_init(gf_t *gf)
static
int gf_w128_shift_init(gf_t *gf)
{
- gf->multiply.w128 = gf_w128_shift_multiply;
- gf->inverse.w128 = gf_w128_euclid;
- gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply)
+ SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single)
return 1;
}
@@ -1446,16 +1448,16 @@ int gf_w128_bytwo_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w128 = gf_w128_bytwo_p_multiply;
- /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/
+ SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
+ /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/
/* John: the sse function is slower.*/
} else {
- gf->multiply.w128 = gf_w128_bytwo_b_multiply;
- /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply)
+ /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply)
Ben: This sse function is also slower. */
}
- gf->inverse.w128 = gf_w128_euclid;
- gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region;
+ SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region)
return 1;
}
@@ -1525,20 +1527,20 @@ int gf_w128_split_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
- gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
#if defined(INTEL_SSE4_PCLMUL)
- if (!(h->region_type & GF_REGION_NOSIMD)){
- gf->multiply.w128 = gf_w128_clm_multiply;
+ if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
+ SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
}
#endif
- gf->inverse.w128 = gf_w128_euclid;
+ SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
sd8 = (struct gf_w128_split_8_128_data *) h->private;
sd8->last_value[0] = 0;
sd8->last_value[1] = 0;
- gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region)
} else {
sd4 = (struct gf_w128_split_4_128_data *) h->private;
sd4->last_value[0] = 0;
@@ -1546,23 +1548,19 @@ int gf_w128_split_init(gf_t *gf)
if((h->region_type & GF_REGION_ALTMAP))
{
#ifdef INTEL_SSE4
- if(!(h->region_type & GF_REGION_NOSIMD))
- gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
+ if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
else
- return 0;
- #else
- return 0;
#endif
+ return 0;
}
else {
#ifdef INTEL_SSE4
- if(!(h->region_type & GF_REGION_NOSIMD))
- gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
+ if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
else
- gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
- #else
- gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
#endif
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
}
}
return 1;
@@ -1586,9 +1584,9 @@ int gf_w128_group_init(gf_t *gf)
gt->m_table[2] = 0;
gt->m_table[3] = 0;
- gf->multiply.w128 = gf_w128_group_multiply;
- gf->inverse.w128 = gf_w128_euclid;
- gf->multiply_region.w128 = gf_w128_group_multiply_region;
+ SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply)
+ SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region)
gf_w128_group_r_init(gf);
@@ -1738,10 +1736,10 @@ int gf_w128_init(gf_t *gf)
}
}
- gf->multiply.w128 = NULL;
- gf->divide.w128 = NULL;
- gf->inverse.w128 = NULL;
- gf->multiply_region.w128 = NULL;
+ SET_FUNCTION(gf,multiply,w128,NULL)
+ SET_FUNCTION(gf,divide,w128,NULL)
+ SET_FUNCTION(gf,inverse,w128,NULL)
+ SET_FUNCTION(gf,multiply_region,w128,NULL)
switch(h->mult_type) {
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break;
@@ -1757,22 +1755,22 @@ int gf_w128_init(gf_t *gf)
/* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
are multiple flags in h->region_type */
if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
- gf->extract_word.w128 = gf_w128_split_extract_word;
+ SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word)
} else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
- gf->extract_word.w128 = gf_w128_composite_extract_word;
+ SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word)
} else {
- gf->extract_word.w128 = gf_w128_extract_word;
+ SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word)
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w128 = gf_w128_divide_from_inverse;
+ SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
}
if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
- gf->divide.w128 = gf_w128_divide_from_inverse;
+ SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
}
if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) {
- gf->inverse.w128 = gf_w128_inverse_from_divide;
+ SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide)
}
return 1;
}
diff --git a/src/gf_w16.c b/src/gf_w16.c
index 4e026b2..8316892 100644
--- a/src/gf_w16.c
+++ b/src/gf_w16.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w16.h"
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -391,6 +392,7 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
extra memory.
*/
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -398,8 +400,6 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -433,11 +433,11 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -445,8 +445,6 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -473,11 +471,11 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -485,8 +483,6 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -515,10 +511,9 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-
-#endif
return rv;
}
+#endif
static
@@ -548,7 +543,7 @@ gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
static
int gf_w16_shift_init(gf_t *gf)
{
- gf->multiply.w32 = gf_w16_shift_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_shift_multiply)
return 1;
}
@@ -556,25 +551,27 @@ static
int gf_w16_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
-
- /*Ben: Determining how many reductions to do */
-
- if ((0xfe00 & h->prim_poly) == 0) {
- gf->multiply.w32 = gf_w16_clm_multiply_2;
- gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2;
- } else if((0xf000 & h->prim_poly) == 0) {
- gf->multiply.w32 = gf_w16_clm_multiply_3;
- gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3;
- } else if ((0xe000 & h->prim_poly) == 0) {
- gf->multiply.w32 = gf_w16_clm_multiply_4;
- gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4;
- } else {
- return 0;
- }
- return 1;
+ h = (gf_internal_t *) gf->scratch;
+
+ /*Ben: Determining how many reductions to do */
+
+ if ((0xfe00 & h->prim_poly) == 0) {
+ SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
+ } else if((0xf000 & h->prim_poly) == 0) {
+ SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
+ } else if ((0xe000 & h->prim_poly) == 0) {
+ SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
+ }
#endif
return 0;
@@ -688,10 +685,9 @@ int gf_w16_log_init(gf_t *gf)
if (check) {
if (h->mult_type != GF_MULT_LOG_TABLE) {
-
-#if defined(INTEL_SSE4_PCLMUL)
- return gf_w16_cfm_init(gf);
-#endif
+ if (gf_cpu_supports_intel_pclmul) {
+ return gf_w16_cfm_init(gf);
+ }
return gf_w16_shift_init(gf);
} else {
_gf_errno = GF_E_LOGPOLY;
@@ -705,10 +701,10 @@ int gf_w16_log_init(gf_t *gf)
ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
}
- gf->inverse.w32 = gf_w16_log_inverse;
- gf->divide.w32 = gf_w16_log_divide;
- gf->multiply.w32 = gf_w16_log_multiply;
- gf->multiply_region.w32 = gf_w16_log_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w16_log_inverse)
+ SET_FUNCTION(gf,divide,w32,gf_w16_log_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w16_log_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_multiply_region)
return 1;
}
@@ -948,11 +944,11 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
gf_do_final_region_alignment(&rd);
}
+#ifdef INTEL_SSSE3
static
void
gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
uint64_t i, j, *s64, *d64, *top64;;
uint64_t c, prod;
uint8_t low[4][16];
@@ -1078,14 +1074,14 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
+#ifdef INTEL_SSSE3
static
void
gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
uint64_t i, j, *s64, *d64, *top64;;
uint64_t c, prod;
uint8_t low[4][16];
@@ -1187,8 +1183,8 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
uint32_t
gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
@@ -1216,21 +1212,11 @@ int gf_w16_split_init(gf_t *gf)
{
gf_internal_t *h;
struct gf_w16_split_8_8_data *d8;
- int i, j, exp, issse3;
- int isneon = 0;
+ int i, j, exp;
uint32_t p, basep, tmp;
h = (gf_internal_t *) gf->scratch;
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#else
- issse3 = 0;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
if (h->arg1 == 8 && h->arg2 == 8) {
d8 = (struct gf_w16_split_8_8_data *) h->private;
basep = 1;
@@ -1260,8 +1246,8 @@ int gf_w16_split_init(gf_t *gf)
}
for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
}
- gf->multiply.w32 = gf_w16_split_8_8_multiply;
- gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_split_8_8_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
return 1;
}
@@ -1273,36 +1259,45 @@ int gf_w16_split_init(gf_t *gf)
/* Defaults */
- if (issse3) {
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
- } else if (isneon) {
-#ifdef ARM_NEON
+#ifdef INTEL_SSSE3
+ if (gf_cpu_supports_intel_ssse3) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
+ } else {
+#elif ARM_NEON
+ if (gf_cpu_supports_arm_neon) {
gf_w16_neon_split_init(gf);
-#endif
} else {
- gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+#endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
}
-
+#endif
if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
- gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
} else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
- if (issse3 || isneon) {
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
else if(h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
- else if(h->region_type & GF_REGION_ALTMAP && issse3)
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3)
+ else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
+#endif
} else {
+#endif
if(h->region_type & GF_REGION_SIMD)
return 0;
else if(h->region_type & GF_REGION_ALTMAP)
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
else
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
}
+#endif
}
return 1;
@@ -1313,7 +1308,7 @@ int gf_w16_table_init(gf_t *gf)
{
gf_w16_log_init(gf);
- gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_table_lazy_multiply_region)
return 1;
}
@@ -1844,28 +1839,30 @@ int gf_w16_bytwo_init(gf_t *gf)
}
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w32 = gf_w16_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
- gf->multiply.w32 = gf_w16_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
@@ -1904,10 +1901,10 @@ int gf_w16_log_zero_init(gf_t *gf)
ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
}
- gf->inverse.w32 = gf_w16_log_zero_inverse;
- gf->divide.w32 = gf_w16_log_zero_divide;
- gf->multiply.w32 = gf_w16_log_zero_multiply;
- gf->multiply_region.w32 = gf_w16_log_zero_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w16_log_zero_inverse)
+ SET_FUNCTION(gf,divide,w32,gf_w16_log_zero_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w16_log_zero_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_zero_multiply_region)
return 1;
}
@@ -2145,18 +2142,18 @@ int gf_w16_composite_init(gf_t *gf)
cd->mult_table = gf_w8_get_mult_table(h->base_gf);
if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region_alt)
} else {
- gf->multiply_region.w32 = gf_w16_composite_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region)
}
if (cd->mult_table == NULL) {
- gf->multiply.w32 = gf_w16_composite_multiply_recursive;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_recursive)
} else {
- gf->multiply.w32 = gf_w16_composite_multiply_inline;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_inline)
}
- gf->divide.w32 = NULL;
- gf->inverse.w32 = gf_w16_composite_inverse;
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,gf_w16_composite_inverse)
return 1;
}
@@ -2277,10 +2274,10 @@ int gf_w16_group_init(gf_t *gf)
d44->reduce[p>>16] = (p&0xffff);
}
- gf->multiply.w32 = gf_w16_group_4_4_multiply;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = gf_w16_group_4_4_region_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w16_group_4_4_multiply)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_group_4_4_region_multiply)
return 1;
}
@@ -2360,10 +2357,10 @@ int gf_w16_init(gf_t *gf)
if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16);
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = NULL;
+ SET_FUNCTION(gf,multiply,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,NULL)
switch(h->mult_type) {
case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break;
@@ -2380,34 +2377,34 @@ int gf_w16_init(gf_t *gf)
default: return 0;
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_w16_divide_from_inverse;
- gf->inverse.w32 = gf_w16_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w16_euclid)
} else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_w16_divide_from_inverse;
- gf->inverse.w32 = gf_w16_matrix;
+ SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w16_matrix)
}
if (gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_w16_divide_from_inverse;
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_euclid)
}
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide;
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_inverse_from_divide)
if (h->region_type & GF_REGION_ALTMAP) {
if (h->mult_type == GF_MULT_COMPOSITE) {
- gf->extract_word.w32 = gf_w16_composite_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w16_composite_extract_word)
} else {
- gf->extract_word.w32 = gf_w16_split_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w16_split_extract_word)
}
} else if (h->region_type == GF_REGION_CAUCHY) {
- gf->multiply_region.w32 = gf_wgen_cauchy_region;
- gf->extract_word.w32 = gf_wgen_extract_word;
+ SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+ SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
} else {
- gf->extract_word.w32 = gf_w16_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w16_extract_word)
}
if (gf->multiply_region.w32 == NULL) {
- gf->multiply_region.w32 = gf_w16_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_multiply_region_from_single)
}
return 1;
}
diff --git a/src/gf_w32.c b/src/gf_w32.c
index 854a6e4..bb22894 100644
--- a/src/gf_w32.c
+++ b/src/gf_w32.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w32.h"
+#include "gf_cpu.h"
#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
@@ -347,6 +348,8 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
extra memory.
*/
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_32_t
@@ -354,8 +357,6 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i w;
@@ -378,9 +379,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
#if defined(INTEL_SSE4_PCLMUL)
@@ -435,6 +436,8 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32
#endif
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_32_t
@@ -442,8 +445,6 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -476,9 +477,11 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
@@ -487,8 +490,6 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -515,9 +516,11 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
@@ -526,8 +529,6 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -556,9 +557,9 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
@@ -589,33 +590,35 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
static
int gf_w32_cfmgk_init(gf_t *gf)
{
- gf->inverse.w32 = gf_w32_euclid;
- gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
- gf->multiply.w32 = gf_w32_cfmgk_multiply;
- gf->multiply_region.w32 = gf_w32_cfmgk_multiply_region_from_single;
+ h = (gf_internal_t *) gf->scratch;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
- uint64_t *q_plus = (uint64_t *) h->private;
- uint64_t *g_star = (uint64_t *) h->private + 1;
+ uint64_t *q_plus = (uint64_t *) h->private;
+ uint64_t *g_star = (uint64_t *) h->private + 1;
- uint64_t tmp = h->prim_poly << 32;
- *q_plus = 1ULL << 32;
+ uint64_t tmp = h->prim_poly << 32;
+ *q_plus = 1ULL << 32;
- int i;
- for(i = 63; i >= 32; i--)
- if((1ULL << i) & tmp)
- {
- *q_plus |= 1ULL << (i-32);
- tmp ^= h->prim_poly << (i-32);
- }
+ int i;
+ for(i = 63; i >= 32; i--)
+ if((1ULL << i) & tmp)
+ {
+ *q_plus |= 1ULL << (i-32);
+ tmp ^= h->prim_poly << (i-32);
+ }
- *g_star = h->prim_poly & ((1ULL << 32) - 1);
+ *g_star = h->prim_poly & ((1ULL << 32) - 1);
- return 1;
+ return 1;
+ }
#endif
return 0;
@@ -624,30 +627,32 @@ int gf_w32_cfmgk_init(gf_t *gf)
static
int gf_w32_cfm_init(gf_t *gf)
{
- gf->inverse.w32 = gf_w32_euclid;
- gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
/*Ben: We also check to see if the prim poly will work for pclmul */
/*Ben: Check to see how many reduction steps it will take*/
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
-
- h = (gf_internal_t *) gf->scratch;
-
- if ((0xfffe0000 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w32_clm_multiply_2;
- gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2;
- }else if ((0xffc00000 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w32_clm_multiply_3;
- gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3;
- }else if ((0xfe000000 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w32_clm_multiply_4;
- gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4;
- } else {
- return 0;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ if ((0xfffe0000 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
+ }else if ((0xffc00000 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
+ }else if ((0xfe000000 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
}
- return 1;
#endif
return 0;
@@ -656,9 +661,9 @@ int gf_w32_cfm_init(gf_t *gf)
static
int gf_w32_shift_init(gf_t *gf)
{
- gf->inverse.w32 = gf_w32_euclid;
- gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
- gf->multiply.w32 = gf_w32_shift_multiply;
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
+ SET_FUNCTION(gf,multiply,w32,gf_w32_shift_multiply)
return 1;
}
@@ -1380,32 +1385,34 @@ int gf_w32_bytwo_init(gf_t *gf)
}
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w32 = gf_w32_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
- gf->multiply.w32 = gf_w32_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
- gf->inverse.w32 = gf_w32_euclid;
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
return 1;
}
@@ -1755,11 +1762,11 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
gf_do_final_region_alignment(&rd);
}
+#ifdef INTEL_SSSE3
static
void
gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, j, k;
uint32_t pp, v, *s32, *d32, *top;
@@ -1942,16 +1949,15 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
}
gf_do_final_region_alignment(&rd);
-
-#endif
}
+#endif
+#ifdef INTEL_SSSE3
static
void
gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, j, k;
uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
@@ -2216,9 +2222,8 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
}
}
gf_do_final_region_alignment(&rd);
-
-#endif
}
+#endif
static
int gf_w32_split_init(gf_t *gf)
@@ -2230,29 +2235,13 @@ int gf_w32_split_init(gf_t *gf)
struct gf_split_8_32_lazy_data *d32;
struct gf_split_16_32_lazy_data *d16;
uint32_t p, basep;
- int i, j, exp, ispclmul, issse3;
- int isneon = 0;
-
-#if defined(INTEL_SSE4_PCLMUL)
- ispclmul = 1;
-#else
- ispclmul = 0;
-#endif
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#else
- issse3 = 0;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
+ int i, j, exp;
h = (gf_internal_t *) gf->scratch;
/* Defaults */
- gf->inverse.w32 = gf_w32_euclid;
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
/* JSP: First handle single multiplication:
If args == 8, then we're doing split 8 8.
@@ -2261,17 +2250,19 @@ int gf_w32_split_init(gf_t *gf)
*/
if (h->arg1 == 8 && h->arg2 == 8) {
- gf->multiply.w32 = gf_w32_split_8_8_multiply;
- } else if (ispclmul) {
+ SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
+#if defined(INTEL_SSE4_PCLMUL)
+ } else if (gf_cpu_supports_intel_pclmul) {
if ((0xfffe0000 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w32_clm_multiply_2;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
} else if ((0xffc00000 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w32_clm_multiply_3;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
} else if ((0xfe000000 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w32_clm_multiply_4;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
}
+#endif
} else {
- gf->multiply.w32 = gf_w32_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
}
/* Easy cases: 16/32 and 2/32 */
@@ -2279,7 +2270,7 @@ int gf_w32_split_init(gf_t *gf)
if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
d16 = (struct gf_split_16_32_lazy_data *) h->private;
d16->last_value = 0;
- gf->multiply_region.w32 = gf_w32_split_16_32_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_16_32_lazy_multiply_region)
return 1;
}
@@ -2287,33 +2278,39 @@ int gf_w32_split_init(gf_t *gf)
ld2 = (struct gf_split_2_32_lazy_data *) h->private;
ld2->last_value = 0;
#ifdef INTEL_SSSE3
- if (!(h->region_type & GF_REGION_NOSIMD))
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
- if(h->region_type & GF_REGION_SIMD) return 0;
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
+ if(h->region_type & GF_REGION_SIMD) return 0;
+ #ifdef INTEL_SSSE3
+ }
#endif
return 1;
}
/* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
+
if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
- ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
+ ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
ld4 = (struct gf_split_4_32_lazy_data *) h->private;
ld4->last_value = 0;
- if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
- } else if (isneon) {
+ if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
+ } else if (gf_cpu_supports_arm_neon) {
#ifdef ARM_NEON
gf_w32_neon_split_init(gf);
#endif
} else if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
+#ifdef INTEL_SSSE3
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
+#endif
} else {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
+#ifdef INTEL_SSSE3
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
+#endif
}
return 1;
}
@@ -2324,7 +2321,7 @@ int gf_w32_split_init(gf_t *gf)
h->mult_type == GF_MULT_DEFAULT) {
d32 = (struct gf_split_8_32_lazy_data *) h->private;
d32->last_value = 0;
- gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
return 1;
}
@@ -2333,8 +2330,8 @@ int gf_w32_split_init(gf_t *gf)
if (h->arg1 == 8 && h->arg2 == 8) {
d8 = (struct gf_w32_split_8_8_data *) h->private;
d8->last_value = 0;
- gf->multiply.w32 = gf_w32_split_8_8_multiply;
- gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
basep = 1;
for (exp = 0; exp < 7; exp++) {
for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
@@ -2407,14 +2404,14 @@ int gf_w32_group_init(gf_t *gf)
}
if (g_s == g_r) {
- gf->multiply.w32 = gf_w32_group_s_equals_r_multiply;
- gf->multiply_region.w32 = gf_w32_group_s_equals_r_multiply_region;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_group_s_equals_r_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_s_equals_r_multiply_region)
} else {
- gf->multiply.w32 = gf_w32_group_multiply;
- gf->multiply_region.w32 = gf_w32_group_multiply_region;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_group_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_multiply_region)
}
- gf->divide.w32 = NULL;
- gf->inverse.w32 = gf_w32_euclid;
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
return 1;
}
@@ -2666,18 +2663,18 @@ int gf_w32_composite_init(gf_t *gf)
cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region_alt)
} else {
- gf->multiply_region.w32 = gf_w32_composite_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region)
}
if (cd->log == NULL) {
- gf->multiply.w32 = gf_w32_composite_multiply_recursive;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_recursive)
} else {
- gf->multiply.w32 = gf_w32_composite_multiply_inline;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_inline)
}
- gf->divide.w32 = NULL;
- gf->inverse.w32 = gf_w32_composite_inverse;
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,gf_w32_composite_inverse)
return 1;
}
@@ -2686,16 +2683,6 @@ int gf_w32_composite_init(gf_t *gf)
int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int issse3 = 0;
- int isneon = 0;
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
switch(mult_type)
{
case GF_MULT_BYTWO_p:
@@ -2720,7 +2707,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
}
if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
- (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
+ (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
}
if ((arg1 == 4 && arg2 == 32) ||
@@ -2776,10 +2763,10 @@ int gf_w32_init(gf_t *gf)
if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = NULL;
+ SET_FUNCTION(gf,multiply,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,NULL)
switch(h->mult_type) {
case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break;
@@ -2794,30 +2781,30 @@ int gf_w32_init(gf_t *gf)
default: return 0;
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_w32_divide_from_inverse;
- gf->inverse.w32 = gf_w32_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
} else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_w32_divide_from_inverse;
- gf->inverse.w32 = gf_w32_matrix;
+ SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w32_matrix)
}
if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_w32_divide_from_inverse;
+ SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
}
if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
- gf->inverse.w32 = gf_w32_inverse_from_divide;
+ SET_FUNCTION(gf,inverse,w32,gf_w32_inverse_from_divide)
}
if (h->region_type == GF_REGION_CAUCHY) {
- gf->extract_word.w32 = gf_wgen_extract_word;
- gf->multiply_region.w32 = gf_wgen_cauchy_region;
+ SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
+ SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
} else if (h->region_type & GF_REGION_ALTMAP) {
if (h->mult_type == GF_MULT_COMPOSITE) {
- gf->extract_word.w32 = gf_w32_composite_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w32_composite_extract_word)
} else {
- gf->extract_word.w32 = gf_w32_split_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w32_split_extract_word)
}
} else {
- gf->extract_word.w32 = gf_w32_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w32_extract_word)
}
return 1;
}
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 0e86aa8..3a7b953 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w4.h"
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
/* Ben: This function works, but it is 33% slower than the normal shift mult */
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
void
@@ -311,10 +311,10 @@ int gf_w4_log_init(gf_t *gf)
return 0;
}
- gf->inverse.w32 = gf_w4_inverse_from_divide;
- gf->divide.w32 = gf_w4_log_divide;
- gf->multiply.w32 = gf_w4_log_multiply;
- gf->multiply_region.w32 = gf_w4_log_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
+ SET_FUNCTION(gf,divide,w32,gf_w4_log_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region)
return 1;
}
@@ -444,21 +444,22 @@ int gf_w4_single_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_single_table_divide;
- gf->multiply.w32 = gf_w4_single_table_multiply;
- #if defined(INTEL_SSSE3) || defined(ARM_NEON)
- if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
- gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
- else
- #if defined(INTEL_SSSE3)
- gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
- #elif defined(ARM_NEON)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
+ #if defined(INTEL_SSSE3)
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
+ } else {
+ #elif defined(ARM_NEON)
+ if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
gf_w4_neon_single_table_init(gf);
- #endif
- #else
- gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
- if (h->region_type & GF_REGION_SIMD) return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
+ if (h->region_type & GF_REGION_SIMD) return 0;
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ }
#endif
return 1;
@@ -548,10 +549,10 @@ int gf_w4_double_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_double_table_divide;
- gf->multiply.w32 = gf_w4_double_table_multiply;
- gf->multiply_region.w32 = gf_w4_double_table_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region)
return 1;
}
@@ -682,10 +683,10 @@ int gf_w4_quad_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_quad_table_divide;
- gf->multiply.w32 = gf_w4_quad_table_multiply;
- gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
return 1;
}
static
@@ -724,10 +725,10 @@ int gf_w4_quad_table_lazy_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_quad_table_lazy_divide;
- gf->multiply.w32 = gf_w4_quad_table_lazy_multiply;
- gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
return 1;
}
@@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf)
{
int rt;
gf_internal_t *h;
- int simd = 0;
-
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- simd = 1;
-#endif
h = (gf_internal_t *) gf->scratch;
rt = (h->region_type);
- if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
+ if (h->mult_type == GF_MULT_DEFAULT &&
+ !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))
+ rt |= GF_REGION_DOUBLE_TABLE;
if (rt & GF_REGION_DOUBLE_TABLE) {
return gf_w4_double_table_init(gf);
@@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
#endif
/*
+#ifdef INTEL_SSE2
static
void
gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE2
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
struct gf_bytwo_data *btd;
@@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
}
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
*/
#ifdef INTEL_SSE2
@@ -1865,28 +1863,30 @@ int gf_w4_bytwo_init(gf_t *gf)
}
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w32 = gf_w4_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
- if (h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
+ if (h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
- gf->multiply.w32 = gf_w4_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
- if (h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
+ if (h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
return 1;
@@ -1897,10 +1897,14 @@ static
int gf_w4_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf->multiply.w32 = gf_w4_clm_multiply;
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
+ return 1;
+ }
#elif defined(ARM_NEON)
- return gf_w4_neon_cfm_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ return gf_w4_neon_cfm_init(gf);
+ }
#endif
return 0;
}
@@ -1908,7 +1912,7 @@ int gf_w4_cfm_init(gf_t *gf)
static
int gf_w4_shift_init(gf_t *gf)
{
- gf->multiply.w32 = gf_w4_shift_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply)
return 1;
}
@@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf)
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int issse3 = 0, isneon = 0;
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
switch(mult_type)
{
case GF_MULT_BYTWO_p:
@@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
- if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+ if (mult_type == GF_MULT_DEFAULT &&
+ !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
region_type = GF_REGION_DOUBLE_TABLE;
if (region_type & GF_REGION_DOUBLE_TABLE) {
@@ -1977,11 +1973,11 @@ gf_w4_init (gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (h->prim_poly == 0) h->prim_poly = 0x13;
h->prim_poly |= 0x10;
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = NULL;
- gf->extract_word.w32 = gf_w4_extract_word;
+ SET_FUNCTION(gf,multiply,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,NULL)
+ SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word)
switch(h->mult_type) {
case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
@@ -1995,27 +1991,27 @@ gf_w4_init (gf_t *gf)
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_w4_divide_from_inverse;
- gf->inverse.w32 = gf_w4_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
} else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_w4_divide_from_inverse;
- gf->inverse.w32 = gf_w4_matrix;
+ SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w4_matrix)
}
if (gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_w4_divide_from_inverse;
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
}
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide;
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
if (h->region_type == GF_REGION_CAUCHY) {
- gf->multiply_region.w32 = gf_wgen_cauchy_region;
- gf->extract_word.w32 = gf_wgen_extract_word;
+ SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+ SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
}
if (gf->multiply_region.w32 == NULL) {
- gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single)
}
return 1;
diff --git a/src/gf_w64.c b/src/gf_w64.c
index eae31e6..69e55db 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w64.h"
+#include "gf_cpu.h"
static
inline
@@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
*/
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_64_t
@@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
gf_val_64_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
result = _mm_xor_si128 (result, w);
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_64_t
@@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
gf_val_64_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
result = _mm_xor_si128 (result, w);
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
void
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
-#if defined(INTEL_SSE4_PCLMUL)
gf_internal_t *h;
uint8_t *s8, *d8, *dtop;
gf_region_data rd;
@@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
}
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
void
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
@@ -697,33 +698,35 @@ gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_
static
int gf_w64_shift_init(gf_t *gf)
{
- gf->multiply.w64 = gf_w64_shift_multiply;
- gf->inverse.w64 = gf_w64_euclid;
- gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_shift_multiply)
+ SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
return 1;
}
static
int gf_w64_cfm_init(gf_t *gf)
{
- gf->inverse.w64 = gf_w64_euclid;
- gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+ SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
-#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+#if defined(INTEL_SSE4_PCLMUL)
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
+ h = (gf_internal_t *) gf->scratch;
- if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
- gf->multiply.w64 = gf_w64_clm_multiply_2;
- gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2;
- }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
- gf->multiply.w64 = gf_w64_clm_multiply_4;
- gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
- } else {
- return 0;
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
}
- return 1;
#endif
return 0;
@@ -1008,14 +1011,14 @@ int gf_w64_group_init(gf_t *gf)
}
if (g_s == g_r) {
- gf->multiply.w64 = gf_w64_group_s_equals_r_multiply;
- gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_group_s_equals_r_multiply)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_s_equals_r_multiply_region)
} else {
- gf->multiply.w64 = gf_w64_group_multiply;
- gf->multiply_region.w64 = gf_w64_group_multiply_region;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_group_multiply)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_multiply_region)
}
- gf->divide.w64 = NULL;
- gf->inverse.w64 = gf_w64_euclid;
+ SET_FUNCTION(gf,divide,w64,NULL)
+ SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
return 1;
}
@@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
v = _mm_srli_epi64(v, 1); }
+#ifdef INTEL_SSE2
void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint64_t vrev, one64;
@@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_
s8 += 16;
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
#ifdef INTEL_SSE2
static
@@ -1455,31 +1458,33 @@ int gf_w64_bytwo_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w64 = gf_w64_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
- else
- gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region;
- #else
- gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
- gf->multiply.w64 = gf_w64_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
- else
- gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region;
- #else
- gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
- gf->inverse.w64 = gf_w64_euclid;
+ SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
return 1;
}
@@ -1653,14 +1658,14 @@ int gf_w64_composite_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region_alt)
} else {
- gf->multiply_region.w64 = gf_w64_composite_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region)
}
- gf->multiply.w64 = gf_w64_composite_multiply;
- gf->divide.w64 = NULL;
- gf->inverse.w64 = gf_w64_composite_inverse;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_composite_multiply)
+ SET_FUNCTION(gf,divide,w64,NULL)
+ SET_FUNCTION(gf,inverse,w64,gf_w64_composite_inverse)
return 1;
}
@@ -1970,49 +1975,55 @@ int gf_w64_split_init(gf_t *gf)
/* Defaults */
- gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
- gf->multiply.w64 = gf_w64_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
#if defined(INTEL_SSE4_PCLMUL)
- if ((!(h->region_type & GF_REGION_NOSIMD) &&
- (h->arg1 == 64 || h->arg2 == 64)) ||
- h->mult_type == GF_MULT_DEFAULT){
-
- if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
- gf->multiply.w64 = gf_w64_clm_multiply_2;
- gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2;
- }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
- gf->multiply.w64 = gf_w64_clm_multiply_4;
- gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
- }else{
- return 0;
+ if (gf_cpu_supports_intel_pclmul) {
+ if ((!(h->region_type & GF_REGION_NOSIMD) &&
+ (h->arg1 == 64 || h->arg2 == 64)) ||
+ h->mult_type == GF_MULT_DEFAULT){
+
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+ }else{
+ return 0;
+ }
}
}
#endif
- gf->inverse.w64 = gf_w64_euclid;
+ SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
/* Allen: set region pointers for default mult type. Single pointers are
* taken care of above (explicitly for sse, implicitly for no sse). */
-#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
if (h->mult_type == GF_MULT_DEFAULT) {
- d4 = (struct gf_split_4_64_lazy_data *) h->private;
- d4->last_value = 0;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+ d4 = (struct gf_split_4_64_lazy_data *) h->private;
+ d4->last_value = 0;
#if defined(INTEL_SSE4)
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+ if (gf_cpu_supports_intel_sse4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
#elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
+ if (gf_cpu_supports_arm_neon)
+ gf_w64_neon_split_init(gf);
#endif
- }
-#else
- if (h->mult_type == GF_MULT_DEFAULT) {
- d8 = (struct gf_split_8_64_lazy_data *) h->private;
- d8->last_value = 0;
- gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
- }
+ } else {
#endif
+ d8 = (struct gf_split_8_64_lazy_data *) h->private;
+ d8->last_value = 0;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
+#endif
+ }
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
d4 = (struct gf_split_4_64_lazy_data *) h->private;
@@ -2022,44 +2033,51 @@ int gf_w64_split_init(gf_t *gf)
if(h->region_type & GF_REGION_ALTMAP)
{
#ifdef INTEL_SSSE3
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region;
+ if (gf_cpu_supports_intel_ssse3) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+ } else
#elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
- #else
- return 0;
+ if (gf_cpu_supports_arm_neon) {
+ gf_w64_neon_split_init(gf);
+ } else
#endif
+ return 0;
}
else //no altmap
{
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
- if(h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
- else
- #if defined(INTEL_SSE4)
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
- #elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
- #endif
- #else
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+ if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+ if (h->region_type & GF_REGION_NOSIMD) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
+ } else
+ #if defined(INTEL_SSE4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+ #elif defined(ARCH_AARCH64)
+ gf_w64_neon_split_init(gf);
+ #endif
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
}
}
if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
d8 = (struct gf_split_8_64_lazy_data *) h->private;
d8->last_value = 0;
- gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
}
if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) {
d16 = (struct gf_split_16_64_lazy_data *) h->private;
d16->last_value = 0;
- gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_16_64_lazy_multiply_region)
}
if ((h->arg1 == 8 && h->arg2 == 8)) {
d88 = (struct gf_split_8_8_data *) h->private;
- gf->multiply.w64 = gf_w64_split_8_8_multiply;
+ SET_FUNCTION(gf,multiply,w64,gf_w64_split_8_8_multiply)
/* The performance of this guy sucks, so don't bother with a region op */
@@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
* then fall through to split table scratch size code. */
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
arg1 = 64;
arg2 = 4;
-#else
+ } else {
+#endif
arg1 = 64;
arg2 = 8;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
case GF_MULT_SPLIT_TABLE:
@@ -2169,10 +2191,10 @@ int gf_w64_init(gf_t *gf)
}
}
- gf->multiply.w64 = NULL;
- gf->divide.w64 = NULL;
- gf->inverse.w64 = NULL;
- gf->multiply_region.w64 = NULL;
+ SET_FUNCTION(gf,multiply,w64,NULL)
+ SET_FUNCTION(gf,divide,w64,NULL)
+ SET_FUNCTION(gf,inverse,w64,NULL)
+ SET_FUNCTION(gf,multiply_region,w64,NULL)
switch(h->mult_type) {
case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break;
@@ -2186,27 +2208,27 @@ int gf_w64_init(gf_t *gf)
default: return 0;
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w64 = gf_w64_divide_from_inverse;
- gf->inverse.w64 = gf_w64_euclid;
+ SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
}
if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
- gf->divide.w64 = gf_w64_divide_from_inverse;
+ SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse)
}
if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) {
- gf->inverse.w64 = gf_w64_inverse_from_divide;
+ SET_FUNCTION(gf,inverse,w64,gf_w64_inverse_from_divide)
}
if (h->region_type == GF_REGION_CAUCHY) return 0;
if (h->region_type & GF_REGION_ALTMAP) {
if (h->mult_type == GF_MULT_COMPOSITE) {
- gf->extract_word.w64 = gf_w64_composite_extract_word;
+ SET_FUNCTION(gf,extract_word,w64,gf_w64_composite_extract_word)
} else if (h->mult_type == GF_MULT_SPLIT_TABLE) {
- gf->extract_word.w64 = gf_w64_split_extract_word;
+ SET_FUNCTION(gf,extract_word,w64,gf_w64_split_extract_word)
}
} else {
- gf->extract_word.w64 = gf_w64_extract_word;
+ SET_FUNCTION(gf,extract_word,w64,gf_w64_extract_word)
}
return 1;
diff --git a/src/gf_w8.c b/src/gf_w8.c
index 276799f..f647a31 100644
--- a/src/gf_w8.c
+++ b/src/gf_w8.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -127,6 +128,7 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
}
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -134,8 +136,6 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -169,10 +169,11 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -180,8 +181,6 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -208,10 +207,11 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -219,8 +219,6 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -248,9 +246,9 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
@@ -509,25 +507,29 @@ static
int gf_w8_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
-
- h = (gf_internal_t *) gf->scratch;
-
- if ((0xe0 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w8_clm_multiply_2;
- gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2;
- }else if ((0xc0 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w8_clm_multiply_3;
- gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3;
- }else if ((0x80 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w8_clm_multiply_4;
- gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4;
- }else{
- return 0;
- }
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ if ((0xe0 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
+ }else if ((0xc0 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
+ }else if ((0x80 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
+ }else{
+ return 0;
+ }
+ return 1;
+ }
#elif defined(ARM_NEON)
- return gf_w8_neon_cfm_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ return gf_w8_neon_cfm_init(gf);
+ }
#endif
return 0;
@@ -537,7 +539,7 @@ int gf_w8_cfm_init(gf_t *gf)
static
int gf_w8_shift_init(gf_t *gf)
{
- gf->multiply.w32 = gf_w8_shift_multiply; /* The others will be set automatically */
+ SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply) /* The others will be set automatically */
return 1;
}
@@ -809,20 +811,20 @@ int gf_w8_log_init(gf_t *gf)
} while (i != 1);
if (h->mult_type == GF_MULT_LOG_TABLE) {
- gf->inverse.w32 = gf_w8_log_inverse;
- gf->divide.w32 = gf_w8_log_divide;
- gf->multiply.w32 = gf_w8_log_multiply;
- gf->multiply_region.w32 = gf_w8_log_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse)
+ SET_FUNCTION(gf,divide,w32,gf_w8_log_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region)
} else if (h->mult_type == GF_MULT_LOG_ZERO) {
- gf->inverse.w32 = gf_w8_logzero_small_inverse;
- gf->divide.w32 = gf_w8_logzero_small_divide;
- gf->multiply.w32 = gf_w8_logzero_small_multiply;
- gf->multiply_region.w32 = gf_w8_logzero_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse)
+ SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
} else {
- gf->inverse.w32 = gf_w8_logzero_inverse;
- gf->divide.w32 = gf_w8_logzero_divide;
- gf->multiply.w32 = gf_w8_logzero_multiply;
- gf->multiply_region.w32 = gf_w8_logzero_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse)
+ SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
}
return 1;
}
@@ -1102,21 +1104,22 @@ int gf_w8_split_init(gf_t *gf)
}
}
- gf->multiply.w32 = gf_w8_split_multiply;
-
- #if defined(INTEL_SSSE3) || defined(ARM_NEON)
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w8_split_multiply_region;
- else
- #if defined(INTEL_SSSE3)
- gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
- #elif defined(ARM_NEON)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
+
+ #if defined(INTEL_SSSE3)
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+ } else {
+ #elif defined(ARM_NEON)
+ if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
gf_w8_neon_split_init(gf);
- #endif
- #else
- gf->multiply_region.w32 = gf_w8_split_multiply_region;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ }
#endif
return 1;
@@ -1134,17 +1137,12 @@ int gf_w8_table_init(gf_t *gf)
struct gf_w8_double_table_data *dtd = NULL;
struct gf_w8_double_table_lazy_data *ltd = NULL;
struct gf_w8_default_data *dd = NULL;
- int a, b, c, prod, scase, use_simd;
+ int a, b, c, prod, scase;
h = (gf_internal_t *) gf->scratch;
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- use_simd = 1;
-#else
- use_simd = 0;
-#endif
-
- if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
+ if (h->mult_type == GF_MULT_DEFAULT &&
+ (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
dd = (struct gf_w8_default_data *)h->private;
scase = 3;
bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
@@ -1201,32 +1199,38 @@ int gf_w8_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL; /* Will set from divide */
+ SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */
switch (scase) {
case 0:
- gf->divide.w32 = gf_w8_table_divide;
- gf->multiply.w32 = gf_w8_table_multiply;
- gf->multiply_region.w32 = gf_w8_table_multiply_region;
+ SET_FUNCTION(gf,divide,w32,gf_w8_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region)
break;
case 1:
- gf->divide.w32 = gf_w8_double_table_divide;
- gf->multiply.w32 = gf_w8_double_table_multiply;
- gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+ SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
break;
case 2:
- gf->divide.w32 = gf_w8_double_table_lazy_divide;
- gf->multiply.w32 = gf_w8_double_table_lazy_multiply;
- gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+ SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
break;
case 3:
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- gf->divide.w32 = gf_w8_default_divide;
- gf->multiply.w32 = gf_w8_default_multiply;
+ if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+ SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
#if defined(INTEL_SSSE3)
- gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+ if (gf_cpu_supports_intel_ssse3) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+ }
#elif defined(ARM_NEON)
- gf_w8_neon_split_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ gf_w8_neon_split_init(gf);
+ }
#endif
+ }
#endif
break;
}
@@ -1472,18 +1476,18 @@ int gf_w8_composite_init(gf_t *gf)
cd->mult_table = gf_w4_get_mult_table(h->base_gf);
if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt)
} else {
- gf->multiply_region.w32 = gf_w8_composite_multiply_region;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region)
}
if (cd->mult_table == NULL) {
- gf->multiply.w32 = gf_w8_composite_multiply_recursive;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive)
} else {
- gf->multiply.w32 = gf_w8_composite_multiply_inline;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline)
}
- gf->divide.w32 = NULL;
- gf->inverse.w32 = gf_w8_composite_inverse;
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse)
return 1;
}
@@ -2190,28 +2194,30 @@ int gf_w8_bytwo_init(gf_t *gf)
}
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w32 = gf_w8_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
-#else
- gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
+ } else {
+#endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+#ifdef INTEL_SSE2
+ }
#endif
} else {
- gf->multiply.w32 = gf_w8_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
-#else
- gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
+ } else {
+#endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+#ifdef INTEL_SSE2
+ }
#endif
}
return 1;
@@ -2229,9 +2235,9 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
switch(mult_type)
{
case GF_MULT_DEFAULT:
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
-#endif
+ if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+ }
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
case GF_MULT_TABLE:
if (region_type == GF_REGION_CAUCHY) {
@@ -2304,11 +2310,11 @@ int gf_w8_init(gf_t *gf)
h->prim_poly |= 0x100;
}
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = NULL;
- gf->extract_word.w32 = gf_w8_extract_word;
+ SET_FUNCTION(gf,multiply,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,NULL)
+ SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word)
switch(h->mult_type) {
case GF_MULT_DEFAULT:
@@ -2326,31 +2332,31 @@ int gf_w8_init(gf_t *gf)
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_w8_divide_from_inverse;
- gf->inverse.w32 = gf_w8_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
} else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_w8_divide_from_inverse;
- gf->inverse.w32 = gf_w8_matrix;
+ SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w8_matrix)
}
if (gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_w8_divide_from_inverse;
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
}
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_inverse_from_divide;
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide)
if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
- gf->extract_word.w32 = gf_w8_composite_extract_word;
+ SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word)
}
if (h->region_type == GF_REGION_CAUCHY) {
- gf->multiply_region.w32 = gf_wgen_cauchy_region;
- gf->extract_word.w32 = gf_wgen_extract_word;
+ SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+ SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
}
if (gf->multiply_region.w32 == NULL) {
- gf->multiply_region.w32 = gf_w8_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single)
}
return 1;
diff --git a/src/gf_wgen.c b/src/gf_wgen.c
index ebc50a5..1e3d2e0 100644
--- a/src/gf_wgen.c
+++ b/src/gf_wgen.c
@@ -178,8 +178,8 @@ gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
static
int gf_wgen_shift_init(gf_t *gf)
{
- gf->multiply.w32 = gf_wgen_shift_multiply;
- gf->inverse.w32 = gf_wgen_euclid;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_shift_multiply)
+ SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
return 1;
}
@@ -211,8 +211,8 @@ gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
static
int gf_wgen_bytwo_b_init(gf_t *gf)
{
- gf->multiply.w32 = gf_wgen_bytwo_b_multiply;
- gf->inverse.w32 = gf_wgen_euclid;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_b_multiply)
+ SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
return 1;
}
@@ -247,8 +247,8 @@ gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
static
int gf_wgen_bytwo_p_init(gf_t *gf)
{
- gf->multiply.w32 = gf_wgen_bytwo_p_multiply;
- gf->inverse.w32 = gf_wgen_euclid;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_p_multiply)
+ SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
return 1;
}
@@ -453,12 +453,12 @@ int gf_wgen_group_init(gf_t *gf)
}
if (g_s == g_r) {
- gf->multiply.w32 = gf_wgen_group_s_equals_r_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_group_s_equals_r_multiply)
} else {
- gf->multiply.w32 = gf_wgen_group_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_group_multiply)
}
- gf->divide.w32 = NULL;
- gf->divide.w32 = NULL;
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
return 1;
}
@@ -519,8 +519,8 @@ int gf_wgen_table_8_init(gf_t *gf)
}
}
- gf->multiply.w32 = gf_wgen_table_8_multiply;
- gf->divide.w32 = gf_wgen_table_8_divide;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_table_8_multiply)
+ SET_FUNCTION(gf,divide,w32,gf_wgen_table_8_divide)
return 1;
}
@@ -580,8 +580,8 @@ int gf_wgen_table_16_init(gf_t *gf)
}
}
- gf->multiply.w32 = gf_wgen_table_16_multiply;
- gf->divide.w32 = gf_wgen_table_16_divide;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_table_16_multiply)
+ SET_FUNCTION(gf,divide,w32,gf_wgen_table_16_divide)
return 1;
}
@@ -670,8 +670,8 @@ int gf_wgen_log_8_init(gf_t *gf)
return 0;
}
- gf->multiply.w32 = gf_wgen_log_8_multiply;
- gf->divide.w32 = gf_wgen_log_8_divide;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_log_8_multiply)
+ SET_FUNCTION(gf,divide,w32,gf_wgen_log_8_divide)
return 1;
}
@@ -746,8 +746,8 @@ int gf_wgen_log_16_init(gf_t *gf)
return 0;
}
- gf->multiply.w32 = gf_wgen_log_16_multiply;
- gf->divide.w32 = gf_wgen_log_16_divide;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_log_16_multiply)
+ SET_FUNCTION(gf,divide,w32,gf_wgen_log_16_divide)
return 1;
}
@@ -821,8 +821,8 @@ int gf_wgen_log_32_init(gf_t *gf)
return 0;
}
- gf->multiply.w32 = gf_wgen_log_32_multiply;
- gf->divide.w32 = gf_wgen_log_32_divide;
+ SET_FUNCTION(gf,multiply,w32,gf_wgen_log_32_multiply)
+ SET_FUNCTION(gf,divide,w32,gf_wgen_log_32_divide)
return 1;
}
@@ -975,11 +975,11 @@ int gf_wgen_init(gf_t *gf)
}
}
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = gf_wgen_cauchy_region;
- gf->extract_word.w32 = gf_wgen_extract_word;
+ SET_FUNCTION(gf,multiply,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+ SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
switch(h->mult_type) {
case GF_MULT_DEFAULT:
@@ -1000,20 +1000,20 @@ int gf_wgen_init(gf_t *gf)
default: return 0;
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_wgen_divide_from_inverse;
- gf->inverse.w32 = gf_wgen_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
} else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_wgen_divide_from_inverse;
- gf->inverse.w32 = gf_wgen_matrix;
+ SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_wgen_matrix)
}
- if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_wgen_euclid;
+ if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_wgen_divide_from_inverse;
+ SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse)
}
if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
- gf->inverse.w32 = gf_wgen_inverse_from_divide;
+ SET_FUNCTION(gf,inverse,w32,gf_wgen_inverse_from_divide)
}
return 1;
}
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
index 2bd3f30..477ee63 100644
--- a/src/neon/gf_w16_neon.c
+++ b/src/neon/gf_w16_neon.c
@@ -270,7 +270,7 @@ void gf_w16_neon_split_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP)
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_altmap_multiply_region_neon)
else
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region_neon)
}
diff --git a/src/neon/gf_w32_neon.c b/src/neon/gf_w32_neon.c
index 8231eb3..7fd1329 100644
--- a/src/neon/gf_w32_neon.c
+++ b/src/neon/gf_w32_neon.c
@@ -262,8 +262,8 @@ void gf_w32_neon_split_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP)
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_altmap_multiply_region_neon)
else
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region_neon)
}
diff --git a/src/neon/gf_w4_neon.c b/src/neon/gf_w4_neon.c
index 3a21432..5f35c86 100644
--- a/src/neon/gf_w4_neon.c
+++ b/src/neon/gf_w4_neon.c
@@ -235,13 +235,13 @@ gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
int gf_w4_neon_cfm_init(gf_t *gf)
{
// single clm multiplication probably pointless
- gf->multiply.w32 = gf_w4_neon_clm_multiply;
- gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_neon_clm_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_neon_clm_multiply_region_from_single)
return 1;
}
void gf_w4_neon_single_table_init(gf_t *gf)
{
- gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region_neon)
}
diff --git a/src/neon/gf_w64_neon.c b/src/neon/gf_w64_neon.c
index 0eca9c7..2409823 100644
--- a/src/neon/gf_w64_neon.c
+++ b/src/neon/gf_w64_neon.c
@@ -326,8 +326,8 @@ void gf_w64_neon_split_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP)
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_altmap_multiply_region_neon)
else
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region_neon)
}
diff --git a/src/neon/gf_w8_neon.c b/src/neon/gf_w8_neon.c
index 930a916..0cce5ba 100644
--- a/src/neon/gf_w8_neon.c
+++ b/src/neon/gf_w8_neon.c
@@ -188,14 +188,14 @@ int gf_w8_neon_cfm_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if ((0xe0 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w8_neon_clm_multiply_2;
- gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_2)
}else if ((0xc0 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w8_neon_clm_multiply_3;
- gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_3)
}else if ((0x80 & h->prim_poly) == 0){
- gf->multiply.w32 = gf_w8_neon_clm_multiply_4;
- gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4;
+ SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_4)
}else{
return 0;
}
@@ -298,5 +298,5 @@ gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t va
void gf_w8_neon_split_init(gf_t *gf)
{
- gf->multiply_region.w32 = gf_w8_split_multiply_region_neon;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_neon)
}
diff --git a/test/Makefile.am b/test/Makefile.am
index 2791528..f590ecc 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1,7 +1,7 @@
# GF-Complete 'test' AM file
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
+AM_CFLAGS = -O3 -fPIC
bin_PROGRAMS = gf_unit
diff --git a/tools/Makefile.am b/tools/Makefile.am
index a9dd8b9..4ca9131 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -1,7 +1,7 @@
# GF-Complete 'tools' AM file
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
+AM_CFLAGS = -O3 -fPIC
bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time
diff --git a/tools/gf_methods.c b/tools/gf_methods.c
index c7d3d58..b016c33 100644
--- a/tools/gf_methods.c
+++ b/tools/gf_methods.c
@@ -39,7 +39,7 @@ static char *divides[NDIVS] = { "MATRIX", "EUCLID" };
void usage(char *s)
{
- fprintf(stderr, "usage: gf_methods w -BADC -LUMDRB\n");
+ fprintf(stderr, "usage: gf_methods w -BADC -LXUMDRB\n");
fprintf(stderr, "\n");
fprintf(stderr, " w can be 1-32, 64, 128\n");
fprintf(stderr, "\n");
@@ -50,6 +50,7 @@ void usage(char *s)
fprintf(stderr, " Combinations are fine.\n");
fprintf(stderr, "\n");
fprintf(stderr, " -L Simply lists methods\n");
+ fprintf(stderr, " -X List methods and functions selected (compile with DEBUG_FUNCTIONS)\n");
fprintf(stderr, " -U Produces calls to gf_unit\n");
fprintf(stderr, " -M Produces calls to time_tool.sh for single multiplications\n");
fprintf(stderr, " -D Produces calls to time_tool.sh for single divisions\n");
@@ -63,6 +64,19 @@ void usage(char *s)
exit(1);
}
+void print_methods(gf_t *gf)
+{
+#ifdef DEBUG_FUNCTIONS
+ gf_internal_t *h = (gf_internal_t*) gf->scratch;
+
+ printf("multiply = %s\n", h->multiply);
+ printf("divide = %s\n", h->divide);
+ printf("inverse = %s\n", h->inverse);
+ printf("multiply_region = %s\n", h->multiply_region);
+ printf("extract_word = %s\n", h->extract_word);
+#endif
+}
+
int main(int argc, char *argv[])
{
int m, r, d, w, i, sa, j, k, reset, ok;
@@ -99,12 +113,12 @@ int main(int argc, char *argv[])
}
}
- if (strchr("LUMDRB", argv[3][1]) == NULL) { usage("Bad -LUMDRB"); }
+ if (strchr("LXUMDRB", argv[3][1]) == NULL) { usage("Bad -LXUMDRB"); }
listing = argv[3][1];
if (listing == 'U') {
w_str = "../test/gf_unit %d A -1";
- } else if (listing == 'L') {
+ } else if (listing == 'L' || listing == 'X') {
w_str = "w=%d:";
} else {
w_str = strdup("sh time_tool.sh X %d");
@@ -192,6 +206,8 @@ int main(int argc, char *argv[])
printf(w_str, w);
for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]);
printf("\n");
+ if (listing == 'X')
+ print_methods(&gf);
gf_free(&gf, 1);
} else if (_gf_errno == GF_E_DEFAULT) {
fprintf(stderr, "Unlabeled failed method: w=%d:", w);
@@ -212,6 +228,8 @@ int main(int argc, char *argv[])
printf(w_str, w);
for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]);
printf("\n");
+ if (listing == 'X')
+ print_methods(&gf);
gf_free(&gf, 1);
} else if (_gf_errno == GF_E_DEFAULT) {
fprintf(stderr, "Unlabeled failed method: w=%d:", w);
diff --git a/tools/test_simd.sh b/tools/test_simd.sh
new file mode 100755
index 0000000..e514e4f
--- /dev/null
+++ b/tools/test_simd.sh
@@ -0,0 +1,367 @@
+#!/bin/bash -e
+
+# this scripts has a number of tests for SIMD. It can be invoked
+# on the host or on a QEMU machine.
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+host_cpu=`uname -p`
+results=${script_dir}/test_simd.results
+nprocs=$(grep -c ^processor /proc/cpuinfo)
+
+# runs unit tests and save the results
+test_unit(){
+ { ./configure && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; }
+ make -j$nprocs check || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); }
+ cat tools/test-suite.log >> ${results} || true
+}
+
+# build with DEBUG_FUNCTIONS and save all methods selected
+# to a results file
+test_functions() {
+ failed=0
+
+ { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+# build with DEBUG_CPU_FUNCTIONS and print out CPU detection
+test_detection() {
+ failed=0
+
+ { ./configure --enable-debug-cpu && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; }
+ { ${script_dir}/gf_methods 32 -ACD -L | grep '#' >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); }
+
+ return ${failed}
+}
+
+compile_arm() {
+ failed=0
+
+ echo -n "Compiling with NO SIMD support..." >> ${results}
+ { ./configure --disable-neon && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with FULL SIMD support..." >> ${results}
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ return ${failed}
+}
+
+compile_intel() {
+ failed=0
+
+ echo -n "Compiling with NO SIMD support..." >> ${results}
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with SSE2 only..." >> ${results}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=no
+ export ax_cv_have_ssse3_ext=no
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with SSE2,SSE3 only..." >> ${results}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=no
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with SSE2,SSE3,SSSE3 only..." >> ${results}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_1 only..." >> ${results}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=yes
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_2 only..." >> ${results}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=yes
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ echo -n "Compiling with FULL SIMD support..." >> ${results}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=yes
+ export ax_cv_have_sse42_ext=yes
+ export ax_cv_have_pclmuldq_ext=yes
+ { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+ return ${failed}
+}
+
+# test that we can compile the source code with different
+# SIMD options. We assume that we are running on processor
+# full SIMD support
+test_compile() {
+ case $host_cpu in
+ aarch64*|arm*) compile_arm ;;
+ i[[3456]]86*|x86_64*|amd64*) compile_intel ;;
+ esac
+}
+
+# disable through build flags
+runtime_arm_flags() {
+ failed=0
+
+ echo "====NO SIMD support..." >> ${1}
+ { ./configure --disable-neon --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+# build once with FULL SIMD and disable at runtime through environment
+runtime_arm_env() {
+ failed=0
+
+ { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+ echo "====NO SIMD support..." >> ${1}
+ export GF_COMPLETE_DISABLE_NEON=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_NEON
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+runtime_intel_flags() {
+ failed=0
+
+ echo "====NO SIMD support..." >> ${1}
+ { ./configure --disable-sse --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=no
+ export ax_cv_have_ssse3_ext=no
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=no
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=yes
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=yes
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+runtime_intel_env() {
+ failed=0
+
+ # compile a build with full SIMD support
+ { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+ echo "====NO SIMD support..." >> ${1}
+ export GF_COMPLETE_DISABLE_SSE2=1
+ export GF_COMPLETE_DISABLE_SSE3=1
+ export GF_COMPLETE_DISABLE_SSSE3=1
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ export GF_COMPLETE_DISABLE_SSE3=1
+ export GF_COMPLETE_DISABLE_SSSE3=1
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ export GF_COMPLETE_DISABLE_SSSE3=1
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ unset GF_COMPLETE_DISABLE_SSE4
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ unset GF_COMPLETE_DISABLE_SSE4
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ unset GF_COMPLETE_DISABLE_SSE4
+ unset GF_COMPLETE_DISABLE_SSE4_PCLMUL
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+test_runtime() {
+ rm -f ${results}.left
+ rm -f ${results}.right
+
+ case $host_cpu in
+ aarch64*|arm*)
+ runtime_arm_flags ${results}.left
+ runtime_arm_env ${results}.right
+ ;;
+ i[[3456]]86*|x86_64*|amd64*)
+ runtime_intel_flags ${results}.left
+ runtime_intel_env ${results}.right
+ ;;
+ esac
+
+ echo "======LEFT======" > ${results}
+ cat ${results}.left >> ${results}
+ echo "======RIGHT======" >> ${results}
+ cat ${results}.right >> ${results}
+ echo "======RESULT======" >> ${results}
+ if diff "${results}.left" "${results}.right"; then
+ echo SUCCESS >> ${results}
+ return 0
+ else
+ echo SUCCESS >> ${results}
+ return 1
+ fi
+}
+
+cd ${script_dir}/..
+rm -f ${results}
+
+test_$1
+exit $?
diff --git a/tools/test_simd_qemu.sh b/tools/test_simd_qemu.sh
new file mode 100755
index 0000000..5771874
--- /dev/null
+++ b/tools/test_simd_qemu.sh
@@ -0,0 +1,258 @@
+#!/bin/bash -e
+
+# This script will use QEMU to test gf-complete especially SIMD support
+# on different architectures and cpus. It will boot a qemu machine
+# and run an Ubuntu cloud image. All testing will happen inside the
+# QEMU machine.
+
+# The following packages are required:
+# qemu-system-aarch64
+# qemu-system-arm
+# qemu-system-x86_64
+# genisoimage
+
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+qemu_dir="${script_dir}/.qemu"
+ssh_port=2222
+ssh_pubkey_file="${qemu_dir}/qemu.pub"
+ssh_key_file="${qemu_dir}/qemu"
+
+mkdir -p "${qemu_dir}"
+
+cleanup() {
+ if [[ -n "$(jobs -p)" ]]; then
+ echo killing qemu processes "$(jobs -p)"
+ kill $(jobs -p)
+ fi
+}
+
+trap cleanup EXIT
+
+start_qemu() {
+ arch=$1
+ cpu=$2
+
+ image_version="xenial"
+ image_url_base="http://cloud-images.ubuntu.com/${image_version}/current"
+
+ case $arch in
+ i[[3456]]86*|x86_64*|amd64*)
+ image_kernel="${image_version}-server-cloudimg-amd64-vmlinuz-generic"
+ image_initrd="${image_version}-server-cloudimg-amd64-initrd-generic"
+ image_disk="${image_version}-server-cloudimg-amd64-disk1.img"
+ ;;
+ aarch64*)
+ image_kernel="${image_version}-server-cloudimg-arm64-vmlinuz-generic"
+ image_initrd="${image_version}-server-cloudimg-arm64-initrd-generic"
+ image_disk="${image_version}-server-cloudimg-arm64-disk1.img"
+ ;;
+ arm*)
+ image_kernel="${image_version}-server-cloudimg-armhf-vmlinuz-lpae"
+ image_initrd="${image_version}-server-cloudimg-armhf-initrd-generic-lpae"
+ image_disk="${image_version}-server-cloudimg-armhf-disk1.img"
+ ;;
+ *) die "Unsupported arch" ;;
+ esac
+
+ [[ -f ${qemu_dir}/${image_kernel} ]] || wget -O ${qemu_dir}/${image_kernel} ${image_url_base}/unpacked/${image_kernel}
+ [[ -f ${qemu_dir}/${image_initrd} ]] || wget -O ${qemu_dir}/${image_initrd} ${image_url_base}/unpacked/${image_initrd}
+ [[ -f ${qemu_dir}/${image_disk} ]] || wget -O ${qemu_dir}/${image_disk} ${image_url_base}/${image_disk}
+
+ #create a delta disk to keep the original image clean
+ delta_disk="${qemu_dir}/disk.img"
+ rm -f ${delta_disk}
+ qemu-img create -q -f qcow2 -b "${qemu_dir}/${image_disk}" ${delta_disk}
+
+ # generate an ssh keys
+ [[ -f ${ssh_pubkey_file} ]] || ssh-keygen -q -N "" -f ${ssh_key_file}
+
+ # create a config disk to set the SSH keys
+ cat > "${qemu_dir}/meta-data" <<EOF
+instance-id: qemu
+local-hostname: qemu
+EOF
+ cat > "${qemu_dir}/user-data" <<EOF
+#cloud-config
+hostname: qemu
+manage_etc_hosts: true
+users:
+ - name: qemu
+ ssh-authorized-keys:
+ - $(cat "${ssh_pubkey_file}")
+ sudo: ['ALL=(ALL) NOPASSWD:ALL']
+ groups: sudo
+ shell: /bin/bash
+EOF
+ genisoimage -quiet -output "${qemu_dir}/cloud.iso" -volid cidata -joliet -rock "${qemu_dir}/user-data" "${qemu_dir}/meta-data"
+
+ common_args=( \
+ -name "qemu" \
+ -m 1024 \
+ -nodefaults \
+ -nographic \
+ -kernel ${qemu_dir}/${image_kernel} \
+ -initrd ${qemu_dir}/${image_initrd} \
+ -cdrom ${qemu_dir}/cloud.iso \
+ -serial file:${qemu_dir}/console.log
+ )
+
+ case $arch in
+ i[[3456]]86*|x86_64*|amd64*)
+ qemu-system-x86_64 \
+ "${common_args[@]}" \
+ -machine accel=kvm -cpu $cpu \
+ -append "console=ttyS0 root=/dev/sda1" \
+ -hda "${delta_disk}" \
+ -net nic,vlan=0,model=virtio \
+ -net user,vlan=0,hostfwd=tcp::"${ssh_port}"-:22,hostname="${vm_name}" \
+ &
+ ;;
+ aarch64*|arm*)
+ qemu-system-$arch \
+ "${common_args[@]}" \
+ -machine virt -cpu $cpu -machine type=virt -smp 1 \
+ -drive if=none,file="${delta_disk}",id=hd0 \
+ -device virtio-blk-device,drive=hd0 \
+ -append "console=ttyAMA0 root=/dev/vda1" \
+ -netdev user,id=eth0,hostfwd=tcp::"${ssh_port}"-:22,hostname="${vm_name}" \
+ -device virtio-net-device,netdev=eth0 \
+ &
+ ;;
+ *) die "Unsupported arch" ;;
+ esac
+
+ wait_for_ssh
+}
+
+stop_qemu() {
+ run_ssh "sudo shutdown now" || true
+ wait $(jobs -p)
+}
+
+shared_args=(
+ -i ${ssh_key_file}
+ -F /dev/null
+ -o BatchMode=yes
+ -o UserKnownHostsFile=/dev/null
+ -o StrictHostKeyChecking=no
+ -o IdentitiesOnly=yes
+)
+
+ssh_args=(
+ ${shared_args[*]}
+ -p ${ssh_port}
+)
+
+wait_for_ssh() {
+ retries=0
+ retry_count=50
+
+ echo "waiting for machine to come up."
+ echo "tail -F ${qemu_dir}/console.log for progress."
+
+ while true; do
+ set +e
+ ssh -q ${ssh_args[*]} -o ConnectTimeout=1 qemu@localhost "echo done"
+ error=$?
+ set -e
+ if [[ $error == 0 ]]; then
+ return 0
+ fi
+
+ if [[ ${retries} == ${retry_count} ]]; then
+ echo "timeout"
+ return 1
+ fi
+
+ echo -n "."
+ ((++retries))
+ sleep 10
+ done
+}
+
+run_ssh() {
+ ssh -q ${ssh_args[*]} qemu@localhost "$@"
+}
+
+run_scp() {
+ scp -q ${shared_args[*]} -P ${ssh_port} "$@"
+}
+
+rsync_args=(
+ --exclude '.qemu'
+ --exclude '.git'
+)
+
+run_rsync() {
+ rsync -avz -e "ssh ${ssh_args[*]}" ${rsync_args[*]} "$@"
+}
+
+init_machine() {
+ run_ssh "sudo apt-get -y install --no-install-recommends make gcc autoconf libtool automake"
+}
+
+init_machine_and_copy_source() {
+ init_machine
+ run_ssh "rm -fr ~/gf-complete; mkdir -p ~/gf-complete"
+ run_rsync ${script_dir}/.. qemu@localhost:gf-complete
+ run_ssh "cd ~/gf-complete && ./autogen.sh"
+}
+
+run_test() {
+ arch=$1; shift
+ cpu=$1; shift
+ test=$1; shift
+
+ run_ssh "~/gf-complete/tools/test_simd.sh ${test}"
+ run_scp qemu@localhost:gf-complete/tools/test_simd.results ${script_dir}/test_simd_${test}_${arch}_${cpu}.results
+}
+
+# this test run the unit tests on the machine using "make check"
+run_test_simd_basic() {
+ arch=$1; shift
+ cpu=$1; shift
+
+ failed=0
+
+ echo "=====starting qemu machine $arch $cpu"
+ start_qemu $arch $cpu
+ init_machine_and_copy_source
+ echo "=====running compile test"
+ { run_test $arch $cpu "compile" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+ echo "=====running unit test"
+ { run_test $arch $cpu "unit" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+ echo "=====running functions test"
+ { run_test $arch $cpu "functions" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+ echo "=====running detection test"
+ { run_test $arch $cpu "detection" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+ echo "=====running runtime test"
+ { run_test $arch $cpu "runtime" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+ stop_qemu
+
+ return ${failed}
+}
+
+run_all_tests() {
+ failed=0
+
+ echo ============================
+ echo =====running x86_64 tests
+ # NOTE: Broadwell has all the supported SIMD instructions
+ { run_test_simd_basic "x86_64" "Broadwell" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+
+ echo ============================
+ echo =====running aarch64 tests
+ # NOTE: cortex-a57 has ASIMD support
+ { run_test_simd_basic "aarch64" "cortex-a57" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+
+ echo ============================
+ echo =====running arm tests
+ # NOTE: cortex-a15 has NEON support
+ { run_test_simd_basic "arm" "cortex-a15" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+
+ return ${failed}
+}
+
+run_all_tests
+exit $?