diff options
-rw-r--r-- | .gitignore | 8 | ||||
-rwxr-xr-x | compile | 347 | ||||
-rw-r--r-- | configure.ac | 17 | ||||
-rwxr-xr-x | depcomp | 791 | ||||
-rw-r--r-- | include/gf_cpu.h | 20 | ||||
-rw-r--r-- | include/gf_int.h | 16 | ||||
-rw-r--r-- | m4/ax_ext.m4 | 295 | ||||
-rw-r--r-- | m4/ax_gcc_x86_avx_xgetbv.m4 | 79 | ||||
-rw-r--r-- | m4/ax_gcc_x86_cpuid.m4 | 79 | ||||
-rw-r--r-- | m4/ltoptions.m4 | 384 | ||||
-rw-r--r-- | m4/ltsugar.m4 | 123 | ||||
-rw-r--r-- | m4/lt~obsolete.m4 | 98 | ||||
-rw-r--r-- | src/Makefile.am | 16 | ||||
-rw-r--r-- | src/gf.c | 149 | ||||
-rw-r--r-- | src/gf_cpu.c | 168 | ||||
-rw-r--r-- | src/gf_w128.c | 114 | ||||
-rw-r--r-- | src/gf_w16.c | 231 | ||||
-rw-r--r-- | src/gf_w32.c | 295 | ||||
-rw-r--r-- | src/gf_w4.c | 172 | ||||
-rw-r--r-- | src/gf_w64.c | 244 | ||||
-rw-r--r-- | src/gf_w8.c | 236 | ||||
-rw-r--r-- | src/gf_wgen.c | 64 | ||||
-rw-r--r-- | src/neon/gf_w16_neon.c | 4 | ||||
-rw-r--r-- | src/neon/gf_w32_neon.c | 4 | ||||
-rw-r--r-- | src/neon/gf_w4_neon.c | 6 | ||||
-rw-r--r-- | src/neon/gf_w64_neon.c | 4 | ||||
-rw-r--r-- | src/neon/gf_w8_neon.c | 14 | ||||
-rw-r--r-- | test/Makefile.am | 2 | ||||
-rw-r--r-- | tools/Makefile.am | 2 | ||||
-rw-r--r-- | tools/gf_methods.c | 24 | ||||
-rwxr-xr-x | tools/test_simd.sh | 367 | ||||
-rwxr-xr-x | tools/test_simd_qemu.sh | 258 |
32 files changed, 1699 insertions, 2932 deletions
@@ -50,6 +50,10 @@ config.sub ltmain.sh m4/libtool.m4 m4/ltversion.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/lt~obsolete.m4 +test-driver src/.dirstamp test-driver @@ -68,3 +72,7 @@ tools/gf_methods tools/gf_mult tools/gf_poly tools/gf_time +tools/gf_unit_w* +tools/test-suite.log +tools/.qemu/ +tools/test_simd*.results* diff --git a/compile b/compile deleted file mode 100755 index 531136b..0000000 --- a/compile +++ /dev/null @@ -1,347 +0,0 @@ -#! /bin/sh -# Wrapper for compilers which do not understand '-c -o'. - -scriptversion=2012-10-14.11; # UTC - -# Copyright (C) 1999-2013 Free Software Foundation, Inc. -# Written by Tom Tromey <tromey@cygnus.com>. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# This file is maintained in Automake, please report -# bugs to <bug-automake@gnu.org> or send patches to -# <automake-patches@gnu.org>. - -nl=' -' - -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent tools from complaining about whitespace usage. -IFS=" "" $nl" - -file_conv= - -# func_file_conv build_file lazy -# Convert a $build file to $host form and store it in $file -# Currently only supports Windows hosts. If the determined conversion -# type is listed in (the comma separated) LAZY, no conversion will -# take place. -func_file_conv () -{ - file=$1 - case $file in - / | /[!/]*) # absolute file, and not a UNC file - if test -z "$file_conv"; then - # lazily determine how to convert abs files - case `uname -s` in - MINGW*) - file_conv=mingw - ;; - CYGWIN*) - file_conv=cygwin - ;; - *) - file_conv=wine - ;; - esac - fi - case $file_conv/,$2, in - *,$file_conv,*) - ;; - mingw/*) - file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` - ;; - cygwin/*) - file=`cygpath -m "$file" || echo "$file"` - ;; - wine/*) - file=`winepath -w "$file" || echo "$file"` - ;; - esac - ;; - esac -} - -# func_cl_dashL linkdir -# Make cl look for libraries in LINKDIR -func_cl_dashL () -{ - func_file_conv "$1" - if test -z "$lib_path"; then - lib_path=$file - else - lib_path="$lib_path;$file" - fi - linker_opts="$linker_opts -LIBPATH:$file" -} - -# func_cl_dashl library -# Do a library search-path lookup for cl -func_cl_dashl () -{ - lib=$1 - found=no - save_IFS=$IFS - IFS=';' - for dir in $lib_path $LIB - do - IFS=$save_IFS - if $shared && test -f "$dir/$lib.dll.lib"; then - found=yes - lib=$dir/$lib.dll.lib - break - fi - if test -f "$dir/$lib.lib"; then - found=yes - lib=$dir/$lib.lib - break - fi - if test -f "$dir/lib$lib.a"; then - found=yes - lib=$dir/lib$lib.a - break - fi - done - IFS=$save_IFS - - if test "$found" != yes; then - lib=$lib.lib - fi -} - -# func_cl_wrapper cl arg... -# Adjust compile command to suit cl -func_cl_wrapper () -{ - # Assume a capable shell - lib_path= - shared=: - linker_opts= - for arg - do - if test -n "$eat"; then - eat= - else - case $1 in - -o) - # configure might choose to run compile as 'compile cc -o foo foo.c'. - eat=1 - case $2 in - *.o | *.[oO][bB][jJ]) - func_file_conv "$2" - set x "$@" -Fo"$file" - shift - ;; - *) - func_file_conv "$2" - set x "$@" -Fe"$file" - shift - ;; - esac - ;; - -I) - eat=1 - func_file_conv "$2" mingw - set x "$@" -I"$file" - shift - ;; - -I*) - func_file_conv "${1#-I}" mingw - set x "$@" -I"$file" - shift - ;; - -l) - eat=1 - func_cl_dashl "$2" - set x "$@" "$lib" - shift - ;; - -l*) - func_cl_dashl "${1#-l}" - set x "$@" "$lib" - shift - ;; - -L) - eat=1 - func_cl_dashL "$2" - ;; - -L*) - func_cl_dashL "${1#-L}" - ;; - -static) - shared=false - ;; - -Wl,*) - arg=${1#-Wl,} - save_ifs="$IFS"; IFS=',' - for flag in $arg; do - IFS="$save_ifs" - linker_opts="$linker_opts $flag" - done - IFS="$save_ifs" - ;; - -Xlinker) - eat=1 - linker_opts="$linker_opts $2" - ;; - -*) - set x "$@" "$1" - shift - ;; - *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) - func_file_conv "$1" - set x "$@" -Tp"$file" - shift - ;; - *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) - func_file_conv "$1" mingw - set x "$@" "$file" - shift - ;; - *) - set x "$@" "$1" - shift - ;; - esac - fi - shift - done - if test -n "$linker_opts"; then - linker_opts="-link$linker_opts" - fi - exec "$@" $linker_opts - exit 1 -} - -eat= - -case $1 in - '') - echo "$0: No command. Try '$0 --help' for more information." 1>&2 - exit 1; - ;; - -h | --h*) - cat <<\EOF -Usage: compile [--help] [--version] PROGRAM [ARGS] - -Wrapper for compilers which do not understand '-c -o'. -Remove '-o dest.o' from ARGS, run PROGRAM with the remaining -arguments, and rename the output as expected. - -If you are trying to build a whole package this is not the -right script to run: please start by reading the file 'INSTALL'. - -Report bugs to <bug-automake@gnu.org>. -EOF - exit $? - ;; - -v | --v*) - echo "compile $scriptversion" - exit $? - ;; - cl | *[/\\]cl | cl.exe | *[/\\]cl.exe ) - func_cl_wrapper "$@" # Doesn't return... - ;; -esac - -ofile= -cfile= - -for arg -do - if test -n "$eat"; then - eat= - else - case $1 in - -o) - # configure might choose to run compile as 'compile cc -o foo foo.c'. - # So we strip '-o arg' only if arg is an object. - eat=1 - case $2 in - *.o | *.obj) - ofile=$2 - ;; - *) - set x "$@" -o "$2" - shift - ;; - esac - ;; - *.c) - cfile=$1 - set x "$@" "$1" - shift - ;; - *) - set x "$@" "$1" - shift - ;; - esac - fi - shift -done - -if test -z "$ofile" || test -z "$cfile"; then - # If no '-o' option was seen then we might have been invoked from a - # pattern rule where we don't need one. That is ok -- this is a - # normal compilation that the losing compiler can handle. If no - # '.c' file was seen then we are probably linking. That is also - # ok. - exec "$@" -fi - -# Name of file we expect compiler to create. -cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` - -# Create the lock directory. -# Note: use '[/\\:.-]' here to ensure that we don't use the same name -# that we are using for the .o file. Also, base the name on the expected -# object file name, since that is what matters with a parallel build. -lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d -while true; do - if mkdir "$lockdir" >/dev/null 2>&1; then - break - fi - sleep 1 -done -# FIXME: race condition here if user kills between mkdir and trap. -trap "rmdir '$lockdir'; exit 1" 1 2 15 - -# Run the compile. -"$@" -ret=$? - -if test -f "$cofile"; then - test "$cofile" = "$ofile" || mv "$cofile" "$ofile" -elif test -f "${cofile}bj"; then - test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" -fi - -rmdir "$lockdir" -exit $ret - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC" -# time-stamp-end: "; # UTC" -# End: diff --git a/configure.ac b/configure.ac index 3e8cf18..d696f6e 100644 --- a/configure.ac +++ b/configure.ac @@ -29,6 +29,14 @@ AC_CHECK_FUNCS([posix_memalign], AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])]) +AC_ARG_ENABLE([debug-functions], + AS_HELP_STRING([--enable-debug-func], [Enable debugging of functions selected])) +AS_IF([test "x$enable_debug_func" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_FUNCTIONS"]) + +AC_ARG_ENABLE([debug-cpu], + AS_HELP_STRING([--enable-debug-cpu], [Enable debugging of SIMD detection])) +AS_IF([test "x$enable_debug_cpu" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_CPU_DETECTION"]) + AX_EXT() AC_ARG_ENABLE([neon], @@ -66,5 +74,14 @@ AC_ARG_ENABLE([valgrind], [enable_valgrind=no]) AM_CONDITIONAL(ENABLE_VALGRIND, test "x$enable_valgrind" != xno) +AC_ARG_ENABLE([avx], AS_HELP_STRING([--enable-avx], [Build with AVX optimizations])) +AX_CHECK_COMPILE_FLAG(-mavx, [ax_cv_support_avx=yes], []) + +AS_IF([test "x$enable_avx" = "xyes"], + [AS_IF([test "x$ax_cv_support_avx" = "xno"], + [AC_MSG_ERROR([AVX requested but compiler does not support -mavx])], + [SIMD_FLAGS="$SIMD_FLAGS -mavx"]) + ]) + AC_CONFIG_FILES([Makefile src/Makefile tools/Makefile test/Makefile examples/Makefile]) AC_OUTPUT diff --git a/depcomp b/depcomp deleted file mode 100755 index 4ebd5b3..0000000 --- a/depcomp +++ /dev/null @@ -1,791 +0,0 @@ -#! /bin/sh -# depcomp - compile a program generating dependencies as side-effects - -scriptversion=2013-05-30.07; # UTC - -# Copyright (C) 1999-2013 Free Software Foundation, Inc. - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>. - -case $1 in - '') - echo "$0: No command. Try '$0 --help' for more information." 1>&2 - exit 1; - ;; - -h | --h*) - cat <<\EOF -Usage: depcomp [--help] [--version] PROGRAM [ARGS] - -Run PROGRAMS ARGS to compile a file, generating dependencies -as side-effects. - -Environment variables: - depmode Dependency tracking mode. - source Source file read by 'PROGRAMS ARGS'. - object Object file output by 'PROGRAMS ARGS'. - DEPDIR directory where to store dependencies. - depfile Dependency file to output. - tmpdepfile Temporary file to use when outputting dependencies. - libtool Whether libtool is used (yes/no). - -Report bugs to <bug-automake@gnu.org>. -EOF - exit $? - ;; - -v | --v*) - echo "depcomp $scriptversion" - exit $? - ;; -esac - -# Get the directory component of the given path, and save it in the -# global variables '$dir'. Note that this directory component will -# be either empty or ending with a '/' character. This is deliberate. -set_dir_from () -{ - case $1 in - */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;; - *) dir=;; - esac -} - -# Get the suffix-stripped basename of the given path, and save it the -# global variable '$base'. -set_base_from () -{ - base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'` -} - -# If no dependency file was actually created by the compiler invocation, -# we still have to create a dummy depfile, to avoid errors with the -# Makefile "include basename.Plo" scheme. -make_dummy_depfile () -{ - echo "#dummy" > "$depfile" -} - -# Factor out some common post-processing of the generated depfile. -# Requires the auxiliary global variable '$tmpdepfile' to be set. -aix_post_process_depfile () -{ - # If the compiler actually managed to produce a dependency file, - # post-process it. - if test -f "$tmpdepfile"; then - # Each line is of the form 'foo.o: dependency.h'. - # Do two passes, one to just change these to - # $object: dependency.h - # and one to simply output - # dependency.h: - # which is needed to avoid the deleted-header problem. - { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile" - sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile" - } > "$depfile" - rm -f "$tmpdepfile" - else - make_dummy_depfile - fi -} - -# A tabulation character. -tab=' ' -# A newline character. -nl=' -' -# Character ranges might be problematic outside the C locale. -# These definitions help. -upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ -lower=abcdefghijklmnopqrstuvwxyz -digits=0123456789 -alpha=${upper}${lower} - -if test -z "$depmode" || test -z "$source" || test -z "$object"; then - echo "depcomp: Variables source, object and depmode must be set" 1>&2 - exit 1 -fi - -# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po. -depfile=${depfile-`echo "$object" | - sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`} -tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} - -rm -f "$tmpdepfile" - -# Avoid interferences from the environment. -gccflag= dashmflag= - -# Some modes work just like other modes, but use different flags. We -# parameterize here, but still list the modes in the big case below, -# to make depend.m4 easier to write. Note that we *cannot* use a case -# here, because this file can only contain one case statement. -if test "$depmode" = hp; then - # HP compiler uses -M and no extra arg. - gccflag=-M - depmode=gcc -fi - -if test "$depmode" = dashXmstdout; then - # This is just like dashmstdout with a different argument. - dashmflag=-xM - depmode=dashmstdout -fi - -cygpath_u="cygpath -u -f -" -if test "$depmode" = msvcmsys; then - # This is just like msvisualcpp but w/o cygpath translation. - # Just convert the backslash-escaped backslashes to single forward - # slashes to satisfy depend.m4 - cygpath_u='sed s,\\\\,/,g' - depmode=msvisualcpp -fi - -if test "$depmode" = msvc7msys; then - # This is just like msvc7 but w/o cygpath translation. - # Just convert the backslash-escaped backslashes to single forward - # slashes to satisfy depend.m4 - cygpath_u='sed s,\\\\,/,g' - depmode=msvc7 -fi - -if test "$depmode" = xlc; then - # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information. - gccflag=-qmakedep=gcc,-MF - depmode=gcc -fi - -case "$depmode" in -gcc3) -## gcc 3 implements dependency tracking that does exactly what -## we want. Yay! Note: for some reason libtool 1.4 doesn't like -## it if -MD -MP comes after the -MF stuff. Hmm. -## Unfortunately, FreeBSD c89 acceptance of flags depends upon -## the command line argument order; so add the flags where they -## appear in depend2.am. Note that the slowdown incurred here -## affects only configure: in makefiles, %FASTDEP% shortcuts this. - for arg - do - case $arg in - -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;; - *) set fnord "$@" "$arg" ;; - esac - shift # fnord - shift # $arg - done - "$@" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - mv "$tmpdepfile" "$depfile" - ;; - -gcc) -## Note that this doesn't just cater to obsosete pre-3.x GCC compilers. -## but also to in-use compilers like IMB xlc/xlC and the HP C compiler. -## (see the conditional assignment to $gccflag above). -## There are various ways to get dependency output from gcc. Here's -## why we pick this rather obscure method: -## - Don't want to use -MD because we'd like the dependencies to end -## up in a subdir. Having to rename by hand is ugly. -## (We might end up doing this anyway to support other compilers.) -## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like -## -MM, not -M (despite what the docs say). Also, it might not be -## supported by the other compilers which use the 'gcc' depmode. -## - Using -M directly means running the compiler twice (even worse -## than renaming). - if test -z "$gccflag"; then - gccflag=-MD, - fi - "$@" -Wp,"$gccflag$tmpdepfile" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - echo "$object : \\" > "$depfile" - # The second -e expression handles DOS-style file names with drive - # letters. - sed -e 's/^[^:]*: / /' \ - -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile" -## This next piece of magic avoids the "deleted header file" problem. -## The problem is that when a header file which appears in a .P file -## is deleted, the dependency causes make to die (because there is -## typically no way to rebuild the header). We avoid this by adding -## dummy dependencies for each header file. Too bad gcc doesn't do -## this for us directly. -## Some versions of gcc put a space before the ':'. On the theory -## that the space means something, we add a space to the output as -## well. hp depmode also adds that space, but also prefixes the VPATH -## to the object. Take care to not repeat it in the output. -## Some versions of the HPUX 10.20 sed can't process this invocation -## correctly. Breaking it into two sed invocations is a workaround. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -hp) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -sgi) - if test "$libtool" = yes; then - "$@" "-Wp,-MDupdate,$tmpdepfile" - else - "$@" -MDupdate "$tmpdepfile" - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - - if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files - echo "$object : \\" > "$depfile" - # Clip off the initial element (the dependent). Don't try to be - # clever and replace this with sed code, as IRIX sed won't handle - # lines with more than a fixed number of characters (4096 in - # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines; - # the IRIX cc adds comments like '#:fec' to the end of the - # dependency line. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \ - | tr "$nl" ' ' >> "$depfile" - echo >> "$depfile" - # The second pass generates a dummy entry for each header file. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \ - >> "$depfile" - else - make_dummy_depfile - fi - rm -f "$tmpdepfile" - ;; - -xlc) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -aix) - # The C for AIX Compiler uses -M and outputs the dependencies - # in a .u file. In older versions, this file always lives in the - # current directory. Also, the AIX compiler puts '$object:' at the - # start of each line; $object doesn't have directory information. - # Version 6 uses the directory in both cases. - set_dir_from "$object" - set_base_from "$object" - if test "$libtool" = yes; then - tmpdepfile1=$dir$base.u - tmpdepfile2=$base.u - tmpdepfile3=$dir.libs/$base.u - "$@" -Wc,-M - else - tmpdepfile1=$dir$base.u - tmpdepfile2=$dir$base.u - tmpdepfile3=$dir$base.u - "$@" -M - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - do - test -f "$tmpdepfile" && break - done - aix_post_process_depfile - ;; - -tcc) - # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26 - # FIXME: That version still under development at the moment of writing. - # Make that this statement remains true also for stable, released - # versions. - # It will wrap lines (doesn't matter whether long or short) with a - # trailing '\', as in: - # - # foo.o : \ - # foo.c \ - # foo.h \ - # - # It will put a trailing '\' even on the last line, and will use leading - # spaces rather than leading tabs (at least since its commit 0394caf7 - # "Emit spaces for -MD"). - "$@" -MD -MF "$tmpdepfile" - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'. - # We have to change lines of the first kind to '$object: \'. - sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile" - # And for each line of the second kind, we have to emit a 'dep.h:' - # dummy dependency, to avoid the deleted-header problem. - sed -n -e 's|^ *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile" - rm -f "$tmpdepfile" - ;; - -## The order of this option in the case statement is important, since the -## shell code in configure will try each of these formats in the order -## listed in this file. A plain '-MD' option would be understood by many -## compilers, so we must ensure this comes after the gcc and icc options. -pgcc) - # Portland's C compiler understands '-MD'. - # Will always output deps to 'file.d' where file is the root name of the - # source file under compilation, even if file resides in a subdirectory. - # The object file name does not affect the name of the '.d' file. - # pgcc 10.2 will output - # foo.o: sub/foo.c sub/foo.h - # and will wrap long lines using '\' : - # foo.o: sub/foo.c ... \ - # sub/foo.h ... \ - # ... - set_dir_from "$object" - # Use the source, not the object, to determine the base name, since - # that's sadly what pgcc will do too. - set_base_from "$source" - tmpdepfile=$base.d - - # For projects that build the same source file twice into different object - # files, the pgcc approach of using the *source* file root name can cause - # problems in parallel builds. Use a locking strategy to avoid stomping on - # the same $tmpdepfile. - lockdir=$base.d-lock - trap " - echo '$0: caught signal, cleaning up...' >&2 - rmdir '$lockdir' - exit 1 - " 1 2 13 15 - numtries=100 - i=$numtries - while test $i -gt 0; do - # mkdir is a portable test-and-set. - if mkdir "$lockdir" 2>/dev/null; then - # This process acquired the lock. - "$@" -MD - stat=$? - # Release the lock. - rmdir "$lockdir" - break - else - # If the lock is being held by a different process, wait - # until the winning process is done or we timeout. - while test -d "$lockdir" && test $i -gt 0; do - sleep 1 - i=`expr $i - 1` - done - fi - i=`expr $i - 1` - done - trap - 1 2 13 15 - if test $i -le 0; then - echo "$0: failed to acquire lock after $numtries attempts" >&2 - echo "$0: check lockdir '$lockdir'" >&2 - exit 1 - fi - - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - # Each line is of the form `foo.o: dependent.h', - # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'. - # Do two passes, one to just change these to - # `$object: dependent.h' and one to simply `dependent.h:'. - sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process this invocation - # correctly. Breaking it into two sed invocations is a workaround. - sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -hp2) - # The "hp" stanza above does not work with aCC (C++) and HP's ia64 - # compilers, which have integrated preprocessors. The correct option - # to use with these is +Maked; it writes dependencies to a file named - # 'foo.d', which lands next to the object file, wherever that - # happens to be. - # Much of this is similar to the tru64 case; see comments there. - set_dir_from "$object" - set_base_from "$object" - if test "$libtool" = yes; then - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir.libs/$base.d - "$@" -Wc,+Maked - else - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir$base.d - "$@" +Maked - fi - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" - do - test -f "$tmpdepfile" && break - done - if test -f "$tmpdepfile"; then - sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile" - # Add 'dependent.h:' lines. - sed -ne '2,${ - s/^ *// - s/ \\*$// - s/$/:/ - p - }' "$tmpdepfile" >> "$depfile" - else - make_dummy_depfile - fi - rm -f "$tmpdepfile" "$tmpdepfile2" - ;; - -tru64) - # The Tru64 compiler uses -MD to generate dependencies as a side - # effect. 'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'. - # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put - # dependencies in 'foo.d' instead, so we check for that too. - # Subdirectories are respected. - set_dir_from "$object" - set_base_from "$object" - - if test "$libtool" = yes; then - # Libtool generates 2 separate objects for the 2 libraries. These - # two compilations output dependencies in $dir.libs/$base.o.d and - # in $dir$base.o.d. We have to check for both files, because - # one of the two compilations can be disabled. We should prefer - # $dir$base.o.d over $dir.libs/$base.o.d because the latter is - # automatically cleaned when .libs/ is deleted, while ignoring - # the former would cause a distcleancheck panic. - tmpdepfile1=$dir$base.o.d # libtool 1.5 - tmpdepfile2=$dir.libs/$base.o.d # Likewise. - tmpdepfile3=$dir.libs/$base.d # Compaq CCC V6.2-504 - "$@" -Wc,-MD - else - tmpdepfile1=$dir$base.d - tmpdepfile2=$dir$base.d - tmpdepfile3=$dir$base.d - "$@" -MD - fi - - stat=$? - if test $stat -ne 0; then - rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - exit $stat - fi - - for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" - do - test -f "$tmpdepfile" && break - done - # Same post-processing that is required for AIX mode. - aix_post_process_depfile - ;; - -msvc7) - if test "$libtool" = yes; then - showIncludes=-Wc,-showIncludes - else - showIncludes=-showIncludes - fi - "$@" $showIncludes > "$tmpdepfile" - stat=$? - grep -v '^Note: including file: ' "$tmpdepfile" - if test $stat -ne 0; then - rm -f "$tmpdepfile" - exit $stat - fi - rm -f "$depfile" - echo "$object : \\" > "$depfile" - # The first sed program below extracts the file names and escapes - # backslashes for cygpath. The second sed program outputs the file - # name when reading, but also accumulates all include files in the - # hold buffer in order to output them again at the end. This only - # works with sed implementations that can handle large buffers. - sed < "$tmpdepfile" -n ' -/^Note: including file: *\(.*\)/ { - s//\1/ - s/\\/\\\\/g - p -}' | $cygpath_u | sort -u | sed -n ' -s/ /\\ /g -s/\(.*\)/'"$tab"'\1 \\/p -s/.\(.*\) \\/\1:/ -H -$ { - s/.*/'"$tab"'/ - G - p -}' >> "$depfile" - echo >> "$depfile" # make sure the fragment doesn't end with a backslash - rm -f "$tmpdepfile" - ;; - -msvc7msys) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -#nosideeffect) - # This comment above is used by automake to tell side-effect - # dependency tracking mechanisms from slower ones. - -dashmstdout) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout, regardless of -o. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - # Remove '-o $object'. - IFS=" " - for arg - do - case $arg in - -o) - shift - ;; - $object) - shift - ;; - *) - set fnord "$@" "$arg" - shift # fnord - shift # $arg - ;; - esac - done - - test -z "$dashmflag" && dashmflag=-M - # Require at least two characters before searching for ':' - # in the target name. This is to cope with DOS-style filenames: - # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise. - "$@" $dashmflag | - sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile" - rm -f "$depfile" - cat < "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process this sed invocation - # correctly. Breaking it into two sed invocations is a workaround. - tr ' ' "$nl" < "$tmpdepfile" \ - | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -dashXmstdout) - # This case only exists to satisfy depend.m4. It is never actually - # run, as this mode is specially recognized in the preamble. - exit 1 - ;; - -makedepend) - "$@" || exit $? - # Remove any Libtool call - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - # X makedepend - shift - cleared=no eat=no - for arg - do - case $cleared in - no) - set ""; shift - cleared=yes ;; - esac - if test $eat = yes; then - eat=no - continue - fi - case "$arg" in - -D*|-I*) - set fnord "$@" "$arg"; shift ;; - # Strip any option that makedepend may not understand. Remove - # the object too, otherwise makedepend will parse it as a source file. - -arch) - eat=yes ;; - -*|$object) - ;; - *) - set fnord "$@" "$arg"; shift ;; - esac - done - obj_suffix=`echo "$object" | sed 's/^.*\././'` - touch "$tmpdepfile" - ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@" - rm -f "$depfile" - # makedepend may prepend the VPATH from the source file name to the object. - # No need to regex-escape $object, excess matching of '.' is harmless. - sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile" - # Some versions of the HPUX 10.20 sed can't process the last invocation - # correctly. Breaking it into two sed invocations is a workaround. - sed '1,2d' "$tmpdepfile" \ - | tr ' ' "$nl" \ - | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ - | sed -e 's/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" "$tmpdepfile".bak - ;; - -cpp) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - # Remove '-o $object'. - IFS=" " - for arg - do - case $arg in - -o) - shift - ;; - $object) - shift - ;; - *) - set fnord "$@" "$arg" - shift # fnord - shift # $arg - ;; - esac - done - - "$@" -E \ - | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ - -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ - | sed '$ s: \\$::' > "$tmpdepfile" - rm -f "$depfile" - echo "$object : \\" > "$depfile" - cat < "$tmpdepfile" >> "$depfile" - sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -msvisualcpp) - # Important note: in order to support this mode, a compiler *must* - # always write the preprocessed file to stdout. - "$@" || exit $? - - # Remove the call to Libtool. - if test "$libtool" = yes; then - while test "X$1" != 'X--mode=compile'; do - shift - done - shift - fi - - IFS=" " - for arg - do - case "$arg" in - -o) - shift - ;; - $object) - shift - ;; - "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI") - set fnord "$@" - shift - shift - ;; - *) - set fnord "$@" "$arg" - shift - shift - ;; - esac - done - "$@" -E 2>/dev/null | - sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile" - rm -f "$depfile" - echo "$object : \\" > "$depfile" - sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile" - echo "$tab" >> "$depfile" - sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile" - rm -f "$tmpdepfile" - ;; - -msvcmsys) - # This case exists only to let depend.m4 do its work. It works by - # looking at the text of this script. This case will never be run, - # since it is checked for above. - exit 1 - ;; - -none) - exec "$@" - ;; - -*) - echo "Unknown depmode $depmode" 1>&2 - exit 1 - ;; -esac - -exit 0 - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC" -# time-stamp-end: "; # UTC" -# End: diff --git a/include/gf_cpu.h b/include/gf_cpu.h new file mode 100644 index 0000000..71c7227 --- /dev/null +++ b/include/gf_cpu.h @@ -0,0 +1,20 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_cpu.h + * + * Identifies whether the CPU supports SIMD instructions at runtime. + */ + +#pragma once + +extern int gf_cpu_supports_intel_pclmul; +extern int gf_cpu_supports_intel_sse4; +extern int gf_cpu_supports_intel_ssse3; +extern int gf_cpu_supports_intel_sse3; +extern int gf_cpu_supports_intel_sse2; +extern int gf_cpu_supports_arm_neon; + +void gf_cpu_identify(void); diff --git a/include/gf_int.h b/include/gf_int.h index 32866f4..0356920 100644 --- a/include/gf_int.h +++ b/include/gf_int.h @@ -30,8 +30,24 @@ typedef struct { int arg2; gf_t *base_gf; void *private; +#ifdef DEBUG_FUNCTIONS + const char *multiply; + const char *divide; + const char *inverse; + const char *multiply_region; + const char *extract_word; +#endif } gf_internal_t; +#ifdef DEBUG_FUNCTIONS +#define SET_FUNCTION(gf,method,size,func) \ + { (gf)->method.size = (func); \ + ((gf_internal_t*)(gf)->scratch)->method = #func; } +#else +#define SET_FUNCTION(gf,method,size,func) \ + (gf)->method.size = (func); +#endif + extern int gf_w4_init (gf_t *gf); extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4 index c03ccef..95c4dbe 100644 --- a/m4/ax_ext.m4 +++ b/m4/ax_ext.m4 @@ -1,40 +1,7 @@ # -# Updated by KMG to support -DINTEL_SSE for GF-Complete +# This macro is based on http://www.gnu.org/software/autoconf-archive/ax_ext.html +# but simplified to do compile time SIMD checks only # -# =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_ext.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_EXT -# -# DESCRIPTION -# -# Find supported SIMD extensions by requesting cpuid. When an SIMD -# extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if -# compiler supports it. For example, if "sse2" is available, then "-msse2" -# is added to SIMD_FLAGS. -# -# This macro calls: -# -# AC_SUBST(SIMD_FLAGS) -# -# And defines: -# -# HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4.1 / HAVE_SSE4.2 / HAVE_AVX -# -# LICENSE -# -# Copyright (c) 2007 Christophe Tournayre <turn3r@users.sourceforge.net> -# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com> -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice -# and this notice are preserved. This file is offered as-is, without any -# warranty. - -#serial 12 AC_DEFUN([AX_EXT], [ @@ -45,263 +12,63 @@ AC_DEFUN([AX_EXT], AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64]) SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64" - AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext], - [ - # TODO: detect / cross-compile - ax_cv_have_neon_ext=yes - ]) - AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext], - [ - # TODO: detect / cross-compile - ax_cv_have_arm_crypt_ext=yes - ]) - - if test "$ax_cv_have_arm_crypt_ext" = yes; then - AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension]) - fi - + AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes]) if test "$ax_cv_have_neon_ext" = yes; then - AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON"], [ax_cv_have_neon_ext=no]) fi - - if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto, - SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", []) - elif test "$ax_cv_have_arm_crypt_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto, - SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", []) - elif test "$ax_cv_have_neon_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, - SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", []) - fi - ;; + ;; arm*) - AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext], - [ - # TODO: detect / cross-compile - ax_cv_have_neon_ext=yes - ]) - + AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes]) if test "$ax_cv_have_neon_ext" = yes; then - AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) - AX_CHECK_COMPILE_FLAG(-mfpu=neon, - SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", []) + AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON"], [ax_cv_have_neon_ext=no]) fi - ;; + ;; powerpc*) - AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext], - [ - if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then - if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then - ax_cv_have_altivec_ext=yes - fi - fi - ]) - - if test "$ax_cv_have_altivec_ext" = yes; then - AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions]) - AX_CHECK_COMPILE_FLAG(-faltivec, SIMD_FLAGS="$SIMD_FLAGS -faltivec", []) - fi - ;; - - - i[[3456]]86*|x86_64*|amd64*) - - AC_REQUIRE([AX_GCC_X86_CPUID]) - AC_REQUIRE([AX_GCC_X86_AVX_XGETBV]) - - AX_GCC_X86_CPUID(0x00000001) - ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3` - edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4` - - AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext], - [ - ax_cv_have_mmx_ext=no - if test "$((0x$edx>>23&0x01))" = 1; then - ax_cv_have_mmx_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext], - [ - ax_cv_have_sse_ext=no - if test "$((0x$edx>>25&0x01))" = 1; then - ax_cv_have_sse_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext], - [ - ax_cv_have_sse2_ext=no - if test "$((0x$edx>>26&0x01))" = 1; then - ax_cv_have_sse2_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext], - [ - ax_cv_have_sse3_ext=no - if test "$((0x$ecx&0x01))" = 1; then - ax_cv_have_sse3_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether pclmuldq is supported], [ax_cv_have_pclmuldq_ext], - [ - ax_cv_have_pclmuldq_ext=no - if test "$((0x$ecx>>1&0x01))" = 1; then - ax_cv_have_pclmuldq_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext], - [ - ax_cv_have_ssse3_ext=no - if test "$((0x$ecx>>9&0x01))" = 1; then - ax_cv_have_ssse3_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether sse4.1 is supported], [ax_cv_have_sse41_ext], - [ - ax_cv_have_sse41_ext=no - if test "$((0x$ecx>>19&0x01))" = 1; then - ax_cv_have_sse41_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether sse4.2 is supported], [ax_cv_have_sse42_ext], - [ - ax_cv_have_sse42_ext=no - if test "$((0x$ecx>>20&0x01))" = 1; then - ax_cv_have_sse42_ext=yes - fi - ]) - - AC_CACHE_CHECK([whether avx is supported by processor], [ax_cv_have_avx_cpu_ext], - [ - ax_cv_have_avx_cpu_ext=no - if test "$((0x$ecx>>28&0x01))" = 1; then - ax_cv_have_avx_cpu_ext=yes - fi - ]) - - if test x"$ax_cv_have_avx_cpu_ext" = x"yes"; then - AX_GCC_X86_AVX_XGETBV(0x00000000) - - xgetbv_eax="0" - if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then - xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1` - fi - - AC_CACHE_CHECK([whether avx is supported by operating system], [ax_cv_have_avx_ext], - [ - ax_cv_have_avx_ext=no - - if test "$((0x$ecx>>27&0x01))" = 1; then - if test "$((0x$xgetbv_eax&0x6))" = 6; then - ax_cv_have_avx_ext=yes - fi - fi - ]) - if test x"$ax_cv_have_avx_ext" = x"no"; then - AC_MSG_WARN([Your processor supports AVX, but your operating system doesn't]) - fi + AC_CACHE_CHECK([whether altivec is enabled], [ax_cv_have_altivec_ext], [ax_cv_have_altivec_ext=yes]) + if test "$ax_cv_have_altivec_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-faltivec, [SIMD_FLAGS="$SIMD_FLAGS -faltivec"], [ax_cv_have_altivec_ext=no]) fi + ;; - if test "$ax_cv_have_mmx_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-mmmx, ax_cv_support_mmx_ext=yes, []) - if test x"$ax_cv_support_mmx_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -mmmx" - AC_DEFINE(HAVE_MMX,,[Support mmx instructions]) - else - AC_MSG_WARN([Your processor supports mmx instructions but not your compiler, can you try another compiler?]) - fi - fi + i[[3456]]86*|x86_64*|amd64*) + AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], [ax_cv_have_sse_ext=yes]) if test "$ax_cv_have_sse_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-msse, ax_cv_support_sse_ext=yes, []) - if test x"$ax_cv_support_sse_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE" - AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions]) - else - AC_MSG_WARN([Your processor supports sse instructions but not your compiler, can you try another compiler?]) - fi + AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"], [ax_cv_have_sse_ext=no]) fi + AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], [ax_cv_have_sse2_ext=yes]) if test "$ax_cv_have_sse2_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-msse2, ax_cv_support_sse2_ext=yes, []) - if test x"$ax_cv_support_sse2_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2" - AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions]) - else - AC_MSG_WARN([Your processor supports sse2 instructions but not your compiler, can you try another compiler?]) - fi + AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no]) fi + AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], [ax_cv_have_sse3_ext=yes]) if test "$ax_cv_have_sse3_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-msse3, ax_cv_support_sse3_ext=yes, []) - if test x"$ax_cv_support_sse3_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3" - AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions]) - else - AC_MSG_WARN([Your processor supports sse3 instructions but not your compiler, can you try another compiler?]) - fi - fi - - if test "$ax_cv_have_pclmuldq_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-mpclmul, ax_cv_support_pclmuldq_ext=yes, []) - if test x"$ax_cv_support_pclmuldq_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL" - AC_DEFINE(HAVE_PCLMULDQ,,[Support (PCLMULDQ) Carry-Free Muliplication]) - else - AC_MSG_WARN([Your processor supports pclmuldq instructions but not your compiler, can you try another compiler?]) - fi + AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no]) fi + AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], [ax_cv_have_ssse3_ext=yes]) if test "$ax_cv_have_ssse3_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-mssse3, ax_cv_support_ssse3_ext=yes, []) - if test x"$ax_cv_support_ssse3_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3" - AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions]) - else - AC_MSG_WARN([Your processor supports ssse3 instructions but not your compiler, can you try another compiler?]) - fi + AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no]) fi - if test "$ax_cv_have_sse41_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, []) - if test x"$ax_cv_support_sse41_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4" - AC_DEFINE(HAVE_SSE4_1,,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions]) - else - AC_MSG_WARN([Your processor supports sse4.1 instructions but not your compiler, can you try another compiler?]) - fi + AC_CACHE_CHECK([whether pclmuldq is enabled], [ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes]) + if test "$ax_cv_have_pclmuldq_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no]) fi - if test "$ax_cv_have_sse42_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-msse4.2, ax_cv_support_sse42_ext=yes, []) - if test x"$ax_cv_support_sse42_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4" - AC_DEFINE(HAVE_SSE4_2,,[Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions]) - else - AC_MSG_WARN([Your processor supports sse4.2 instructions but not your compiler, can you try another compiler?]) - fi + AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], [ax_cv_have_sse41_ext=yes]) + if test "$ax_cv_have_sse41_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no]) fi - if test "$ax_cv_have_avx_ext" = yes; then - AX_CHECK_COMPILE_FLAG(-mavx, ax_cv_support_avx_ext=yes, []) - if test x"$ax_cv_support_avx_ext" = x"yes"; then - SIMD_FLAGS="$SIMD_FLAGS -mavx" - AC_DEFINE(HAVE_AVX,,[Support AVX (Advanced Vector Extensions) instructions]) - else - AC_MSG_WARN([Your processor supports avx instructions but not your compiler, can you try another compiler?]) - fi + AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], [ax_cv_have_sse42_ext=yes]) + if test "$ax_cv_have_sse42_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no]) fi - - ;; + ;; esac AC_SUBST(SIMD_FLAGS) diff --git a/m4/ax_gcc_x86_avx_xgetbv.m4 b/m4/ax_gcc_x86_avx_xgetbv.m4 deleted file mode 100644 index 0624eeb..0000000 --- a/m4/ax_gcc_x86_avx_xgetbv.m4 +++ /dev/null @@ -1,79 +0,0 @@ -# =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_GCC_X86_AVX_XGETBV -# -# DESCRIPTION -# -# On later x86 processors with AVX SIMD support, with gcc or a compiler -# that has a compatible syntax for inline assembly instructions, run a -# small program that executes the xgetbv instruction with input OP. This -# can be used to detect if the OS supports AVX instruction usage. -# -# On output, the values of the eax and edx registers are stored as -# hexadecimal strings as "eax:edx" in the cache variable -# ax_cv_gcc_x86_avx_xgetbv. -# -# If the xgetbv instruction fails (because you are running a -# cross-compiler, or because you are not using gcc, or because you are on -# a processor that doesn't have this instruction), -# ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown". -# -# This macro mainly exists to be used in AX_EXT. -# -# LICENSE -# -# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com> -# -# This program is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 3 of the License, or (at your -# option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -# Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. If not, see <http://www.gnu.org/licenses/>. -# -# As a special exception, the respective Autoconf Macro's copyright owner -# gives unlimited permission to copy, distribute and modify the configure -# scripts that are the output of Autoconf when processing the Macro. You -# need not follow the terms of the GNU General Public License when using -# or distributing such scripts, even though portions of the text of the -# Macro appear in them. The GNU General Public License (GPL) does govern -# all other use of the material that constitutes the Autoconf Macro. -# -# This special exception to the GPL applies to versions of the Autoconf -# Macro released by the Autoconf Archive. When you make and distribute a -# modified version of the Autoconf Macro, you may extend this special -# exception to the GPL to apply to your modified version as well. - -#serial 1 - -AC_DEFUN([AX_GCC_X86_AVX_XGETBV], -[AC_REQUIRE([AC_PROG_CC]) -AC_LANG_PUSH([C]) -AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1, - [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [ - int op = $1, eax, edx; - FILE *f; - /* Opcodes for xgetbv */ - __asm__(".byte 0x0f, 0x01, 0xd0" - : "=a" (eax), "=d" (edx) - : "c" (op)); - f = fopen("conftest_xgetbv", "w"); if (!f) return 1; - fprintf(f, "%x:%x\n", eax, edx); - fclose(f); - return 0; -])], - [ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv], - [ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv], - [ax_cv_gcc_x86_avx_xgetbv_$1=unknown])]) -AC_LANG_POP([C]) -]) diff --git a/m4/ax_gcc_x86_cpuid.m4 b/m4/ax_gcc_x86_cpuid.m4 deleted file mode 100644 index 7d46fee..0000000 --- a/m4/ax_gcc_x86_cpuid.m4 +++ /dev/null @@ -1,79 +0,0 @@ -# =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_cpuid.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_GCC_X86_CPUID(OP) -# -# DESCRIPTION -# -# On Pentium and later x86 processors, with gcc or a compiler that has a -# compatible syntax for inline assembly instructions, run a small program -# that executes the cpuid instruction with input OP. This can be used to -# detect the CPU type. -# -# On output, the values of the eax, ebx, ecx, and edx registers are stored -# as hexadecimal strings as "eax:ebx:ecx:edx" in the cache variable -# ax_cv_gcc_x86_cpuid_OP. -# -# If the cpuid instruction fails (because you are running a -# cross-compiler, or because you are not using gcc, or because you are on -# a processor that doesn't have this instruction), ax_cv_gcc_x86_cpuid_OP -# is set to the string "unknown". -# -# This macro mainly exists to be used in AX_GCC_ARCHFLAG. -# -# LICENSE -# -# Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu> -# Copyright (c) 2008 Matteo Frigo -# -# This program is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 3 of the License, or (at your -# option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -# Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. If not, see <http://www.gnu.org/licenses/>. -# -# As a special exception, the respective Autoconf Macro's copyright owner -# gives unlimited permission to copy, distribute and modify the configure -# scripts that are the output of Autoconf when processing the Macro. You -# need not follow the terms of the GNU General Public License when using -# or distributing such scripts, even though portions of the text of the -# Macro appear in them. The GNU General Public License (GPL) does govern -# all other use of the material that constitutes the Autoconf Macro. -# -# This special exception to the GPL applies to versions of the Autoconf -# Macro released by the Autoconf Archive. When you make and distribute a -# modified version of the Autoconf Macro, you may extend this special -# exception to the GPL to apply to your modified version as well. - -#serial 7 - -AC_DEFUN([AX_GCC_X86_CPUID], -[AC_REQUIRE([AC_PROG_CC]) -AC_LANG_PUSH([C]) -AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1, - [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [ - int op = $1, eax, ebx, ecx, edx; - FILE *f; - __asm__("cpuid" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a" (op)); - f = fopen("conftest_cpuid", "w"); if (!f) return 1; - fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx); - fclose(f); - return 0; -])], - [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid], - [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid], - [ax_cv_gcc_x86_cpuid_$1=unknown])]) -AC_LANG_POP([C]) -]) diff --git a/m4/ltoptions.m4 b/m4/ltoptions.m4 deleted file mode 100644 index 5d9acd8..0000000 --- a/m4/ltoptions.m4 +++ /dev/null @@ -1,384 +0,0 @@ -# Helper functions for option handling. -*- Autoconf -*- -# -# Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation, -# Inc. -# Written by Gary V. Vaughan, 2004 -# -# This file is free software; the Free Software Foundation gives -# unlimited permission to copy and/or distribute it, with or without -# modifications, as long as this notice is preserved. - -# serial 7 ltoptions.m4 - -# This is to help aclocal find these macros, as it can't see m4_define. -AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])]) - - -# _LT_MANGLE_OPTION(MACRO-NAME, OPTION-NAME) -# ------------------------------------------ -m4_define([_LT_MANGLE_OPTION], -[[_LT_OPTION_]m4_bpatsubst($1__$2, [[^a-zA-Z0-9_]], [_])]) - - -# _LT_SET_OPTION(MACRO-NAME, OPTION-NAME) -# --------------------------------------- -# Set option OPTION-NAME for macro MACRO-NAME, and if there is a -# matching handler defined, dispatch to it. Other OPTION-NAMEs are -# saved as a flag. -m4_define([_LT_SET_OPTION], -[m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl -m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]), - _LT_MANGLE_DEFUN([$1], [$2]), - [m4_warning([Unknown $1 option `$2'])])[]dnl -]) - - -# _LT_IF_OPTION(MACRO-NAME, OPTION-NAME, IF-SET, [IF-NOT-SET]) -# ------------------------------------------------------------ -# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise. -m4_define([_LT_IF_OPTION], -[m4_ifdef(_LT_MANGLE_OPTION([$1], [$2]), [$3], [$4])]) - - -# _LT_UNLESS_OPTIONS(MACRO-NAME, OPTION-LIST, IF-NOT-SET) -# ------------------------------------------------------- -# Execute IF-NOT-SET unless all options in OPTION-LIST for MACRO-NAME -# are set. -m4_define([_LT_UNLESS_OPTIONS], -[m4_foreach([_LT_Option], m4_split(m4_normalize([$2])), - [m4_ifdef(_LT_MANGLE_OPTION([$1], _LT_Option), - [m4_define([$0_found])])])[]dnl -m4_ifdef([$0_found], [m4_undefine([$0_found])], [$3 -])[]dnl -]) - - -# _LT_SET_OPTIONS(MACRO-NAME, OPTION-LIST) -# ---------------------------------------- -# OPTION-LIST is a space-separated list of Libtool options associated -# with MACRO-NAME. If any OPTION has a matching handler declared with -# LT_OPTION_DEFINE, dispatch to that macro; otherwise complain about -# the unknown option and exit. -m4_defun([_LT_SET_OPTIONS], -[# Set options -m4_foreach([_LT_Option], m4_split(m4_normalize([$2])), - [_LT_SET_OPTION([$1], _LT_Option)]) - -m4_if([$1],[LT_INIT],[ - dnl - dnl Simply set some default values (i.e off) if boolean options were not - dnl specified: - _LT_UNLESS_OPTIONS([LT_INIT], [dlopen], [enable_dlopen=no - ]) - _LT_UNLESS_OPTIONS([LT_INIT], [win32-dll], [enable_win32_dll=no - ]) - dnl - dnl If no reference was made to various pairs of opposing options, then - dnl we run the default mode handler for the pair. For example, if neither - dnl `shared' nor `disable-shared' was passed, we enable building of shared - dnl archives by default: - _LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED]) - _LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC]) - _LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC]) - _LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install], - [_LT_ENABLE_FAST_INSTALL]) - ]) -])# _LT_SET_OPTIONS - - -## --------------------------------- ## -## Macros to handle LT_INIT options. ## -## --------------------------------- ## - -# _LT_MANGLE_DEFUN(MACRO-NAME, OPTION-NAME) -# ----------------------------------------- -m4_define([_LT_MANGLE_DEFUN], -[[_LT_OPTION_DEFUN_]m4_bpatsubst(m4_toupper([$1__$2]), [[^A-Z0-9_]], [_])]) - - -# LT_OPTION_DEFINE(MACRO-NAME, OPTION-NAME, CODE) -# ----------------------------------------------- -m4_define([LT_OPTION_DEFINE], -[m4_define(_LT_MANGLE_DEFUN([$1], [$2]), [$3])[]dnl -])# LT_OPTION_DEFINE - - -# dlopen -# ------ -LT_OPTION_DEFINE([LT_INIT], [dlopen], [enable_dlopen=yes -]) - -AU_DEFUN([AC_LIBTOOL_DLOPEN], -[_LT_SET_OPTION([LT_INIT], [dlopen]) -AC_DIAGNOSE([obsolete], -[$0: Remove this warning and the call to _LT_SET_OPTION when you -put the `dlopen' option into LT_INIT's first parameter.]) -]) - -dnl aclocal-1.4 backwards compatibility: -dnl AC_DEFUN([AC_LIBTOOL_DLOPEN], []) - - -# win32-dll -# --------- -# Declare package support for building win32 dll's. -LT_OPTION_DEFINE([LT_INIT], [win32-dll], -[enable_win32_dll=yes - -case $host in -*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*) - AC_CHECK_TOOL(AS, as, false) - AC_CHECK_TOOL(DLLTOOL, dlltool, false) - AC_CHECK_TOOL(OBJDUMP, objdump, false) - ;; -esac - -test -z "$AS" && AS=as -_LT_DECL([], [AS], [1], [Assembler program])dnl - -test -z "$DLLTOOL" && DLLTOOL=dlltool -_LT_DECL([], [DLLTOOL], [1], [DLL creation program])dnl - -test -z "$OBJDUMP" && OBJDUMP=objdump -_LT_DECL([], [OBJDUMP], [1], [Object dumper program])dnl -])# win32-dll - -AU_DEFUN([AC_LIBTOOL_WIN32_DLL], -[AC_REQUIRE([AC_CANONICAL_HOST])dnl -_LT_SET_OPTION([LT_INIT], [win32-dll]) -AC_DIAGNOSE([obsolete], -[$0: Remove this warning and the call to _LT_SET_OPTION when you -put the `win32-dll' option into LT_INIT's first parameter.]) -]) - -dnl aclocal-1.4 backwards compatibility: -dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], []) - - -# _LT_ENABLE_SHARED([DEFAULT]) -# ---------------------------- -# implement the --enable-shared flag, and supports the `shared' and -# `disable-shared' LT_INIT options. -# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'. -m4_define([_LT_ENABLE_SHARED], -[m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl -AC_ARG_ENABLE([shared], - [AS_HELP_STRING([--enable-shared@<:@=PKGS@:>@], - [build shared libraries @<:@default=]_LT_ENABLE_SHARED_DEFAULT[@:>@])], - [p=${PACKAGE-default} - case $enableval in - yes) enable_shared=yes ;; - no) enable_shared=no ;; - *) - enable_shared=no - # Look at the argument we got. We use all the common list separators. - lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR," - for pkg in $enableval; do - IFS="$lt_save_ifs" - if test "X$pkg" = "X$p"; then - enable_shared=yes - fi - done - IFS="$lt_save_ifs" - ;; - esac], - [enable_shared=]_LT_ENABLE_SHARED_DEFAULT) - - _LT_DECL([build_libtool_libs], [enable_shared], [0], - [Whether or not to build shared libraries]) -])# _LT_ENABLE_SHARED - -LT_OPTION_DEFINE([LT_INIT], [shared], [_LT_ENABLE_SHARED([yes])]) -LT_OPTION_DEFINE([LT_INIT], [disable-shared], [_LT_ENABLE_SHARED([no])]) - -# Old names: -AC_DEFUN([AC_ENABLE_SHARED], -[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[shared]) -]) - -AC_DEFUN([AC_DISABLE_SHARED], -[_LT_SET_OPTION([LT_INIT], [disable-shared]) -]) - -AU_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)]) -AU_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)]) - -dnl aclocal-1.4 backwards compatibility: -dnl AC_DEFUN([AM_ENABLE_SHARED], []) -dnl AC_DEFUN([AM_DISABLE_SHARED], []) - - - -# _LT_ENABLE_STATIC([DEFAULT]) -# ---------------------------- -# implement the --enable-static flag, and support the `static' and -# `disable-static' LT_INIT options. -# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'. -m4_define([_LT_ENABLE_STATIC], -[m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl -AC_ARG_ENABLE([static], - [AS_HELP_STRING([--enable-static@<:@=PKGS@:>@], - [build static libraries @<:@default=]_LT_ENABLE_STATIC_DEFAULT[@:>@])], - [p=${PACKAGE-default} - case $enableval in - yes) enable_static=yes ;; - no) enable_static=no ;; - *) - enable_static=no - # Look at the argument we got. We use all the common list separators. - lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR," - for pkg in $enableval; do - IFS="$lt_save_ifs" - if test "X$pkg" = "X$p"; then - enable_static=yes - fi - done - IFS="$lt_save_ifs" - ;; - esac], - [enable_static=]_LT_ENABLE_STATIC_DEFAULT) - - _LT_DECL([build_old_libs], [enable_static], [0], - [Whether or not to build static libraries]) -])# _LT_ENABLE_STATIC - -LT_OPTION_DEFINE([LT_INIT], [static], [_LT_ENABLE_STATIC([yes])]) -LT_OPTION_DEFINE([LT_INIT], [disable-static], [_LT_ENABLE_STATIC([no])]) - -# Old names: -AC_DEFUN([AC_ENABLE_STATIC], -[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[static]) -]) - -AC_DEFUN([AC_DISABLE_STATIC], -[_LT_SET_OPTION([LT_INIT], [disable-static]) -]) - -AU_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)]) -AU_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)]) - -dnl aclocal-1.4 backwards compatibility: -dnl AC_DEFUN([AM_ENABLE_STATIC], []) -dnl AC_DEFUN([AM_DISABLE_STATIC], []) - - - -# _LT_ENABLE_FAST_INSTALL([DEFAULT]) -# ---------------------------------- -# implement the --enable-fast-install flag, and support the `fast-install' -# and `disable-fast-install' LT_INIT options. -# DEFAULT is either `yes' or `no'. If omitted, it defaults to `yes'. -m4_define([_LT_ENABLE_FAST_INSTALL], -[m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl -AC_ARG_ENABLE([fast-install], - [AS_HELP_STRING([--enable-fast-install@<:@=PKGS@:>@], - [optimize for fast installation @<:@default=]_LT_ENABLE_FAST_INSTALL_DEFAULT[@:>@])], - [p=${PACKAGE-default} - case $enableval in - yes) enable_fast_install=yes ;; - no) enable_fast_install=no ;; - *) - enable_fast_install=no - # Look at the argument we got. We use all the common list separators. - lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR," - for pkg in $enableval; do - IFS="$lt_save_ifs" - if test "X$pkg" = "X$p"; then - enable_fast_install=yes - fi - done - IFS="$lt_save_ifs" - ;; - esac], - [enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT) - -_LT_DECL([fast_install], [enable_fast_install], [0], - [Whether or not to optimize for fast installation])dnl -])# _LT_ENABLE_FAST_INSTALL - -LT_OPTION_DEFINE([LT_INIT], [fast-install], [_LT_ENABLE_FAST_INSTALL([yes])]) -LT_OPTION_DEFINE([LT_INIT], [disable-fast-install], [_LT_ENABLE_FAST_INSTALL([no])]) - -# Old names: -AU_DEFUN([AC_ENABLE_FAST_INSTALL], -[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install]) -AC_DIAGNOSE([obsolete], -[$0: Remove this warning and the call to _LT_SET_OPTION when you put -the `fast-install' option into LT_INIT's first parameter.]) -]) - -AU_DEFUN([AC_DISABLE_FAST_INSTALL], -[_LT_SET_OPTION([LT_INIT], [disable-fast-install]) -AC_DIAGNOSE([obsolete], -[$0: Remove this warning and the call to _LT_SET_OPTION when you put -the `disable-fast-install' option into LT_INIT's first parameter.]) -]) - -dnl aclocal-1.4 backwards compatibility: -dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], []) -dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], []) - - -# _LT_WITH_PIC([MODE]) -# -------------------- -# implement the --with-pic flag, and support the `pic-only' and `no-pic' -# LT_INIT options. -# MODE is either `yes' or `no'. If omitted, it defaults to `both'. -m4_define([_LT_WITH_PIC], -[AC_ARG_WITH([pic], - [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@], - [try to use only PIC/non-PIC objects @<:@default=use both@:>@])], - [lt_p=${PACKAGE-default} - case $withval in - yes|no) pic_mode=$withval ;; - *) - pic_mode=default - # Look at the argument we got. We use all the common list separators. - lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR," - for lt_pkg in $withval; do - IFS="$lt_save_ifs" - if test "X$lt_pkg" = "X$lt_p"; then - pic_mode=yes - fi - done - IFS="$lt_save_ifs" - ;; - esac], - [pic_mode=default]) - -test -z "$pic_mode" && pic_mode=m4_default([$1], [default]) - -_LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl -])# _LT_WITH_PIC - -LT_OPTION_DEFINE([LT_INIT], [pic-only], [_LT_WITH_PIC([yes])]) -LT_OPTION_DEFINE([LT_INIT], [no-pic], [_LT_WITH_PIC([no])]) - -# Old name: -AU_DEFUN([AC_LIBTOOL_PICMODE], -[_LT_SET_OPTION([LT_INIT], [pic-only]) -AC_DIAGNOSE([obsolete], -[$0: Remove this warning and the call to _LT_SET_OPTION when you -put the `pic-only' option into LT_INIT's first parameter.]) -]) - -dnl aclocal-1.4 backwards compatibility: -dnl AC_DEFUN([AC_LIBTOOL_PICMODE], []) - -## ----------------- ## -## LTDL_INIT Options ## -## ----------------- ## - -m4_define([_LTDL_MODE], []) -LT_OPTION_DEFINE([LTDL_INIT], [nonrecursive], - [m4_define([_LTDL_MODE], [nonrecursive])]) -LT_OPTION_DEFINE([LTDL_INIT], [recursive], - [m4_define([_LTDL_MODE], [recursive])]) -LT_OPTION_DEFINE([LTDL_INIT], [subproject], - [m4_define([_LTDL_MODE], [subproject])]) - -m4_define([_LTDL_TYPE], []) -LT_OPTION_DEFINE([LTDL_INIT], [installable], - [m4_define([_LTDL_TYPE], [installable])]) -LT_OPTION_DEFINE([LTDL_INIT], [convenience], - [m4_define([_LTDL_TYPE], [convenience])]) diff --git a/m4/ltsugar.m4 b/m4/ltsugar.m4 deleted file mode 100644 index 9000a05..0000000 --- a/m4/ltsugar.m4 +++ /dev/null @@ -1,123 +0,0 @@ -# ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*- -# -# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc. -# Written by Gary V. Vaughan, 2004 -# -# This file is free software; the Free Software Foundation gives -# unlimited permission to copy and/or distribute it, with or without -# modifications, as long as this notice is preserved. - -# serial 6 ltsugar.m4 - -# This is to help aclocal find these macros, as it can't see m4_define. -AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])]) - - -# lt_join(SEP, ARG1, [ARG2...]) -# ----------------------------- -# Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their -# associated separator. -# Needed until we can rely on m4_join from Autoconf 2.62, since all earlier -# versions in m4sugar had bugs. -m4_define([lt_join], -[m4_if([$#], [1], [], - [$#], [2], [[$2]], - [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])]) -m4_define([_lt_join], -[m4_if([$#$2], [2], [], - [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])]) - - -# lt_car(LIST) -# lt_cdr(LIST) -# ------------ -# Manipulate m4 lists. -# These macros are necessary as long as will still need to support -# Autoconf-2.59 which quotes differently. -m4_define([lt_car], [[$1]]) -m4_define([lt_cdr], -[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])], - [$#], 1, [], - [m4_dquote(m4_shift($@))])]) -m4_define([lt_unquote], $1) - - -# lt_append(MACRO-NAME, STRING, [SEPARATOR]) -# ------------------------------------------ -# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'. -# Note that neither SEPARATOR nor STRING are expanded; they are appended -# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked). -# No SEPARATOR is output if MACRO-NAME was previously undefined (different -# than defined and empty). -# -# This macro is needed until we can rely on Autoconf 2.62, since earlier -# versions of m4sugar mistakenly expanded SEPARATOR but not STRING. -m4_define([lt_append], -[m4_define([$1], - m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])]) - - - -# lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...]) -# ---------------------------------------------------------- -# Produce a SEP delimited list of all paired combinations of elements of -# PREFIX-LIST with SUFFIX1 through SUFFIXn. Each element of the list -# has the form PREFIXmINFIXSUFFIXn. -# Needed until we can rely on m4_combine added in Autoconf 2.62. -m4_define([lt_combine], -[m4_if(m4_eval([$# > 3]), [1], - [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl -[[m4_foreach([_Lt_prefix], [$2], - [m4_foreach([_Lt_suffix], - ]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[, - [_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])]) - - -# lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ]) -# ----------------------------------------------------------------------- -# Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited -# by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ. -m4_define([lt_if_append_uniq], -[m4_ifdef([$1], - [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1], - [lt_append([$1], [$2], [$3])$4], - [$5])], - [lt_append([$1], [$2], [$3])$4])]) - - -# lt_dict_add(DICT, KEY, VALUE) -# ----------------------------- -m4_define([lt_dict_add], -[m4_define([$1($2)], [$3])]) - - -# lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE) -# -------------------------------------------- -m4_define([lt_dict_add_subkey], -[m4_define([$1($2:$3)], [$4])]) - - -# lt_dict_fetch(DICT, KEY, [SUBKEY]) -# ---------------------------------- -m4_define([lt_dict_fetch], -[m4_ifval([$3], - m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]), - m4_ifdef([$1($2)], [m4_defn([$1($2)])]))]) - - -# lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE]) -# ----------------------------------------------------------------- -m4_define([lt_if_dict_fetch], -[m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4], - [$5], - [$6])]) - - -# lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...]) -# -------------------------------------------------------------- -m4_define([lt_dict_filter], -[m4_if([$5], [], [], - [lt_join(m4_quote(m4_default([$4], [[, ]])), - lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]), - [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl -]) diff --git a/m4/lt~obsolete.m4 b/m4/lt~obsolete.m4 deleted file mode 100644 index c573da9..0000000 --- a/m4/lt~obsolete.m4 +++ /dev/null @@ -1,98 +0,0 @@ -# lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*- -# -# Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc. -# Written by Scott James Remnant, 2004. -# -# This file is free software; the Free Software Foundation gives -# unlimited permission to copy and/or distribute it, with or without -# modifications, as long as this notice is preserved. - -# serial 5 lt~obsolete.m4 - -# These exist entirely to fool aclocal when bootstrapping libtool. -# -# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN) -# which have later been changed to m4_define as they aren't part of the -# exported API, or moved to Autoconf or Automake where they belong. -# -# The trouble is, aclocal is a bit thick. It'll see the old AC_DEFUN -# in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us -# using a macro with the same name in our local m4/libtool.m4 it'll -# pull the old libtool.m4 in (it doesn't see our shiny new m4_define -# and doesn't know about Autoconf macros at all.) -# -# So we provide this file, which has a silly filename so it's always -# included after everything else. This provides aclocal with the -# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything -# because those macros already exist, or will be overwritten later. -# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. -# -# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here. -# Yes, that means every name once taken will need to remain here until -# we give up compatibility with versions before 1.7, at which point -# we need to keep only those names which we still refer to. - -# This is to help aclocal find these macros, as it can't see m4_define. -AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])]) - -m4_ifndef([AC_LIBTOOL_LINKER_OPTION], [AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])]) -m4_ifndef([AC_PROG_EGREP], [AC_DEFUN([AC_PROG_EGREP])]) -m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])]) -m4_ifndef([_LT_AC_SHELL_INIT], [AC_DEFUN([_LT_AC_SHELL_INIT])]) -m4_ifndef([_LT_AC_SYS_LIBPATH_AIX], [AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])]) -m4_ifndef([_LT_PROG_LTMAIN], [AC_DEFUN([_LT_PROG_LTMAIN])]) -m4_ifndef([_LT_AC_TAGVAR], [AC_DEFUN([_LT_AC_TAGVAR])]) -m4_ifndef([AC_LTDL_ENABLE_INSTALL], [AC_DEFUN([AC_LTDL_ENABLE_INSTALL])]) -m4_ifndef([AC_LTDL_PREOPEN], [AC_DEFUN([AC_LTDL_PREOPEN])]) -m4_ifndef([_LT_AC_SYS_COMPILER], [AC_DEFUN([_LT_AC_SYS_COMPILER])]) -m4_ifndef([_LT_AC_LOCK], [AC_DEFUN([_LT_AC_LOCK])]) -m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE], [AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])]) -m4_ifndef([_LT_AC_TRY_DLOPEN_SELF], [AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])]) -m4_ifndef([AC_LIBTOOL_PROG_CC_C_O], [AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])]) -m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])]) -m4_ifndef([AC_LIBTOOL_OBJDIR], [AC_DEFUN([AC_LIBTOOL_OBJDIR])]) -m4_ifndef([AC_LTDL_OBJDIR], [AC_DEFUN([AC_LTDL_OBJDIR])]) -m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])]) -m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP], [AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])]) -m4_ifndef([AC_PATH_MAGIC], [AC_DEFUN([AC_PATH_MAGIC])]) -m4_ifndef([AC_PROG_LD_GNU], [AC_DEFUN([AC_PROG_LD_GNU])]) -m4_ifndef([AC_PROG_LD_RELOAD_FLAG], [AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])]) -m4_ifndef([AC_DEPLIBS_CHECK_METHOD], [AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])]) -m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])]) -m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])]) -m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])]) -m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS], [AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])]) -m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP], [AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])]) -m4_ifndef([LT_AC_PROG_EGREP], [AC_DEFUN([LT_AC_PROG_EGREP])]) -m4_ifndef([LT_AC_PROG_SED], [AC_DEFUN([LT_AC_PROG_SED])]) -m4_ifndef([_LT_CC_BASENAME], [AC_DEFUN([_LT_CC_BASENAME])]) -m4_ifndef([_LT_COMPILER_BOILERPLATE], [AC_DEFUN([_LT_COMPILER_BOILERPLATE])]) -m4_ifndef([_LT_LINKER_BOILERPLATE], [AC_DEFUN([_LT_LINKER_BOILERPLATE])]) -m4_ifndef([_AC_PROG_LIBTOOL], [AC_DEFUN([_AC_PROG_LIBTOOL])]) -m4_ifndef([AC_LIBTOOL_SETUP], [AC_DEFUN([AC_LIBTOOL_SETUP])]) -m4_ifndef([_LT_AC_CHECK_DLFCN], [AC_DEFUN([_LT_AC_CHECK_DLFCN])]) -m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER], [AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])]) -m4_ifndef([_LT_AC_TAGCONFIG], [AC_DEFUN([_LT_AC_TAGCONFIG])]) -m4_ifndef([AC_DISABLE_FAST_INSTALL], [AC_DEFUN([AC_DISABLE_FAST_INSTALL])]) -m4_ifndef([_LT_AC_LANG_CXX], [AC_DEFUN([_LT_AC_LANG_CXX])]) -m4_ifndef([_LT_AC_LANG_F77], [AC_DEFUN([_LT_AC_LANG_F77])]) -m4_ifndef([_LT_AC_LANG_GCJ], [AC_DEFUN([_LT_AC_LANG_GCJ])]) -m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])]) -m4_ifndef([_LT_AC_LANG_C_CONFIG], [AC_DEFUN([_LT_AC_LANG_C_CONFIG])]) -m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])]) -m4_ifndef([_LT_AC_LANG_CXX_CONFIG], [AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])]) -m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])]) -m4_ifndef([_LT_AC_LANG_F77_CONFIG], [AC_DEFUN([_LT_AC_LANG_F77_CONFIG])]) -m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])]) -m4_ifndef([_LT_AC_LANG_GCJ_CONFIG], [AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])]) -m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])]) -m4_ifndef([_LT_AC_LANG_RC_CONFIG], [AC_DEFUN([_LT_AC_LANG_RC_CONFIG])]) -m4_ifndef([AC_LIBTOOL_CONFIG], [AC_DEFUN([AC_LIBTOOL_CONFIG])]) -m4_ifndef([_LT_AC_FILE_LTDLL_C], [AC_DEFUN([_LT_AC_FILE_LTDLL_C])]) -m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS], [AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])]) -m4_ifndef([_LT_AC_PROG_CXXCPP], [AC_DEFUN([_LT_AC_PROG_CXXCPP])]) -m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS], [AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])]) -m4_ifndef([_LT_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])]) -m4_ifndef([_LT_PROG_F77], [AC_DEFUN([_LT_PROG_F77])]) -m4_ifndef([_LT_PROG_FC], [AC_DEFUN([_LT_PROG_FC])]) -m4_ifndef([_LT_PROG_CXX], [AC_DEFUN([_LT_PROG_CXX])]) diff --git a/src/Makefile.am b/src/Makefile.am index a3bd37a..cfc2a50 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -4,11 +4,21 @@ AUTOMAKE_OPTIONS = subdir-objects AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare +# avoid using SIMD_FLAGS for code that calls strcmp as new gcc +# versions will use SIMD for the strcmp implementation. Instead +# we create a static library just for gf_method that is not compiled +# with SIMD_FLAGS, this static library will get linked into gf_complete.so +noinst_LTLIBRARIES = libgf_util.la +libgf_util_la_SOURCES = gf_method.c +libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare + +# we narrowly use SIMD_FLAGS for code that needs it lib_LTLIBRARIES = libgf_complete.la -libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ - gf_w64.c gf_w128.c gf_rand.c gf_general.c +libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ + gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c +libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare +libgf_complete_la_LIBADD = libgf_util.la if HAVE_NEON libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include <assert.h> +#include "gf_cpu.h" int _gf_errno = GF_E_DEFAULT; @@ -207,20 +208,28 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } #ifdef INTEL_SSE2 - sse2 = 1; + if (gf_cpu_supports_intel_sse2) { + sse2 = 1; + } #endif #ifdef INTEL_SSSE3 - sse3 = 1; + if (gf_cpu_supports_intel_ssse3) { + sse3 = 1; + } #endif #ifdef INTEL_SSE4_PCLMUL - pclmul = 1; + if (gf_cpu_supports_intel_pclmul) { + pclmul = 1; + } #endif #ifdef ARM_NEON - pclmul = (w == 4 || w == 8); - sse3 = 1; + if (gf_cpu_supports_arm_neon) { + pclmul = (w == 4 || w == 8); + sse3 = 1; + } #endif @@ -473,6 +482,8 @@ int gf_init_hard(gf_t *gf, int w, int mult_type, int sz; gf_internal_t *h; + gf_cpu_identify(); + if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, prim_poly, base_gf) == 0) return 0; @@ -901,9 +912,6 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes); void gf_multby_one(void *src, void *dest, int bytes, int xor) { -#ifdef INTEL_SSE2 - __m128i ms, md; -#endif unsigned long uls, uld; uint8_t *s8, *d8; uint64_t *s64, *d64, *dtop64; @@ -918,84 +926,89 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) uld = (unsigned long) dest; #ifdef INTEL_SSE2 - int abytes; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - if (uls % 16 == uld % 16) { - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); - while (s8 != rd.s_start) { - *d8 ^= *s8; - d8++; - s8++; + if (gf_cpu_supports_intel_sse2) { + __m128i ms, md; + int abytes; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + d8++; + s8++; + } + while (s8 < (uint8_t *) rd.s_top) { + ms = _mm_load_si128 ((__m128i *)(s8)); + md = _mm_load_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_store_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (s8 != (uint8_t *) src + bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; } - while (s8 < (uint8_t *) rd.s_top) { - ms = _mm_load_si128 ((__m128i *)(s8)); - md = _mm_load_si128 ((__m128i *)(d8)); + + abytes = (bytes & 0xfffffff0); + + while (d8 < (uint8_t *) dest + abytes) { + ms = _mm_loadu_si128 ((__m128i *)(s8)); + md = _mm_loadu_si128 ((__m128i *)(d8)); md = _mm_xor_si128(md, ms); - _mm_store_si128((__m128i *)(d8), md); + _mm_storeu_si128((__m128i *)(d8), md); s8 += 16; d8 += 16; } - while (s8 != (uint8_t *) src + bytes) { + while (d8 != (uint8_t *) dest+bytes) { *d8 ^= *s8; d8++; s8++; } return; } - - abytes = (bytes & 0xfffffff0); - - while (d8 < (uint8_t *) dest + abytes) { - ms = _mm_loadu_si128 ((__m128i *)(s8)); - md = _mm_loadu_si128 ((__m128i *)(d8)); - md = _mm_xor_si128(md, ms); - _mm_storeu_si128((__m128i *)(d8), md); - s8 += 16; - d8 += 16; - } - while (d8 != (uint8_t *) dest+bytes) { - *d8 ^= *s8; - d8++; - s8++; - } - return; #endif #if defined(ARM_NEON) - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - if (uls % 16 == uld % 16) { - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); - while (s8 != rd.s_start) { + if (gf_cpu_supports_arm_neon) { + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + s8++; + d8++; + } + while (s8 < (uint8_t *) rd.s_top) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } else { + while (s8 + 15 < (uint8_t *) src + bytes) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } + while (s8 < (uint8_t *) src + bytes) { *d8 ^= *s8; s8++; d8++; } - while (s8 < (uint8_t *) rd.s_top) { - uint8x16_t vs = vld1q_u8 (s8); - uint8x16_t vd = vld1q_u8 (d8); - uint8x16_t vr = veorq_u8 (vs, vd); - vst1q_u8 (d8, vr); - s8 += 16; - d8 += 16; - } - } else { - while (s8 + 15 < (uint8_t *) src + bytes) { - uint8x16_t vs = vld1q_u8 (s8); - uint8x16_t vd = vld1q_u8 (d8); - uint8x16_t vr = veorq_u8 (vs, vd); - vst1q_u8 (d8, vr); - s8 += 16; - d8 += 16; - } - } - while (s8 < (uint8_t *) src + bytes) { - *d8 ^= *s8; - s8++; - d8++; + return; } - return; #endif if (uls % 8 != uld % 8) { gf_unaligned_xor(src, dest, bytes); diff --git a/src/gf_cpu.c b/src/gf_cpu.c new file mode 100644 index 0000000..fae2cd5 --- /dev/null +++ b/src/gf_cpu.c @@ -0,0 +1,168 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_cpu.h + * + * Identifies whether the CPU supports SIMD instructions at runtime. + */ + +#include <stdio.h> +#include <stdlib.h> + +int gf_cpu_identified = 0; + +int gf_cpu_supports_intel_pclmul = 0; +int gf_cpu_supports_intel_sse4 = 0; +int gf_cpu_supports_intel_ssse3 = 0; +int gf_cpu_supports_intel_sse3 = 0; +int gf_cpu_supports_intel_sse2 = 0; +int gf_cpu_supports_arm_neon = 0; + +#if defined(__x86_64__) + +#if defined(_MSC_VER) + +#define cpuid(info, x) __cpuidex(info, x, 0) + +#elif defined(__GNUC__) + +#include <cpuid.h> +void cpuid(int info[4], int InfoType){ + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); +} + +#else + +#error please add a way to detect CPU SIMD support at runtime + +#endif + +void gf_cpu_identify(void) +{ + if (gf_cpu_identified) { + return; + } + + int reg[4]; + + cpuid(reg, 1); + +#if defined(INTEL_SSE4_PCLMUL) + if ((reg[2] & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) { + gf_cpu_supports_intel_pclmul = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_pclmul\n"); +#endif + } +#endif + +#if defined(INTEL_SSE4) + if (((reg[2] & (1<<20)) != 0 || (reg[2] & (1<<19)) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) { + gf_cpu_supports_intel_sse4 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_sse4\n"); +#endif + } +#endif + +#if defined(INTEL_SSSE3) + if ((reg[2] & (1<<9)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) { + gf_cpu_supports_intel_ssse3 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_ssse3\n"); +#endif + } +#endif + +#if defined(INTEL_SSE3) + if ((reg[2] & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) { + gf_cpu_supports_intel_sse3 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_sse3\n"); +#endif + } +#endif + +#if defined(INTEL_SSE2) + if ((reg[3] & (1<<26)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) { + gf_cpu_supports_intel_sse2 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_sse2\n"); +#endif + } +#endif + + gf_cpu_identified = 1; +} + +#elif defined(__arm__) || defined(__aarch64__) + +#ifdef __linux__ + +#include <stdio.h> +#include <unistd.h> +#include <elf.h> +#include <linux/auxvec.h> +#include <asm/hwcap.h> +#include <fcntl.h> + +unsigned long get_hwcap(unsigned long type) { + unsigned long hwcap = 0; + int fd = open("/proc/self/auxv", O_RDONLY); + if (fd > 0) { + Elf32_auxv_t auxv; + while (read(fd, &auxv, sizeof(Elf32_auxv_t))) { + if (auxv.a_type == type) { + hwcap = auxv.a_un.a_val; + break; + } + } + close(fd); + } + + return hwcap; +} + +#endif // linux + +void gf_cpu_identify(void) +{ + if (gf_cpu_identified) { + return; + } + +#if defined(ARM_NEON) + if (!getenv("GF_COMPLETE_DISABLE_NEON")) { +#if __linux__ && __arm__ + gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0; +#elif __aarch64__ + // ASIMD is supported on all aarch64 architectures + gf_cpu_supports_arm_neon = 1; +#else + // we assume that NEON is supported if the compiler supports + // NEON and we dont have a reliable way to detect runtime support. + gf_cpu_supports_arm_neon = 1; +#endif + +#ifdef DEBUG_CPU_DETECTION + if (gf_cpu_supports_arm_neon) { + printf("#gf_cpu_supports_arm_neon\n"); + } +#endif + } +#endif // defined(ARM_NEON) + + gf_cpu_identified = 1; +} + +#else // defined(__arm__) || defined(__aarch64__) + +int gf_cpu_identify(void) +{ + gf_cpu_identified = 1; + return 0; +} + +#endif diff --git a/src/gf_w128.c b/src/gf_w128.c index b1e3d92..74f72e8 100644 --- a/src/gf_w128.c +++ b/src/gf_w128.c @@ -11,6 +11,7 @@ #include "gf_int.h" #include <stdio.h> #include <stdlib.h> +#include "gf_cpu.h" #define GF_FIELD_WIDTH (128) @@ -290,11 +291,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 return; } +#if defined(INTEL_SSE4_PCLMUL) + void gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { -#if defined(INTEL_SSE4_PCLMUL) - __m128i a,b; __m128i result0,result1; __m128i prim_poly; @@ -338,9 +339,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_ c128[0] = (uint64_t)_mm_extract_epi64(result1,1); c128[1] = (uint64_t)_mm_extract_epi64(result1,0); -#endif -return; } +#endif void gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) @@ -376,10 +376,10 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_ return; } +#if defined(INTEL_SSE4) void gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { -#if defined(INTEL_SSE4) int i; __m128i a, b, pp, prod, amask, u_middle_one; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ @@ -427,16 +427,16 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ } c128[0] = (uint64_t)_mm_extract_epi64(prod, 1); c128[1] = (uint64_t)_mm_extract_epi64(prod, 0); -#endif return; } +#endif /* Ben: This slow function implements sse instrutions for bytwo_b because why not */ +#if defined(INTEL_SSE4) void gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { -#if defined(INTEL_SSE4) __m128i a, b, lmask, hmask, pp, c, middle_one; gf_internal_t *h; uint64_t topbit, middlebit; @@ -471,8 +471,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ if (middlebit) b = _mm_xor_si128(b, middle_one); if (topbit) b = _mm_xor_si128(b, pp); } -#endif } +#endif void gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) @@ -1146,7 +1146,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, } /* a^-1 -> b */ - void +void gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) { uint64_t e_i[2], e_im1[2], e_ip1[2]; @@ -1239,7 +1239,7 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) return; } - void +void gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { uint64_t d[2]; @@ -1248,7 +1248,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val return; } - void +void gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) { uint64_t one128[2]; @@ -1260,7 +1260,7 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) static - void +void gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv) { gf_internal_t *h = (gf_internal_t *) gf->scratch; @@ -1405,14 +1405,14 @@ int gf_w128_composite_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt; + SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt) } else { - gf->multiply_region.w128 = gf_w128_composite_multiply_region; + SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region) } - gf->multiply.w128 = gf_w128_composite_multiply; - gf->divide.w128 = gf_w128_divide_from_inverse; - gf->inverse.w128 = gf_w128_composite_inverse; + SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply) + SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse) + SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse) return 1; } @@ -1421,10 +1421,12 @@ static int gf_w128_cfm_init(gf_t *gf) { #if defined(INTEL_SSE4_PCLMUL) - gf->inverse.w128 = gf_w128_euclid; - gf->multiply.w128 = gf_w128_clm_multiply; - gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single; - return 1; + if (gf_cpu_supports_intel_pclmul) { + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single) + return 1; + } #endif return 0; @@ -1433,9 +1435,9 @@ int gf_w128_cfm_init(gf_t *gf) static int gf_w128_shift_init(gf_t *gf) { - gf->multiply.w128 = gf_w128_shift_multiply; - gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_multiply_region_from_single; + SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply) + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single) return 1; } @@ -1446,16 +1448,16 @@ int gf_w128_bytwo_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w128 = gf_w128_bytwo_p_multiply; - /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/ + SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply) + /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/ /* John: the sse function is slower.*/ } else { - gf->multiply.w128 = gf_w128_bytwo_b_multiply; - /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply; + SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply) + /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply) Ben: This sse function is also slower. */ } - gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region; + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region) return 1; } @@ -1525,20 +1527,20 @@ int gf_w128_split_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; - gf->multiply.w128 = gf_w128_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply) #if defined(INTEL_SSE4_PCLMUL) - if (!(h->region_type & GF_REGION_NOSIMD)){ - gf->multiply.w128 = gf_w128_clm_multiply; + if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){ + SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply) } #endif - gf->inverse.w128 = gf_w128_euclid; + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) { sd8 = (struct gf_w128_split_8_128_data *) h->private; sd8->last_value[0] = 0; sd8->last_value[1] = 0; - gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region; + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region) } else { sd4 = (struct gf_w128_split_4_128_data *) h->private; sd4->last_value[0] = 0; @@ -1546,23 +1548,19 @@ int gf_w128_split_init(gf_t *gf) if((h->region_type & GF_REGION_ALTMAP)) { #ifdef INTEL_SSE4 - if(!(h->region_type & GF_REGION_NOSIMD)) - gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region; + if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD)) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region) else - return 0; - #else - return 0; #endif + return 0; } else { #ifdef INTEL_SSE4 - if(!(h->region_type & GF_REGION_NOSIMD)) - gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region; + if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD)) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region) else - gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; - #else - gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; #endif + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region) } } return 1; @@ -1586,9 +1584,9 @@ int gf_w128_group_init(gf_t *gf) gt->m_table[2] = 0; gt->m_table[3] = 0; - gf->multiply.w128 = gf_w128_group_multiply; - gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_group_multiply_region; + SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply) + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region) gf_w128_group_r_init(gf); @@ -1738,10 +1736,10 @@ int gf_w128_init(gf_t *gf) } } - gf->multiply.w128 = NULL; - gf->divide.w128 = NULL; - gf->inverse.w128 = NULL; - gf->multiply_region.w128 = NULL; + SET_FUNCTION(gf,multiply,w128,NULL) + SET_FUNCTION(gf,divide,w128,NULL) + SET_FUNCTION(gf,inverse,w128,NULL) + SET_FUNCTION(gf,multiply_region,w128,NULL) switch(h->mult_type) { case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break; @@ -1757,22 +1755,22 @@ int gf_w128_init(gf_t *gf) /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there are multiple flags in h->region_type */ if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) { - gf->extract_word.w128 = gf_w128_split_extract_word; + SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word) } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) { - gf->extract_word.w128 = gf_w128_composite_extract_word; + SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word) } else { - gf->extract_word.w128 = gf_w128_extract_word; + SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word) } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w128 = gf_w128_divide_from_inverse; + SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse) } if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) { - gf->divide.w128 = gf_w128_divide_from_inverse; + SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse) } if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) { - gf->inverse.w128 = gf_w128_inverse_from_divide; + SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide) } return 1; } diff --git a/src/gf_w16.c b/src/gf_w16.c index 4e026b2..8316892 100644 --- a/src/gf_w16.c +++ b/src/gf_w16.c @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include "gf_w16.h" +#include "gf_cpu.h" #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ @@ -391,6 +392,7 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) extra memory. */ +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -398,8 +400,6 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -433,11 +433,11 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -445,8 +445,6 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -473,11 +471,11 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -485,8 +483,6 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -515,10 +511,9 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); - -#endif return rv; } +#endif static @@ -548,7 +543,7 @@ gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) static int gf_w16_shift_init(gf_t *gf) { - gf->multiply.w32 = gf_w16_shift_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w16_shift_multiply) return 1; } @@ -556,25 +551,27 @@ static int gf_w16_cfm_init(gf_t *gf) { #if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; - h = (gf_internal_t *) gf->scratch; - - /*Ben: Determining how many reductions to do */ - - if ((0xfe00 & h->prim_poly) == 0) { - gf->multiply.w32 = gf_w16_clm_multiply_2; - gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; - } else if((0xf000 & h->prim_poly) == 0) { - gf->multiply.w32 = gf_w16_clm_multiply_3; - gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3; - } else if ((0xe000 & h->prim_poly) == 0) { - gf->multiply.w32 = gf_w16_clm_multiply_4; - gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4; - } else { - return 0; - } - return 1; + h = (gf_internal_t *) gf->scratch; + + /*Ben: Determining how many reductions to do */ + + if ((0xfe00 & h->prim_poly) == 0) { + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2) + } else if((0xf000 & h->prim_poly) == 0) { + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3) + } else if ((0xe000 & h->prim_poly) == 0) { + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; + } #endif return 0; @@ -688,10 +685,9 @@ int gf_w16_log_init(gf_t *gf) if (check) { if (h->mult_type != GF_MULT_LOG_TABLE) { - -#if defined(INTEL_SSE4_PCLMUL) - return gf_w16_cfm_init(gf); -#endif + if (gf_cpu_supports_intel_pclmul) { + return gf_w16_cfm_init(gf); + } return gf_w16_shift_init(gf); } else { _gf_errno = GF_E_LOGPOLY; @@ -705,10 +701,10 @@ int gf_w16_log_init(gf_t *gf) ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; } - gf->inverse.w32 = gf_w16_log_inverse; - gf->divide.w32 = gf_w16_log_divide; - gf->multiply.w32 = gf_w16_log_multiply; - gf->multiply_region.w32 = gf_w16_log_multiply_region; + SET_FUNCTION(gf,inverse,w32,gf_w16_log_inverse) + SET_FUNCTION(gf,divide,w32,gf_w16_log_divide) + SET_FUNCTION(gf,multiply,w32,gf_w16_log_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_multiply_region) return 1; } @@ -948,11 +944,11 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v gf_do_final_region_alignment(&rd); } +#ifdef INTEL_SSSE3 static void gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; uint64_t c, prod; uint8_t low[4][16]; @@ -1078,14 +1074,14 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSSE3 static void gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; uint64_t c, prod; uint8_t low[4][16]; @@ -1187,8 +1183,8 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des } gf_do_final_region_alignment(&rd); -#endif } +#endif uint32_t gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) @@ -1216,21 +1212,11 @@ int gf_w16_split_init(gf_t *gf) { gf_internal_t *h; struct gf_w16_split_8_8_data *d8; - int i, j, exp, issse3; - int isneon = 0; + int i, j, exp; uint32_t p, basep, tmp; h = (gf_internal_t *) gf->scratch; -#ifdef INTEL_SSSE3 - issse3 = 1; -#else - issse3 = 0; -#endif -#ifdef ARM_NEON - isneon = 1; -#endif - if (h->arg1 == 8 && h->arg2 == 8) { d8 = (struct gf_w16_split_8_8_data *) h->private; basep = 1; @@ -1260,8 +1246,8 @@ int gf_w16_split_init(gf_t *gf) } for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); } - gf->multiply.w32 = gf_w16_split_8_8_multiply; - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + SET_FUNCTION(gf,multiply,w32,gf_w16_split_8_8_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region) return 1; } @@ -1273,36 +1259,45 @@ int gf_w16_split_init(gf_t *gf) /* Defaults */ - if (issse3) { - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; - } else if (isneon) { -#ifdef ARM_NEON +#ifdef INTEL_SSSE3 + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region) + } else { +#elif ARM_NEON + if (gf_cpu_supports_arm_neon) { gf_w16_neon_split_init(gf); -#endif } else { - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; +#endif + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region) +#if defined(INTEL_SSSE3) || defined(ARM_NEON) } - +#endif if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region) } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { - if (issse3 || isneon) { +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) { if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region) else if(h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; - else if(h->region_type & GF_REGION_ALTMAP && issse3) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region) +#if defined(INTEL_SSSE3) + else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region) +#endif } else { +#endif if(h->region_type & GF_REGION_SIMD) return 0; else if(h->region_type & GF_REGION_ALTMAP) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region) else - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region) +#if defined(INTEL_SSSE3) || defined(ARM_NEON) } +#endif } return 1; @@ -1313,7 +1308,7 @@ int gf_w16_table_init(gf_t *gf) { gf_w16_log_init(gf); - gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_table_lazy_multiply_region) return 1; } @@ -1844,28 +1839,30 @@ int gf_w16_bytwo_init(gf_t *gf) } if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w16_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSE2 + } #endif } else { - gf->multiply.w32 = gf_w16_bytwo_b_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSE2 + } #endif } @@ -1904,10 +1901,10 @@ int gf_w16_log_zero_init(gf_t *gf) ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; } - gf->inverse.w32 = gf_w16_log_zero_inverse; - gf->divide.w32 = gf_w16_log_zero_divide; - gf->multiply.w32 = gf_w16_log_zero_multiply; - gf->multiply_region.w32 = gf_w16_log_zero_multiply_region; + SET_FUNCTION(gf,inverse,w32,gf_w16_log_zero_inverse) + SET_FUNCTION(gf,divide,w32,gf_w16_log_zero_divide) + SET_FUNCTION(gf,multiply,w32,gf_w16_log_zero_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_zero_multiply_region) return 1; } @@ -2145,18 +2142,18 @@ int gf_w16_composite_init(gf_t *gf) cd->mult_table = gf_w8_get_mult_table(h->base_gf); if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region_alt) } else { - gf->multiply_region.w32 = gf_w16_composite_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region) } if (cd->mult_table == NULL) { - gf->multiply.w32 = gf_w16_composite_multiply_recursive; + SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_recursive) } else { - gf->multiply.w32 = gf_w16_composite_multiply_inline; + SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_inline) } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w16_composite_inverse; + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w16_composite_inverse) return 1; } @@ -2277,10 +2274,10 @@ int gf_w16_group_init(gf_t *gf) d44->reduce[p>>16] = (p&0xffff); } - gf->multiply.w32 = gf_w16_group_4_4_multiply; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = gf_w16_group_4_4_region_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w16_group_4_4_multiply) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_group_4_4_region_multiply) return 1; } @@ -2360,10 +2357,10 @@ int gf_w16_init(gf_t *gf) if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16); - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) switch(h->mult_type) { case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break; @@ -2380,34 +2377,34 @@ int gf_w16_init(gf_t *gf) default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w16_divide_from_inverse; - gf->inverse.w32 = gf_w16_euclid; + SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w16_euclid) } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w16_divide_from_inverse; - gf->inverse.w32 = gf_w16_matrix; + SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w16_matrix) } if (gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w16_divide_from_inverse; - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid; + SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse) + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_euclid) } - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide; + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_inverse_from_divide) if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { - gf->extract_word.w32 = gf_w16_composite_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w16_composite_extract_word) } else { - gf->extract_word.w32 = gf_w16_split_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w16_split_extract_word) } } else if (h->region_type == GF_REGION_CAUCHY) { - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) } else { - gf->extract_word.w32 = gf_w16_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w16_extract_word) } if (gf->multiply_region.w32 == NULL) { - gf->multiply_region.w32 = gf_w16_multiply_region_from_single; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_multiply_region_from_single) } return 1; } diff --git a/src/gf_w32.c b/src/gf_w32.c index 854a6e4..bb22894 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -13,6 +13,7 @@ #include <stdio.h> #include <stdlib.h> #include "gf_w32.h" +#include "gf_cpu.h" #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } @@ -347,6 +348,8 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) extra memory. */ +#if defined(INTEL_SSE4_PCLMUL) + static inline gf_val_32_t @@ -354,8 +357,6 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i w; @@ -378,9 +379,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif #if defined(INTEL_SSE4_PCLMUL) @@ -435,6 +436,8 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32 #endif +#if defined(INTEL_SSE4_PCLMUL) + static inline gf_val_32_t @@ -442,8 +445,6 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -476,9 +477,11 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif + +#if defined(INTEL_SSE4_PCLMUL) static inline @@ -487,8 +490,6 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -515,9 +516,11 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif + +#if defined(INTEL_SSE4_PCLMUL) static inline @@ -526,8 +529,6 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -556,9 +557,9 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif static @@ -589,33 +590,35 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) static int gf_w32_cfmgk_init(gf_t *gf) { - gf->inverse.w32 = gf_w32_euclid; - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single) #if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; - h = (gf_internal_t *) gf->scratch; - gf->multiply.w32 = gf_w32_cfmgk_multiply; - gf->multiply_region.w32 = gf_w32_cfmgk_multiply_region_from_single; + h = (gf_internal_t *) gf->scratch; + SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single) - uint64_t *q_plus = (uint64_t *) h->private; - uint64_t *g_star = (uint64_t *) h->private + 1; + uint64_t *q_plus = (uint64_t *) h->private; + uint64_t *g_star = (uint64_t *) h->private + 1; - uint64_t tmp = h->prim_poly << 32; - *q_plus = 1ULL << 32; + uint64_t tmp = h->prim_poly << 32; + *q_plus = 1ULL << 32; - int i; - for(i = 63; i >= 32; i--) - if((1ULL << i) & tmp) - { - *q_plus |= 1ULL << (i-32); - tmp ^= h->prim_poly << (i-32); - } + int i; + for(i = 63; i >= 32; i--) + if((1ULL << i) & tmp) + { + *q_plus |= 1ULL << (i-32); + tmp ^= h->prim_poly << (i-32); + } - *g_star = h->prim_poly & ((1ULL << 32) - 1); + *g_star = h->prim_poly & ((1ULL << 32) - 1); - return 1; + return 1; + } #endif return 0; @@ -624,30 +627,32 @@ int gf_w32_cfmgk_init(gf_t *gf) static int gf_w32_cfm_init(gf_t *gf) { - gf->inverse.w32 = gf_w32_euclid; - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single) /*Ben: We also check to see if the prim poly will work for pclmul */ /*Ben: Check to see how many reduction steps it will take*/ #if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if ((0xfffe0000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_2; - gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2; - }else if ((0xffc00000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_3; - gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3; - }else if ((0xfe000000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_4; - gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4; - } else { - return 0; + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xfffe0000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2) + }else if ((0xffc00000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3) + }else if ((0xfe000000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; } - return 1; #endif return 0; @@ -656,9 +661,9 @@ int gf_w32_cfm_init(gf_t *gf) static int gf_w32_shift_init(gf_t *gf) { - gf->inverse.w32 = gf_w32_euclid; - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; - gf->multiply.w32 = gf_w32_shift_multiply; + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single) + SET_FUNCTION(gf,multiply,w32,gf_w32_shift_multiply) return 1; } @@ -1380,32 +1385,34 @@ int gf_w32_bytwo_init(gf_t *gf) } if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w32_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SIMD) - return 0; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } #endif } else { - gf->multiply.w32 = gf_w32_bytwo_b_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSE2 + } #endif } - gf->inverse.w32 = gf_w32_euclid; + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) return 1; } @@ -1755,11 +1762,11 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t gf_do_final_region_alignment(&rd); } +#ifdef INTEL_SSSE3 static void gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; int i, j, k; uint32_t pp, v, *s32, *d32, *top; @@ -1942,16 +1949,15 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des } gf_do_final_region_alignment(&rd); - -#endif } +#endif +#ifdef INTEL_SSSE3 static void gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; int i, j, k; uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; @@ -2216,9 +2222,8 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint } } gf_do_final_region_alignment(&rd); - -#endif } +#endif static int gf_w32_split_init(gf_t *gf) @@ -2230,29 +2235,13 @@ int gf_w32_split_init(gf_t *gf) struct gf_split_8_32_lazy_data *d32; struct gf_split_16_32_lazy_data *d16; uint32_t p, basep; - int i, j, exp, ispclmul, issse3; - int isneon = 0; - -#if defined(INTEL_SSE4_PCLMUL) - ispclmul = 1; -#else - ispclmul = 0; -#endif - -#ifdef INTEL_SSSE3 - issse3 = 1; -#else - issse3 = 0; -#endif -#ifdef ARM_NEON - isneon = 1; -#endif + int i, j, exp; h = (gf_internal_t *) gf->scratch; /* Defaults */ - gf->inverse.w32 = gf_w32_euclid; + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) /* JSP: First handle single multiplication: If args == 8, then we're doing split 8 8. @@ -2261,17 +2250,19 @@ int gf_w32_split_init(gf_t *gf) */ if (h->arg1 == 8 && h->arg2 == 8) { - gf->multiply.w32 = gf_w32_split_8_8_multiply; - } else if (ispclmul) { + SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply) +#if defined(INTEL_SSE4_PCLMUL) + } else if (gf_cpu_supports_intel_pclmul) { if ((0xfffe0000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_2; + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2) } else if ((0xffc00000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_3; + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3) } else if ((0xfe000000 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w32_clm_multiply_4; + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4) } +#endif } else { - gf->multiply.w32 = gf_w32_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply) } /* Easy cases: 16/32 and 2/32 */ @@ -2279,7 +2270,7 @@ int gf_w32_split_init(gf_t *gf) if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) { d16 = (struct gf_split_16_32_lazy_data *) h->private; d16->last_value = 0; - gf->multiply_region.w32 = gf_w32_split_16_32_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_16_32_lazy_multiply_region) return 1; } @@ -2287,33 +2278,39 @@ int gf_w32_split_init(gf_t *gf) ld2 = (struct gf_split_2_32_lazy_data *) h->private; ld2->last_value = 0; #ifdef INTEL_SSSE3 - if (!(h->region_type & GF_REGION_NOSIMD)) - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; - else - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - #else - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - if(h->region_type & GF_REGION_SIMD) return 0; + if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region) + if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSSE3 + } #endif return 1; } /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ + if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || - ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) { + ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) { ld4 = (struct gf_split_4_32_lazy_data *) h->private; ld4->last_value = 0; - if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; - } else if (isneon) { + if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region) + } else if (gf_cpu_supports_arm_neon) { #ifdef ARM_NEON gf_w32_neon_split_init(gf); #endif } else if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; +#ifdef INTEL_SSSE3 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region) +#endif } else { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; +#ifdef INTEL_SSSE3 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region) +#endif } return 1; } @@ -2324,7 +2321,7 @@ int gf_w32_split_init(gf_t *gf) h->mult_type == GF_MULT_DEFAULT) { d32 = (struct gf_split_8_32_lazy_data *) h->private; d32->last_value = 0; - gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region) return 1; } @@ -2333,8 +2330,8 @@ int gf_w32_split_init(gf_t *gf) if (h->arg1 == 8 && h->arg2 == 8) { d8 = (struct gf_w32_split_8_8_data *) h->private; d8->last_value = 0; - gf->multiply.w32 = gf_w32_split_8_8_multiply; - gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; + SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region) basep = 1; for (exp = 0; exp < 7; exp++) { for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; @@ -2407,14 +2404,14 @@ int gf_w32_group_init(gf_t *gf) } if (g_s == g_r) { - gf->multiply.w32 = gf_w32_group_s_equals_r_multiply; - gf->multiply_region.w32 = gf_w32_group_s_equals_r_multiply_region; + SET_FUNCTION(gf,multiply,w32,gf_w32_group_s_equals_r_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_s_equals_r_multiply_region) } else { - gf->multiply.w32 = gf_w32_group_multiply; - gf->multiply_region.w32 = gf_w32_group_multiply_region; + SET_FUNCTION(gf,multiply,w32,gf_w32_group_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_multiply_region) } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w32_euclid; + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) return 1; } @@ -2666,18 +2663,18 @@ int gf_w32_composite_init(gf_t *gf) cd->alog = gf_w16_get_mult_alog_table(h->base_gf); if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region_alt) } else { - gf->multiply_region.w32 = gf_w32_composite_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region) } if (cd->log == NULL) { - gf->multiply.w32 = gf_w32_composite_multiply_recursive; + SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_recursive) } else { - gf->multiply.w32 = gf_w32_composite_multiply_inline; + SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_inline) } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w32_composite_inverse; + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w32_composite_inverse) return 1; } @@ -2686,16 +2683,6 @@ int gf_w32_composite_init(gf_t *gf) int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int issse3 = 0; - int isneon = 0; - -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif -#ifdef ARM_NEON - isneon = 1; -#endif - switch(mult_type) { case GF_MULT_BYTWO_p: @@ -2720,7 +2707,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; } if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || - (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) { + (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) { return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; } if ((arg1 == 4 && arg2 == 32) || @@ -2776,10 +2763,10 @@ int gf_w32_init(gf_t *gf) if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff; - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) switch(h->mult_type) { case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; @@ -2794,30 +2781,30 @@ int gf_w32_init(gf_t *gf) default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w32_divide_from_inverse; - gf->inverse.w32 = gf_w32_euclid; + SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w32_divide_from_inverse; - gf->inverse.w32 = gf_w32_matrix; + SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w32_matrix) } if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w32_divide_from_inverse; + SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse) } if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_w32_inverse_from_divide; + SET_FUNCTION(gf,inverse,w32,gf_w32_inverse_from_divide) } if (h->region_type == GF_REGION_CAUCHY) { - gf->extract_word.w32 = gf_wgen_extract_word; - gf->multiply_region.w32 = gf_wgen_cauchy_region; + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) } else if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { - gf->extract_word.w32 = gf_w32_composite_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w32_composite_extract_word) } else { - gf->extract_word.w32 = gf_w32_split_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w32_split_extract_word) } } else { - gf->extract_word.w32 = gf_w32_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w32_extract_word) } return 1; } diff --git a/src/gf_w4.c b/src/gf_w4.c index 0e86aa8..3a7b953 100644 --- a/src/gf_w4.c +++ b/src/gf_w4.c @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include "gf_w4.h" +#include "gf_cpu.h" #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ @@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) /* Ben: This function works, but it is 33% slower than the normal shift mult */ +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif static void @@ -311,10 +311,10 @@ int gf_w4_log_init(gf_t *gf) return 0; } - gf->inverse.w32 = gf_w4_inverse_from_divide; - gf->divide.w32 = gf_w4_log_divide; - gf->multiply.w32 = gf_w4_log_multiply; - gf->multiply_region.w32 = gf_w4_log_multiply_region; + SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide) + SET_FUNCTION(gf,divide,w32,gf_w4_log_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region) return 1; } @@ -444,21 +444,22 @@ int gf_w4_single_table_init(gf_t *gf) } } - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_single_table_divide; - gf->multiply.w32 = gf_w4_single_table_multiply; - #if defined(INTEL_SSSE3) || defined(ARM_NEON) - if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY)) - gf->multiply_region.w32 = gf_w4_single_table_multiply_region; - else - #if defined(INTEL_SSSE3) - gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; - #elif defined(ARM_NEON) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply) + #if defined(INTEL_SSSE3) + if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region) + } else { + #elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) { gf_w4_neon_single_table_init(gf); - #endif - #else - gf->multiply_region.w32 = gf_w4_single_table_multiply_region; - if (h->region_type & GF_REGION_SIMD) return 0; + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region) + if (h->region_type & GF_REGION_SIMD) return 0; + #if defined(INTEL_SSSE3) || defined(ARM_NEON) + } #endif return 1; @@ -548,10 +549,10 @@ int gf_w4_double_table_init(gf_t *gf) } } - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_double_table_divide; - gf->multiply.w32 = gf_w4_double_table_multiply; - gf->multiply_region.w32 = gf_w4_double_table_multiply_region; + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region) return 1; } @@ -682,10 +683,10 @@ int gf_w4_quad_table_init(gf_t *gf) } } - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_quad_table_divide; - gf->multiply.w32 = gf_w4_quad_table_multiply; - gf->multiply_region.w32 = gf_w4_quad_table_multiply_region; + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region) return 1; } static @@ -724,10 +725,10 @@ int gf_w4_quad_table_lazy_init(gf_t *gf) } } - gf->inverse.w32 = NULL; - gf->divide.w32 = gf_w4_quad_table_lazy_divide; - gf->multiply.w32 = gf_w4_quad_table_lazy_multiply; - gf->multiply_region.w32 = gf_w4_quad_table_multiply_region; + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region) return 1; } @@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf) { int rt; gf_internal_t *h; - int simd = 0; - -#if defined(INTEL_SSSE3) || defined(ARM_NEON) - simd = 1; -#endif h = (gf_internal_t *) gf->scratch; rt = (h->region_type); - if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE; + if (h->mult_type == GF_MULT_DEFAULT && + !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) + rt |= GF_REGION_DOUBLE_TABLE; if (rt & GF_REGION_DOUBLE_TABLE) { return gf_w4_double_table_init(gf); @@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v #endif /* +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_bytwo_data *btd; @@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } } gf_do_final_region_alignment(&rd); -#endif } +#endif */ #ifdef INTEL_SSE2 @@ -1865,28 +1863,30 @@ int gf_w4_bytwo_init(gf_t *gf) } if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w4_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; - if (h->region_type & GF_REGION_SIMD) - return 0; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region) + if (h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } #endif } else { - gf->multiply.w32 = gf_w4_bytwo_b_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; - if (h->region_type & GF_REGION_SIMD) - return 0; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region) + if (h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } #endif } return 1; @@ -1897,10 +1897,14 @@ static int gf_w4_cfm_init(gf_t *gf) { #if defined(INTEL_SSE4_PCLMUL) - gf->multiply.w32 = gf_w4_clm_multiply; - return 1; + if (gf_cpu_supports_intel_pclmul) { + SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply) + return 1; + } #elif defined(ARM_NEON) - return gf_w4_neon_cfm_init(gf); + if (gf_cpu_supports_arm_neon) { + return gf_w4_neon_cfm_init(gf); + } #endif return 0; } @@ -1908,7 +1912,7 @@ int gf_w4_cfm_init(gf_t *gf) static int gf_w4_shift_init(gf_t *gf) { - gf->multiply.w32 = gf_w4_shift_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply) return 1; } @@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf) int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int issse3 = 0, isneon = 0; - -#ifdef INTEL_SSSE3 - issse3 = 1; -#endif -#ifdef ARM_NEON - isneon = 1; -#endif - switch(mult_type) { case GF_MULT_BYTWO_p: @@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; } - if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon)) + if (mult_type == GF_MULT_DEFAULT && + !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3)) region_type = GF_REGION_DOUBLE_TABLE; if (region_type & GF_REGION_DOUBLE_TABLE) { @@ -1977,11 +1973,11 @@ gf_w4_init (gf_t *gf) h = (gf_internal_t *) gf->scratch; if (h->prim_poly == 0) h->prim_poly = 0x13; h->prim_poly |= 0x10; - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - gf->extract_word.w32 = gf_w4_extract_word; + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) + SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word) switch(h->mult_type) { case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break; @@ -1995,27 +1991,27 @@ gf_w4_init (gf_t *gf) } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w4_divide_from_inverse; - gf->inverse.w32 = gf_w4_euclid; + SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w4_euclid) } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w4_divide_from_inverse; - gf->inverse.w32 = gf_w4_matrix; + SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w4_matrix) } if (gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w4_divide_from_inverse; - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid; + SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse) + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid) } - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide; + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide) if (h->region_type == GF_REGION_CAUCHY) { - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) } if (gf->multiply_region.w32 == NULL) { - gf->multiply_region.w32 = gf_w4_multiply_region_from_single; + SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single) } return 1; diff --git a/src/gf_w64.c b/src/gf_w64.c index eae31e6..69e55db 100644 --- a/src/gf_w64.c +++ b/src/gf_w64.c @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include "gf_w64.h" +#include "gf_cpu.h" static inline @@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. */ +#if defined(INTEL_SSE4_PCLMUL) + static inline gf_val_64_t @@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { gf_val_64_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) result = _mm_xor_si128 (result, w); rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) + static inline gf_val_64_t @@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { gf_val_64_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) result = _mm_xor_si128 (result, w); rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) void gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; uint8_t *s8, *d8, *dtop; gf_region_data rd; @@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by } } gf_do_final_region_alignment(&rd); -#endif } +#endif void gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) @@ -697,33 +698,35 @@ gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_ static int gf_w64_shift_init(gf_t *gf) { - gf->multiply.w64 = gf_w64_shift_multiply; - gf->inverse.w64 = gf_w64_euclid; - gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + SET_FUNCTION(gf,multiply,w64,gf_w64_shift_multiply) + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) return 1; } static int gf_w64_cfm_init(gf_t *gf) { - gf->inverse.w64 = gf_w64_euclid; - gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; - h = (gf_internal_t *) gf->scratch; + h = (gf_internal_t *) gf->scratch; - if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_2; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; - }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_4; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; - } else { - return 0; + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; } - return 1; #endif return 0; @@ -1008,14 +1011,14 @@ int gf_w64_group_init(gf_t *gf) } if (g_s == g_r) { - gf->multiply.w64 = gf_w64_group_s_equals_r_multiply; - gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region; + SET_FUNCTION(gf,multiply,w64,gf_w64_group_s_equals_r_multiply) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_s_equals_r_multiply_region) } else { - gf->multiply.w64 = gf_w64_group_multiply; - gf->multiply_region.w64 = gf_w64_group_multiply_region; + SET_FUNCTION(gf,multiply,w64,gf_w64_group_multiply) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_multiply_region) } - gf->divide.w64 = NULL; - gf->inverse.w64 = gf_w64_euclid; + SET_FUNCTION(gf,divide,w64,NULL) + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) return 1; } @@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint64_t vrev, one64; @@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif #ifdef INTEL_SSE2 static @@ -1455,31 +1458,33 @@ int gf_w64_bytwo_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w64 = gf_w64_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; - #else - gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SIMD) - return 0; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } #endif } else { - gf->multiply.w64 = gf_w64_bytwo_b_multiply; + SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; - #else - gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSE2 + } #endif } - gf->inverse.w64 = gf_w64_euclid; + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) return 1; } @@ -1653,14 +1658,14 @@ int gf_w64_composite_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region_alt) } else { - gf->multiply_region.w64 = gf_w64_composite_multiply_region; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region) } - gf->multiply.w64 = gf_w64_composite_multiply; - gf->divide.w64 = NULL; - gf->inverse.w64 = gf_w64_composite_inverse; + SET_FUNCTION(gf,multiply,w64,gf_w64_composite_multiply) + SET_FUNCTION(gf,divide,w64,NULL) + SET_FUNCTION(gf,inverse,w64,gf_w64_composite_inverse) return 1; } @@ -1970,49 +1975,55 @@ int gf_w64_split_init(gf_t *gf) /* Defaults */ - gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) - gf->multiply.w64 = gf_w64_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) #if defined(INTEL_SSE4_PCLMUL) - if ((!(h->region_type & GF_REGION_NOSIMD) && - (h->arg1 == 64 || h->arg2 == 64)) || - h->mult_type == GF_MULT_DEFAULT){ - - if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_2; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; - }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ - gf->multiply.w64 = gf_w64_clm_multiply_4; - gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; - }else{ - return 0; + if (gf_cpu_supports_intel_pclmul) { + if ((!(h->region_type & GF_REGION_NOSIMD) && + (h->arg1 == 64 || h->arg2 == 64)) || + h->mult_type == GF_MULT_DEFAULT){ + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) + }else{ + return 0; + } } } #endif - gf->inverse.w64 = gf_w64_euclid; + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) /* Allen: set region pointers for default mult type. Single pointers are * taken care of above (explicitly for sse, implicitly for no sse). */ -#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) if (h->mult_type == GF_MULT_DEFAULT) { - d4 = (struct gf_split_4_64_lazy_data *) h->private; - d4->last_value = 0; +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; #if defined(INTEL_SSE4) - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + if (gf_cpu_supports_intel_sse4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) #elif defined(ARCH_AARCH64) - gf_w64_neon_split_init(gf); + if (gf_cpu_supports_arm_neon) + gf_w64_neon_split_init(gf); #endif - } -#else - if (h->mult_type == GF_MULT_DEFAULT) { - d8 = (struct gf_split_8_64_lazy_data *) h->private; - d8->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; - } + } else { #endif + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region) +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } +#endif + } if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { d4 = (struct gf_split_4_64_lazy_data *) h->private; @@ -2022,44 +2033,51 @@ int gf_w64_split_init(gf_t *gf) if(h->region_type & GF_REGION_ALTMAP) { #ifdef INTEL_SSSE3 - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region) + } else #elif defined(ARCH_AARCH64) - gf_w64_neon_split_init(gf); - #else - return 0; + if (gf_cpu_supports_arm_neon) { + gf_w64_neon_split_init(gf); + } else #endif + return 0; } else //no altmap { #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) - if(h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; - else - #if defined(INTEL_SSE4) - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; - #elif defined(ARCH_AARCH64) - gf_w64_neon_split_init(gf); - #endif - #else - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; + if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + if (h->region_type & GF_REGION_NOSIMD) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) + } else + #if defined(INTEL_SSE4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) + #elif defined(ARCH_AARCH64) + gf_w64_neon_split_init(gf); + #endif + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } #endif } } if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) { d8 = (struct gf_split_8_64_lazy_data *) h->private; d8->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region) } if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) { d16 = (struct gf_split_16_64_lazy_data *) h->private; d16->last_value = 0; - gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_16_64_lazy_multiply_region) } if ((h->arg1 == 8 && h->arg2 == 8)) { d88 = (struct gf_split_8_8_data *) h->private; - gf->multiply.w64 = gf_w64_split_8_8_multiply; + SET_FUNCTION(gf,multiply,w64,gf_w64_split_8_8_multiply) /* The performance of this guy sucks, so don't bother with a region op */ @@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg * then fall through to split table scratch size code. */ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { arg1 = 64; arg2 = 4; -#else + } else { +#endif arg1 = 64; arg2 = 8; +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } #endif case GF_MULT_SPLIT_TABLE: @@ -2169,10 +2191,10 @@ int gf_w64_init(gf_t *gf) } } - gf->multiply.w64 = NULL; - gf->divide.w64 = NULL; - gf->inverse.w64 = NULL; - gf->multiply_region.w64 = NULL; + SET_FUNCTION(gf,multiply,w64,NULL) + SET_FUNCTION(gf,divide,w64,NULL) + SET_FUNCTION(gf,inverse,w64,NULL) + SET_FUNCTION(gf,multiply_region,w64,NULL) switch(h->mult_type) { case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break; @@ -2186,27 +2208,27 @@ int gf_w64_init(gf_t *gf) default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w64 = gf_w64_divide_from_inverse; - gf->inverse.w64 = gf_w64_euclid; + SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse) + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) } if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) { - gf->divide.w64 = gf_w64_divide_from_inverse; + SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse) } if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) { - gf->inverse.w64 = gf_w64_inverse_from_divide; + SET_FUNCTION(gf,inverse,w64,gf_w64_inverse_from_divide) } if (h->region_type == GF_REGION_CAUCHY) return 0; if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { - gf->extract_word.w64 = gf_w64_composite_extract_word; + SET_FUNCTION(gf,extract_word,w64,gf_w64_composite_extract_word) } else if (h->mult_type == GF_MULT_SPLIT_TABLE) { - gf->extract_word.w64 = gf_w64_split_extract_word; + SET_FUNCTION(gf,extract_word,w64,gf_w64_split_extract_word) } } else { - gf->extract_word.w64 = gf_w64_extract_word; + SET_FUNCTION(gf,extract_word,w64,gf_w64_extract_word) } return 1; diff --git a/src/gf_w8.c b/src/gf_w8.c index 276799f..f647a31 100644 --- a/src/gf_w8.c +++ b/src/gf_w8.c @@ -13,6 +13,7 @@ #include <stdio.h> #include <stdlib.h> #include <assert.h> +#include "gf_cpu.h" #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ @@ -127,6 +128,7 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) } +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -134,8 +136,6 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -169,10 +169,11 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -180,8 +181,6 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -208,10 +207,11 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) static inline gf_val_32_t @@ -219,8 +219,6 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) { gf_val_32_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -248,9 +246,9 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) /* Extracts 32 bit value from result. */ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); -#endif return rv; } +#endif static @@ -509,25 +507,29 @@ static int gf_w8_cfm_init(gf_t *gf) { #if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - - if ((0xe0 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_clm_multiply_2; - gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2; - }else if ((0xc0 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_clm_multiply_3; - gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3; - }else if ((0x80 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_clm_multiply_4; - gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4; - }else{ - return 0; - } - return 1; + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xe0 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2) + }else if ((0xc0 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3) + }else if ((0x80 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4) + }else{ + return 0; + } + return 1; + } #elif defined(ARM_NEON) - return gf_w8_neon_cfm_init(gf); + if (gf_cpu_supports_arm_neon) { + return gf_w8_neon_cfm_init(gf); + } #endif return 0; @@ -537,7 +539,7 @@ int gf_w8_cfm_init(gf_t *gf) static int gf_w8_shift_init(gf_t *gf) { - gf->multiply.w32 = gf_w8_shift_multiply; /* The others will be set automatically */ + SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply) /* The others will be set automatically */ return 1; } @@ -809,20 +811,20 @@ int gf_w8_log_init(gf_t *gf) } while (i != 1); if (h->mult_type == GF_MULT_LOG_TABLE) { - gf->inverse.w32 = gf_w8_log_inverse; - gf->divide.w32 = gf_w8_log_divide; - gf->multiply.w32 = gf_w8_log_multiply; - gf->multiply_region.w32 = gf_w8_log_multiply_region; + SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse) + SET_FUNCTION(gf,divide,w32,gf_w8_log_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region) } else if (h->mult_type == GF_MULT_LOG_ZERO) { - gf->inverse.w32 = gf_w8_logzero_small_inverse; - gf->divide.w32 = gf_w8_logzero_small_divide; - gf->multiply.w32 = gf_w8_logzero_small_multiply; - gf->multiply_region.w32 = gf_w8_logzero_multiply_region; + SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse) + SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region) } else { - gf->inverse.w32 = gf_w8_logzero_inverse; - gf->divide.w32 = gf_w8_logzero_divide; - gf->multiply.w32 = gf_w8_logzero_multiply; - gf->multiply_region.w32 = gf_w8_logzero_multiply_region; + SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse) + SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region) } return 1; } @@ -1102,21 +1104,22 @@ int gf_w8_split_init(gf_t *gf) } } - gf->multiply.w32 = gf_w8_split_multiply; - - #if defined(INTEL_SSSE3) || defined(ARM_NEON) - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w8_split_multiply_region; - else - #if defined(INTEL_SSSE3) - gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; - #elif defined(ARM_NEON) + SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply) + + #if defined(INTEL_SSSE3) + if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse) + } else { + #elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) { gf_w8_neon_split_init(gf); - #endif - #else - gf->multiply_region.w32 = gf_w8_split_multiply_region; + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #if defined(INTEL_SSSE3) || defined(ARM_NEON) + } #endif return 1; @@ -1134,17 +1137,12 @@ int gf_w8_table_init(gf_t *gf) struct gf_w8_double_table_data *dtd = NULL; struct gf_w8_double_table_lazy_data *ltd = NULL; struct gf_w8_default_data *dd = NULL; - int a, b, c, prod, scase, use_simd; + int a, b, c, prod, scase; h = (gf_internal_t *) gf->scratch; -#if defined(INTEL_SSSE3) || defined(ARM_NEON) - use_simd = 1; -#else - use_simd = 0; -#endif - - if (h->mult_type == GF_MULT_DEFAULT && use_simd) { + if (h->mult_type == GF_MULT_DEFAULT && + (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) { dd = (struct gf_w8_default_data *)h->private; scase = 3; bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); @@ -1201,32 +1199,38 @@ int gf_w8_table_init(gf_t *gf) } } - gf->inverse.w32 = NULL; /* Will set from divide */ + SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */ switch (scase) { case 0: - gf->divide.w32 = gf_w8_table_divide; - gf->multiply.w32 = gf_w8_table_multiply; - gf->multiply_region.w32 = gf_w8_table_multiply_region; + SET_FUNCTION(gf,divide,w32,gf_w8_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region) break; case 1: - gf->divide.w32 = gf_w8_double_table_divide; - gf->multiply.w32 = gf_w8_double_table_multiply; - gf->multiply_region.w32 = gf_w8_double_table_multiply_region; + SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region) break; case 2: - gf->divide.w32 = gf_w8_double_table_lazy_divide; - gf->multiply.w32 = gf_w8_double_table_lazy_multiply; - gf->multiply_region.w32 = gf_w8_double_table_multiply_region; + SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region) break; case 3: #if defined(INTEL_SSSE3) || defined(ARM_NEON) - gf->divide.w32 = gf_w8_default_divide; - gf->multiply.w32 = gf_w8_default_multiply; + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) { + SET_FUNCTION(gf,divide,w32,gf_w8_default_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply) #if defined(INTEL_SSSE3) - gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse) + } #elif defined(ARM_NEON) - gf_w8_neon_split_init(gf); + if (gf_cpu_supports_arm_neon) { + gf_w8_neon_split_init(gf); + } #endif + } #endif break; } @@ -1472,18 +1476,18 @@ int gf_w8_composite_init(gf_t *gf) cd->mult_table = gf_w4_get_mult_table(h->base_gf); if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt; + SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt) } else { - gf->multiply_region.w32 = gf_w8_composite_multiply_region; + SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region) } if (cd->mult_table == NULL) { - gf->multiply.w32 = gf_w8_composite_multiply_recursive; + SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive) } else { - gf->multiply.w32 = gf_w8_composite_multiply_inline; + SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline) } - gf->divide.w32 = NULL; - gf->inverse.w32 = gf_w8_composite_inverse; + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse) return 1; } @@ -2190,28 +2194,30 @@ int gf_w8_bytwo_init(gf_t *gf) } if (h->mult_type == GF_MULT_BYTWO_p) { - gf->multiply.w32 = gf_w8_bytwo_p_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region; -#else - gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SIMD) - return 0; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region) + } else { +#endif + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; +#ifdef INTEL_SSE2 + } #endif } else { - gf->multiply.w32 = gf_w8_bytwo_b_multiply; + SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; - else - gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region; -#else - gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region) + } else { +#endif + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; +#ifdef INTEL_SSE2 + } #endif } return 1; @@ -2229,9 +2235,9 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1 switch(mult_type) { case GF_MULT_DEFAULT: -#if defined(INTEL_SSSE3) || defined(ARM_NEON) - return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; -#endif + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; + } return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; case GF_MULT_TABLE: if (region_type == GF_REGION_CAUCHY) { @@ -2304,11 +2310,11 @@ int gf_w8_init(gf_t *gf) h->prim_poly |= 0x100; } - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - gf->extract_word.w32 = gf_w8_extract_word; + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) + SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word) switch(h->mult_type) { case GF_MULT_DEFAULT: @@ -2326,31 +2332,31 @@ int gf_w8_init(gf_t *gf) } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w8_divide_from_inverse; - gf->inverse.w32 = gf_w8_euclid; + SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w8_euclid) } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w8_divide_from_inverse; - gf->inverse.w32 = gf_w8_matrix; + SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w8_matrix) } if (gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w8_divide_from_inverse; - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid; + SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse) + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid) } - if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_inverse_from_divide; + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide) if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { - gf->extract_word.w32 = gf_w8_composite_extract_word; + SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word) } if (h->region_type == GF_REGION_CAUCHY) { - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) } if (gf->multiply_region.w32 == NULL) { - gf->multiply_region.w32 = gf_w8_multiply_region_from_single; + SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single) } return 1; diff --git a/src/gf_wgen.c b/src/gf_wgen.c index ebc50a5..1e3d2e0 100644 --- a/src/gf_wgen.c +++ b/src/gf_wgen.c @@ -178,8 +178,8 @@ gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) static int gf_wgen_shift_init(gf_t *gf) { - gf->multiply.w32 = gf_wgen_shift_multiply; - gf->inverse.w32 = gf_wgen_euclid; + SET_FUNCTION(gf,multiply,w32,gf_wgen_shift_multiply) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) return 1; } @@ -211,8 +211,8 @@ gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) static int gf_wgen_bytwo_b_init(gf_t *gf) { - gf->multiply.w32 = gf_wgen_bytwo_b_multiply; - gf->inverse.w32 = gf_wgen_euclid; + SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_b_multiply) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) return 1; } @@ -247,8 +247,8 @@ gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) static int gf_wgen_bytwo_p_init(gf_t *gf) { - gf->multiply.w32 = gf_wgen_bytwo_p_multiply; - gf->inverse.w32 = gf_wgen_euclid; + SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_p_multiply) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) return 1; } @@ -453,12 +453,12 @@ int gf_wgen_group_init(gf_t *gf) } if (g_s == g_r) { - gf->multiply.w32 = gf_wgen_group_s_equals_r_multiply; + SET_FUNCTION(gf,multiply,w32,gf_wgen_group_s_equals_r_multiply) } else { - gf->multiply.w32 = gf_wgen_group_multiply; + SET_FUNCTION(gf,multiply,w32,gf_wgen_group_multiply) } - gf->divide.w32 = NULL; - gf->divide.w32 = NULL; + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) return 1; } @@ -519,8 +519,8 @@ int gf_wgen_table_8_init(gf_t *gf) } } - gf->multiply.w32 = gf_wgen_table_8_multiply; - gf->divide.w32 = gf_wgen_table_8_divide; + SET_FUNCTION(gf,multiply,w32,gf_wgen_table_8_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_table_8_divide) return 1; } @@ -580,8 +580,8 @@ int gf_wgen_table_16_init(gf_t *gf) } } - gf->multiply.w32 = gf_wgen_table_16_multiply; - gf->divide.w32 = gf_wgen_table_16_divide; + SET_FUNCTION(gf,multiply,w32,gf_wgen_table_16_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_table_16_divide) return 1; } @@ -670,8 +670,8 @@ int gf_wgen_log_8_init(gf_t *gf) return 0; } - gf->multiply.w32 = gf_wgen_log_8_multiply; - gf->divide.w32 = gf_wgen_log_8_divide; + SET_FUNCTION(gf,multiply,w32,gf_wgen_log_8_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_log_8_divide) return 1; } @@ -746,8 +746,8 @@ int gf_wgen_log_16_init(gf_t *gf) return 0; } - gf->multiply.w32 = gf_wgen_log_16_multiply; - gf->divide.w32 = gf_wgen_log_16_divide; + SET_FUNCTION(gf,multiply,w32,gf_wgen_log_16_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_log_16_divide) return 1; } @@ -821,8 +821,8 @@ int gf_wgen_log_32_init(gf_t *gf) return 0; } - gf->multiply.w32 = gf_wgen_log_32_multiply; - gf->divide.w32 = gf_wgen_log_32_divide; + SET_FUNCTION(gf,multiply,w32,gf_wgen_log_32_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_log_32_divide) return 1; } @@ -975,11 +975,11 @@ int gf_wgen_init(gf_t *gf) } } - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = gf_wgen_cauchy_region; - gf->extract_word.w32 = gf_wgen_extract_word; + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) switch(h->mult_type) { case GF_MULT_DEFAULT: @@ -1000,20 +1000,20 @@ int gf_wgen_init(gf_t *gf) default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_wgen_divide_from_inverse; - gf->inverse.w32 = gf_wgen_euclid; + SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_wgen_divide_from_inverse; - gf->inverse.w32 = gf_wgen_matrix; + SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_wgen_matrix) } - if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_wgen_euclid; + if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { - gf->divide.w32 = gf_wgen_divide_from_inverse; + SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse) } if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_wgen_inverse_from_divide; + SET_FUNCTION(gf,inverse,w32,gf_wgen_inverse_from_divide) } return 1; } diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c index 2bd3f30..477ee63 100644 --- a/src/neon/gf_w16_neon.c +++ b/src/neon/gf_w16_neon.c @@ -270,7 +270,7 @@ void gf_w16_neon_split_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; if (h->region_type & GF_REGION_ALTMAP) - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_altmap_multiply_region_neon) else - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region_neon) } diff --git a/src/neon/gf_w32_neon.c b/src/neon/gf_w32_neon.c index 8231eb3..7fd1329 100644 --- a/src/neon/gf_w32_neon.c +++ b/src/neon/gf_w32_neon.c @@ -262,8 +262,8 @@ void gf_w32_neon_split_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; if (h->region_type & GF_REGION_ALTMAP) - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_altmap_multiply_region_neon) else - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region_neon) } diff --git a/src/neon/gf_w4_neon.c b/src/neon/gf_w4_neon.c index 3a21432..5f35c86 100644 --- a/src/neon/gf_w4_neon.c +++ b/src/neon/gf_w4_neon.c @@ -235,13 +235,13 @@ gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest, int gf_w4_neon_cfm_init(gf_t *gf) { // single clm multiplication probably pointless - gf->multiply.w32 = gf_w4_neon_clm_multiply; - gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single; + SET_FUNCTION(gf,multiply,w32,gf_w4_neon_clm_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_neon_clm_multiply_region_from_single) return 1; } void gf_w4_neon_single_table_init(gf_t *gf) { - gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region_neon) } diff --git a/src/neon/gf_w64_neon.c b/src/neon/gf_w64_neon.c index 0eca9c7..2409823 100644 --- a/src/neon/gf_w64_neon.c +++ b/src/neon/gf_w64_neon.c @@ -326,8 +326,8 @@ void gf_w64_neon_split_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; if (h->region_type & GF_REGION_ALTMAP) - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_altmap_multiply_region_neon) else - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region_neon) } diff --git a/src/neon/gf_w8_neon.c b/src/neon/gf_w8_neon.c index 930a916..0cce5ba 100644 --- a/src/neon/gf_w8_neon.c +++ b/src/neon/gf_w8_neon.c @@ -188,14 +188,14 @@ int gf_w8_neon_cfm_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; if ((0xe0 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_neon_clm_multiply_2; - gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2; + SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_2) }else if ((0xc0 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_neon_clm_multiply_3; - gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3; + SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_3) }else if ((0x80 & h->prim_poly) == 0){ - gf->multiply.w32 = gf_w8_neon_clm_multiply_4; - gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4; + SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_4) }else{ return 0; } @@ -298,5 +298,5 @@ gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t va void gf_w8_neon_split_init(gf_t *gf) { - gf->multiply_region.w32 = gf_w8_split_multiply_region_neon; + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_neon) } diff --git a/test/Makefile.am b/test/Makefile.am index 2791528..f590ecc 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,7 +1,7 @@ # GF-Complete 'test' AM file AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC +AM_CFLAGS = -O3 -fPIC bin_PROGRAMS = gf_unit diff --git a/tools/Makefile.am b/tools/Makefile.am index a9dd8b9..4ca9131 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -1,7 +1,7 @@ # GF-Complete 'tools' AM file AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC +AM_CFLAGS = -O3 -fPIC bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time diff --git a/tools/gf_methods.c b/tools/gf_methods.c index c7d3d58..b016c33 100644 --- a/tools/gf_methods.c +++ b/tools/gf_methods.c @@ -39,7 +39,7 @@ static char *divides[NDIVS] = { "MATRIX", "EUCLID" }; void usage(char *s) { - fprintf(stderr, "usage: gf_methods w -BADC -LUMDRB\n"); + fprintf(stderr, "usage: gf_methods w -BADC -LXUMDRB\n"); fprintf(stderr, "\n"); fprintf(stderr, " w can be 1-32, 64, 128\n"); fprintf(stderr, "\n"); @@ -50,6 +50,7 @@ void usage(char *s) fprintf(stderr, " Combinations are fine.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -L Simply lists methods\n"); + fprintf(stderr, " -X List methods and functions selected (compile with DEBUG_FUNCTIONS)\n"); fprintf(stderr, " -U Produces calls to gf_unit\n"); fprintf(stderr, " -M Produces calls to time_tool.sh for single multiplications\n"); fprintf(stderr, " -D Produces calls to time_tool.sh for single divisions\n"); @@ -63,6 +64,19 @@ void usage(char *s) exit(1); } +void print_methods(gf_t *gf) +{ +#ifdef DEBUG_FUNCTIONS + gf_internal_t *h = (gf_internal_t*) gf->scratch; + + printf("multiply = %s\n", h->multiply); + printf("divide = %s\n", h->divide); + printf("inverse = %s\n", h->inverse); + printf("multiply_region = %s\n", h->multiply_region); + printf("extract_word = %s\n", h->extract_word); +#endif +} + int main(int argc, char *argv[]) { int m, r, d, w, i, sa, j, k, reset, ok; @@ -99,12 +113,12 @@ int main(int argc, char *argv[]) } } - if (strchr("LUMDRB", argv[3][1]) == NULL) { usage("Bad -LUMDRB"); } + if (strchr("LXUMDRB", argv[3][1]) == NULL) { usage("Bad -LXUMDRB"); } listing = argv[3][1]; if (listing == 'U') { w_str = "../test/gf_unit %d A -1"; - } else if (listing == 'L') { + } else if (listing == 'L' || listing == 'X') { w_str = "w=%d:"; } else { w_str = strdup("sh time_tool.sh X %d"); @@ -192,6 +206,8 @@ int main(int argc, char *argv[]) printf(w_str, w); for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]); printf("\n"); + if (listing == 'X') + print_methods(&gf); gf_free(&gf, 1); } else if (_gf_errno == GF_E_DEFAULT) { fprintf(stderr, "Unlabeled failed method: w=%d:", w); @@ -212,6 +228,8 @@ int main(int argc, char *argv[]) printf(w_str, w); for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]); printf("\n"); + if (listing == 'X') + print_methods(&gf); gf_free(&gf, 1); } else if (_gf_errno == GF_E_DEFAULT) { fprintf(stderr, "Unlabeled failed method: w=%d:", w); diff --git a/tools/test_simd.sh b/tools/test_simd.sh new file mode 100755 index 0000000..e514e4f --- /dev/null +++ b/tools/test_simd.sh @@ -0,0 +1,367 @@ +#!/bin/bash -e + +# this scripts has a number of tests for SIMD. It can be invoked +# on the host or on a QEMU machine. + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +host_cpu=`uname -p` +results=${script_dir}/test_simd.results +nprocs=$(grep -c ^processor /proc/cpuinfo) + +# runs unit tests and save the results +test_unit(){ + { ./configure && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; } + make -j$nprocs check || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); } + cat tools/test-suite.log >> ${results} || true +} + +# build with DEBUG_FUNCTIONS and save all methods selected +# to a results file +test_functions() { + failed=0 + + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); } + done + + return ${failed} +} + +# build with DEBUG_CPU_FUNCTIONS and print out CPU detection +test_detection() { + failed=0 + + { ./configure --enable-debug-cpu && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; } + { ${script_dir}/gf_methods 32 -ACD -L | grep '#' >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); } + + return ${failed} +} + +compile_arm() { + failed=0 + + echo -n "Compiling with NO SIMD support..." >> ${results} + { ./configure --disable-neon && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with FULL SIMD support..." >> ${results} + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + return ${failed} +} + +compile_intel() { + failed=0 + + echo -n "Compiling with NO SIMD support..." >> ${results} + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=no + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3,SSSE3 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_1 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=yes + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_2 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=yes + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with FULL SIMD support..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=yes + export ax_cv_have_sse42_ext=yes + export ax_cv_have_pclmuldq_ext=yes + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + return ${failed} +} + +# test that we can compile the source code with different +# SIMD options. We assume that we are running on processor +# full SIMD support +test_compile() { + case $host_cpu in + aarch64*|arm*) compile_arm ;; + i[[3456]]86*|x86_64*|amd64*) compile_intel ;; + esac +} + +# disable through build flags +runtime_arm_flags() { + failed=0 + + echo "====NO SIMD support..." >> ${1} + { ./configure --disable-neon --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +# build once with FULL SIMD and disable at runtime through environment +runtime_arm_env() { + failed=0 + + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + + echo "====NO SIMD support..." >> ${1} + export GF_COMPLETE_DISABLE_NEON=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + unset GF_COMPLETE_DISABLE_NEON + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +runtime_intel_flags() { + failed=0 + + echo "====NO SIMD support..." >> ${1} + { ./configure --disable-sse --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=no + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=yes + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=yes + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +runtime_intel_env() { + failed=0 + + # compile a build with full SIMD support + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + + echo "====NO SIMD support..." >> ${1} + export GF_COMPLETE_DISABLE_SSE2=1 + export GF_COMPLETE_DISABLE_SSE3=1 + export GF_COMPLETE_DISABLE_SSSE3=1 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + export GF_COMPLETE_DISABLE_SSE3=1 + export GF_COMPLETE_DISABLE_SSSE3=1 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + export GF_COMPLETE_DISABLE_SSSE3=1 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + unset GF_COMPLETE_DISABLE_SSE4 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + unset GF_COMPLETE_DISABLE_SSE4 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + unset GF_COMPLETE_DISABLE_SSE4 + unset GF_COMPLETE_DISABLE_SSE4_PCLMUL + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +test_runtime() { + rm -f ${results}.left + rm -f ${results}.right + + case $host_cpu in + aarch64*|arm*) + runtime_arm_flags ${results}.left + runtime_arm_env ${results}.right + ;; + i[[3456]]86*|x86_64*|amd64*) + runtime_intel_flags ${results}.left + runtime_intel_env ${results}.right + ;; + esac + + echo "======LEFT======" > ${results} + cat ${results}.left >> ${results} + echo "======RIGHT======" >> ${results} + cat ${results}.right >> ${results} + echo "======RESULT======" >> ${results} + if diff "${results}.left" "${results}.right"; then + echo SUCCESS >> ${results} + return 0 + else + echo SUCCESS >> ${results} + return 1 + fi +} + +cd ${script_dir}/.. +rm -f ${results} + +test_$1 +exit $? diff --git a/tools/test_simd_qemu.sh b/tools/test_simd_qemu.sh new file mode 100755 index 0000000..5771874 --- /dev/null +++ b/tools/test_simd_qemu.sh @@ -0,0 +1,258 @@ +#!/bin/bash -e + +# This script will use QEMU to test gf-complete especially SIMD support +# on different architectures and cpus. It will boot a qemu machine +# and run an Ubuntu cloud image. All testing will happen inside the +# QEMU machine. + +# The following packages are required: +# qemu-system-aarch64 +# qemu-system-arm +# qemu-system-x86_64 +# genisoimage + + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +qemu_dir="${script_dir}/.qemu" +ssh_port=2222 +ssh_pubkey_file="${qemu_dir}/qemu.pub" +ssh_key_file="${qemu_dir}/qemu" + +mkdir -p "${qemu_dir}" + +cleanup() { + if [[ -n "$(jobs -p)" ]]; then + echo killing qemu processes "$(jobs -p)" + kill $(jobs -p) + fi +} + +trap cleanup EXIT + +start_qemu() { + arch=$1 + cpu=$2 + + image_version="xenial" + image_url_base="http://cloud-images.ubuntu.com/${image_version}/current" + + case $arch in + i[[3456]]86*|x86_64*|amd64*) + image_kernel="${image_version}-server-cloudimg-amd64-vmlinuz-generic" + image_initrd="${image_version}-server-cloudimg-amd64-initrd-generic" + image_disk="${image_version}-server-cloudimg-amd64-disk1.img" + ;; + aarch64*) + image_kernel="${image_version}-server-cloudimg-arm64-vmlinuz-generic" + image_initrd="${image_version}-server-cloudimg-arm64-initrd-generic" + image_disk="${image_version}-server-cloudimg-arm64-disk1.img" + ;; + arm*) + image_kernel="${image_version}-server-cloudimg-armhf-vmlinuz-lpae" + image_initrd="${image_version}-server-cloudimg-armhf-initrd-generic-lpae" + image_disk="${image_version}-server-cloudimg-armhf-disk1.img" + ;; + *) die "Unsupported arch" ;; + esac + + [[ -f ${qemu_dir}/${image_kernel} ]] || wget -O ${qemu_dir}/${image_kernel} ${image_url_base}/unpacked/${image_kernel} + [[ -f ${qemu_dir}/${image_initrd} ]] || wget -O ${qemu_dir}/${image_initrd} ${image_url_base}/unpacked/${image_initrd} + [[ -f ${qemu_dir}/${image_disk} ]] || wget -O ${qemu_dir}/${image_disk} ${image_url_base}/${image_disk} + + #create a delta disk to keep the original image clean + delta_disk="${qemu_dir}/disk.img" + rm -f ${delta_disk} + qemu-img create -q -f qcow2 -b "${qemu_dir}/${image_disk}" ${delta_disk} + + # generate an ssh keys + [[ -f ${ssh_pubkey_file} ]] || ssh-keygen -q -N "" -f ${ssh_key_file} + + # create a config disk to set the SSH keys + cat > "${qemu_dir}/meta-data" <<EOF +instance-id: qemu +local-hostname: qemu +EOF + cat > "${qemu_dir}/user-data" <<EOF +#cloud-config +hostname: qemu +manage_etc_hosts: true +users: + - name: qemu + ssh-authorized-keys: + - $(cat "${ssh_pubkey_file}") + sudo: ['ALL=(ALL) NOPASSWD:ALL'] + groups: sudo + shell: /bin/bash +EOF + genisoimage -quiet -output "${qemu_dir}/cloud.iso" -volid cidata -joliet -rock "${qemu_dir}/user-data" "${qemu_dir}/meta-data" + + common_args=( \ + -name "qemu" \ + -m 1024 \ + -nodefaults \ + -nographic \ + -kernel ${qemu_dir}/${image_kernel} \ + -initrd ${qemu_dir}/${image_initrd} \ + -cdrom ${qemu_dir}/cloud.iso \ + -serial file:${qemu_dir}/console.log + ) + + case $arch in + i[[3456]]86*|x86_64*|amd64*) + qemu-system-x86_64 \ + "${common_args[@]}" \ + -machine accel=kvm -cpu $cpu \ + -append "console=ttyS0 root=/dev/sda1" \ + -hda "${delta_disk}" \ + -net nic,vlan=0,model=virtio \ + -net user,vlan=0,hostfwd=tcp::"${ssh_port}"-:22,hostname="${vm_name}" \ + & + ;; + aarch64*|arm*) + qemu-system-$arch \ + "${common_args[@]}" \ + -machine virt -cpu $cpu -machine type=virt -smp 1 \ + -drive if=none,file="${delta_disk}",id=hd0 \ + -device virtio-blk-device,drive=hd0 \ + -append "console=ttyAMA0 root=/dev/vda1" \ + -netdev user,id=eth0,hostfwd=tcp::"${ssh_port}"-:22,hostname="${vm_name}" \ + -device virtio-net-device,netdev=eth0 \ + & + ;; + *) die "Unsupported arch" ;; + esac + + wait_for_ssh +} + +stop_qemu() { + run_ssh "sudo shutdown now" || true + wait $(jobs -p) +} + +shared_args=( + -i ${ssh_key_file} + -F /dev/null + -o BatchMode=yes + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + -o IdentitiesOnly=yes +) + +ssh_args=( + ${shared_args[*]} + -p ${ssh_port} +) + +wait_for_ssh() { + retries=0 + retry_count=50 + + echo "waiting for machine to come up." + echo "tail -F ${qemu_dir}/console.log for progress." + + while true; do + set +e + ssh -q ${ssh_args[*]} -o ConnectTimeout=1 qemu@localhost "echo done" + error=$? + set -e + if [[ $error == 0 ]]; then + return 0 + fi + + if [[ ${retries} == ${retry_count} ]]; then + echo "timeout" + return 1 + fi + + echo -n "." + ((++retries)) + sleep 10 + done +} + +run_ssh() { + ssh -q ${ssh_args[*]} qemu@localhost "$@" +} + +run_scp() { + scp -q ${shared_args[*]} -P ${ssh_port} "$@" +} + +rsync_args=( + --exclude '.qemu' + --exclude '.git' +) + +run_rsync() { + rsync -avz -e "ssh ${ssh_args[*]}" ${rsync_args[*]} "$@" +} + +init_machine() { + run_ssh "sudo apt-get -y install --no-install-recommends make gcc autoconf libtool automake" +} + +init_machine_and_copy_source() { + init_machine + run_ssh "rm -fr ~/gf-complete; mkdir -p ~/gf-complete" + run_rsync ${script_dir}/.. qemu@localhost:gf-complete + run_ssh "cd ~/gf-complete && ./autogen.sh" +} + +run_test() { + arch=$1; shift + cpu=$1; shift + test=$1; shift + + run_ssh "~/gf-complete/tools/test_simd.sh ${test}" + run_scp qemu@localhost:gf-complete/tools/test_simd.results ${script_dir}/test_simd_${test}_${arch}_${cpu}.results +} + +# this test run the unit tests on the machine using "make check" +run_test_simd_basic() { + arch=$1; shift + cpu=$1; shift + + failed=0 + + echo "=====starting qemu machine $arch $cpu" + start_qemu $arch $cpu + init_machine_and_copy_source + echo "=====running compile test" + { run_test $arch $cpu "compile" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + echo "=====running unit test" + { run_test $arch $cpu "unit" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + echo "=====running functions test" + { run_test $arch $cpu "functions" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + echo "=====running detection test" + { run_test $arch $cpu "detection" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + echo "=====running runtime test" + { run_test $arch $cpu "runtime" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + stop_qemu + + return ${failed} +} + +run_all_tests() { + failed=0 + + echo ============================ + echo =====running x86_64 tests + # NOTE: Broadwell has all the supported SIMD instructions + { run_test_simd_basic "x86_64" "Broadwell" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + + echo ============================ + echo =====running aarch64 tests + # NOTE: cortex-a57 has ASIMD support + { run_test_simd_basic "aarch64" "cortex-a57" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + + echo ============================ + echo =====running arm tests + # NOTE: cortex-a15 has NEON support + { run_test_simd_basic "arm" "cortex-a15" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); } + + return ${failed} +} + +run_all_tests +exit $? |