BUG: Ugly fix for Apple's cblas_sgemv segfault

SGEMV in Accelerate framework will segfault on MacOS X version 10.9 (aka Mavericks) if arrays are not aligned to 32 byte boundaries and the CPU supports AVX instructions. This can produce segfaults in numpy.dot if we use numpy.float32 as dtype. This patch overshadows the symbols cblas_sgemv, sgemv_ and sgemv exported by Accelerate to produce the correct behavior. The MacOS X version and CPU specs are checked on module import. If Mavericks and AVX are detected the call to SGEMV is emulated with a call to SGEMM if the arrays are not 32 byte aligned. If the exported symbols cannot be overshadowed on module import, a fatal error is produced and the process aborts. All the fixes are in a self-contained C file and do not alter the _dotblas C code. The patch is not applied unless NumPy is configured to link with Apple's Accelerate framework.
author: Sturla Molden <sturla@molden.no> 2014-10-18 09:03:47 +0200
committer: Sturla Molden <sturla@molden.no> 2014-10-23 19:39:32 +0200
commit: e2eaf961dff8d57e3c12c19c39956a40754a1d0e (patch)
tree: 1ff5c5bb995f4fd5099cf404a3a5628fd08dc14f /numpy
parent: 555d803792a9f78db64961d6236eb0b01cd7af45 (diff)
download: numpy-e2eaf961dff8d57e3c12c19c39956a40754a1d0e.tar.gz
4 files changed, 299 insertions, 5 deletions
diff --git a/numpy/core/blasdot/apple_sgemv_patch.c b/numpy/core/blasdot/apple_sgemv_patch.c
new file mode 100644
index 000000000..9941a0731
--- /dev/null
+++ b/numpy/core/blasdot/apple_sgemv_patch.c
@@ -0,0 +1,216 @@
+#ifdef APPLE_ACCELERATE_SGEMV_PATCH  
+
+/* This is an ugly hack to circumvent a bug in Accelerate's cblas_sgemv.
+ *
+ * See: https://github.com/numpy/numpy/issues/4007
+ *
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+#include <string.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/* ----------------------------------------------------------------- */
+/* Original cblas_sgemv */
+
+#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"
+
+enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+extern void cblas_xerbla(int info, const char *rout, const char *form, ...);
+
+typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, 
+                 const float beta, float  *Y, const int incY);
+
+typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
+                 const int M, const int N, const int K,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *B, const int ldb, 
+                 const float beta, float  *C, const int incC); 
+
+typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY );
+
+static void *veclib = NULL;
+static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
+static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
+static fortran_sgemv_t *accelerate_sgemv = NULL;
+static int AVX_and_10_9 = 0;
+
+/* Dynamic check for AVX support
+ * __builtin_cpu_supports("avx") is available in gcc 4.8,
+ * but clang and icc do not currently support it. */
+#define cpu_supports_avx()\
+(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)
+
+/* Check if we are using MacOS X version 10.9 */
+#define using_mavericks()\
+(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0) 
+ 
+__attribute__((destructor))
+static void unloadlib(void)
+{
+   if (veclib) dlclose(veclib);
+}
+
+__attribute__((constructor))
+static void loadlib()
+/* automatically executed on module import */
+{
+    char errormsg[1024];
+    int AVX, MAVERICKS;
+    memset((void*)errormsg, 0, sizeof(errormsg));
+    /* check if the CPU supports AVX */
+    AVX = cpu_supports_avx();
+    /* check if the OS is MacOS X Mavericks */
+    MAVERICKS = using_mavericks();    
+    /* we need the workaround when the CPU supports
+     * AVX and the OS version is Mavericks */
+    AVX_and_10_9 = AVX && MAVERICKS;
+    /* load vecLib */
+    veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
+    if (!veclib) {
+        veclib = NULL;
+        sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
+        Py_FatalError(errormsg); /* calls abort() and dumps core */
+    }
+    /* resolve Fortran SGEMV from Accelerate */
+    accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
+    if (!accelerate_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemv from Accelerate */
+    accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
+    if (!accelerate_cblas_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemm from Accelerate */
+    accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
+    if (!accelerate_cblas_sgemm) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
+        Py_FatalError(errormsg);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Fortran SGEMV override */
+
+void sgemv_( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY )
+{    
+    /* It is safe to use the original SGEMV if we are not using AVX on Mavericks
+     * or the input arrays A, X and Y are all aligned on 32 byte boundaries. */
+    #define BADARRAY(x) (((npy_intp)(void*)x) % 32)
+    const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
+    if (!use_sgemm) {
+        accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
+        return;
+    }
+
+    /* Arrays are misaligned, the CPU supports AVX, and we are running 
+     * Mavericks.
+     *
+     * Emulation of SGEMV with SGEMM: 
+     *
+     * SGEMV allows vectors to be strided. SGEMM requires all arrays to be
+     * contiguous along the leading dimension. To emulate striding in SGEMV
+     * with the leading dimension arguments in SGEMM we compute
+     *
+     *    Y = alpha * op(A) @ X + beta * Y
+     *
+     * as
+     *
+     *    Y.T = alpha * X.T @ op(A).T + beta * Y.T
+     *
+     * Because Fortran uses column major order and X.T and Y.T are row vectors,
+     * the leading dimensions of X.T and Y.T in SGEMM become equal to the 
+     * strides of the the column vectors X and Y in SGEMV. */
+
+    switch (*trans) {
+        case 'T':
+        case 't':
+        case 'C':
+        case 'c':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        case 'N':
+        case 'n':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
+                1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        default:
+            cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Override for an alias symbol for sgemv_ in Accelerate */
+
+void sgemv (char *trans,
+            const int *m, const int *n,
+            const float *alpha,
+            const float *A, const int *lda,
+            const float *B, const int *incB,
+            const float *beta,
+            float *C, const int *incC)
+{
+    sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
+}
+
+/* ----------------------------------------------------------------- */
+/* cblas_sgemv override, based on Netlib CBLAS code */
+
+void cblas_sgemv(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, const float beta,
+                 float  *Y, const int incY)
+{
+   char TA;
+   if (order == CblasColMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'N';
+      else if (TransA == CblasTrans) TA = 'T';
+      else if (TransA == CblasConjTrans) TA = 'C';
+      else 
+      {
+         cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
+      }
+      sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else if (order == CblasRowMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'T';
+      else if (TransA == CblasTrans) TA = 'N';
+      else if (TransA == CblasConjTrans) TA = 'N';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
+         return;
+      }
+      sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else
+      cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
+}
+
+#endif
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 5da042413..1cd1ee1c2 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -954,13 +954,14 @@ def configuration(parent_package='',top_path=None):
         if blas_info:
             if ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', []):
                 return None # dotblas needs ATLAS, Fortran compiled blas will not be sufficient.
-            return ext.depends[:1]
+            return ext.depends[:2]
         return None # no extension module will be built
 
     config.add_extension('_dotblas',
                          sources = [get_dotblas_sources],
                          depends = [join('blasdot', '_dotblas.c'),
-                                  join('blasdot', 'cblas.h'),
+                                    join('blasdot', 'apple_sgemv_patch.c'),
+                                    join('blasdot', 'cblas.h'),
                                   ],
                          include_dirs = ['blasdot'],
                          extra_info = blas_info
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index 264663835..6b5afef14 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -1,7 +1,9 @@
 from __future__ import division, absolute_import, print_function
 
-import numpy as np
 import sys
+from itertools import product
+
+import numpy as np
 from numpy.core import zeros, float64
 from numpy.testing import dec, TestCase, assert_almost_equal, assert_, \
      assert_raises, assert_array_equal, assert_allclose, assert_equal
@@ -170,3 +172,78 @@ def test_dot_override():
     assert_equal(c.dot(a), "A")
     assert_raises(TypeError, np.dot, b, c)
     assert_raises(TypeError, c.dot, b)
+
+
+def test_npdot_segfault():
+    if sys.platform != 'darwin': return
+    # Test for float32 np.dot segfault
+    # https://github.com/numpy/numpy/issues/4007
+
+    def aligned_array(shape, align, dtype, order='C'):
+        # Make array shape `shape` with aligned at `align` bytes
+        d = dtype()
+        # Make array of correct size with `align` extra bytes
+        N = np.prod(shape)
+        tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
+        address = tmp.__array_interface__["data"][0]
+        # Find offset into array giving desired alignment
+        for offset in range(align):
+            if (address + offset) % align == 0: break
+        tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
+        return tmp.reshape(shape, order=order)
+
+    def as_aligned(arr, align, dtype, order='C'):
+        # Copy `arr` into an aligned array with same shape
+        aligned = aligned_array(arr.shape, align, dtype, order)
+        aligned[:] = arr[:]
+        return aligned
+
+    def assert_dot_close(A, X, desired):
+        assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7)
+
+    m = aligned_array(100, 15, np.float32)
+    s = aligned_array((100, 100), 15, np.float32)
+    # This always segfaults when the sgemv alignment bug is present
+    np.dot(s, m)
+    # test the sanity of np.dot after applying patch
+    for align, m, n, a_order in product(
+        (15, 32),
+        (10000,),
+        (200, 89),
+        ('C', 'F')):
+        # Calculation in double precision
+        A_d = np.random.rand(m, n)
+        X_d = np.random.rand(n)
+        desired = np.dot(A_d, X_d)
+        # Calculation with aligned single precision
+        A_f = as_aligned(A_d, align, np.float32, order=a_order)
+        X_f = as_aligned(X_d, align, np.float32)
+        assert_dot_close(A_f, X_f, desired)
+        # Strided A rows
+        A_d_2 = A_d[::2]
+        desired = np.dot(A_d_2, X_d)
+        A_f_2 = A_f[::2]
+        assert_dot_close(A_f_2, X_f, desired)
+        # Strided A columns, strided X vector
+        A_d_22 = A_d_2[:, ::2]
+        X_d_2 = X_d[::2]
+        desired = np.dot(A_d_22, X_d_2)
+        A_f_22 = A_f_2[:, ::2]
+        X_f_2 = X_f[::2]
+        assert_dot_close(A_f_22, X_f_2, desired)
+        # Check the strides are as expected
+        if a_order == 'F':
+            assert_equal(A_f_22.strides, (8, 8 * m))
+        else:
+            assert_equal(A_f_22.strides, (8 * n, 8))
+        assert_equal(X_f_2.strides, (8,))
+        # Strides in A rows + cols only
+        X_f_2c = as_aligned(X_f_2, align, np.float32)
+        assert_dot_close(A_f_22, X_f_2c, desired)
+        # Strides just in A cols
+        A_d_12 = A_d[:, ::2]
+        desired = np.dot(A_d_12, X_d_2)
+        A_f_12 = A_f[:, ::2]
+        assert_dot_close(A_f_12, X_f_2c, desired)
+        # Strides in A cols and X
+        assert_dot_close(A_f_12, X_f_2, desired)
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index 48c92c548..31cb13b86 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -1400,7 +1400,7 @@ class lapack_opt_info(system_info):
             if os.path.exists('/System/Library/Frameworks'
                               '/Accelerate.framework/'):
                 if intel:
-                    args.extend(['-msse3'])
+                    args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH'])
                 else:
                     args.extend(['-faltivec'])
                 link_args.extend(['-Wl,-framework', '-Wl,Accelerate'])
@@ -1497,7 +1497,7 @@ class blas_opt_info(system_info):
             if os.path.exists('/System/Library/Frameworks'
                               '/Accelerate.framework/'):
                 if intel:
-                    args.extend(['-msse3'])
+                    args.extend(['-msse3', '-DAPPLE_ACCELERATE_SGEMV_PATCH'])
                 else:
                     args.extend(['-faltivec'])
                 args.extend([
author	Sturla Molden <sturla@molden.no>	2014-10-18 09:03:47 +0200
committer	Sturla Molden <sturla@molden.no>	2014-10-23 19:39:32 +0200
commit	e2eaf961dff8d57e3c12c19c39956a40754a1d0e (patch)
tree	1ff5c5bb995f4fd5099cf404a3a5628fd08dc14f /numpy
parent	555d803792a9f78db64961d6236eb0b01cd7af45 (diff)
download	numpy-e2eaf961dff8d57e3c12c19c39956a40754a1d0e.tar.gz