From b8cf7f904974294d4e3af43c68ef23f87385f2f6 Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Wed, 6 Jan 2016 21:04:59 -0800 Subject: Top shuffle speed for machine-sized ints/floats. Apparently gcc only specializes one branch (the last one) so I went for another 33% performance increase (matching #6776) in what's likely the most common use case. --- numpy/random/mtrand/mtrand.pyx | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'numpy') diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx index f70f578cc..c8738cf6f 100644 --- a/numpy/random/mtrand/mtrand.pyx +++ b/numpy/random/mtrand/mtrand.pyx @@ -4982,8 +4982,7 @@ cdef class RandomState: """ cdef: - npy_intp i, j, n = len(x) - size_t stride, nbytes + npy_intp i, j, n = len(x), stride, itemsize char* x_ptr char* buf_ptr @@ -4993,15 +4992,17 @@ cdef class RandomState: # as MaskedArrays may not support this approach). x_ptr = x.ctypes.data stride = x.strides[0] - nbytes = x[:1].nbytes + itemsize = x.dtype.itemsize buf = np.empty_like(x[0]) # GC'd at function exit buf_ptr = buf.ctypes.data with self.lock: - for i in reversed(range(1, n)): - j = rk_interval(i, self.internal_state) - string.memcpy(buf_ptr, x_ptr + j * stride, nbytes) - string.memcpy(x_ptr + j * stride, x_ptr + i * stride, nbytes) - string.memcpy(x_ptr + i * stride, buf_ptr, nbytes) + # We trick gcc into providing a specialized implementation for + # the most common case, yielding a ~33% performance improvement. + # Note that apparently, only one branch can ever be specialized. + if itemsize == sizeof(npy_intp): + self._shuffle_raw(n, sizeof(npy_intp), stride, x_ptr, buf_ptr) + else: + self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr) elif isinstance(x, np.ndarray) and x.ndim > 1 and x.size: # Multidimensional ndarrays require a bounce buffer. buf = np.empty_like(x[0]) @@ -5018,6 +5019,15 @@ cdef class RandomState: j = rk_interval(i, self.internal_state) x[i], x[j] = x[j], x[i] + cdef inline _shuffle_raw(self, npy_intp n, npy_intp itemsize, + npy_intp stride, char* data, char* buf): + cdef npy_intp i, j + for i in reversed(range(1, n)): + j = rk_interval(i, self.internal_state) + string.memcpy(buf, data + j * stride, itemsize) + string.memcpy(data + j * stride, data + i * stride, itemsize) + string.memcpy(data + i * stride, buf, itemsize) + def permutation(self, object x): """ permutation(x) -- cgit v1.2.1