diff options
author | Antony Lee <anntzer.lee@gmail.com> | 2016-01-06 21:04:59 -0800 |
---|---|---|
committer | Antony Lee <anntzer.lee@gmail.com> | 2016-01-16 20:56:41 -0800 |
commit | b8cf7f904974294d4e3af43c68ef23f87385f2f6 (patch) | |
tree | 1b2861b995dcaffd194b27a6bbdba483f9af84c3 /numpy | |
parent | 5be93a2580a232705e897984d0f920bc6346990e (diff) | |
download | numpy-b8cf7f904974294d4e3af43c68ef23f87385f2f6.tar.gz |
Top shuffle speed for machine-sized ints/floats.
Apparently gcc only specializes one branch (the last one) so I went for
another 33% performance increase (matching #6776) in what's likely the
most common use case.
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/random/mtrand/mtrand.pyx | 26 |
1 files changed, 18 insertions, 8 deletions
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx index f70f578cc..c8738cf6f 100644 --- a/numpy/random/mtrand/mtrand.pyx +++ b/numpy/random/mtrand/mtrand.pyx @@ -4982,8 +4982,7 @@ cdef class RandomState: """ cdef: - npy_intp i, j, n = len(x) - size_t stride, nbytes + npy_intp i, j, n = len(x), stride, itemsize char* x_ptr char* buf_ptr @@ -4993,15 +4992,17 @@ cdef class RandomState: # as MaskedArrays may not support this approach). x_ptr = <char*><size_t>x.ctypes.data stride = x.strides[0] - nbytes = x[:1].nbytes + itemsize = x.dtype.itemsize buf = np.empty_like(x[0]) # GC'd at function exit buf_ptr = <char*><size_t>buf.ctypes.data with self.lock: - for i in reversed(range(1, n)): - j = rk_interval(i, self.internal_state) - string.memcpy(buf_ptr, x_ptr + j * stride, nbytes) - string.memcpy(x_ptr + j * stride, x_ptr + i * stride, nbytes) - string.memcpy(x_ptr + i * stride, buf_ptr, nbytes) + # We trick gcc into providing a specialized implementation for + # the most common case, yielding a ~33% performance improvement. + # Note that apparently, only one branch can ever be specialized. + if itemsize == sizeof(npy_intp): + self._shuffle_raw(n, sizeof(npy_intp), stride, x_ptr, buf_ptr) + else: + self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr) elif isinstance(x, np.ndarray) and x.ndim > 1 and x.size: # Multidimensional ndarrays require a bounce buffer. buf = np.empty_like(x[0]) @@ -5018,6 +5019,15 @@ cdef class RandomState: j = rk_interval(i, self.internal_state) x[i], x[j] = x[j], x[i] + cdef inline _shuffle_raw(self, npy_intp n, npy_intp itemsize, + npy_intp stride, char* data, char* buf): + cdef npy_intp i, j + for i in reversed(range(1, n)): + j = rk_interval(i, self.internal_state) + string.memcpy(buf, data + j * stride, itemsize) + string.memcpy(data + j * stride, data + i * stride, itemsize) + string.memcpy(data + i * stride, buf, itemsize) + def permutation(self, object x): """ permutation(x) |