summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorAntony Lee <anntzer.lee@gmail.com>2016-01-06 21:04:59 -0800
committerAntony Lee <anntzer.lee@gmail.com>2016-01-16 20:56:41 -0800
commitb8cf7f904974294d4e3af43c68ef23f87385f2f6 (patch)
tree1b2861b995dcaffd194b27a6bbdba483f9af84c3 /numpy
parent5be93a2580a232705e897984d0f920bc6346990e (diff)
downloadnumpy-b8cf7f904974294d4e3af43c68ef23f87385f2f6.tar.gz
Top shuffle speed for machine-sized ints/floats.
Apparently gcc only specializes one branch (the last one) so I went for another 33% performance increase (matching #6776) in what's likely the most common use case.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/random/mtrand/mtrand.pyx26
1 files changed, 18 insertions, 8 deletions
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
index f70f578cc..c8738cf6f 100644
--- a/numpy/random/mtrand/mtrand.pyx
+++ b/numpy/random/mtrand/mtrand.pyx
@@ -4982,8 +4982,7 @@ cdef class RandomState:
"""
cdef:
- npy_intp i, j, n = len(x)
- size_t stride, nbytes
+ npy_intp i, j, n = len(x), stride, itemsize
char* x_ptr
char* buf_ptr
@@ -4993,15 +4992,17 @@ cdef class RandomState:
# as MaskedArrays may not support this approach).
x_ptr = <char*><size_t>x.ctypes.data
stride = x.strides[0]
- nbytes = x[:1].nbytes
+ itemsize = x.dtype.itemsize
buf = np.empty_like(x[0]) # GC'd at function exit
buf_ptr = <char*><size_t>buf.ctypes.data
with self.lock:
- for i in reversed(range(1, n)):
- j = rk_interval(i, self.internal_state)
- string.memcpy(buf_ptr, x_ptr + j * stride, nbytes)
- string.memcpy(x_ptr + j * stride, x_ptr + i * stride, nbytes)
- string.memcpy(x_ptr + i * stride, buf_ptr, nbytes)
+ # We trick gcc into providing a specialized implementation for
+ # the most common case, yielding a ~33% performance improvement.
+ # Note that apparently, only one branch can ever be specialized.
+ if itemsize == sizeof(npy_intp):
+ self._shuffle_raw(n, sizeof(npy_intp), stride, x_ptr, buf_ptr)
+ else:
+ self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr)
elif isinstance(x, np.ndarray) and x.ndim > 1 and x.size:
# Multidimensional ndarrays require a bounce buffer.
buf = np.empty_like(x[0])
@@ -5018,6 +5019,15 @@ cdef class RandomState:
j = rk_interval(i, self.internal_state)
x[i], x[j] = x[j], x[i]
+ cdef inline _shuffle_raw(self, npy_intp n, npy_intp itemsize,
+ npy_intp stride, char* data, char* buf):
+ cdef npy_intp i, j
+ for i in reversed(range(1, n)):
+ j = rk_interval(i, self.internal_state)
+ string.memcpy(buf, data + j * stride, itemsize)
+ string.memcpy(data + j * stride, data + i * stride, itemsize)
+ string.memcpy(data + i * stride, buf, itemsize)
+
def permutation(self, object x):
"""
permutation(x)