diff options
| author | Warren Weckesser <warren.weckesser@gmail.com> | 2019-10-24 22:03:05 -0400 |
|---|---|---|
| committer | Warren Weckesser <warren.weckesser@gmail.com> | 2019-10-24 22:06:31 -0400 |
| commit | 4780c1bc78065154e2817e899a37acdde7225ec5 (patch) | |
| tree | c39d5ff2ad634fe3244dfff358460ba2cbdb1569 /numpy/random | |
| parent | 34e83881a9e377d8cc8fe2644bfb06600d9e77d1 (diff) | |
| download | numpy-4780c1bc78065154e2817e899a37acdde7225ec5.tar.gz | |
BUG: random: biased samples from integers() with 8 or 16 bit dtype.
When an 8 or 16 bit dtype was given to the integers() method of the
Generator class, the resulting sample was biased. The problem was
the lines of the form
const uint8_t threshold = -rng_excl % rng_excl;
in the implementations of Lemire's method, in the C file
distributions.c. The intent was to compute
(UINT8_MAX+1 - rng_excl) % rng_excl
However, when the type of rng_excl has integer conversion rank lower
than a C int (which is almost certainly the case for the 8 and 16
bit types), the terms in the expression -rng_excl % rng_excl are
promoted to int, and the result of the calculation is always 0.
The fix is to make the expression explicit, and write it as
const uint8_t threshold = (UINT8_MAX - rng) % rng_excl;
rng is used, because rng_excl is simply rng + 1; by using rng, we
we only need the constant UINT#_MAX, without the extra +1.
For consistency, I made the same change for all the data types
(8, 16, 32 and 64 bit).
Closes gh-14774.
Diffstat (limited to 'numpy/random')
| -rw-r--r-- | numpy/random/src/distributions/distributions.c | 19 | ||||
| -rw-r--r-- | numpy/random/tests/test_generator_mt19937.py | 22 |
2 files changed, 25 insertions, 16 deletions
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c index b382ead0b..ab8de8bcb 100644 --- a/numpy/random/src/distributions/distributions.c +++ b/numpy/random/src/distributions/distributions.c @@ -1170,10 +1170,7 @@ static NPY_INLINE uint64_t bounded_lemire_uint64(bitgen_t *bitgen_state, if (leftover < rng_excl) { /* `rng_excl` is a simple upper bound for `threshold`. */ - - const uint64_t threshold = -rng_excl % rng_excl; - /* Same as: threshold=((uint64_t)(0x10000000000000000ULLL - rng_excl)) % - * rng_excl; */ + const uint64_t threshold = (UINT64_MAX - rng) % rng_excl; while (leftover < threshold) { m = ((__uint128_t)next_uint64(bitgen_state)) * rng_excl; @@ -1196,10 +1193,7 @@ static NPY_INLINE uint64_t bounded_lemire_uint64(bitgen_t *bitgen_state, if (leftover < rng_excl) { /* `rng_excl` is a simple upper bound for `threshold`. */ - - const uint64_t threshold = -rng_excl % rng_excl; - /* Same as:threshold=((uint64_t)(0x10000000000000000ULLL - rng_excl)) % - * rng_excl; */ + const uint64_t threshold = (UINT64_MAX - rng) % rng_excl; while (leftover < threshold) { x = next_uint64(bitgen_state); @@ -1260,8 +1254,7 @@ static NPY_INLINE uint32_t buffered_bounded_lemire_uint32( if (leftover < rng_excl) { /* `rng_excl` is a simple upper bound for `threshold`. */ - const uint32_t threshold = -rng_excl % rng_excl; - /* Same as: threshold=((uint64_t)(0x100000000ULL - rng_excl)) % rng_excl; */ + const uint32_t threshold = (UINT32_MAX - rng) % rng_excl; while (leftover < threshold) { m = ((uint64_t)next_uint32(bitgen_state)) * rng_excl; @@ -1295,8 +1288,7 @@ static NPY_INLINE uint16_t buffered_bounded_lemire_uint16( if (leftover < rng_excl) { /* `rng_excl` is a simple upper bound for `threshold`. */ - const uint16_t threshold = -rng_excl % rng_excl; - /* Same as: threshold=((uint32_t)(0x10000ULL - rng_excl)) % rng_excl; */ + const uint16_t threshold = (UINT16_MAX - rng) % rng_excl; while (leftover < threshold) { m = ((uint32_t)buffered_uint16(bitgen_state, bcnt, buf)) * rng_excl; @@ -1331,8 +1323,7 @@ static NPY_INLINE uint8_t buffered_bounded_lemire_uint8(bitgen_t *bitgen_state, if (leftover < rng_excl) { /* `rng_excl` is a simple upper bound for `threshold`. */ - const uint8_t threshold = -rng_excl % rng_excl; - /* Same as: threshold=((uint16_t)(0x100ULL - rng_excl)) % rng_excl; */ + const uint8_t threshold = (UINT8_MAX - rng) % rng_excl; while (leftover < threshold) { m = ((uint16_t)buffered_uint8(bitgen_state, bcnt, buf)) * rng_excl; diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py index 526275dda..d4502d276 100644 --- a/numpy/random/tests/test_generator_mt19937.py +++ b/numpy/random/tests/test_generator_mt19937.py @@ -463,11 +463,11 @@ class TestIntegers(object): 'int16': '39624ead49ad67e37545744024d2648b', 'int32': '5c4810373f979336c6c0c999996e47a1', 'int64': 'ab126c15edff26f55c50d2b7e37391ac', - 'int8': 'd1746364b48a020dab9ef0568e6c0cd2', + 'int8': 'ba71ccaffeeeb9eeb1860f8075020b9c', 'uint16': '39624ead49ad67e37545744024d2648b', 'uint32': '5c4810373f979336c6c0c999996e47a1', 'uint64': 'ab126c15edff26f55c50d2b7e37391ac', - 'uint8': 'd1746364b48a020dab9ef0568e6c0cd2'} + 'uint8': 'ba71ccaffeeeb9eeb1860f8075020b9c'} for dt in self.itype[1:]: random = Generator(MT19937(1234)) @@ -618,6 +618,24 @@ class TestIntegers(object): with pytest.raises(ValueError): random.integers(0, 200, size=10, dtype=other_byteord_dt) + # chi2max is the maximum acceptable chi-squared value. + @pytest.mark.slow + @pytest.mark.parametrize('sample_size,high,dtype,chi2max', + [(5000000, 5, np.int8, 125.0), # p-value ~4.6e-25 + (5000000, 7, np.uint8, 150.0), # p-value ~7.7e-30 + (10000000, 2500, np.int16, 3300.0), # p-value ~3.0e-25 + (50000000, 5000, np.uint16, 6500.0), # p-value ~3.5e-25 + ]) + def test_integers_small_dtype_chisquared(self, sample_size, high, + dtype, chi2max): + # Regression test for gh-14774. + samples = random.integers(high, size=sample_size, dtype=dtype) + + values, counts = np.unique(samples, return_counts=True) + expected = sample_size / high + chi2 = ((counts - expected)**2 / expected).sum() + assert chi2 < chi2max + class TestRandomDist(object): # Make sure the random distribution returns the correct value for a |
