1 files changed, 148 insertions, 75 deletions
diff --git a/Lib/test/test_bigmem.py b/Lib/test/test_bigmem.py
index f3c6ebbf05..0e545950ee 100644
--- a/Lib/test/test_bigmem.py
+++ b/Lib/test/test_bigmem.py
@@ -1,3 +1,13 @@
+"""Bigmem tests - tests for the 32-bit boundary in containers.
+
+These tests try to exercise the 32-bit boundary that is sometimes, if
+rarely, exceeded in practice, but almost never tested.  They are really only
+meaningful on 64-bit builds on machines with a *lot* of memory, but the
+tests are always run, usually with very low memory limits to make sure the
+tests themselves don't suffer from bitrot.  To run them for real, pass a
+high memory limit to regrtest, with the -M option.
+"""
+
 from test import support
 from test.support import bigmemtest, _1G, _2G, _4G
 
@@ -6,20 +16,35 @@ import operator
 import sys
 import functools
 
+# These tests all use one of the bigmemtest decorators to indicate how much
+# memory they use and how much memory they need to be even meaningful.  The
+# decorators take two arguments: a 'memuse' indicator declaring
+# (approximate) bytes per size-unit the test will use (at peak usage), and a
+# 'minsize' indicator declaring a minimum *useful* size.  A test that
+# allocates a bytestring to test various operations near the end will have a
+# minsize of at least 2Gb (or it wouldn't reach the 32-bit limit, so the
+# test wouldn't be very useful) and a memuse of 1 (one byte per size-unit,
+# if it allocates only one big string at a time.)
+#
+# When run with a memory limit set, both decorators skip tests that need
+# more memory than available to be meaningful.  The precisionbigmemtest will
+# always pass minsize as size, even if there is much more memory available.
+# The bigmemtest decorator will scale size upward to fill available memory.
+#
 # Bigmem testing houserules:
 #
 #  - Try not to allocate too many large objects. It's okay to rely on
-#    refcounting semantics, but don't forget that 's = create_largestring()'
+#    refcounting semantics, and don't forget that 's = create_largestring()'
 #    doesn't release the old 's' (if it exists) until well after its new
 #    value has been created. Use 'del s' before the create_largestring call.
 #
-#  - Do *not* compare large objects using assertEqual or similar. It's a
-#    lengthy operation and the errormessage will be utterly useless due to
-#    its size. To make sure whether a result has the right contents, better
-#    to use the strip or count methods, or compare meaningful slices.
+#  - Do *not* compare large objects using assertEqual, assertIn or similar.
+#    It's a lengthy operation and the errormessage will be utterly useless
+#    due to its size.  To make sure whether a result has the right contents,
+#    better to use the strip or count methods, or compare meaningful slices.
 #
 #  - Don't forget to test for large indices, offsets and results and such,
-#    in addition to large sizes.
+#    in addition to large sizes. Anything that probes the 32-bit boundary.
 #
 #  - When repeating an object (say, a substring, or a small list) to create
 #    a large object, make the subobject of a length that is not a power of
@@ -37,13 +62,14 @@ import functools
 # fail as well. I do not know whether it is due to memory fragmentation
 # issues, or other specifics of the platform malloc() routine.
 
-character_size = 4 if sys.maxunicode > 0xFFFF else 2
+ascii_char_size = 1
+ucs2_char_size = 2
+ucs4_char_size = 4
 
 
 class BaseStrTest:
 
-    @bigmemtest(size=_2G, memuse=2)
-    def test_capitalize(self, size):
+    def _test_capitalize(self, size):
         _ = self.from_latin1
         SUBSTR = self.from_latin1(' abc def ghi')
         s = _('-') * size + SUBSTR
@@ -92,7 +118,7 @@ class BaseStrTest:
         _ = self.from_latin1
         s = _('-') * size
         tabsize = 8
-        self.assertEqual(s.expandtabs(), s)
+        self.assertTrue(s.expandtabs() == s)
         del s
         slen, remainder = divmod(size, tabsize)
         s = _('       \t') * slen
@@ -347,7 +373,7 @@ class BaseStrTest:
     # suffer for the list size. (Otherwise, it'd cost another 48 times
     # size in bytes!) Nevertheless, a list of size takes
     # 8*size bytes.
-    @bigmemtest(size=_2G + 5, memuse=10)
+    @bigmemtest(size=_2G + 5, memuse=2 * ascii_char_size + 8)
     def test_split_large(self, size):
         _ = self.from_latin1
         s = _(' a') * size + _(' ')
@@ -366,9 +392,9 @@ class BaseStrTest:
         # take up an inordinate amount of memory
         chunksize = int(size ** 0.5 + 2) // 2
         SUBSTR = _(' ') * chunksize + _('\n') + _(' ') * chunksize + _('\r\n')
-        s = SUBSTR * chunksize
+        s = SUBSTR * (chunksize * 2)
         l = s.splitlines()
-        self.assertEqual(len(l), chunksize * 2)
+        self.assertEqual(len(l), chunksize * 4)
         expected = _(' ') * chunksize
         for item in l:
             self.assertEqual(item, expected)
@@ -394,8 +420,7 @@ class BaseStrTest:
         self.assertEqual(len(s), size)
         self.assertEqual(s.strip(), SUBSTR.strip())
 
-    @bigmemtest(size=_2G, memuse=2)
-    def test_swapcase(self, size):
+    def _test_swapcase(self, size):
         _ = self.from_latin1
         SUBSTR = _("aBcDeFG12.'\xa9\x00")
         sublen = len(SUBSTR)
@@ -406,8 +431,7 @@ class BaseStrTest:
         self.assertEqual(s[:sublen * 3], SUBSTR.swapcase() * 3)
         self.assertEqual(s[-sublen * 3:], SUBSTR.swapcase() * 3)
 
-    @bigmemtest(size=_2G, memuse=2)
-    def test_title(self, size):
+    def _test_title(self, size):
         _ = self.from_latin1
         SUBSTR = _('SpaaHAaaAaham')
         s = SUBSTR * (size // len(SUBSTR) + 2)
@@ -419,14 +443,7 @@ class BaseStrTest:
     def test_translate(self, size):
         _ = self.from_latin1
         SUBSTR = _('aZz.z.Aaz.')
-        if isinstance(SUBSTR, str):
-            trans = {
-                ord(_('.')): _('-'),
-                ord(_('a')): _('!'),
-                ord(_('Z')): _('$'),
-            }
-        else:
-            trans = bytes.maketrans(b'.aZ', b'-!$')
+        trans = bytes.maketrans(b'.aZ', b'-!$')
         sublen = len(SUBSTR)
         repeats = size // sublen + 2
         s = SUBSTR * repeats
@@ -519,19 +536,19 @@ class BaseStrTest:
         edge = _('-') * (size // 2)
         s = _('').join([edge, SUBSTR, edge])
         del edge
-        self.assertIn(SUBSTR, s)
-        self.assertNotIn(SUBSTR * 2, s)
-        self.assertIn(_('-'), s)
-        self.assertNotIn(_('a'), s)
+        self.assertTrue(SUBSTR in s)
+        self.assertFalse(SUBSTR * 2 in s)
+        self.assertTrue(_('-') in s)
+        self.assertFalse(_('a') in s)
         s += _('a')
-        self.assertIn(_('a'), s)
+        self.assertTrue(_('a') in s)
 
     @bigmemtest(size=_2G + 10, memuse=2)
     def test_compare(self, size):
         _ = self.from_latin1
         s1 = _('-') * size
         s2 = _('-') * size
-        self.assertEqual(s1, s2)
+        self.assertTrue(s1 == s2)
         del s2
         s2 = s1 + _('a')
         self.assertFalse(s1 == s2)
@@ -552,7 +569,7 @@ class BaseStrTest:
         h1 = hash(s)
         del s
         s = _('\x00') * (size + 1)
-        self.assertFalse(h1 == hash(s))
+        self.assertNotEqual(h1, hash(s))
 
 
 class StrTest(unittest.TestCase, BaseStrTest):
@@ -563,7 +580,6 @@ class StrTest(unittest.TestCase, BaseStrTest):
     def basic_encode_test(self, size, enc, c='.', expectedsize=None):
         if expectedsize is None:
             expectedsize = size
-
         try:
             s = c * size
             self.assertEqual(len(s.encode(enc)), expectedsize)
@@ -582,48 +598,64 @@ class StrTest(unittest.TestCase, BaseStrTest):
                 memuse = meth.memuse
             except AttributeError:
                 continue
-            meth.memuse = character_size * memuse
+            meth.memuse = ascii_char_size * memuse
             self._adjusted[name] = memuse
 
     def tearDown(self):
         for name, memuse in self._adjusted.items():
             getattr(type(self), name).memuse = memuse
 
-    # the utf8 encoder preallocates big time (4x the number of characters)
-    @bigmemtest(size=_2G + 2, memuse=character_size + 4)
+    @bigmemtest(size=_2G, memuse=ucs4_char_size * 3)
+    def test_capitalize(self, size):
+        self._test_capitalize(size)
+
+    @bigmemtest(size=_2G, memuse=ucs4_char_size * 3)
+    def test_title(self, size):
+        self._test_title(size)
+
+    @bigmemtest(size=_2G, memuse=ucs4_char_size * 3)
+    def test_swapcase(self, size):
+        self._test_swapcase(size)
+
+    # Many codecs convert to the legacy representation first, explaining
+    # why we add 'ucs4_char_size' to the 'memuse' below.
+
+    @bigmemtest(size=_2G + 2, memuse=ascii_char_size + 1)
     def test_encode(self, size):
         return self.basic_encode_test(size, 'utf-8')
 
-    @bigmemtest(size=_4G // 6 + 2, memuse=character_size + 1)
+    @bigmemtest(size=_4G // 6 + 2, memuse=ascii_char_size + ucs4_char_size + 1)
     def test_encode_raw_unicode_escape(self, size):
         try:
             return self.basic_encode_test(size, 'raw_unicode_escape')
         except MemoryError:
             pass # acceptable on 32-bit
 
-    @bigmemtest(size=_4G // 5 + 70, memuse=character_size + 1)
+    @bigmemtest(size=_4G // 5 + 70, memuse=ascii_char_size + ucs4_char_size + 1)
     def test_encode_utf7(self, size):
         try:
             return self.basic_encode_test(size, 'utf7')
         except MemoryError:
             pass # acceptable on 32-bit
 
-    @bigmemtest(size=_4G // 4 + 5, memuse=character_size + 4)
+    @bigmemtest(size=_4G // 4 + 5, memuse=ascii_char_size + ucs4_char_size + 4)
     def test_encode_utf32(self, size):
         try:
-            return self.basic_encode_test(size, 'utf32', expectedsize=4*size+4)
+            return self.basic_encode_test(size, 'utf32', expectedsize=4 * size + 4)
         except MemoryError:
             pass # acceptable on 32-bit
 
-    @bigmemtest(size=_2G - 1, memuse=character_size + 1)
+    @bigmemtest(size=_2G - 1, memuse=ascii_char_size + 1)
     def test_encode_ascii(self, size):
         return self.basic_encode_test(size, 'ascii', c='A')
 
-    @bigmemtest(size=_2G + 10, memuse=character_size * 2)
+    # str % (...) uses a Py_UCS4 intermediate representation
+
+    @bigmemtest(size=_2G + 10, memuse=ascii_char_size * 2 + ucs4_char_size)
     def test_format(self, size):
         s = '-' * size
         sf = '%s' % (s,)
-        self.assertEqual(s, sf)
+        self.assertTrue(s == sf)
         del sf
         sf = '..%s..' % (s,)
         self.assertEqual(len(sf), len(s) + 4)
@@ -640,7 +672,7 @@ class StrTest(unittest.TestCase, BaseStrTest):
         self.assertEqual(s.count('.'), 3)
         self.assertEqual(s.count('-'), size * 2)
 
-    @bigmemtest(size=_2G + 10, memuse=character_size * 2)
+    @bigmemtest(size=_2G + 10, memuse=ascii_char_size * 2)
     def test_repr_small(self, size):
         s = '-' * size
         s = repr(s)
@@ -661,7 +693,7 @@ class StrTest(unittest.TestCase, BaseStrTest):
         self.assertEqual(s.count('\\'), size)
         self.assertEqual(s.count('0'), size * 2)
 
-    @bigmemtest(size=_2G + 10, memuse=character_size * 5)
+    @bigmemtest(size=_2G + 10, memuse=ascii_char_size * 5)
     def test_repr_large(self, size):
         s = '\x00' * size
         s = repr(s)
@@ -671,7 +703,13 @@ class StrTest(unittest.TestCase, BaseStrTest):
         self.assertEqual(s.count('\\'), size)
         self.assertEqual(s.count('0'), size * 2)
 
-    @bigmemtest(size=_2G // 5 + 1, memuse=character_size * 7)
+    # ascii() calls encode('ascii', 'backslashreplace'), which itself
+    # creates a temporary Py_UNICODE representation in addition to the
+    # original (Py_UCS2) one
+    # There's also some overallocation when resizing the ascii() result
+    # that isn't taken into account here.
+    @bigmemtest(size=_2G // 5 + 1, memuse=ucs2_char_size +
+                                          ucs4_char_size + ascii_char_size * 6)
     def test_unicode_repr(self, size):
         # Use an assigned, but not printable code point.
         # It is in the range of the low surrogates \uDC00-\uDFFF.
@@ -686,9 +724,7 @@ class StrTest(unittest.TestCase, BaseStrTest):
         finally:
             r = s = None
 
-    # The character takes 4 bytes even in UCS-2 builds because it will
-    # be decomposed into surrogates.
-    @bigmemtest(size=_2G // 5 + 1, memuse=4 + character_size * 9)
+    @bigmemtest(size=_2G // 5 + 1, memuse=ucs4_char_size * 2 + ascii_char_size * 10)
     def test_unicode_repr_wide(self, size):
         char = "\U0001DCBA"
         s = char * size
@@ -701,39 +737,76 @@ class StrTest(unittest.TestCase, BaseStrTest):
         finally:
             r = s = None
 
-    @bigmemtest(size=_4G // 5, memuse=character_size * (6 + 1))
-    def _test_unicode_repr_overflow(self, size):
-        # XXX not sure what this test is about
-        char = "\uDCBA"
-        s = char * size
-        try:
-            r = repr(s)
-            self.assertTrue(s == eval(r))
-        finally:
-            r = s = None
+    # The original test_translate is overriden here, so as to get the
+    # correct size estimate: str.translate() uses an intermediate Py_UCS4
+    # representation.
+
+    @bigmemtest(size=_2G, memuse=ascii_char_size * 2 + ucs4_char_size)
+    def test_translate(self, size):
+        _ = self.from_latin1
+        SUBSTR = _('aZz.z.Aaz.')
+        trans = {
+            ord(_('.')): _('-'),
+            ord(_('a')): _('!'),
+            ord(_('Z')): _('$'),
+        }
+        sublen = len(SUBSTR)
+        repeats = size // sublen + 2
+        s = SUBSTR * repeats
+        s = s.translate(trans)
+        self.assertEqual(len(s), repeats * sublen)
+        self.assertEqual(s[:sublen], SUBSTR.translate(trans))
+        self.assertEqual(s[-sublen:], SUBSTR.translate(trans))
+        self.assertEqual(s.count(_('.')), 0)
+        self.assertEqual(s.count(_('!')), repeats * 2)
+        self.assertEqual(s.count(_('z')), repeats * 3)
 
 
 class BytesTest(unittest.TestCase, BaseStrTest):
 
     def from_latin1(self, s):
-        return s.encode("latin1")
+        return s.encode("latin-1")
 
-    @bigmemtest(size=_2G + 2, memuse=1 + character_size)
+    @bigmemtest(size=_2G + 2, memuse=1 + ascii_char_size)
     def test_decode(self, size):
         s = self.from_latin1('.') * size
         self.assertEqual(len(s.decode('utf-8')), size)
 
+    @bigmemtest(size=_2G, memuse=2)
+    def test_capitalize(self, size):
+        self._test_capitalize(size)
+
+    @bigmemtest(size=_2G, memuse=2)
+    def test_title(self, size):
+        self._test_title(size)
+
+    @bigmemtest(size=_2G, memuse=2)
+    def test_swapcase(self, size):
+        self._test_swapcase(size)
+
 
 class BytearrayTest(unittest.TestCase, BaseStrTest):
 
     def from_latin1(self, s):
-        return bytearray(s.encode("latin1"))
+        return bytearray(s.encode("latin-1"))
 
-    @bigmemtest(size=_2G + 2, memuse=1 + character_size)
+    @bigmemtest(size=_2G + 2, memuse=1 + ascii_char_size)
     def test_decode(self, size):
         s = self.from_latin1('.') * size
         self.assertEqual(len(s.decode('utf-8')), size)
 
+    @bigmemtest(size=_2G, memuse=2)
+    def test_capitalize(self, size):
+        self._test_capitalize(size)
+
+    @bigmemtest(size=_2G, memuse=2)
+    def test_title(self, size):
+        self._test_title(size)
+
+    @bigmemtest(size=_2G, memuse=2)
+    def test_swapcase(self, size):
+        self._test_swapcase(size)
+
     test_hash = None
     test_split_large = None
 
@@ -752,7 +825,7 @@ class TupleTest(unittest.TestCase):
     def test_compare(self, size):
         t1 = ('',) * size
         t2 = ('',) * size
-        self.assertEqual(t1, t2)
+        self.assertTrue(t1 == t2)
         del t2
         t2 = ('',) * (size + 1)
         self.assertFalse(t1 == t2)
@@ -783,9 +856,9 @@ class TupleTest(unittest.TestCase):
     def test_contains(self, size):
         t = (1, 2, 3, 4, 5) * size
         self.assertEqual(len(t), size * 5)
-        self.assertIn(5, t)
-        self.assertNotIn((1, 2, 3, 4, 5), t)
-        self.assertNotIn(0, t)
+        self.assertTrue(5 in t)
+        self.assertFalse((1, 2, 3, 4, 5) in t)
+        self.assertFalse(0 in t)
 
     @bigmemtest(size=_2G + 10, memuse=8)
     def test_hash(self, size):
@@ -869,11 +942,11 @@ class TupleTest(unittest.TestCase):
         self.assertEqual(s[-5:], '0, 0)')
         self.assertEqual(s.count('0'), size)
 
-    @bigmemtest(size=_2G // 3 + 2, memuse=8 + 3 * character_size)
+    @bigmemtest(size=_2G // 3 + 2, memuse=8 + 3 * ascii_char_size)
     def test_repr_small(self, size):
         return self.basic_test_repr(size)
 
-    @bigmemtest(size=_2G + 2, memuse=8 + 3 * character_size)
+    @bigmemtest(size=_2G + 2, memuse=8 + 3 * ascii_char_size)
     def test_repr_large(self, size):
         return self.basic_test_repr(size)
 
@@ -888,7 +961,7 @@ class ListTest(unittest.TestCase):
     def test_compare(self, size):
         l1 = [''] * size
         l2 = [''] * size
-        self.assertEqual(l1, l2)
+        self.assertTrue(l1 == l2)
         del l2
         l2 = [''] * (size + 1)
         self.assertFalse(l1 == l2)
@@ -934,9 +1007,9 @@ class ListTest(unittest.TestCase):
     def test_contains(self, size):
         l = [1, 2, 3, 4, 5] * size
         self.assertEqual(len(l), size * 5)
-        self.assertIn(5, l)
-        self.assertNotIn([1, 2, 3, 4, 5], l)
-        self.assertNotIn(0, l)
+        self.assertTrue(5 in l)
+        self.assertFalse([1, 2, 3, 4, 5] in l)
+        self.assertFalse(0 in l)
 
     @bigmemtest(size=_2G + 10, memuse=8)
     def test_hash(self, size):
@@ -1044,11 +1117,11 @@ class ListTest(unittest.TestCase):
         self.assertEqual(s[-5:], '0, 0]')
         self.assertEqual(s.count('0'), size)
 
-    @bigmemtest(size=_2G // 3 + 2, memuse=8 + 3 * character_size)
+    @bigmemtest(size=_2G // 3 + 2, memuse=8 + 3 * ascii_char_size)
     def test_repr_small(self, size):
         return self.basic_test_repr(size)
 
-    @bigmemtest(size=_2G + 2, memuse=8 + 3 * character_size)
+    @bigmemtest(size=_2G + 2, memuse=8 + 3 * ascii_char_size)
     def test_repr_large(self, size):
         return self.basic_test_repr(size)