diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 246 |
1 files changed, 128 insertions, 118 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9058018201..ad32a062d4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -41,7 +41,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #define PY_SSIZE_T_CLEAN #include "Python.h" #include "pycore_abstract.h" // _PyIndex_Check() +#include "pycore_atomic_funcs.h" // _Py_atomic_size_get() #include "pycore_bytes_methods.h" // _Py_bytes_lower() +#include "pycore_format.h" // F_LJUST #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.fs_codec #include "pycore_object.h" // _PyObject_GC_TRACK() @@ -204,22 +206,6 @@ extern "C" { # define OVERALLOCATE_FACTOR 4 #endif -/* bpo-40521: Interned strings are shared by all interpreters. */ -#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS -# define INTERNED_STRINGS -#endif - -/* This dictionary holds all interned unicode strings. Note that references - to strings in this dictionary are *not* counted in the string's ob_refcnt. - When the interned string reaches a refcnt of 0 the string deallocation - function will delete the reference from this dictionary. - - Another way to look at this is that to say that the actual reference - count of a string is: s->ob_refcnt + (s->state ? 2 : 0) -*/ -#ifdef INTERNED_STRINGS -static PyObject *interned = NULL; -#endif static struct _Py_unicode_state* get_unicode_state(void) @@ -301,9 +287,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed); -/* List of static strings. */ -static _Py_Identifier *static_strings = NULL; - /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -839,7 +822,11 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str, /* generate replacement */ for (i = collstart; i < collend; ++i) { - str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + if (size < 0) { + return NULL; + } + str += size; } return str; } @@ -1057,7 +1044,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) new_size = (struct_size + (length + 1) * char_size); if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_UTF8(unicode)); + PyObject_Free(_PyUnicode_UTF8(unicode)); _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } @@ -1068,7 +1055,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _Py_ForgetReference(unicode); #endif - new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); + new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size); if (new_unicode == NULL) { _Py_NewReference(unicode); PyErr_NoMemory(); @@ -1084,7 +1071,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _PyUnicode_WSTR_LENGTH(unicode) = length; } else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; if (!PyUnicode_IS_ASCII(unicode)) _PyUnicode_WSTR_LENGTH(unicode) = 0; @@ -1127,12 +1114,12 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_UTF8(unicode)); + PyObject_Free(_PyUnicode_UTF8(unicode)); _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } - data = (PyObject *)PyObject_REALLOC(data, new_size); + data = (PyObject *)PyObject_Realloc(data, new_size); if (data == NULL) { PyErr_NoMemory(); return -1; @@ -1165,7 +1152,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) } new_size = sizeof(wchar_t) * (length + 1); wstr = _PyUnicode_WSTR(unicode); - wstr = PyObject_REALLOC(wstr, new_size); + wstr = PyObject_Realloc(wstr, new_size); if (!wstr) { PyErr_NoMemory(); return -1; @@ -1255,7 +1242,7 @@ _PyUnicode_New(Py_ssize_t length) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; - _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); + _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size); if (!_PyUnicode_WSTR(unicode)) { Py_DECREF(unicode); PyErr_NoMemory(); @@ -1452,7 +1439,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) * PyObject_New() so we are able to allocate space for the object and * it's data buffer. */ - obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); + obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size); if (obj == NULL) { return PyErr_NoMemory(); } @@ -1834,7 +1821,7 @@ _PyUnicode_Ready(PyObject *unicode) return -1; if (maxchar < 256) { - _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); + _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1); if (!_PyUnicode_DATA_ANY(unicode)) { PyErr_NoMemory(); return -1; @@ -1855,7 +1842,7 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } - PyObject_FREE(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; } @@ -1875,7 +1862,7 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_UTF8_LENGTH(unicode) = 0; #else /* sizeof(wchar_t) == 4 */ - _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( + _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc( 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); if (!_PyUnicode_DATA_ANY(unicode)) { PyErr_NoMemory(); @@ -1889,12 +1876,12 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; - PyObject_FREE(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; #endif } - /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ + /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */ else { #if SIZEOF_WCHAR_T == 2 /* in case the native representation is 2-bytes, we need to allocate a @@ -1904,7 +1891,7 @@ _PyUnicode_Ready(PyObject *unicode) PyErr_NoMemory(); return -1; } - _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); + _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1)); if (!_PyUnicode_DATA_ANY(unicode)) { PyErr_NoMemory(); return -1; @@ -1916,7 +1903,7 @@ _PyUnicode_Ready(PyObject *unicode) /* unicode_convert_wchar_to_ucs4() requires a ready string */ _PyUnicode_STATE(unicode).ready = 1; unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); - PyObject_FREE(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; #else @@ -1943,7 +1930,8 @@ unicode_dealloc(PyObject *unicode) break; case SSTATE_INTERNED_MORTAL: -#ifdef INTERNED_STRINGS + { + struct _Py_unicode_state *state = get_unicode_state(); /* Revive the dead object temporarily. PyDict_DelItem() removes two references (key and value) which were ignored by PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2 @@ -1951,14 +1939,14 @@ unicode_dealloc(PyObject *unicode) PyDict_DelItem(). */ assert(Py_REFCNT(unicode) == 0); Py_SET_REFCNT(unicode, 3); - if (PyDict_DelItem(interned, unicode) != 0) { + if (PyDict_DelItem(state->interned, unicode) != 0) { _PyErr_WriteUnraisableMsg("deletion of interned string failed", NULL); } assert(Py_REFCNT(unicode) == 1); Py_SET_REFCNT(unicode, 0); -#endif break; + } case SSTATE_INTERNED_IMMORTAL: _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died"); @@ -1969,13 +1957,13 @@ unicode_dealloc(PyObject *unicode) } if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); } if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_UTF8(unicode)); + PyObject_Free(_PyUnicode_UTF8(unicode)); } if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) { - PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); + PyObject_Free(_PyUnicode_DATA_ANY(unicode)); } Py_TYPE(unicode)->tp_free(unicode); @@ -2307,42 +2295,84 @@ PyUnicode_FromString(const char *u) return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); } + PyObject * _PyUnicode_FromId(_Py_Identifier *id) { - if (id->object) { - return id->object; + PyInterpreterState *interp = _PyInterpreterState_GET(); + struct _Py_unicode_ids *ids = &interp->unicode.ids; + + int index = _Py_atomic_size_get(&id->index); + if (index < 0) { + struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids; + + PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); + // Check again to detect concurrent access. Another thread can have + // initialized the index while this thread waited for the lock. + index = _Py_atomic_size_get(&id->index); + if (index < 0) { + assert(rt_ids->next_index < PY_SSIZE_T_MAX); + index = rt_ids->next_index; + rt_ids->next_index++; + _Py_atomic_size_set(&id->index, index); + } + PyThread_release_lock(rt_ids->lock); } + assert(index >= 0); PyObject *obj; - obj = PyUnicode_DecodeUTF8Stateful(id->string, - strlen(id->string), + if (index < ids->size) { + obj = ids->array[index]; + if (obj) { + // Return a borrowed reference + return obj; + } + } + + obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string), NULL, NULL); if (!obj) { return NULL; } PyUnicode_InternInPlace(&obj); - assert(!id->next); - id->object = obj; - id->next = static_strings; - static_strings = id; - return id->object; + if (index >= ids->size) { + // Overallocate to reduce the number of realloc + Py_ssize_t new_size = Py_MAX(index * 2, 16); + Py_ssize_t item_size = sizeof(ids->array[0]); + PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size); + if (new_array == NULL) { + PyErr_NoMemory(); + return NULL; + } + memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size); + ids->array = new_array; + ids->size = new_size; + } + + // The array stores a strong reference + ids->array[index] = obj; + + // Return a borrowed reference + return obj; } + static void -unicode_clear_static_strings(void) +unicode_clear_identifiers(struct _Py_unicode_state *state) { - _Py_Identifier *tmp, *s = static_strings; - while (s) { - Py_CLEAR(s->object); - tmp = s->next; - s->next = NULL; - s = tmp; + struct _Py_unicode_ids *ids = &state->ids; + for (Py_ssize_t i=0; i < ids->size; i++) { + Py_XDECREF(ids->array[i]); } - static_strings = NULL; + ids->size = 0; + PyMem_Free(ids->array); + ids->array = NULL; + // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid + // after Py_Finalize(). } + /* Internal function, doesn't check maximum character */ PyObject* @@ -3294,7 +3324,7 @@ PyUnicode_AsWideCharString(PyObject *unicode, *size = buflen; } else if (wcslen(buffer) != (size_t)buflen) { - PyMem_FREE(buffer); + PyMem_Free(buffer); PyErr_SetString(PyExc_ValueError, "embedded null character"); return NULL; @@ -4195,7 +4225,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) PyErr_NoMemory(); return NULL; } - w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1)); + w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1)); if (w == NULL) { PyErr_NoMemory(); return NULL; @@ -5623,7 +5653,7 @@ unicode_fill_utf8(PyObject *unicode) PyBytes_AS_STRING(writer.buffer); Py_ssize_t len = end - start; - char *cache = PyObject_MALLOC(len + 1); + char *cache = PyObject_Malloc(len + 1); if (cache == NULL) { _PyBytesWriter_Dealloc(&writer); PyErr_NoMemory(); @@ -8540,7 +8570,7 @@ PyUnicode_BuildEncodingMap(PyObject* string) } /* Create a three-level trie */ - result = PyObject_MALLOC(sizeof(struct encoding_map) + + result = PyObject_Malloc(sizeof(struct encoding_map) + 16*count2 + 128*count3 - 1); if (!result) { return PyErr_NoMemory(); @@ -10207,7 +10237,7 @@ case_operation(PyObject *self, PyErr_SetString(PyExc_OverflowError, "string is too long"); return NULL; } - tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); + tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length); if (tmp == NULL) return PyErr_NoMemory(); newlength = perform(kind, data, length, tmp, &maxchar); @@ -10231,7 +10261,7 @@ case_operation(PyObject *self, Py_UNREACHABLE(); } leave: - PyMem_FREE(tmp); + PyMem_Free(tmp); return res; } @@ -11046,11 +11076,11 @@ replace(PyObject *self, PyObject *str1, assert(release1 == (buf1 != PyUnicode_DATA(str1))); assert(release2 == (buf2 != PyUnicode_DATA(str2))); if (srelease) - PyMem_FREE((void *)sbuf); + PyMem_Free((void *)sbuf); if (release1) - PyMem_FREE((void *)buf1); + PyMem_Free((void *)buf1); if (release2) - PyMem_FREE((void *)buf2); + PyMem_Free((void *)buf2); assert(_PyUnicode_CheckConsistency(u, 1)); return u; @@ -11060,11 +11090,11 @@ replace(PyObject *self, PyObject *str1, assert(release1 == (buf1 != PyUnicode_DATA(str1))); assert(release2 == (buf2 != PyUnicode_DATA(str2))); if (srelease) - PyMem_FREE((void *)sbuf); + PyMem_Free((void *)sbuf); if (release1) - PyMem_FREE((void *)buf1); + PyMem_Free((void *)buf1); if (release2) - PyMem_FREE((void *)buf2); + PyMem_Free((void *)buf2); return unicode_result_unchanged(self); error: @@ -11072,11 +11102,11 @@ replace(PyObject *self, PyObject *str1, assert(release1 == (buf1 != PyUnicode_DATA(str1))); assert(release2 == (buf2 != PyUnicode_DATA(str2))); if (srelease) - PyMem_FREE((void *)sbuf); + PyMem_Free((void *)sbuf); if (release1) - PyMem_FREE((void *)buf1); + PyMem_Free((void *)buf1); if (release2) - PyMem_FREE((void *)buf2); + PyMem_Free((void *)buf2); return NULL; } @@ -11490,12 +11520,11 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) if (PyUnicode_CHECK_INTERNED(left)) return 0; -#ifdef INTERNED_STRINGS assert(_PyUnicode_HASH(right_uni) != -1); Py_hash_t hash = _PyUnicode_HASH(left); - if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) + if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) { return 0; -#endif + } return unicode_compare_eq(left, right_uni); } @@ -15563,7 +15592,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) PyErr_NoMemory(); goto onError; } - data = PyObject_MALLOC((length + 1) * char_size); + data = PyObject_Malloc((length + 1) * char_size); if (data == NULL) { PyErr_NoMemory(); goto onError; @@ -15719,23 +15748,21 @@ PyUnicode_InternInPlace(PyObject **p) return; } -#ifdef INTERNED_STRINGS if (PyUnicode_READY(s) == -1) { PyErr_Clear(); return; } - if (interned == NULL) { - interned = PyDict_New(); - if (interned == NULL) { + struct _Py_unicode_state *state = get_unicode_state(); + if (state->interned == NULL) { + state->interned = PyDict_New(); + if (state->interned == NULL) { PyErr_Clear(); /* Don't leave an exception */ return; } } - PyObject *t; - t = PyDict_SetDefault(interned, s, s); - + PyObject *t = PyDict_SetDefault(state->interned, s, s); if (t == NULL) { PyErr_Clear(); return; @@ -15752,13 +15779,9 @@ PyUnicode_InternInPlace(PyObject **p) this. */ Py_SET_REFCNT(s, Py_REFCNT(s) - 2); _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; -#else - // PyDict expects that interned strings have their hash - // (PyASCIIObject.hash) already computed. - (void)unicode_hash(s); -#endif } + void PyUnicode_InternImmortal(PyObject **p) { @@ -15792,35 +15815,25 @@ PyUnicode_InternFromString(const char *cp) void _PyUnicode_ClearInterned(PyThreadState *tstate) { - if (!_Py_IsMainInterpreter(tstate)) { - // interned dict is shared by all interpreters - return; - } - - if (interned == NULL) { - return; - } - assert(PyDict_CheckExact(interned)); - - PyObject *keys = PyDict_Keys(interned); - if (keys == NULL) { - PyErr_Clear(); + struct _Py_unicode_state *state = &tstate->interp->unicode; + if (state->interned == NULL) { return; } - assert(PyList_CheckExact(keys)); + assert(PyDict_CheckExact(state->interned)); /* Interned unicode strings are not forcibly deallocated; rather, we give them their stolen references back, and then clear and DECREF the interned dict. */ - Py_ssize_t n = PyList_GET_SIZE(keys); #ifdef INTERNED_STATS - fprintf(stderr, "releasing %zd interned strings\n", n); + fprintf(stderr, "releasing %zd interned strings\n", + PyDict_GET_SIZE(state->interned)); Py_ssize_t immortal_size = 0, mortal_size = 0; #endif - for (Py_ssize_t i = 0; i < n; i++) { - PyObject *s = PyList_GET_ITEM(keys, i); + Py_ssize_t pos = 0; + PyObject *s, *ignored_value; + while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) { assert(PyUnicode_IS_READY(s)); switch (PyUnicode_CHECK_INTERNED(s)) { @@ -15850,10 +15863,9 @@ _PyUnicode_ClearInterned(PyThreadState *tstate) "total size of all interned strings: %zd/%zd mortal/immortal\n", mortal_size, immortal_size); #endif - Py_DECREF(keys); - PyDict_Clear(interned); - Py_CLEAR(interned); + PyDict_Clear(state->interned); + Py_CLEAR(state->interned); } @@ -16223,21 +16235,19 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) void _PyUnicode_Fini(PyThreadState *tstate) { + struct _Py_unicode_state *state = &tstate->interp->unicode; + // _PyUnicode_ClearInterned() must be called before + assert(state->interned == NULL); - struct _Py_unicode_state *state = &tstate->interp->unicode; + _PyUnicode_FiniEncodings(&state->fs_codec); - Py_CLEAR(state->empty_string); + unicode_clear_identifiers(state); for (Py_ssize_t i = 0; i < 256; i++) { Py_CLEAR(state->latin1[i]); } - - if (_Py_IsMainInterpreter(tstate)) { - unicode_clear_static_strings(); - } - - _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec); + Py_CLEAR(state->empty_string); } |