summaryrefslogtreecommitdiff
path: root/Modules/unicodedata.c
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2009-04-27 21:53:26 +0000
committerAntoine Pitrou <solipsis@pitrou.net>2009-04-27 21:53:26 +0000
commite988e286b2831382deb7c69b26c74ed185f51696 (patch)
tree5c6c9d5a61bb107559e469c2c8e4d41af011c94e /Modules/unicodedata.c
parent8b8f8cc1b00900b5af7d79fc56e9c2a343990319 (diff)
downloadcpython-git-e988e286b2831382deb7c69b26c74ed185f51696.tar.gz
Issue #1734234: Massively speedup `unicodedata.normalize()` when the
string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen.
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r--Modules/unicodedata.c63
1 files changed, 58 insertions, 5 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index ad77651a26..9c9ad5e517 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -27,6 +27,7 @@ typedef struct {
const unsigned char mirrored; /* true if mirrored in bidir mode */
const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */
+ const unsigned char normalization_quick_check; /* see is_normalized() */
} _PyUnicode_DatabaseRecord;
typedef struct change_record {
@@ -720,7 +721,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
return result;
}
-
+
+/* Return 1 if the input is certainly normalized, 0 if it might not be. */
+static int
+is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+{
+ Py_UNICODE *i, *end;
+ unsigned char prev_combining = 0, quickcheck_mask;
+
+ /* An older version of the database is requested, quickchecks must be
+ disabled. */
+ if (self != NULL)
+ return 0;
+
+ /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
+ as described in http://unicode.org/reports/tr15/#Annex8. */
+ quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+
+ i = PyUnicode_AS_UNICODE(input);
+ end = i + PyUnicode_GET_SIZE(input);
+ while (i < end) {
+ const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
+ unsigned char combining = record->combining;
+ unsigned char quickcheck = record->normalization_quick_check;
+
+ if (quickcheck & quickcheck_mask)
+ return 0; /* this string might need normalization */
+ if (combining && prev_combining > combining)
+ return 0; /* non-canonical sort order, not normalized */
+ prev_combining = combining;
+ }
+ return 1; /* certainly normalized */
+}
+
PyDoc_STRVAR(unicodedata_normalize__doc__,
"normalize(form, unistr)\n\
\n\
@@ -744,14 +777,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
return input;
}
- if (strcmp(form, "NFC") == 0)
+ if (strcmp(form, "NFC") == 0) {
+ if (is_normalized(self, input, 1, 0)) {
+ Py_INCREF(input);
+ return input;
+ }
return nfc_nfkc(self, input, 0);
- if (strcmp(form, "NFKC") == 0)
+ }
+ if (strcmp(form, "NFKC") == 0) {
+ if (is_normalized(self, input, 1, 1)) {
+ Py_INCREF(input);
+ return input;
+ }
return nfc_nfkc(self, input, 1);
- if (strcmp(form, "NFD") == 0)
+ }
+ if (strcmp(form, "NFD") == 0) {
+ if (is_normalized(self, input, 0, 0)) {
+ Py_INCREF(input);
+ return input;
+ }
return nfd_nfkd(self, input, 0);
- if (strcmp(form, "NFKD") == 0)
+ }
+ if (strcmp(form, "NFKD") == 0) {
+ if (is_normalized(self, input, 0, 1)) {
+ Py_INCREF(input);
+ return input;
+ }
return nfd_nfkd(self, input, 1);
+ }
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}