diff options
-rw-r--r-- | test/unittest_textutils.py | 12 | ||||
-rw-r--r-- | textutils.py | 20 |
2 files changed, 25 insertions, 7 deletions
diff --git a/test/unittest_textutils.py b/test/unittest_textutils.py index 75b9cbb..d72a4a1 100644 --- a/test/unittest_textutils.py +++ b/test/unittest_textutils.py @@ -228,7 +228,7 @@ class ColorizeAnsiTC(TestCase): class UnormalizeTC(TestCase): - def test_unormalize(self): + def test_unormalize_no_substitute(self): data = [(u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), @@ -236,11 +236,19 @@ class UnormalizeTC(TestCase): (u'àèùéïîôêç', u'aeueiioec'), (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'), (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition + (u'\u0154', u'R'), ] for input, output in data: yield self.assertEqual, tu.unormalize(input), output - self.assertRaises(ValueError, tu.unormalize, u"non ascii char is \u0154", + + def test_unormalize_substitute(self): + self.assertEqual(tu.unormalize(u'ab \u8000 cd', substitute='_'), + 'ab _ cd') + + def test_unormalize_backward_compat(self): + self.assertRaises(ValueError, tu.unormalize, u"\u8000", ignorenonascii=False) + self.assertEqual(tu.unormalize(u"\u8000", ignorenonascii=True), u'') class ModuleDocTest(DocTest): diff --git a/textutils.py b/textutils.py index 4e98e93..64d70d5 100644 --- a/textutils.py +++ b/textutils.py @@ -46,6 +46,7 @@ __docformat__ = "restructuredtext en" import sys import re import os.path as osp +from warnings import warn from unicodedata import normalize as _uninormalize try: from os import linesep @@ -71,7 +72,7 @@ MANUAL_UNICODE_MAP = { u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S } -def unormalize(ustring, ignorenonascii=False): +def unormalize(ustring, ignorenonascii=None, substitute=None): """replace diacritical characters with their corresponding ascii characters Convert the unicode string to its long normalized form (unicode character @@ -79,19 +80,28 @@ def unormalize(ustring, ignorenonascii=False): The normal form KD (NFKD) will apply the compatibility decomposition, i.e. replace all compatibility characters with their equivalents. + :type substitute: str + :param substitute: replacement character to use if decomposition fails + :see: Another project about ASCII transliterations of Unicode text http://pypi.python.org/pypi/Unidecode """ + # backward compatibility, ignorenonascii was a boolean + if ignorenonascii is not None: + warn("ignorenonascii is deprecated, use substitute named parameter instead", + DeprecationWarning, stacklevel=2) + if ignorenonascii: + substitute = '' res = [] for letter in ustring[:]: try: replacement = MANUAL_UNICODE_MAP[letter] except KeyError: - if ord(letter) >= 2**8: - if ignorenonascii: - continue - raise ValueError("can't deal with non-ascii based characters") replacement = _uninormalize('NFKD', letter)[0] + if ord(replacement) >= 2 ** 7: + if substitute is None: + raise ValueError("can't deal with non-ascii based characters") + replacement = substitute res.append(replacement) return u''.join(res) |