diff options
author | Malcolm Tredinnick <malcolm.tredinnick@gmail.com> | 2007-07-04 09:28:29 +0000 |
---|---|---|
committer | Malcolm Tredinnick <malcolm.tredinnick@gmail.com> | 2007-07-04 09:28:29 +0000 |
commit | 5664a678b29ab04cad425c15b2792f4519f43928 (patch) | |
tree | 7e8ef34cb859f3deb73872abc20aa50499dde4df | |
parent | 216fae2352871e04e8edd44045c63bf5379ab229 (diff) | |
download | django-archive/attic/unicode.tar.gz |
unicode: Added unicode-aware slugify filter (in Python) and better non-ASCIIarchive/attic/unicodeattic/unicode
handling for the Javascript slug creator in admin. Can never be perfect here,
but this is more tolerant in many cases. Fixed #4365. Thanks, Bill de hÓra,
Baptiste, orestis@orestis.gr, Ahmet and Jonas for contributions to this.
git-svn-id: http://code.djangoproject.com/svn/django/branches/unicode@5608 bcc190cf-cafb-0310-a4f2-bffc1f526a37
-rw-r--r-- | django/contrib/admin/media/js/urlify.js | 95 | ||||
-rw-r--r-- | django/template/defaultfilters.py | 11 | ||||
-rw-r--r-- | tests/regressiontests/defaultfilters/tests.py | 3 |
3 files changed, 105 insertions, 4 deletions
diff --git a/django/contrib/admin/media/js/urlify.js b/django/contrib/admin/media/js/urlify.js index 9b87113628..cd382751e2 100644 --- a/django/contrib/admin/media/js/urlify.js +++ b/django/contrib/admin/media/js/urlify.js @@ -1,15 +1,110 @@ +var LATIN_MAP = +{ + 'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A', 'Æ': 'AE', 'Ç': + 'C', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', 'Ì': 'I', 'Í': 'I', 'Î': 'I', + 'Ï': 'I', 'Ð': 'D', 'Ñ': 'N', 'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': + 'O', 'Ø': 'O', 'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U', 'Ý': 'Y', 'Þ': 'TH', + 'ß': 'ss', 'à':'a', 'á':'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a', 'æ': + 'ae', 'ç': 'c', 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ì': 'i', 'í': 'i', + 'î': 'i', 'ï': 'i', 'ð': 'o', 'ñ': 'n', 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': + 'o', 'ö': 'o', 'ø': 'o', 'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ý': 'y', + 'þ': 'th', 'ÿ': 'y', +} +var LATIN_SYMBOLS_MAP = +{ + '©':'(c)', +} +var GREEK_MAP = +{ + 'α':'a', 'β':'b', 'γ':'g', 'δ':'d', 'ε':'e', 'ζ':'z', 'η':'h', 'θ':'8', + 'ι':'i', 'κ':'k', 'λ':'l', 'μ':'m', 'ν':'n', 'ξ':'3', 'ο':'o', 'π':'p', + 'ρ':'r', 'σ':'s', 'τ':'t', 'υ':'y', 'φ':'f', 'χ':'x', 'ψ':'ps', 'ω':'w', + 'ά':'a', 'έ':'e', 'ί':'i', 'ό':'o', 'ύ':'y', 'ή':'h', 'ώ':'w', 'ς':'s', + 'ϊ':'i', 'ΰ':'y', 'ϋ':'y', 'ΐ':'i', + 'Α':'A', 'Β':'B', 'Γ':'G', 'Δ':'D', 'Ε':'E', 'Ζ':'Z', 'Η':'H', 'Θ':'8', + 'Ι':'I', 'Κ':'K', 'Λ':'L', 'Μ':'M', 'Ν':'N', 'Ξ':'3', 'Ο':'O', 'Π':'P', + 'Ρ':'R', 'Σ':'S', 'Τ':'T', 'Υ':'Y', 'Φ':'F', 'Χ':'X', 'Ψ':'PS', 'Ω':'W', + 'Ά':'A', 'Έ':'E', 'Ί':'I', 'Ό':'O', 'Ύ':'Y', 'Ή':'H', 'Ώ':'W', 'Ϊ':'I', + 'Ϋ':'Y' +} +var TURKISH_MAP = { + 'ş':'s', 'Ş':'S', 'ı':'i', 'İ':'I', 'ç':'c', 'Ç':'C', 'ü':'u', 'Ü':'U', + 'ö':'o', 'Ö':'O', 'ğ':'g', 'Ğ':'G', +} +// var RUSSIAN_MAP = +// { +// } + +var ALL_DOWNCODE_MAPS=new Array() +ALL_DOWNCODE_MAPS[0]=LATIN_MAP +ALL_DOWNCODE_MAPS[1]=LATIN_SYMBOLS_MAP +ALL_DOWNCODE_MAPS[2]=GREEK_MAP +ALL_DOWNCODE_MAPS[3]=TURKISH_MAP +//ALL_DOWNCODE_MAPS[4]=RUSSIAN_MAP + +var Downcoder = new Object(); +Downcoder.Initialize = function() +{ + if (Downcoder.map) // already made + return ; + Downcoder.map ={} + Downcoder.chars = '' ; + for(var i in ALL_DOWNCODE_MAPS) + { + var lookup = ALL_DOWNCODE_MAPS[i] + for (var c in lookup) + { + Downcoder.map[c] = lookup[c] ; + Downcoder.chars += c ; + } + } + Downcoder.regex = new RegExp('[' + Downcoder.chars + ']|[^' + Downcoder.chars + ']+','g') ; +} + +downcode= function( slug ) +{ + Downcoder.Initialize() ; + var downcoded ="" + var pieces = slug.match(Downcoder.regex); + if(pieces) + { + for (var i = 0 ; i < pieces.length ; i++) + { + if (pieces[i].length == 1) + { + var mapped = Downcoder.map[pieces[i]] ; + if (mapped != null) + { + downcoded+=mapped; + continue ; + } + } + downcoded+=pieces[i]; + } + } + else + { + downcoded = slug; + } + return downcoded; +} + + function URLify(s, num_chars) { // changes, e.g., "Petty theft" to "petty_theft" // remove all these words from the string before urlifying + s = downcode(s); removelist = ["a", "an", "as", "at", "before", "but", "by", "for", "from", "is", "in", "into", "like", "of", "off", "on", "onto", "per", "since", "than", "the", "this", "that", "to", "up", "via", "with"]; r = new RegExp('\\b(' + removelist.join('|') + ')\\b', 'gi'); s = s.replace(r, ''); + // if downcode doesn't hit, the char will be stripped here s = s.replace(/[^-\w\s]/g, ''); // remove unneeded chars s = s.replace(/^\s+|\s+$/g, ''); // trim leading/trailing spaces s = s.replace(/[-\s]+/g, '-'); // convert spaces to hyphens s = s.toLowerCase(); // convert to lowercase return s.substring(0, num_chars);// trim to first num_chars chars } + diff --git a/django/template/defaultfilters.py b/django/template/defaultfilters.py index 6effac184f..6aadbf2381 100644 --- a/django/template/defaultfilters.py +++ b/django/template/defaultfilters.py @@ -115,10 +115,13 @@ def make_list(value): make_list = stringfilter(make_list) def slugify(value): - "Converts to lowercase, removes non-alpha chars and converts spaces to hyphens" - # Don't compile patterns as unicode because \w then would mean any letter. - # Slugify is effectively a conversion to ASCII. - value = re.sub('[^\w\s-]', '', value).strip().lower() + """ + Normalizes string, converts to lowercase, removes non-alpha chars and + converts spaces to hyphens. + """ + import unicodedata + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') + value = unicode(re.sub('[^\w\s-]', '', value).strip().lower()) return re.sub('[-\s]+', '-', value) slugify = stringfilter(slugify) diff --git a/tests/regressiontests/defaultfilters/tests.py b/tests/regressiontests/defaultfilters/tests.py index caa58e1257..a1efae66f6 100644 --- a/tests/regressiontests/defaultfilters/tests.py +++ b/tests/regressiontests/defaultfilters/tests.py @@ -67,6 +67,9 @@ u'\xeb' >>> slugify(' Jack & Jill like numbers 1,2,3 and 4 and silly characters ?%.$!/') u'jack-jill-like-numbers-123-and-4-and-silly-characters' +>>> slugify(u"Un \xe9l\xe9phant \xe0 l'or\xe9e du bois") +u'un-elephant-a-loree-du-bois' + >>> stringformat(1, u'03d') u'001' |