diff options
author | eikeon <devnull@localhost> | 2010-02-13 01:27:55 +0000 |
---|---|---|
committer | eikeon <devnull@localhost> | 2010-02-13 01:27:55 +0000 |
commit | 02b60fa0002fd2efd8a245265f0a8fce1122affd (patch) | |
tree | a993977327f23bd83c8e2eeb495b5df8bfd135eb /rdflib/namespace.py | |
parent | d8166002dd19346ffffae7cc2194c4c197c9f1b5 (diff) | |
download | rdflib-02b60fa0002fd2efd8a245265f0a8fce1122affd.tar.gz |
Update issue 120
Diffstat (limited to 'rdflib/namespace.py')
-rw-r--r-- | rdflib/namespace.py | 84 |
1 files changed, 83 insertions, 1 deletions
diff --git a/rdflib/namespace.py b/rdflib/namespace.py index 8ee0bee6..148f46e9 100644 --- a/rdflib/namespace.py +++ b/rdflib/namespace.py @@ -10,7 +10,6 @@ from urlparse import urljoin, urldefrag from urllib import pathname2url from rdflib.term import URIRef, Variable, _XSD_PFX -from rdflib.syntax.xml_names import split_uri class Namespace(URIRef): @@ -241,3 +240,86 @@ class NamespaceManager(object): if uri and uri[-1]=="#" and result[-1]!="#": result = "%s#" % result return URIRef(result) + +# From: http://www.w3.org/TR/REC-xml#NT-CombiningChar +# +# * Name start characters must have one of the categories Ll, Lu, Lo, +# Lt, Nl. +# +# * Name characters other than Name-start characters must have one of +# the categories Mc, Me, Mn, Lm, or Nd. +# +# * Characters in the compatibility area (i.e. with character code +# greater than #xF900 and less than #xFFFE) are not allowed in XML +# names. +# +# * Characters which have a font or compatibility decomposition +# (i.e. those with a "compatibility formatting tag" in field 5 of the +# database -- marked by field 5 beginning with a "<") are not allowed. +# +# * The following characters are treated as name-start characters rather +# than name characters, because the property file classifies them as +# Alphabetic: [#x02BB-#x02C1], #x0559, #x06E5, #x06E6. +# +# * Characters #x20DD-#x20E0 are excluded (in accordance with Unicode +# 2.0, section 5.14). +# +# * Character #x00B7 is classified as an extender, because the property +# list so identifies it. +# +# * Character #x0387 is added as a name character, because #x00B7 is its +# canonical equivalent. +# +# * Characters ':' and '_' are allowed as name-start characters. +# +# * Characters '-' and '.' are allowed as name characters. + +from unicodedata import category, decomposition + +NAME_START_CATEGORIES = ["Ll", "Lu", "Lo", "Lt", "Nl"] +NAME_CATEGORIES = NAME_START_CATEGORIES + ["Mc", "Me", "Mn", "Lm", "Nd"] +ALLOWED_NAME_CHARS = [u"\u00B7", u"\u0387", u"-", u".", u"_"] + +# http://www.w3.org/TR/REC-xml-names/#NT-NCName +# [4] NCName ::= (Letter | '_') (NCNameChar)* /* An XML Name, minus +# the ":" */ +# [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar +# | Extender + +def is_ncname(name): + first = name[0] + if first=="_" or category(first) in NAME_START_CATEGORIES: + for i in xrange(1, len(name)): + c = name[i] + if not category(c) in NAME_CATEGORIES: + if c in ALLOWED_NAME_CHARS: + continue + return 0 + #if in compatibility area + #if decomposition(c)!='': + # return 0 + + return 1 + else: + return 0 + +XMLNS = "http://www.w3.org/XML/1998/namespace" + +def split_uri(uri): + if uri.startswith(XMLNS): + return (XMLNS, uri.split(XMLNS)[1]) + length = len(uri) + for i in xrange(0, length): + c = uri[-i-1] + if not category(c) in NAME_CATEGORIES: + if c in ALLOWED_NAME_CHARS: + continue + for j in xrange(-1-i, length): + if category(uri[j]) in NAME_START_CATEGORIES or uri[j]=="_": + ns = uri[:j] + if not ns: + break + ln = uri[j:] + return (ns, ln) + break + raise Exception("Can't split '%s'" % uri) |