summaryrefslogtreecommitdiff
path: root/rdflib/namespace.py
diff options
context:
space:
mode:
authoreikeon <devnull@localhost>2010-02-13 01:27:55 +0000
committereikeon <devnull@localhost>2010-02-13 01:27:55 +0000
commit02b60fa0002fd2efd8a245265f0a8fce1122affd (patch)
treea993977327f23bd83c8e2eeb495b5df8bfd135eb /rdflib/namespace.py
parentd8166002dd19346ffffae7cc2194c4c197c9f1b5 (diff)
downloadrdflib-02b60fa0002fd2efd8a245265f0a8fce1122affd.tar.gz
Update issue 120
Diffstat (limited to 'rdflib/namespace.py')
-rw-r--r--rdflib/namespace.py84
1 files changed, 83 insertions, 1 deletions
diff --git a/rdflib/namespace.py b/rdflib/namespace.py
index 8ee0bee6..148f46e9 100644
--- a/rdflib/namespace.py
+++ b/rdflib/namespace.py
@@ -10,7 +10,6 @@ from urlparse import urljoin, urldefrag
from urllib import pathname2url
from rdflib.term import URIRef, Variable, _XSD_PFX
-from rdflib.syntax.xml_names import split_uri
class Namespace(URIRef):
@@ -241,3 +240,86 @@ class NamespaceManager(object):
if uri and uri[-1]=="#" and result[-1]!="#":
result = "%s#" % result
return URIRef(result)
+
+# From: http://www.w3.org/TR/REC-xml#NT-CombiningChar
+#
+# * Name start characters must have one of the categories Ll, Lu, Lo,
+# Lt, Nl.
+#
+# * Name characters other than Name-start characters must have one of
+# the categories Mc, Me, Mn, Lm, or Nd.
+#
+# * Characters in the compatibility area (i.e. with character code
+# greater than #xF900 and less than #xFFFE) are not allowed in XML
+# names.
+#
+# * Characters which have a font or compatibility decomposition
+# (i.e. those with a "compatibility formatting tag" in field 5 of the
+# database -- marked by field 5 beginning with a "<") are not allowed.
+#
+# * The following characters are treated as name-start characters rather
+# than name characters, because the property file classifies them as
+# Alphabetic: [#x02BB-#x02C1], #x0559, #x06E5, #x06E6.
+#
+# * Characters #x20DD-#x20E0 are excluded (in accordance with Unicode
+# 2.0, section 5.14).
+#
+# * Character #x00B7 is classified as an extender, because the property
+# list so identifies it.
+#
+# * Character #x0387 is added as a name character, because #x00B7 is its
+# canonical equivalent.
+#
+# * Characters ':' and '_' are allowed as name-start characters.
+#
+# * Characters '-' and '.' are allowed as name characters.
+
+from unicodedata import category, decomposition
+
+NAME_START_CATEGORIES = ["Ll", "Lu", "Lo", "Lt", "Nl"]
+NAME_CATEGORIES = NAME_START_CATEGORIES + ["Mc", "Me", "Mn", "Lm", "Nd"]
+ALLOWED_NAME_CHARS = [u"\u00B7", u"\u0387", u"-", u".", u"_"]
+
+# http://www.w3.org/TR/REC-xml-names/#NT-NCName
+# [4] NCName ::= (Letter | '_') (NCNameChar)* /* An XML Name, minus
+# the ":" */
+# [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar
+# | Extender
+
+def is_ncname(name):
+ first = name[0]
+ if first=="_" or category(first) in NAME_START_CATEGORIES:
+ for i in xrange(1, len(name)):
+ c = name[i]
+ if not category(c) in NAME_CATEGORIES:
+ if c in ALLOWED_NAME_CHARS:
+ continue
+ return 0
+ #if in compatibility area
+ #if decomposition(c)!='':
+ # return 0
+
+ return 1
+ else:
+ return 0
+
+XMLNS = "http://www.w3.org/XML/1998/namespace"
+
+def split_uri(uri):
+ if uri.startswith(XMLNS):
+ return (XMLNS, uri.split(XMLNS)[1])
+ length = len(uri)
+ for i in xrange(0, length):
+ c = uri[-i-1]
+ if not category(c) in NAME_CATEGORIES:
+ if c in ALLOWED_NAME_CHARS:
+ continue
+ for j in xrange(-1-i, length):
+ if category(uri[j]) in NAME_START_CATEGORIES or uri[j]=="_":
+ ns = uri[:j]
+ if not ns:
+ break
+ ln = uri[j:]
+ return (ns, ln)
+ break
+ raise Exception("Can't split '%s'" % uri)