Restrict normalization to unicode-compatible values (#674)

This allows us to avoid accidentally overriding our intial lexical value with one that is not unicode compatible after normalization. This is specifically relevant for arbitrary binary data with bytes outside of the defined unicode range.
author: Nate Prewitt <nateprewitt@users.noreply.github.com> 2017-01-24 07:12:44 -0700
committer: Gunnar Aastrand Grimnes <gromgull@users.noreply.github.com> 2017-01-24 15:12:44 +0100
commit: 695a670bbde8304a7cd209c2ea629ccf7cf1d52a (patch)
tree: 47825d4b6435cd4e6bda43701e747f0f07f8db1d
parent: 93c74261c4a04049026738ace46cb22902357219 (diff)
download: rdflib-695a670bbde8304a7cd209c2ea629ccf7cf1d52a.tar.gz
2 files changed, 52 insertions, 12 deletions
diff --git a/rdflib/term.py b/rdflib/term.py
index 37c0e84a..37fa4ac7 100644
--- a/rdflib/term.py
+++ b/rdflib/term.py
@@ -50,7 +50,6 @@ from collections import defaultdict
 
 from isodate import parse_time, parse_date, parse_datetime
 
-
 try:
     from hashlib import md5
     assert md5
@@ -63,7 +62,6 @@ from . import py3compat
 from rdflib.compat import numeric_greater
 
 
-
 b = py3compat.b
 
 skolem_genid = "/.well-known/genid/"
@@ -83,6 +81,24 @@ _lang_tag_regex = compile('^[a-zA-Z]+(?:-[a-zA-Z0-9]+)*$')
 def _is_valid_langtag(tag):
     return bool(_lang_tag_regex.match(tag))
 
+def _is_valid_unicode(value):
+    """
+    Verify that the provided value can be converted into a Python
+    unicode object.
+    """
+    if isinstance(value, bytes):
+        coding_func, param = getattr(value, 'decode'), 'utf-8'
+    elif py3compat.PY3:
+        coding_func, param = str, value
+    else:
+        coding_func, param = unicode, value
+
+    # try to convert value into unicode
+    try:
+        coding_func(param)
+    except UnicodeError:
+        return False
+    return True
 
 class Node(object):
     """
@@ -571,7 +587,7 @@ class Literal(Identifier):
 
                 if value is not None and normalize:
                     _value, _datatype = _castPythonToLiteral(value)
-                    if _value is not None:
+                    if _value is not None and _is_valid_unicode(_value):
                         lexical_or_value = _value
 
         else:
@@ -585,7 +601,6 @@ class Literal(Identifier):
             if datatype:
                 lang = None
 
-
         if py3compat.PY3 and isinstance(lexical_or_value, bytes):
             lexical_or_value = lexical_or_value.decode('utf-8')
 
@@ -1495,8 +1510,7 @@ XSDToPython = {
     URIRef(_XSD_PFX + 'unsignedByte'): int,
     URIRef(_XSD_PFX + 'float'): float,
     URIRef(_XSD_PFX + 'double'): float,
-    URIRef(
-        _XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)),
+    URIRef(_XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(s),
     URIRef(_XSD_PFX + 'anyURI'): None,
     _RDF_XMLLITERAL: _parseXML,
     _RDF_HTMLLITERAL: _parseHTML
diff --git a/test/test_term.py b/test/test_term.py
index fdbea4d5..78fa7a03 100644
--- a/test/test_term.py
+++ b/test/test_term.py
@@ -3,9 +3,12 @@ some more specific Literal tests are in test_literal.py
 """
 
 import unittest
+import base64
+
 from rdflib.py3compat import format_doctest_out as uformat
-from rdflib.term import URIRef, BNode
+from rdflib.term import URIRef, BNode, Literal, _is_valid_unicode
 from rdflib.graph import QuotedGraph, Graph
+from rdflib.namespace import XSD
 
 class TestURIRefRepr(unittest.TestCase):
     """
@@ -25,15 +28,38 @@ class TestURIRefRepr(unittest.TestCase):
         a = u>BNode()
         a = u>QuotedGraph(g.store, u)
         a = u>g
-        
-        
-        
-        
+
 
 class TestBNodeRepr(unittest.TestCase):
-   
+
     def testSubclassNameAppearsInRepr(self):
         class MyBNode(BNode):
             pass
         x = MyBNode()
         self.assertTrue(repr(x).startswith("MyBNode("))
+
+
+class TestLiteral(unittest.TestCase):
+
+    def test_base64_values(self):
+        b64msg = 'cmRmbGliIGlzIGNvb2whIGFsc28gaGVyZSdzIHNvbWUgYmluYXJ5IAAR83UC'
+        decoded_b64msg = base64.b64decode(b64msg)
+        lit = Literal(b64msg, datatype=XSD.base64Binary)
+        self.assertEqual(lit.value, decoded_b64msg)
+        self.assertEqual(str(lit), b64msg)
+
+
+class TestValidityFunctions(unittest.TestCase):
+
+    def test_is_valid_unicode(self):
+        testcase_list = (
+            (None, True),
+            (1, True),
+            (['foo'], True),
+            ({'foo': b'bar'}, True),
+            ('foo', True),
+            (b'foo\x00', True),
+            (b'foo\xf3\x02', False)
+        )
+        for val, expected in testcase_list:
+            self.assertEqual(_is_valid_unicode(val), expected)
author	Nate Prewitt <nateprewitt@users.noreply.github.com>	2017-01-24 07:12:44 -0700
committer	Gunnar Aastrand Grimnes <gromgull@users.noreply.github.com>	2017-01-24 15:12:44 +0100
commit	695a670bbde8304a7cd209c2ea629ccf7cf1d52a (patch)
tree	47825d4b6435cd4e6bda43701e747f0f07f8db1d
parent	93c74261c4a04049026738ace46cb22902357219 (diff)
download	rdflib-695a670bbde8304a7cd209c2ea629ccf7cf1d52a.tar.gz