Implement a dedicated int/float parser for XML (schema) values in lxml.objectify.xml_int_float_parsing

This disables support for "_" in numbers, which are allowed by Python but not by XMLSchema. Wee keep a few additional literals, such as "+NaN", simply because they shouldn't hurt. See https://mail.python.org/archives/list/lxml@python.org/thread/6F7VIDKWZTJ6LB6VOX6IJNNWICYHFPNR/
author: Stefan Behnel <stefan_ml@behnel.de> 2021-08-12 16:58:41 +0200
committer: Stefan Behnel <stefan_ml@behnel.de> 2021-08-12 16:59:26 +0200
commit: 83e6c031994d553b74991501c6cd85e3517fadd8 (patch)
tree: f455881a911562a009f0de2f4dcc783d181af8aa
parent: e23a807e816373e9eae9d45b5cecdd85ed2fa76a (diff)
download: python-lxml-xml_int_float_parsing.tar.gz
2 files changed, 179 insertions, 9 deletions
diff --git a/src/lxml/objectify.pyx b/src/lxml/objectify.pyx
index e587e4f2..cacbe806 100644
--- a/src/lxml/objectify.pyx
+++ b/src/lxml/objectify.pyx
@@ -943,6 +943,121 @@ cdef object _parseNumber(NumberElement element):
     return element._parse_value(textOf(element._c_node))
 
 
+cdef enum NumberParserState:
+    NPS_SPACE_PRE = 0
+    NPS_SIGN = 1
+    NPS_DIGITS = 2
+    NPS_POINT_LEAD = 3
+    NPS_POINT = 4
+    NPS_FRACTION = 5
+    NPS_EXP = 6
+    NPS_EXP_SIGN = 7
+    NPS_DIGITS_EXP = 8
+    NPS_SPACE_TAIL = 9
+    NPS_INF1 = 20
+    NPS_INF2 = 21
+    NPS_INF3 = 22
+    NPS_NAN1 = 23
+    NPS_NAN2 = 24
+    NPS_NAN3 = 25
+    NPS_ERROR = 99
+
+
+ctypedef fused bytes_unicode:
+    bytes
+    unicode
+
+
+cdef _checkNumber(bytes_unicode s, bint allow_float):
+    cdef Py_UCS4 c
+    cdef NumberParserState state = NPS_SPACE_PRE
+
+    for c in s:
+        if c.isdigit() if (bytes_unicode is unicode) else c in b'0123456789':
+            if state in (NPS_DIGITS, NPS_FRACTION, NPS_DIGITS_EXP):
+                pass
+            elif state in (NPS_SPACE_PRE, NPS_SIGN):
+                state = NPS_DIGITS
+            elif state in (NPS_POINT_LEAD, NPS_POINT):
+                state = NPS_FRACTION
+            elif state in (NPS_EXP, NPS_EXP_SIGN):
+                state = NPS_DIGITS_EXP
+            else:
+                state = NPS_ERROR
+        else:
+            if c == u'.':
+                if state in (NPS_SPACE_PRE, NPS_SIGN):
+                    state = NPS_POINT_LEAD
+                elif state == NPS_DIGITS:
+                    state = NPS_POINT
+                else:
+                    state = NPS_ERROR
+                if not allow_float:
+                    state = NPS_ERROR
+            elif c in u'-+':
+                if state == NPS_SPACE_PRE:
+                    state = NPS_SIGN
+                elif state == NPS_EXP:
+                    state = NPS_EXP_SIGN
+                else:
+                    state = NPS_ERROR
+            elif c == u'E':
+                if state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION):
+                    state = NPS_EXP
+                else:
+                    state = NPS_ERROR
+                if not allow_float:
+                    state = NPS_ERROR
+            # Allow INF and NaN. XMLSchema requires case, we don't, like Python.
+            elif c in u'iI':
+                state = NPS_INF1 if allow_float and state in (NPS_SPACE_PRE, NPS_SIGN) else NPS_ERROR
+            elif c in u'fF':
+                state = NPS_INF3 if state == NPS_INF2 else NPS_ERROR
+            elif c in u'aA':
+                state = NPS_NAN2 if state == NPS_NAN1 else NPS_ERROR
+            elif c in u'nN':
+                # Python also allows [+-]NaN, so let's accept that.
+                if state in (NPS_SPACE_PRE, NPS_SIGN):
+                    state = NPS_NAN1 if allow_float else NPS_ERROR
+                elif state == NPS_NAN2:
+                    state = NPS_NAN3
+                elif state == NPS_INF1:
+                    state = NPS_INF2
+                else:
+                    state = NPS_ERROR
+            # Allow spaces around text values.
+            else:
+                if c.isspace() if (bytes_unicode is unicode) else c in b'\x09\x0a\x0b\x0c\x0d\x20':
+                    if state in (NPS_SPACE_PRE, NPS_SPACE_TAIL):
+                        pass
+                    elif state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3):
+                        state = NPS_SPACE_TAIL
+                    else:
+                        state = NPS_ERROR
+                else:
+                    state = NPS_ERROR
+
+            if state == NPS_ERROR:
+                break
+
+    if state not in (NPS_DIGITS, NPS_FRACTION, NPS_POINT, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3, NPS_SPACE_TAIL):
+        raise ValueError
+
+
+cdef _checkInt(s):
+    if python.IS_PYTHON2 and type(s) is bytes:
+        return _checkNumber(<bytes>s, allow_float=False)
+    else:
+        return _checkNumber(<unicode>s, allow_float=False)
+
+
+cdef _checkFloat(s):
+    if python.IS_PYTHON2 and type(s) is bytes:
+        return _checkNumber(<bytes>s, allow_float=True)
+    else:
+        return _checkNumber(<unicode>s, allow_float=True)
+
+
 cdef object _strValueOf(obj):
     if python._isString(obj):
         return obj
@@ -1104,7 +1219,7 @@ def pytypename(obj):
     return _pytypename(obj)
 
 cdef _registerPyTypes():
-    pytype = PyType(u'int', int, IntElement)
+    pytype = PyType(u'int', _checkInt, IntElement)  # wraps functions for Python
     pytype.xmlSchemaTypes = (u"integer", u"int", u"short", u"byte", u"unsignedShort",
                              u"unsignedByte", u"nonPositiveInteger",
                              u"negativeInteger", u"long", u"nonNegativeInteger",
@@ -1115,7 +1230,7 @@ cdef _registerPyTypes():
     pytype = PyType(u'long', None, IntElement)
     pytype.register()
 
-    pytype = PyType(u'float', float, FloatElement, repr)
+    pytype = PyType(u'float', _checkFloat, FloatElement, repr)  # wraps _parseFloat for Python
     pytype.xmlSchemaTypes = (u"double", u"float")
     pytype.register()
 
diff --git a/src/lxml/tests/test_objectify.py b/src/lxml/tests/test_objectify.py
index a12ae7e1..178ba256 100644
--- a/src/lxml/tests/test_objectify.py
+++ b/src/lxml/tests/test_objectify.py
@@ -6,7 +6,9 @@ Tests specific to the lxml.objectify API
 
 from __future__ import absolute_import
 
-import unittest, operator
+import operator
+import random
+import unittest
 
 from .common_imports import (
     etree, HelperTestCase, fileInTestDir, doctest, make_doctest, _bytes, _str, BytesIO
@@ -2641,6 +2643,9 @@ class ObjectifyTestCase(HelperTestCase):
           <l>4294967296</l>
           <l>-4294967296</l>
           <f>1.1</f>
+          <f>.1</f>
+          <f>.1E23</f>
+          <f>.1E-23</f>
           <b>true</b>
           <b>false</b>
           <s>Strange things happen, where strings collide</s>
@@ -2649,6 +2654,11 @@ class ObjectifyTestCase(HelperTestCase):
           <s>t</s>
           <s>f</s>
           <s></s>
+          <s>12_34</s>
+          <s>1.2_34</s>
+          <s>34E</s>
+          <s>.E</s>
+          <s>.</s>
           <s>None</s>
           <n xsi:nil="true" />
         </root>
@@ -2656,20 +2666,65 @@ class ObjectifyTestCase(HelperTestCase):
         root = XML(xml)
 
         for i in root.i:
-            self.assertTrue(isinstance(i, objectify.IntElement))
+            self.assertTrue(isinstance(i, objectify.IntElement), (i.text, type(i)))
         for l in root.l:
-            self.assertTrue(isinstance(l, objectify.IntElement))
+            self.assertTrue(isinstance(l, objectify.IntElement), (l.text, type(l)))
         for f in root.f:
-            self.assertTrue(isinstance(f, objectify.FloatElement))  
+            self.assertTrue(isinstance(f, objectify.FloatElement), (f.text, type(f)))
         for b in root.b:
-            self.assertTrue(isinstance(b, objectify.BoolElement))
+            self.assertTrue(isinstance(b, objectify.BoolElement), (b.text, type(b)))
         self.assertEqual(True,  root.b[0])
         self.assertEqual(False, root.b[1])
         for s in root.s:
-            self.assertTrue(isinstance(s, objectify.StringElement))
-        self.assertTrue(isinstance(root.n, objectify.NoneElement))
+            self.assertTrue(isinstance(s, objectify.StringElement), (s.text, type(s)))
+        self.assertTrue(isinstance(root.n, objectify.NoneElement), root.n)
         self.assertEqual(None, root.n)
 
+    def test_standard_lookup_fuzz(self):
+        SPACES = ('',) * 10 + ('\t', 'x', '\n', '\r\n', u'\xA0', u'\x0A', u'\u200A', u'\u200B')
+        DIGITS = ('', '0', '1', '11', '21', '345678', '9'*20)
+
+        def space(_choice=random.choice):
+            return _choice(SPACES)
+
+        fuzz = [
+            '<t>%s</t>\n' % (space() + sign + digits + point + fraction + exp + exp_sign + exp_digits + special + space())
+            for sign in ('', '+', '-')
+            for digits in DIGITS
+            for point in ('', '.')
+            for fraction in DIGITS
+            for exp in ('', 'E')
+            for exp_sign in ('', '+', '-')
+            for exp_digits in DIGITS
+            for special in ('', 'INF', 'inf', 'NaN', 'nan', 'an', 'na', 'ana', 'nf')
+        ]
+
+        root = self.XML(_bytes('''\
+        <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        ''' + ''.join(fuzz) + '''
+        </root>
+        '''))
+
+        test_count = 0
+        for el in root.iterchildren():
+            text = el.text
+            expected_type = objectify.ObjectifiedElement
+            if text:
+                try:
+                    int(text)
+                    expected_type = objectify.IntElement
+                except ValueError:
+                    try:
+                        float(text)
+                        expected_type = objectify.FloatElement
+                    except ValueError:
+                        expected_type = objectify.StringElement
+
+            self.assertTrue(isinstance(el, expected_type), (text, expected_type, type(el)))
+            test_count += 1
+        self.assertEqual(len(fuzz), test_count)
+
+
 def test_suite():
     suite = unittest.TestSuite()
     suite.addTests([unittest.makeSuite(ObjectifyTestCase)])
author	Stefan Behnel <stefan_ml@behnel.de>	2021-08-12 16:58:41 +0200
committer	Stefan Behnel <stefan_ml@behnel.de>	2021-08-12 16:59:26 +0200
commit	83e6c031994d553b74991501c6cd85e3517fadd8 (patch)
tree	f455881a911562a009f0de2f4dcc783d181af8aa
parent	e23a807e816373e9eae9d45b5cecdd85ed2fa76a (diff)
download	python-lxml-xml_int_float_parsing.tar.gz