summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2021-08-12 16:58:41 +0200
committerStefan Behnel <stefan_ml@behnel.de>2021-08-12 16:59:26 +0200
commit83e6c031994d553b74991501c6cd85e3517fadd8 (patch)
treef455881a911562a009f0de2f4dcc783d181af8aa
parente23a807e816373e9eae9d45b5cecdd85ed2fa76a (diff)
downloadpython-lxml-xml_int_float_parsing.tar.gz
Implement a dedicated int/float parser for XML (schema) values in lxml.objectify.xml_int_float_parsing
This disables support for "_" in numbers, which are allowed by Python but not by XMLSchema. Wee keep a few additional literals, such as "+NaN", simply because they shouldn't hurt. See https://mail.python.org/archives/list/lxml@python.org/thread/6F7VIDKWZTJ6LB6VOX6IJNNWICYHFPNR/
-rw-r--r--src/lxml/objectify.pyx119
-rw-r--r--src/lxml/tests/test_objectify.py69
2 files changed, 179 insertions, 9 deletions
diff --git a/src/lxml/objectify.pyx b/src/lxml/objectify.pyx
index e587e4f2..cacbe806 100644
--- a/src/lxml/objectify.pyx
+++ b/src/lxml/objectify.pyx
@@ -943,6 +943,121 @@ cdef object _parseNumber(NumberElement element):
return element._parse_value(textOf(element._c_node))
+cdef enum NumberParserState:
+ NPS_SPACE_PRE = 0
+ NPS_SIGN = 1
+ NPS_DIGITS = 2
+ NPS_POINT_LEAD = 3
+ NPS_POINT = 4
+ NPS_FRACTION = 5
+ NPS_EXP = 6
+ NPS_EXP_SIGN = 7
+ NPS_DIGITS_EXP = 8
+ NPS_SPACE_TAIL = 9
+ NPS_INF1 = 20
+ NPS_INF2 = 21
+ NPS_INF3 = 22
+ NPS_NAN1 = 23
+ NPS_NAN2 = 24
+ NPS_NAN3 = 25
+ NPS_ERROR = 99
+
+
+ctypedef fused bytes_unicode:
+ bytes
+ unicode
+
+
+cdef _checkNumber(bytes_unicode s, bint allow_float):
+ cdef Py_UCS4 c
+ cdef NumberParserState state = NPS_SPACE_PRE
+
+ for c in s:
+ if c.isdigit() if (bytes_unicode is unicode) else c in b'0123456789':
+ if state in (NPS_DIGITS, NPS_FRACTION, NPS_DIGITS_EXP):
+ pass
+ elif state in (NPS_SPACE_PRE, NPS_SIGN):
+ state = NPS_DIGITS
+ elif state in (NPS_POINT_LEAD, NPS_POINT):
+ state = NPS_FRACTION
+ elif state in (NPS_EXP, NPS_EXP_SIGN):
+ state = NPS_DIGITS_EXP
+ else:
+ state = NPS_ERROR
+ else:
+ if c == u'.':
+ if state in (NPS_SPACE_PRE, NPS_SIGN):
+ state = NPS_POINT_LEAD
+ elif state == NPS_DIGITS:
+ state = NPS_POINT
+ else:
+ state = NPS_ERROR
+ if not allow_float:
+ state = NPS_ERROR
+ elif c in u'-+':
+ if state == NPS_SPACE_PRE:
+ state = NPS_SIGN
+ elif state == NPS_EXP:
+ state = NPS_EXP_SIGN
+ else:
+ state = NPS_ERROR
+ elif c == u'E':
+ if state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION):
+ state = NPS_EXP
+ else:
+ state = NPS_ERROR
+ if not allow_float:
+ state = NPS_ERROR
+ # Allow INF and NaN. XMLSchema requires case, we don't, like Python.
+ elif c in u'iI':
+ state = NPS_INF1 if allow_float and state in (NPS_SPACE_PRE, NPS_SIGN) else NPS_ERROR
+ elif c in u'fF':
+ state = NPS_INF3 if state == NPS_INF2 else NPS_ERROR
+ elif c in u'aA':
+ state = NPS_NAN2 if state == NPS_NAN1 else NPS_ERROR
+ elif c in u'nN':
+ # Python also allows [+-]NaN, so let's accept that.
+ if state in (NPS_SPACE_PRE, NPS_SIGN):
+ state = NPS_NAN1 if allow_float else NPS_ERROR
+ elif state == NPS_NAN2:
+ state = NPS_NAN3
+ elif state == NPS_INF1:
+ state = NPS_INF2
+ else:
+ state = NPS_ERROR
+ # Allow spaces around text values.
+ else:
+ if c.isspace() if (bytes_unicode is unicode) else c in b'\x09\x0a\x0b\x0c\x0d\x20':
+ if state in (NPS_SPACE_PRE, NPS_SPACE_TAIL):
+ pass
+ elif state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3):
+ state = NPS_SPACE_TAIL
+ else:
+ state = NPS_ERROR
+ else:
+ state = NPS_ERROR
+
+ if state == NPS_ERROR:
+ break
+
+ if state not in (NPS_DIGITS, NPS_FRACTION, NPS_POINT, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3, NPS_SPACE_TAIL):
+ raise ValueError
+
+
+cdef _checkInt(s):
+ if python.IS_PYTHON2 and type(s) is bytes:
+ return _checkNumber(<bytes>s, allow_float=False)
+ else:
+ return _checkNumber(<unicode>s, allow_float=False)
+
+
+cdef _checkFloat(s):
+ if python.IS_PYTHON2 and type(s) is bytes:
+ return _checkNumber(<bytes>s, allow_float=True)
+ else:
+ return _checkNumber(<unicode>s, allow_float=True)
+
+
cdef object _strValueOf(obj):
if python._isString(obj):
return obj
@@ -1104,7 +1219,7 @@ def pytypename(obj):
return _pytypename(obj)
cdef _registerPyTypes():
- pytype = PyType(u'int', int, IntElement)
+ pytype = PyType(u'int', _checkInt, IntElement) # wraps functions for Python
pytype.xmlSchemaTypes = (u"integer", u"int", u"short", u"byte", u"unsignedShort",
u"unsignedByte", u"nonPositiveInteger",
u"negativeInteger", u"long", u"nonNegativeInteger",
@@ -1115,7 +1230,7 @@ cdef _registerPyTypes():
pytype = PyType(u'long', None, IntElement)
pytype.register()
- pytype = PyType(u'float', float, FloatElement, repr)
+ pytype = PyType(u'float', _checkFloat, FloatElement, repr) # wraps _parseFloat for Python
pytype.xmlSchemaTypes = (u"double", u"float")
pytype.register()
diff --git a/src/lxml/tests/test_objectify.py b/src/lxml/tests/test_objectify.py
index a12ae7e1..178ba256 100644
--- a/src/lxml/tests/test_objectify.py
+++ b/src/lxml/tests/test_objectify.py
@@ -6,7 +6,9 @@ Tests specific to the lxml.objectify API
from __future__ import absolute_import
-import unittest, operator
+import operator
+import random
+import unittest
from .common_imports import (
etree, HelperTestCase, fileInTestDir, doctest, make_doctest, _bytes, _str, BytesIO
@@ -2641,6 +2643,9 @@ class ObjectifyTestCase(HelperTestCase):
<l>4294967296</l>
<l>-4294967296</l>
<f>1.1</f>
+ <f>.1</f>
+ <f>.1E23</f>
+ <f>.1E-23</f>
<b>true</b>
<b>false</b>
<s>Strange things happen, where strings collide</s>
@@ -2649,6 +2654,11 @@ class ObjectifyTestCase(HelperTestCase):
<s>t</s>
<s>f</s>
<s></s>
+ <s>12_34</s>
+ <s>1.2_34</s>
+ <s>34E</s>
+ <s>.E</s>
+ <s>.</s>
<s>None</s>
<n xsi:nil="true" />
</root>
@@ -2656,20 +2666,65 @@ class ObjectifyTestCase(HelperTestCase):
root = XML(xml)
for i in root.i:
- self.assertTrue(isinstance(i, objectify.IntElement))
+ self.assertTrue(isinstance(i, objectify.IntElement), (i.text, type(i)))
for l in root.l:
- self.assertTrue(isinstance(l, objectify.IntElement))
+ self.assertTrue(isinstance(l, objectify.IntElement), (l.text, type(l)))
for f in root.f:
- self.assertTrue(isinstance(f, objectify.FloatElement))
+ self.assertTrue(isinstance(f, objectify.FloatElement), (f.text, type(f)))
for b in root.b:
- self.assertTrue(isinstance(b, objectify.BoolElement))
+ self.assertTrue(isinstance(b, objectify.BoolElement), (b.text, type(b)))
self.assertEqual(True, root.b[0])
self.assertEqual(False, root.b[1])
for s in root.s:
- self.assertTrue(isinstance(s, objectify.StringElement))
- self.assertTrue(isinstance(root.n, objectify.NoneElement))
+ self.assertTrue(isinstance(s, objectify.StringElement), (s.text, type(s)))
+ self.assertTrue(isinstance(root.n, objectify.NoneElement), root.n)
self.assertEqual(None, root.n)
+ def test_standard_lookup_fuzz(self):
+ SPACES = ('',) * 10 + ('\t', 'x', '\n', '\r\n', u'\xA0', u'\x0A', u'\u200A', u'\u200B')
+ DIGITS = ('', '0', '1', '11', '21', '345678', '9'*20)
+
+ def space(_choice=random.choice):
+ return _choice(SPACES)
+
+ fuzz = [
+ '<t>%s</t>\n' % (space() + sign + digits + point + fraction + exp + exp_sign + exp_digits + special + space())
+ for sign in ('', '+', '-')
+ for digits in DIGITS
+ for point in ('', '.')
+ for fraction in DIGITS
+ for exp in ('', 'E')
+ for exp_sign in ('', '+', '-')
+ for exp_digits in DIGITS
+ for special in ('', 'INF', 'inf', 'NaN', 'nan', 'an', 'na', 'ana', 'nf')
+ ]
+
+ root = self.XML(_bytes('''\
+ <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ ''' + ''.join(fuzz) + '''
+ </root>
+ '''))
+
+ test_count = 0
+ for el in root.iterchildren():
+ text = el.text
+ expected_type = objectify.ObjectifiedElement
+ if text:
+ try:
+ int(text)
+ expected_type = objectify.IntElement
+ except ValueError:
+ try:
+ float(text)
+ expected_type = objectify.FloatElement
+ except ValueError:
+ expected_type = objectify.StringElement
+
+ self.assertTrue(isinstance(el, expected_type), (text, expected_type, type(el)))
+ test_count += 1
+ self.assertEqual(len(fuzz), test_count)
+
+
def test_suite():
suite = unittest.TestSuite()
suite.addTests([unittest.makeSuite(ObjectifyTestCase)])