summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Ippolito <bob@redivi.com>2014-07-21 17:18:08 +0200
committerBob Ippolito <bob@redivi.com>2014-07-21 17:18:08 +0200
commit925cae72acd6a0d2a789cdc363736ed826f8ffa6 (patch)
tree201cfe69b7fcceef6e9f92887c9b431beb5ba970
parent3eaa8d54dfed8cd64c9f439451f5514f45cd4dd4 (diff)
downloadsimplejson-925cae72acd6a0d2a789cdc363736ed826f8ffa6.tar.gz
generalize BOM stripping to any use of raw_decode
-rw-r--r--CHANGES.txt6
-rw-r--r--conf.py4
-rw-r--r--simplejson/__init__.py13
-rw-r--r--simplejson/decoder.py7
-rw-r--r--simplejson/tests/test_unicode.py15
-rw-r--r--simplejson/tests/utf-8-bom.json3
6 files changed, 24 insertions, 24 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 23c705b..8714c1a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Version 3.6.0 released 2014-07-21
+
+* Automatically strip any UTF-8 BOM from input to more closely
+ follow the latest specs
+ https://github.com/simplejson/simplejson/pull/101
+
Version 3.5.3 released 2014-06-24
* Fix lower bound checking in scan_once / raw_decode API
diff --git a/conf.py b/conf.py
index c222349..35cf327 100644
--- a/conf.py
+++ b/conf.py
@@ -42,9 +42,9 @@ copyright = '2014, Bob Ippolito'
# other places throughout the built documents.
#
# The short X.Y version.
-version = '3.5'
+version = '3.6'
# The full version, including alpha/beta/rc tags.
-release = '3.5.3'
+release = '3.6.0'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index bc5c93a..0a92914 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -98,7 +98,7 @@ Using simplejson.tool from the shell to validate and pretty-print::
Expecting property name: line 1 column 3 (char 2)
"""
from __future__ import absolute_import
-__version__ = '3.5.3'
+__version__ = '3.6.0'
__all__ = [
'dump', 'dumps', 'load', 'loads',
'JSONDecoder', 'JSONDecodeError', 'JSONEncoder',
@@ -437,16 +437,7 @@ def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
of subclassing whenever possible.
"""
- # Strip the UTF-8 BOM
- contents = fp.read()
- ord0 = ord(contents[0])
- if ord0 in (0xef, 0xfeff):
- if ord0 == 0xfeff:
- contents = contents[1:]
- elif contents[:3] == '\xef\xbb\xbf':
- contents = contents[3:]
-
- return loads(contents,
+ return loads(fp.read(),
encoding=encoding, cls=cls, object_hook=object_hook,
parse_float=parse_float, parse_int=parse_int,
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook,
diff --git a/simplejson/decoder.py b/simplejson/decoder.py
index 1a6c5d9..545e658 100644
--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -390,4 +390,11 @@ class JSONDecoder(object):
raise JSONDecodeError('Expecting value', s, idx)
if _PY3 and not isinstance(s, text_type):
raise TypeError("Input string must be text, not bytes")
+ # strip UTF-8 bom
+ if len(s) > idx:
+ ord0 = ord(s[idx])
+ if ord0 == 0xfeff:
+ idx += 1
+ elif ord0 == 0xef and s[idx:idx + 3] == '\xef\xbb\xbf':
+ idx += 3
return self.scan_once(s, idx=_w(s, idx).end())
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index 60492f7..3b37f65 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -1,9 +1,9 @@
import sys
-import os.path
+import codecs
from unittest import TestCase
import simplejson as json
-from simplejson.compat import unichr, text_type, b, u
+from simplejson.compat import unichr, text_type, b, u, BytesIO
class TestUnicode(TestCase):
def test_encoding1(self):
@@ -146,9 +146,8 @@ class TestUnicode(TestCase):
'"' + c + '"')
def test_strip_bom(self):
- thisdir = os.path.dirname(__file__)
- json_file = os.path.join(thisdir, "utf-8-bom.json")
- doc_ascii = {
- u"content": u"\u3053\u3093\u306b\u3061\u308f"
- }
- self.assertEqual(json.load(open(json_file)), doc_ascii)
+ content = u"\u3053\u3093\u306b\u3061\u308f"
+ json_doc = codecs.BOM_UTF8 + b(json.dumps(content))
+ self.assertEqual(json.load(BytesIO(json_doc)), content)
+ for doc in json_doc, json_doc.decode('utf8'):
+ self.assertEqual(json.loads(doc), content)
diff --git a/simplejson/tests/utf-8-bom.json b/simplejson/tests/utf-8-bom.json
deleted file mode 100644
index 1791beb..0000000
--- a/simplejson/tests/utf-8-bom.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "content": "こんにちわ"
-}