From a6d372d52469304860902a3eba1bafa412d420f0 Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Wed, 11 Jan 2017 11:44:40 +1100 Subject: Fix #315 support utf-8 by default Signed-off-by: Tao Wang --- sqlparse/lexer.py | 2 +- tests/test_regressions.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 914b520..82d4380 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -44,7 +44,7 @@ class Lexer(object): pass elif isinstance(text, bytes_type): try: - text = text.decode() + text = text.decode('utf-8') except UnicodeDecodeError: if not encoding: encoding = 'unicode-escape' diff --git a/tests/test_regressions.py b/tests/test_regressions.py index d646325..b9a73a2 100644 --- a/tests/test_regressions.py +++ b/tests/test_regressions.py @@ -323,3 +323,22 @@ def test_token_next_doesnt_ignore_skip_cm(): def test_issue284_as_grouping(s): p = sqlparse.parse(s)[0] assert s == str(p) + + +def test_issue315_utf8_by_default(): + # Make sure the lexer can handle utf-8 string by default correctly + # digest = '齐天大圣.カラフルな雲.사랑해요' + # The digest contains Chinese, Japanese and Korean characters + # All in 'utf-8' encoding. + digest = ( + '\xe9\xbd\x90\xe5\xa4\xa9\xe5\xa4\xa7\xe5\x9c\xa3.' + '\xe3\x82\xab\xe3\x83\xa9\xe3\x83\x95\xe3\x83\xab\xe3\x81\xaa\xe9' + '\x9b\xb2.' + '\xec\x82\xac\xeb\x9e\x91\xed\x95\xb4\xec\x9a\x94' + ) + sql = "select * from foo where bar = '{0}'".format(digest) + formatted = sqlparse.format(sql, reindent=True) + tformatted = "select *\nfrom foo\nwhere bar = '{0}'".format(digest) + if PY2: + tformatted = tformatted.decode('utf-8') + assert formatted == tformatted -- cgit v1.2.1 From 66b36af84fbe6d546b73a207e687234f28bb00a0 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Tue, 10 Jan 2017 18:52:12 -0700 Subject: Fix encoding logic/order - If user provides an encoding value, use it instead of trying to _guess_ first. - If no value is provided, then decode with default of utf-8, otherwise try with unicode-escape --- sqlparse/lexer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 82d4380..60e43da 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -43,12 +43,13 @@ class Lexer(object): if isinstance(text, text_type): pass elif isinstance(text, bytes_type): - try: - text = text.decode('utf-8') - except UnicodeDecodeError: - if not encoding: - encoding = 'unicode-escape' + if encoding: text = text.decode(encoding) + else: + try: + text = text.decode('utf-8') + except UnicodeDecodeError: + text = text.decode('unicode-escape') else: raise TypeError(u"Expected text or file-like object, got {!r}". format(type(text))) -- cgit v1.2.1