From ed32e9167dc06efa20210a41b45340d86d80ba7f Mon Sep 17 00:00:00 2001
From: ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>
Date: Wed, 25 Nov 2015 19:53:19 +0000
Subject: Cleaned up additional issues from enhancing the error messages for Or
 and MatchFirst, handling Unicode values in expressions. Fixes Unicode
 encoding issues in Python 2.

git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@303 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b
---
 src/CHANGES      |  8 +++++++-
 src/pyparsing.py | 19 ++++++-------------
 src/unitTests.py | 13 +++++++++++++
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/CHANGES b/src/CHANGES
index 6fd73dd..ce95f13 100644
--- a/src/CHANGES
+++ b/src/CHANGES
@@ -5,7 +5,13 @@ Change Log
 Version 2.0.7 - 
 ---------------------------
 - Simplified string representation of Forward class, to avoid memory
-  and performance errors while building ParseException messages.
+  and performance errors while building ParseException messages. Thanks,
+  Will McGugan, Andrea Censi, and Martijn Vermaat for the bug reports and
+  test code.
+
+- Cleaned up additional issues from enhancing the error messages for
+  Or and MatchFirst, handling Unicode values in expressions. Fixes Unicode
+  encoding issues in Python 2, thanks to Evan Hubinger for the bug report.
 
 
 Version 2.0.6 - 
diff --git a/src/pyparsing.py b/src/pyparsing.py
index 186bc45..f30feb9 100644
--- a/src/pyparsing.py
+++ b/src/pyparsing.py
@@ -123,18 +123,11 @@ else:
             return str(obj)
 
         except UnicodeEncodeError:
-            # The Python docs (http://docs.python.org/ref/customization.html#l2h-182)
-            # state that "The return value must be a string object". However, does a
-            # unicode object (being a subclass of basestring) count as a "string
-            # object"?
-            # If so, then return a unicode object:
-            return unicode(obj)
-            # Else encode it... but how? There are many choices... :)
-            # Replace unprintables with escape codes?
-            #return unicode(obj).encode(sys.getdefaultencoding(), 'backslashreplace_errors')
-            # Replace unprintables with question marks?
-            #return unicode(obj).encode(sys.getdefaultencoding(), 'replace')
-            # ...
+            # Else encode it
+            ret = unicode(obj).encode(sys.getdefaultencoding(), 'xmlcharrefreplace')
+            xmlcharref = Regex('&#\d+;')
+            xmlcharref.setParseAction(lambda t: '\\u' + hex(int(t[0][2:-1]))[2:])
+            return xmlcharref.transformString(ret)
 
     # build list of single arg builtins, tolerant of Python version, that can be used as parse actions
     singleArgBuiltins = []
@@ -2351,7 +2344,7 @@ class ParseExpression(ParserElement):
                 self.mayReturnEmpty |= other.mayReturnEmpty
                 self.mayIndexError  |= other.mayIndexError
 
-        self.errmsg = "Expected " + str(self)
+        self.errmsg = "Expected " + _ustr(self)
         
         return self
 
diff --git a/src/unitTests.py b/src/unitTests.py
index f496be1..22ef6d6 100644
--- a/src/unitTests.py
+++ b/src/unitTests.py
@@ -2297,6 +2297,18 @@ class EachWithOptionalWithResultsNameTest(ParseTestCase):
         print_(result.dump())
         assert sorted(result.keys()) == ['one','two']
 
+class UnicodeExpressionTest(ParseTestCase):
+    def runTest(self):
+        from pyparsing import Literal, ParseException
+        
+        z = 'a' | Literal(u'\u1111')
+        z.streamline()
+        try:
+            z.parseString('b')
+        except ParseException as pe:
+            if not PY_3:
+                assert pe.msg == r'''Expected {"a" | "\u1111"}''', "Invalid error message raised, got %r" % pe.msg
+
 class MiscellaneousParserTests(ParseTestCase):
     def runTest(self):
         import pyparsing
@@ -2508,6 +2520,7 @@ def makeTestSuite():
     suite.addTest( AddConditionTest() )
     suite.addTest( PatientOrTest() )
     suite.addTest( EachWithOptionalWithResultsNameTest() )
+    suite.addTest( UnicodeExpressionTest() )
     suite.addTest( MiscellaneousParserTests() )
     if TEST_USING_PACKRAT:
         # retest using packrat parsing (disable those tests that aren't compatible)
-- 
cgit v1.2.1