summaryrefslogtreecommitdiff
path: root/mercurial/encoding.py
diff options
context:
space:
mode:
Diffstat (limited to 'mercurial/encoding.py')
-rw-r--r--mercurial/encoding.py150
1 files changed, 14 insertions, 136 deletions
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
index 781d03b..3005752 100644
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -92,32 +92,24 @@ def tolocal(s):
'foo: \\xc3\\xa4'
"""
- try:
+ for e in ('UTF-8', fallbackencoding):
try:
- # make sure string is actually stored in UTF-8
- u = s.decode('UTF-8')
- if encoding == 'UTF-8':
- # fast path
- return s
+ u = s.decode(e) # attempt strict decoding
r = u.encode(encoding, "replace")
if u == r.decode(encoding):
# r is a safe, non-lossy encoding of s
return r
- return localstr(s, r)
- except UnicodeDecodeError:
- # we should only get here if we're looking at an ancient changeset
- try:
- u = s.decode(fallbackencoding)
- r = u.encode(encoding, "replace")
- if u == r.decode(encoding):
- # r is a safe, non-lossy encoding of s
- return r
+ elif e == 'UTF-8':
+ return localstr(s, r)
+ else:
return localstr(u.encode('UTF-8'), r)
- except UnicodeDecodeError:
- u = s.decode("utf-8", "replace") # last ditch
- return u.encode(encoding, "replace") # can't round-trip
- except LookupError, k:
- raise error.Abort(k, hint="please check your locale settings")
+
+ except LookupError, k:
+ raise error.Abort("%s, please check your locale settings" % k)
+ except UnicodeDecodeError:
+ pass
+ u = s.decode("utf-8", "replace") # last ditch
+ return u.encode(encoding, "replace") # can't round-trip
def fromlocal(s):
"""
@@ -140,14 +132,14 @@ def fromlocal(s):
sub = s[max(0, inst.start - 10):inst.start + 10]
raise error.Abort("decoding near '%s': %s!" % (sub, inst))
except LookupError, k:
- raise error.Abort(k, hint="please check your locale settings")
+ raise error.Abort("%s, please check your locale settings" % k)
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
and "WFA" or "WF")
def colwidth(s):
- "Find the column width of a string for display in the local encoding"
+ "Find the column width of a UTF-8 string for display"
return ucolwidth(s.decode(encoding, 'replace'))
def ucolwidth(d):
@@ -157,22 +149,9 @@ def ucolwidth(d):
return sum([eaw(c) in wide and 2 or 1 for c in d])
return len(d)
-def getcols(s, start, c):
- '''Use colwidth to find a c-column substring of s starting at byte
- index start'''
- for x in xrange(start + c, len(s)):
- t = s[start:x]
- if colwidth(t) == c:
- return t
-
def lower(s):
"best-effort encoding-aware case-folding of local string s"
try:
- s.decode('ascii') # throw exception for non-ASCII character
- return s.lower()
- except UnicodeDecodeError:
- pass
- try:
if isinstance(s, localstr):
u = s._utf8.decode("utf-8")
else:
@@ -184,104 +163,3 @@ def lower(s):
return lu.encode(encoding)
except UnicodeError:
return s.lower() # we don't know how to fold this except in ASCII
- except LookupError, k:
- raise error.Abort(k, hint="please check your locale settings")
-
-def upper(s):
- "best-effort encoding-aware case-folding of local string s"
- try:
- s.decode('ascii') # throw exception for non-ASCII character
- return s.upper()
- except UnicodeDecodeError:
- pass
- try:
- if isinstance(s, localstr):
- u = s._utf8.decode("utf-8")
- else:
- u = s.decode(encoding, encodingmode)
-
- uu = u.upper()
- if u == uu:
- return s # preserve localstring
- return uu.encode(encoding)
- except UnicodeError:
- return s.upper() # we don't know how to fold this except in ASCII
- except LookupError, k:
- raise error.Abort(k, hint="please check your locale settings")
-
-def toutf8b(s):
- '''convert a local, possibly-binary string into UTF-8b
-
- This is intended as a generic method to preserve data when working
- with schemes like JSON and XML that have no provision for
- arbitrary byte strings. As Mercurial often doesn't know
- what encoding data is in, we use so-called UTF-8b.
-
- If a string is already valid UTF-8 (or ASCII), it passes unmodified.
- Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
- uDC00-uDCFF.
-
- Principles of operation:
-
- - ASCII and UTF-8 data sucessfully round-trips and is understood
- by Unicode-oriented clients
- - filenames and file contents in arbitrary other encodings can have
- be round-tripped or recovered by clueful clients
- - local strings that have a cached known UTF-8 encoding (aka
- localstr) get sent as UTF-8 so Unicode-oriented clients get the
- Unicode data they want
- - because we must preserve UTF-8 bytestring in places such as
- filenames, metadata can't be roundtripped without help
-
- (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
- arbitrary bytes into an internal Unicode format that can be
- re-encoded back into the original. Here we are exposing the
- internal surrogate encoding as a UTF-8 string.)
- '''
-
- if isinstance(s, localstr):
- return s._utf8
-
- try:
- if s.decode('utf-8'):
- return s
- except UnicodeDecodeError:
- # surrogate-encode any characters that don't round-trip
- s2 = s.decode('utf-8', 'ignore').encode('utf-8')
- r = ""
- pos = 0
- for c in s:
- if s2[pos:pos + 1] == c:
- r += c
- pos += 1
- else:
- r += unichr(0xdc00 + ord(c)).encode('utf-8')
- return r
-
-def fromutf8b(s):
- '''Given a UTF-8b string, return a local, possibly-binary string.
-
- return the original binary string. This
- is a round-trip process for strings like filenames, but metadata
- that's was passed through tolocal will remain in UTF-8.
-
- >>> m = "\\xc3\\xa9\\x99abcd"
- >>> n = toutf8b(m)
- >>> n
- '\\xc3\\xa9\\xed\\xb2\\x99abcd'
- >>> fromutf8b(n) == m
- True
- '''
-
- # fast path - look for uDxxx prefixes in s
- if "\xed" not in s:
- return s
-
- u = s.decode("utf-8")
- r = ""
- for c in u:
- if ord(c) & 0xff00 == 0xdc00:
- r += chr(ord(c) & 0xff)
- else:
- r += c.encode("utf-8")
- return r