1 files changed, 14 insertions, 136 deletions
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
index 781d03b..3005752 100644
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -92,32 +92,24 @@ def tolocal(s):
     'foo: \\xc3\\xa4'
     """
 
-    try:
+    for e in ('UTF-8', fallbackencoding):
         try:
-            # make sure string is actually stored in UTF-8
-            u = s.decode('UTF-8')
-            if encoding == 'UTF-8':
-                # fast path
-                return s
+            u = s.decode(e) # attempt strict decoding
             r = u.encode(encoding, "replace")
             if u == r.decode(encoding):
                 # r is a safe, non-lossy encoding of s
                 return r
-            return localstr(s, r)
-        except UnicodeDecodeError:
-            # we should only get here if we're looking at an ancient changeset
-            try:
-                u = s.decode(fallbackencoding)
-                r = u.encode(encoding, "replace")
-                if u == r.decode(encoding):
-                    # r is a safe, non-lossy encoding of s
-                    return r
+            elif e == 'UTF-8':
+                return localstr(s, r)
+            else:
                 return localstr(u.encode('UTF-8'), r)
-            except UnicodeDecodeError:
-                u = s.decode("utf-8", "replace") # last ditch
-                return u.encode(encoding, "replace") # can't round-trip
-    except LookupError, k:
-        raise error.Abort(k, hint="please check your locale settings")
+
+        except LookupError, k:
+            raise error.Abort("%s, please check your locale settings" % k)
+        except UnicodeDecodeError:
+            pass
+    u = s.decode("utf-8", "replace") # last ditch
+    return u.encode(encoding, "replace") # can't round-trip
 
 def fromlocal(s):
     """
@@ -140,14 +132,14 @@ def fromlocal(s):
         sub = s[max(0, inst.start - 10):inst.start + 10]
         raise error.Abort("decoding near '%s': %s!" % (sub, inst))
     except LookupError, k:
-        raise error.Abort(k, hint="please check your locale settings")
+        raise error.Abort("%s, please check your locale settings" % k)
 
 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
         and "WFA" or "WF")
 
 def colwidth(s):
-    "Find the column width of a string for display in the local encoding"
+    "Find the column width of a UTF-8 string for display"
     return ucolwidth(s.decode(encoding, 'replace'))
 
 def ucolwidth(d):
@@ -157,22 +149,9 @@ def ucolwidth(d):
         return sum([eaw(c) in wide and 2 or 1 for c in d])
     return len(d)
 
-def getcols(s, start, c):
-    '''Use colwidth to find a c-column substring of s starting at byte
-    index start'''
-    for x in xrange(start + c, len(s)):
-        t = s[start:x]
-        if colwidth(t) == c:
-            return t
-
 def lower(s):
     "best-effort encoding-aware case-folding of local string s"
     try:
-        s.decode('ascii') # throw exception for non-ASCII character
-        return s.lower()
-    except UnicodeDecodeError:
-        pass
-    try:
         if isinstance(s, localstr):
             u = s._utf8.decode("utf-8")
         else:
@@ -184,104 +163,3 @@ def lower(s):
         return lu.encode(encoding)
     except UnicodeError:
         return s.lower() # we don't know how to fold this except in ASCII
-    except LookupError, k:
-        raise error.Abort(k, hint="please check your locale settings")
-
-def upper(s):
-    "best-effort encoding-aware case-folding of local string s"
-    try:
-        s.decode('ascii') # throw exception for non-ASCII character
-        return s.upper()
-    except UnicodeDecodeError:
-        pass
-    try:
-        if isinstance(s, localstr):
-            u = s._utf8.decode("utf-8")
-        else:
-            u = s.decode(encoding, encodingmode)
-
-        uu = u.upper()
-        if u == uu:
-            return s # preserve localstring
-        return uu.encode(encoding)
-    except UnicodeError:
-        return s.upper() # we don't know how to fold this except in ASCII
-    except LookupError, k:
-        raise error.Abort(k, hint="please check your locale settings")
-
-def toutf8b(s):
-    '''convert a local, possibly-binary string into UTF-8b
-
-    This is intended as a generic method to preserve data when working
-    with schemes like JSON and XML that have no provision for
-    arbitrary byte strings. As Mercurial often doesn't know
-    what encoding data is in, we use so-called UTF-8b.
-
-    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
-    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
-    uDC00-uDCFF.
-
-    Principles of operation:
-
-    - ASCII and UTF-8 data sucessfully round-trips and is understood
-      by Unicode-oriented clients
-    - filenames and file contents in arbitrary other encodings can have
-      be round-tripped or recovered by clueful clients
-    - local strings that have a cached known UTF-8 encoding (aka
-      localstr) get sent as UTF-8 so Unicode-oriented clients get the
-      Unicode data they want
-    - because we must preserve UTF-8 bytestring in places such as
-      filenames, metadata can't be roundtripped without help
-
-    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
-    arbitrary bytes into an internal Unicode format that can be
-    re-encoded back into the original. Here we are exposing the
-    internal surrogate encoding as a UTF-8 string.)
-    '''
-
-    if isinstance(s, localstr):
-        return s._utf8
-
-    try:
-        if s.decode('utf-8'):
-            return s
-    except UnicodeDecodeError:
-        # surrogate-encode any characters that don't round-trip
-        s2 = s.decode('utf-8', 'ignore').encode('utf-8')
-        r = ""
-        pos = 0
-        for c in s:
-            if s2[pos:pos + 1] == c:
-                r += c
-                pos += 1
-            else:
-                r += unichr(0xdc00 + ord(c)).encode('utf-8')
-        return r
-
-def fromutf8b(s):
-    '''Given a UTF-8b string, return a local, possibly-binary string.
-
-    return the original binary string. This
-    is a round-trip process for strings like filenames, but metadata
-    that's was passed through tolocal will remain in UTF-8.
-
-    >>> m = "\\xc3\\xa9\\x99abcd"
-    >>> n = toutf8b(m)
-    >>> n
-    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
-    >>> fromutf8b(n) == m
-    True
-    '''
-
-    # fast path - look for uDxxx prefixes in s
-    if "\xed" not in s:
-        return s
-
-    u = s.decode("utf-8")
-    r = ""
-    for c in u:
-        if ord(c) & 0xff00 == 0xdc00:
-            r += chr(ord(c) & 0xff)
-        else:
-            r += c.encode("utf-8")
-    return r