3 files changed, 29 insertions, 46 deletions
diff --git a/libraries/base/GHC/Foreign.hs b/libraries/base/GHC/Foreign.hs
index f6f0272164..4eef2ebc2f 100644
--- a/libraries/base/GHC/Foreign.hs
+++ b/libraries/base/GHC/Foreign.hs
@@ -63,7 +63,6 @@ import GHC.Base
 import GHC.IO
 import GHC.IO.Exception
 import GHC.IO.Buffer
-import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
 import GHC.IO.Encoding.Types
 
 
@@ -173,7 +172,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
             if isEmptyBuffer from'
              then
               -- No input remaining: @why@ will be InputUnderflow, but we don't care
-              fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
+              withBuffer to' $ peekArray (bufferElems to')
              else do
               -- Input remaining: what went wrong?
               putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
@@ -183,7 +182,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
               putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
               putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
               to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
-              fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
+              fmap (to_chars++) $ go (iteration + 1) from''
 
       go (0 :: Int) from0
 
@@ -194,7 +193,7 @@ withEncodedCString :: TextEncoding         -- ^ Encoding of CString to create
                    -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
                    -> IO a
 withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
-  = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+  = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
       from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
 
       let go iteration to_sz_bytes = do
@@ -214,7 +213,7 @@ newEncodedCString :: TextEncoding  -- ^ Encoding of CString to create
                   -> String        -- ^ String to encode
                   -> IO CStringLen
 newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
-  = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+  = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
       from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
 
       let go iteration to_p to_sz_bytes = do
diff --git a/libraries/base/GHC/IO/Encoding/Failure.hs b/libraries/base/GHC/IO/Encoding/Failure.hs
index 6cd475aca3..862de1fb4e 100644
--- a/libraries/base/GHC/IO/Encoding/Failure.hs
+++ b/libraries/base/GHC/IO/Encoding/Failure.hs
@@ -18,7 +18,6 @@
 module GHC.IO.Encoding.Failure (
     CodingFailureMode(..), codingFailureModeSuffix,
     isSurrogate,
-    surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
     recoverDecode, recoverEncode
   ) where
 
@@ -61,26 +60,35 @@ data CodingFailureMode
 -- Note [Roundtripping]
 -- ~~~~~~~~~~~~~~~~~~~~
 --
--- Roundtripping is based on the ideas of PEP383. However, unlike
--- PEP383 we do not wish to use lone surrogate codepoints to escape
--- undecodable bytes, because that may confuse Unicode processing
--- software written in Haskell. Instead, we use the range of
--- private-use characters from 0xEF80 to 0xEFFF designated for
--- "encoding hacks" by the ConScript Unicode Registery.
+-- Roundtripping is based on the ideas of PEP383.
 --
--- This introduces a technical problem when it comes to encoding back
--- to bytes using iconv. The iconv code will not fail when it tries to
--- encode a private-use character (as it would if trying to encode a
--- surrogate), which means that we won't get a chance to replace it
+-- We used to use the range of private-use characters from 0xEF80 to
+-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery
+-- to encode these characters.
+--
+-- However, people didn't like this because it means we don't get
+-- guaranteed roundtripping for byte sequences that look like a UTF-8
+-- encoded codepoint 0xEFxx.
+--
+-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape
+-- undecodable bytes, even though that may confuse Unicode processing
+-- software written in Haskell. This guarantees roundtripping because
+-- unicode input that includes lone surrogate codepoints is invalid by
+-- definition.
+--
+-- When we used private-use characters there was a technical problem when it
+-- came to encoding back to bytes using iconv. The iconv code will not fail when
+-- it tries to encode a private-use character (as it would if trying to encode
+-- a surrogate), which means that we won't get a chance to replace it
 -- with the byte we originally escaped.
 --
 -- To work around this, when filling the buffer to be encoded (in
--- writeBlocks/withEncodedCString/newEncodedCString), we replace the
+-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the
 -- private-use characters with lone surrogates again! Likewise, when
 -- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have
 -- to do the inverse process.
 --
--- The user of String should never see these lone surrogates, but it
+-- The user of String would never see these lone surrogates, but it
 -- ensures that iconv will throw an error when encountering them.  We
 -- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
 
@@ -118,26 +126,6 @@ isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
              || (0xDC00 <= x && x <= 0xDFFF)
   where x = ord c
 
--- | Private use characters (in Strings) --> lone surrogates (in
--- Buffer CharBufElem) (We use some private-use characters for
--- roundtripping unknown bytes through a String)
-{-# INLINE surrogatifyRoundtripCharacter #-}
-surrogatifyRoundtripCharacter :: Char -> Char
-surrogatifyRoundtripCharacter c
-  | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00))
-  | otherwise                 = c
-  where x = ord c
-
--- | Lone surrogates (in Buffer CharBufElem) --> private use
--- characters (in Strings) (We use some surrogate characters for
--- roundtripping unknown bytes through a String)
-{-# INLINE desurrogatifyRoundtripCharacter #-}
-desurrogatifyRoundtripCharacter :: Char -> Char
-desurrogatifyRoundtripCharacter c
-  | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00))
-  | otherwise                 = c
-  where x = ord c
-
 -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
 {-# INLINE escapeToRoundtripCharacterSurrogate #-}
 escapeToRoundtripCharacterSurrogate :: Word8 -> Char
diff --git a/libraries/base/GHC/IO/Handle/Text.hs b/libraries/base/GHC/IO/Handle/Text.hs
index 7162dc28c5..280cebdd03 100644
--- a/libraries/base/GHC/IO/Handle/Text.hs
+++ b/libraries/base/GHC/IO/Handle/Text.hs
@@ -39,7 +39,6 @@ import GHC.IO.FD
 import GHC.IO.Buffer
 import qualified GHC.IO.BufferedIO as Buffered
 import GHC.IO.Exception
-import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
 import GHC.Exception
 import GHC.IO.Handle.Types
 import GHC.IO.Handle.Internals
@@ -275,8 +274,7 @@ unpack !buf !r !w acc0
          | otherwise = do
               -- Here, we are rather careful to only put an *evaluated* character
               -- in the output string. Due to pointer tagging, this allows the consumer
-              -- to avoid ping-ponging between the actual consumer code and the
-              -- code of desurrogatifyRoundtripCharacter
+              -- to avoid ping-ponging between the actual consumer code and the thunk code
 #ifdef CHARBUF_UTF16
               -- reverse-order decoding of UTF-16
               c2 <- peekElemOff pbuf i
@@ -289,8 +287,7 @@ unpack !buf !r !w acc0
                            { C# c# -> unpackRB (C# c# : acc) (i-2) }
 #else
               c <- peekElemOff pbuf i
-              case desurrogatifyRoundtripCharacter c of { C# c# ->
-              unpackRB (C# c# : acc) (i-1) } -- Note [#5536]
+              unpackRB (c : acc) (i-1)
 #endif
      in
      unpackRB acc0 (w-1)
@@ -313,8 +310,7 @@ unpack_nl !buf !r !w acc0
                             then unpackRB ('\n':acc) (i-2)
                             else unpackRB ('\n':acc) (i-1)
                  else do
-                         case desurrogatifyRoundtripCharacter c of { C# c# ->
-                         unpackRB (C# c# : acc) (i-1) } -- Note [#5536]
+                         unpackRB (c : acc) (i-1)
      in do
      c <- peekElemOff pbuf (w-1)
      if (c == '\r')
@@ -612,7 +608,7 @@ writeBlocks hdl line_buffered add_nl nl
            else do
                shoveString n' cs rest
      | otherwise = do
-        n' <- writeCharBuf raw n (surrogatifyRoundtripCharacter c)
+        n' <- writeCharBuf raw n c
         shoveString n' cs rest
   in
   shoveString 0 s (if add_nl then "\n" else "")