summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libraries/base/GHC/Foreign.hs9
-rw-r--r--libraries/base/GHC/IO/Encoding/Failure.hs54
-rw-r--r--libraries/base/GHC/IO/Handle/Text.hs12
3 files changed, 29 insertions, 46 deletions
diff --git a/libraries/base/GHC/Foreign.hs b/libraries/base/GHC/Foreign.hs
index f6f0272164..4eef2ebc2f 100644
--- a/libraries/base/GHC/Foreign.hs
+++ b/libraries/base/GHC/Foreign.hs
@@ -63,7 +63,6 @@ import GHC.Base
import GHC.IO
import GHC.IO.Exception
import GHC.IO.Buffer
-import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
import GHC.IO.Encoding.Types
@@ -173,7 +172,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
if isEmptyBuffer from'
then
-- No input remaining: @why@ will be InputUnderflow, but we don't care
- fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to')
+ withBuffer to' $ peekArray (bufferElems to')
else do
-- Input remaining: what went wrong?
putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why)
@@ -183,7 +182,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes)
putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'')
putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'')
to_chars <- withBuffer to'' $ peekArray (bufferElems to'')
- fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from''
+ fmap (to_chars++) $ go (iteration + 1) from''
go (0 :: Int) from0
@@ -194,7 +193,7 @@ withEncodedCString :: TextEncoding -- ^ Encoding of CString to create
-> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory
-> IO a
withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act
- = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+ = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
let go iteration to_sz_bytes = do
@@ -214,7 +213,7 @@ newEncodedCString :: TextEncoding -- ^ Encoding of CString to create
-> String -- ^ String to encode
-> IO CStringLen
newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s
- = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do
+ = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do
from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p
let go iteration to_p to_sz_bytes = do
diff --git a/libraries/base/GHC/IO/Encoding/Failure.hs b/libraries/base/GHC/IO/Encoding/Failure.hs
index 6cd475aca3..862de1fb4e 100644
--- a/libraries/base/GHC/IO/Encoding/Failure.hs
+++ b/libraries/base/GHC/IO/Encoding/Failure.hs
@@ -18,7 +18,6 @@
module GHC.IO.Encoding.Failure (
CodingFailureMode(..), codingFailureModeSuffix,
isSurrogate,
- surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter,
recoverDecode, recoverEncode
) where
@@ -61,26 +60,35 @@ data CodingFailureMode
-- Note [Roundtripping]
-- ~~~~~~~~~~~~~~~~~~~~
--
--- Roundtripping is based on the ideas of PEP383. However, unlike
--- PEP383 we do not wish to use lone surrogate codepoints to escape
--- undecodable bytes, because that may confuse Unicode processing
--- software written in Haskell. Instead, we use the range of
--- private-use characters from 0xEF80 to 0xEFFF designated for
--- "encoding hacks" by the ConScript Unicode Registery.
+-- Roundtripping is based on the ideas of PEP383.
--
--- This introduces a technical problem when it comes to encoding back
--- to bytes using iconv. The iconv code will not fail when it tries to
--- encode a private-use character (as it would if trying to encode a
--- surrogate), which means that we won't get a chance to replace it
+-- We used to use the range of private-use characters from 0xEF80 to
+-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery
+-- to encode these characters.
+--
+-- However, people didn't like this because it means we don't get
+-- guaranteed roundtripping for byte sequences that look like a UTF-8
+-- encoded codepoint 0xEFxx.
+--
+-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape
+-- undecodable bytes, even though that may confuse Unicode processing
+-- software written in Haskell. This guarantees roundtripping because
+-- unicode input that includes lone surrogate codepoints is invalid by
+-- definition.
+--
+-- When we used private-use characters there was a technical problem when it
+-- came to encoding back to bytes using iconv. The iconv code will not fail when
+-- it tries to encode a private-use character (as it would if trying to encode
+-- a surrogate), which means that we won't get a chance to replace it
-- with the byte we originally escaped.
--
-- To work around this, when filling the buffer to be encoded (in
--- writeBlocks/withEncodedCString/newEncodedCString), we replace the
+-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the
-- private-use characters with lone surrogates again! Likewise, when
-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have
-- to do the inverse process.
--
--- The user of String should never see these lone surrogates, but it
+-- The user of String would never see these lone surrogates, but it
-- ensures that iconv will throw an error when encountering them. We
-- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
@@ -118,26 +126,6 @@ isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
|| (0xDC00 <= x && x <= 0xDFFF)
where x = ord c
--- | Private use characters (in Strings) --> lone surrogates (in
--- Buffer CharBufElem) (We use some private-use characters for
--- roundtripping unknown bytes through a String)
-{-# INLINE surrogatifyRoundtripCharacter #-}
-surrogatifyRoundtripCharacter :: Char -> Char
-surrogatifyRoundtripCharacter c
- | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00))
- | otherwise = c
- where x = ord c
-
--- | Lone surrogates (in Buffer CharBufElem) --> private use
--- characters (in Strings) (We use some surrogate characters for
--- roundtripping unknown bytes through a String)
-{-# INLINE desurrogatifyRoundtripCharacter #-}
-desurrogatifyRoundtripCharacter :: Char -> Char
-desurrogatifyRoundtripCharacter c
- | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00))
- | otherwise = c
- where x = ord c
-
-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
{-# INLINE escapeToRoundtripCharacterSurrogate #-}
escapeToRoundtripCharacterSurrogate :: Word8 -> Char
diff --git a/libraries/base/GHC/IO/Handle/Text.hs b/libraries/base/GHC/IO/Handle/Text.hs
index 7162dc28c5..280cebdd03 100644
--- a/libraries/base/GHC/IO/Handle/Text.hs
+++ b/libraries/base/GHC/IO/Handle/Text.hs
@@ -39,7 +39,6 @@ import GHC.IO.FD
import GHC.IO.Buffer
import qualified GHC.IO.BufferedIO as Buffered
import GHC.IO.Exception
-import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter)
import GHC.Exception
import GHC.IO.Handle.Types
import GHC.IO.Handle.Internals
@@ -275,8 +274,7 @@ unpack !buf !r !w acc0
| otherwise = do
-- Here, we are rather careful to only put an *evaluated* character
-- in the output string. Due to pointer tagging, this allows the consumer
- -- to avoid ping-ponging between the actual consumer code and the
- -- code of desurrogatifyRoundtripCharacter
+ -- to avoid ping-ponging between the actual consumer code and the thunk code
#ifdef CHARBUF_UTF16
-- reverse-order decoding of UTF-16
c2 <- peekElemOff pbuf i
@@ -289,8 +287,7 @@ unpack !buf !r !w acc0
{ C# c# -> unpackRB (C# c# : acc) (i-2) }
#else
c <- peekElemOff pbuf i
- case desurrogatifyRoundtripCharacter c of { C# c# ->
- unpackRB (C# c# : acc) (i-1) } -- Note [#5536]
+ unpackRB (c : acc) (i-1)
#endif
in
unpackRB acc0 (w-1)
@@ -313,8 +310,7 @@ unpack_nl !buf !r !w acc0
then unpackRB ('\n':acc) (i-2)
else unpackRB ('\n':acc) (i-1)
else do
- case desurrogatifyRoundtripCharacter c of { C# c# ->
- unpackRB (C# c# : acc) (i-1) } -- Note [#5536]
+ unpackRB (c : acc) (i-1)
in do
c <- peekElemOff pbuf (w-1)
if (c == '\r')
@@ -612,7 +608,7 @@ writeBlocks hdl line_buffered add_nl nl
else do
shoveString n' cs rest
| otherwise = do
- n' <- writeCharBuf raw n (surrogatifyRoundtripCharacter c)
+ n' <- writeCharBuf raw n c
shoveString n' cs rest
in
shoveString 0 s (if add_nl then "\n" else "")