diff options
-rw-r--r-- | libraries/base/GHC/Foreign.hs | 9 | ||||
-rw-r--r-- | libraries/base/GHC/IO/Encoding/Failure.hs | 54 | ||||
-rw-r--r-- | libraries/base/GHC/IO/Handle/Text.hs | 12 |
3 files changed, 29 insertions, 46 deletions
diff --git a/libraries/base/GHC/Foreign.hs b/libraries/base/GHC/Foreign.hs index f6f0272164..4eef2ebc2f 100644 --- a/libraries/base/GHC/Foreign.hs +++ b/libraries/base/GHC/Foreign.hs @@ -63,7 +63,6 @@ import GHC.Base import GHC.IO import GHC.IO.Exception import GHC.IO.Buffer -import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter) import GHC.IO.Encoding.Types @@ -173,7 +172,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes) if isEmptyBuffer from' then -- No input remaining: @why@ will be InputUnderflow, but we don't care - fmap (map desurrogatifyRoundtripCharacter) $ withBuffer to' $ peekArray (bufferElems to') + withBuffer to' $ peekArray (bufferElems to') else do -- Input remaining: what went wrong? putDebugMsg ("peekEncodedCString: " ++ show iteration ++ " " ++ show why) @@ -183,7 +182,7 @@ peekEncodedCString (TextEncoding { mkTextDecoder = mk_decoder }) (p, sz_bytes) putDebugMsg ("peekEncodedCString: from " ++ summaryBuffer from ++ " " ++ summaryBuffer from' ++ " " ++ summaryBuffer from'') putDebugMsg ("peekEncodedCString: to " ++ summaryBuffer to ++ " " ++ summaryBuffer to' ++ " " ++ summaryBuffer to'') to_chars <- withBuffer to'' $ peekArray (bufferElems to'') - fmap (map desurrogatifyRoundtripCharacter to_chars++) $ go (iteration + 1) from'' + fmap (to_chars++) $ go (iteration + 1) from'' go (0 :: Int) from0 @@ -194,7 +193,7 @@ withEncodedCString :: TextEncoding -- ^ Encoding of CString to create -> (CStringLen -> IO a) -- ^ Worker that can safely use the allocated memory -> IO a withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s act - = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do + = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p let go iteration to_sz_bytes = do @@ -214,7 +213,7 @@ newEncodedCString :: TextEncoding -- ^ Encoding of CString to create -> String -- ^ String to encode -> IO CStringLen newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s - = bracket mk_encoder close $ \encoder -> withArrayLen (map surrogatifyRoundtripCharacter s) $ \sz p -> do + = bracket mk_encoder close $ \encoder -> withArrayLen s $ \sz p -> do from <- fmap (\fp -> bufferAdd sz (emptyBuffer fp sz ReadBuffer)) $ newForeignPtr_ p let go iteration to_p to_sz_bytes = do diff --git a/libraries/base/GHC/IO/Encoding/Failure.hs b/libraries/base/GHC/IO/Encoding/Failure.hs index 6cd475aca3..862de1fb4e 100644 --- a/libraries/base/GHC/IO/Encoding/Failure.hs +++ b/libraries/base/GHC/IO/Encoding/Failure.hs @@ -18,7 +18,6 @@ module GHC.IO.Encoding.Failure ( CodingFailureMode(..), codingFailureModeSuffix, isSurrogate, - surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter, recoverDecode, recoverEncode ) where @@ -61,26 +60,35 @@ data CodingFailureMode -- Note [Roundtripping] -- ~~~~~~~~~~~~~~~~~~~~ -- --- Roundtripping is based on the ideas of PEP383. However, unlike --- PEP383 we do not wish to use lone surrogate codepoints to escape --- undecodable bytes, because that may confuse Unicode processing --- software written in Haskell. Instead, we use the range of --- private-use characters from 0xEF80 to 0xEFFF designated for --- "encoding hacks" by the ConScript Unicode Registery. +-- Roundtripping is based on the ideas of PEP383. -- --- This introduces a technical problem when it comes to encoding back --- to bytes using iconv. The iconv code will not fail when it tries to --- encode a private-use character (as it would if trying to encode a --- surrogate), which means that we won't get a chance to replace it +-- We used to use the range of private-use characters from 0xEF80 to +-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery +-- to encode these characters. +-- +-- However, people didn't like this because it means we don't get +-- guaranteed roundtripping for byte sequences that look like a UTF-8 +-- encoded codepoint 0xEFxx. +-- +-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape +-- undecodable bytes, even though that may confuse Unicode processing +-- software written in Haskell. This guarantees roundtripping because +-- unicode input that includes lone surrogate codepoints is invalid by +-- definition. +-- +-- When we used private-use characters there was a technical problem when it +-- came to encoding back to bytes using iconv. The iconv code will not fail when +-- it tries to encode a private-use character (as it would if trying to encode +-- a surrogate), which means that we won't get a chance to replace it -- with the byte we originally escaped. -- -- To work around this, when filling the buffer to be encoded (in --- writeBlocks/withEncodedCString/newEncodedCString), we replace the +-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the -- private-use characters with lone surrogates again! Likewise, when -- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we have -- to do the inverse process. -- --- The user of String should never see these lone surrogates, but it +-- The user of String would never see these lone surrogates, but it -- ensures that iconv will throw an error when encountering them. We -- use lone surrogates in the range 0xDC00 to 0xDCFF for this purpose. @@ -118,26 +126,6 @@ isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF) where x = ord c --- | Private use characters (in Strings) --> lone surrogates (in --- Buffer CharBufElem) (We use some private-use characters for --- roundtripping unknown bytes through a String) -{-# INLINE surrogatifyRoundtripCharacter #-} -surrogatifyRoundtripCharacter :: Char -> Char -surrogatifyRoundtripCharacter c - | 0xEF00 <= x && x < 0xF000 = chr (x - (0xEF00 - 0xDC00)) - | otherwise = c - where x = ord c - --- | Lone surrogates (in Buffer CharBufElem) --> private use --- characters (in Strings) (We use some surrogate characters for --- roundtripping unknown bytes through a String) -{-# INLINE desurrogatifyRoundtripCharacter #-} -desurrogatifyRoundtripCharacter :: Char -> Char -desurrogatifyRoundtripCharacter c - | 0xDC00 <= x && x < 0xDD00 = chr (x - (0xDC00 - 0xEF00)) - | otherwise = c - where x = ord c - -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem) {-# INLINE escapeToRoundtripCharacterSurrogate #-} escapeToRoundtripCharacterSurrogate :: Word8 -> Char diff --git a/libraries/base/GHC/IO/Handle/Text.hs b/libraries/base/GHC/IO/Handle/Text.hs index 7162dc28c5..280cebdd03 100644 --- a/libraries/base/GHC/IO/Handle/Text.hs +++ b/libraries/base/GHC/IO/Handle/Text.hs @@ -39,7 +39,6 @@ import GHC.IO.FD import GHC.IO.Buffer import qualified GHC.IO.BufferedIO as Buffered import GHC.IO.Exception -import GHC.IO.Encoding.Failure (surrogatifyRoundtripCharacter, desurrogatifyRoundtripCharacter) import GHC.Exception import GHC.IO.Handle.Types import GHC.IO.Handle.Internals @@ -275,8 +274,7 @@ unpack !buf !r !w acc0 | otherwise = do -- Here, we are rather careful to only put an *evaluated* character -- in the output string. Due to pointer tagging, this allows the consumer - -- to avoid ping-ponging between the actual consumer code and the - -- code of desurrogatifyRoundtripCharacter + -- to avoid ping-ponging between the actual consumer code and the thunk code #ifdef CHARBUF_UTF16 -- reverse-order decoding of UTF-16 c2 <- peekElemOff pbuf i @@ -289,8 +287,7 @@ unpack !buf !r !w acc0 { C# c# -> unpackRB (C# c# : acc) (i-2) } #else c <- peekElemOff pbuf i - case desurrogatifyRoundtripCharacter c of { C# c# -> - unpackRB (C# c# : acc) (i-1) } -- Note [#5536] + unpackRB (c : acc) (i-1) #endif in unpackRB acc0 (w-1) @@ -313,8 +310,7 @@ unpack_nl !buf !r !w acc0 then unpackRB ('\n':acc) (i-2) else unpackRB ('\n':acc) (i-1) else do - case desurrogatifyRoundtripCharacter c of { C# c# -> - unpackRB (C# c# : acc) (i-1) } -- Note [#5536] + unpackRB (c : acc) (i-1) in do c <- peekElemOff pbuf (w-1) if (c == '\r') @@ -612,7 +608,7 @@ writeBlocks hdl line_buffered add_nl nl else do shoveString n' cs rest | otherwise = do - n' <- writeCharBuf raw n (surrogatifyRoundtripCharacter c) + n' <- writeCharBuf raw n c shoveString n' cs rest in shoveString 0 s (if add_nl then "\n" else "") |