diff options
author | Max Bolingbroke <batterseapower@hotmail.com> | 2011-11-02 13:44:17 +0000 |
---|---|---|
committer | Max Bolingbroke <batterseapower@hotmail.com> | 2011-11-02 13:44:17 +0000 |
commit | 2043afcb0b938bc89257ab64d72a75b44a475c82 (patch) | |
tree | f65b5eac36ec56507b30867756e10208a63be933 | |
parent | 201a47c891d0451c6acb88a8c163d9e74b501e00 (diff) | |
download | haskell-2043afcb0b938bc89257ab64d72a75b44a475c82.tar.gz |
Avoid using iconv for the locale TextEncoding if we can help it
-rw-r--r-- | libraries/base/GHC/IO/Encoding.hs | 54 | ||||
-rw-r--r-- | libraries/base/GHC/IO/Encoding/Iconv.hs | 8 |
2 files changed, 35 insertions, 27 deletions
diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs index 8d98d94336..df14c0014c 100644 --- a/libraries/base/GHC/IO/Encoding.hs +++ b/libraries/base/GHC/IO/Encoding.hs @@ -35,6 +35,7 @@ import GHC.IO.Encoding.Types import GHC.Word #if !defined(mingw32_HOST_OS) import qualified GHC.IO.Encoding.Iconv as Iconv +import System.IO.Unsafe (unsafePerformIO) #else import qualified GHC.IO.Encoding.CodePage as CodePage import Text.Read (reads) @@ -117,13 +118,23 @@ fileSystemEncoding :: TextEncoding foreignEncoding :: TextEncoding #if !defined(mingw32_HOST_OS) -localeEncoding = Iconv.localeEncoding -fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure -foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure +-- It is rather important that we don't just call Iconv.mkIconvEncoding here +-- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode +-- lone surrogates without complaint. +-- +-- By going through our Haskell implementations of those encodings, we are +-- guaranteed to catch such errors. +-- +-- FIXME: this is not a complete solution because if the locale encoding is one +-- which we don't have a Haskell-side decoder for, iconv might still ignore the +-- lone surrogate in the input. +localeEncoding = unsafePerformIO $ mkTextEncoding' ErrorOnCodingFailure Iconv.localeEncodingName +fileSystemEncoding = unsafePerformIO $ mkTextEncoding' RoundtripFailure Iconv.localeEncodingName +foreignEncoding = unsafePerformIO $ mkTextEncoding' IgnoreCodingFailure Iconv.localeEncodingName #else -localeEncoding = CodePage.localeEncoding +localeEncoding = CodePage.localeEncoding fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure -foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure +foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure #endif -- | An encoding in which Unicode code points are translated to bytes @@ -164,21 +175,8 @@ char8 = Latin1.latin1 -- mkTextEncoding :: String -> IO TextEncoding mkTextEncoding e = case mb_coding_failure_mode of - Nothing -> unknown_encoding - Just cfm -> case enc of - "UTF-8" -> return $ UTF8.mkUTF8 cfm - "UTF-16" -> return $ UTF16.mkUTF16 cfm - "UTF-16LE" -> return $ UTF16.mkUTF16le cfm - "UTF-16BE" -> return $ UTF16.mkUTF16be cfm - "UTF-32" -> return $ UTF32.mkUTF32 cfm - "UTF-32LE" -> return $ UTF32.mkUTF32le cfm - "UTF-32BE" -> return $ UTF32.mkUTF32be cfm -#if defined(mingw32_HOST_OS) - 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp - _ -> unknown_encoding -#else - _ -> Iconv.mkIconvEncoding cfm enc -#endif + Nothing -> unknown_encoding + Just cfm -> mkTextEncoding' cfm enc where -- The only problem with actually documenting //IGNORE and //TRANSLIT as -- supported suffixes is that they are not necessarily supported with non-GNU iconv @@ -193,6 +191,22 @@ mkTextEncoding e = case mb_coding_failure_mode of unknown_encoding = ioException (IOError Nothing NoSuchThing "mkTextEncoding" ("unknown encoding:" ++ e) Nothing Nothing) +mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding +mkTextEncoding' cfm enc = case enc of + "UTF-8" -> return $ UTF8.mkUTF8 cfm + "UTF-16" -> return $ UTF16.mkUTF16 cfm + "UTF-16LE" -> return $ UTF16.mkUTF16le cfm + "UTF-16BE" -> return $ UTF16.mkUTF16be cfm + "UTF-32" -> return $ UTF32.mkUTF32 cfm + "UTF-32LE" -> return $ UTF32.mkUTF32le cfm + "UTF-32BE" -> return $ UTF32.mkUTF32be cfm +#if defined(mingw32_HOST_OS) + 'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp + _ -> unknown_encoding +#else + _ -> Iconv.mkIconvEncoding cfm enc +#endif + latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8) latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs index 2c3ad14621..50cdccb865 100644 --- a/libraries/base/GHC/IO/Encoding/Iconv.hs +++ b/libraries/base/GHC/IO/Encoding/Iconv.hs @@ -23,7 +23,7 @@ module GHC.IO.Encoding.Iconv ( #if !defined(mingw32_HOST_OS) iconvEncoding, mkIconvEncoding, - localeEncoding, mkLocaleEncoding + localeEncodingName #endif ) where @@ -65,12 +65,6 @@ localeEncodingName = unsafePerformIO $ do cstr <- c_localeEncoding peekCAString cstr -- Assume charset names are ASCII -localeEncoding :: TextEncoding -localeEncoding = mkLocaleEncoding ErrorOnCodingFailure - -mkLocaleEncoding :: CodingFailureMode -> TextEncoding -mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName - -- We hope iconv_t is a storable type. It should be, since it has at least the -- value -1, which is a possible return value from iconv_open. type IConv = CLong -- ToDo: (#type iconv_t) |