1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
import Control.Monad
import Data.Word (Word8)
import Foreign.Ptr
import Foreign.Marshal.Array
import GHC.Foreign (peekCStringLen, withCStringLen)
import GHC.IO.Encoding.Failure (CodingFailureMode(..))
import qualified GHC.IO.Encoding.Latin1 as Latin1
import System.IO
import System.IO.Error
-- Tests for single-byte encodings that map directly to Unicode
-- (module GHC.IO.Encoding.Latin1)
eitherToMaybe :: Either a b -> Maybe b
eitherToMaybe (Left _) = Nothing
eitherToMaybe (Right b) = Just b
decode :: TextEncoding -> [Word8] -> IO (Maybe String)
decode enc xs = fmap eitherToMaybe . tryIOError $ withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz))
encode :: TextEncoding -> String -> IO (Maybe [Word8])
encode enc cs = fmap eitherToMaybe . tryIOError $ withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p))
testIO :: (Eq a, Show a) => IO a -> a -> IO ()
testIO action expected = do
result <- action
when (result /= expected) $
putStrLn $ "Test failed: expected " ++ show expected ++ ", but got " ++ show result
-- Test char8-like encodings
test_char8 :: TextEncoding -> IO ()
test_char8 enc = do
testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']
testIO (encode enc ['\0'..'\x200']) $ Just ([0..0xff] ++ [0..0xff] ++ [0])
-- Test latin1-like encodings
test_latin1 :: CodingFailureMode -> TextEncoding -> IO ()
test_latin1 cfm enc = do
testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']
testIO (encode enc ['\0'..'\xff']) $ Just [0..0xff]
testIO (encode enc "\xfe\xff\x100\x101\x100\xff\xfe") $ case cfm of
ErrorOnCodingFailure -> Nothing
IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe]
TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe]
-- N.B. The argument "latin1//TRANSLIT" to mkTextEncoding does not
-- correspond to "latin1//TRANSLIT" in iconv! Instead GHC asks iconv
-- to encode to "latin1" and uses its own "evil hack" to insert '?'
-- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode.
--
-- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would
-- transliterate to 'A' (ASCII 0x41). Similarly iconv would
-- transliterate U+0101 LATIN SMALL LETTER A WITH MACRON to 'a'
-- (ASCII 0x61).
RoundtripFailure -> Nothing
test_ascii :: CodingFailureMode -> TextEncoding -> IO ()
test_ascii cfm enc = do
testIO (decode enc [0..0x7f]) $ Just ['\0'..'\x7f']
testIO (decode enc [0x7e,0x7f,0x80,0x81,0x80,0x7f,0x7e]) $ case cfm of
ErrorOnCodingFailure -> Nothing
IgnoreCodingFailure -> Just "\x7e\x7f\x7f\x7e"
TransliterateCodingFailure -> Just "\x7e\x7f\xfffd\xfffd\xfffd\x7f\x7e"
-- Another GHC special: decode invalid input to the Char U+FFFD
-- REPLACEMENT CHARACTER.
RoundtripFailure -> Just "\x7e\x7f\xdc80\xdc81\xdc80\x7f\x7e"
-- GHC's PEP383-style String-encoding of invalid input,
-- see Note [Roundtripping]
testIO (encode enc ['\0'..'\x7f']) $ Just [0..0x7f]
testIO (encode enc "\x7e\x7f\x80\x81\x80\x7f\xe9") $ case cfm of
ErrorOnCodingFailure -> Nothing
IgnoreCodingFailure -> Just [0x7e,0x7f,0x7f]
TransliterateCodingFailure -> Just [0x7e,0x7f,0x3f,0x3f,0x3f,0x7f,0x3f]
-- See comment in test_latin1. iconv -t ASCII//TRANSLIT would encode
-- U+00E9 LATIN SMALL LETTER E WITH ACUTE as 'e' (ASCII 0x65).
RoundtripFailure -> Nothing
-- Test roundtripping for good measure
case cfm of
RoundtripFailure -> do
Just s <- decode enc [0..0xff]
testIO (encode enc s) $ Just [0..0xff]
_ -> return ()
main = do
putStrLn "char8 tests"
test_char8 char8 -- char8 never fails in either direction
-- These use GHC's own implementation
putStrLn "Latin1.ascii tests"
test_ascii ErrorOnCodingFailure (Latin1.ascii)
test_ascii IgnoreCodingFailure (Latin1.mkAscii IgnoreCodingFailure)
test_ascii TransliterateCodingFailure (Latin1.mkAscii TransliterateCodingFailure)
test_ascii RoundtripFailure (Latin1.mkAscii RoundtripFailure)
putStrLn "Latin1.latin1_checked tests"
test_latin1 ErrorOnCodingFailure (Latin1.latin1_checked)
test_latin1 IgnoreCodingFailure (Latin1.mkLatin1_checked IgnoreCodingFailure)
test_latin1 TransliterateCodingFailure (Latin1.mkLatin1_checked TransliterateCodingFailure)
test_latin1 RoundtripFailure (Latin1.mkLatin1_checked RoundtripFailure)
-- These use iconv (normally, unless it is broken)
putStrLn "mkTextEncoding ASCII tests"
test_ascii ErrorOnCodingFailure =<< mkTextEncoding "ASCII"
test_ascii IgnoreCodingFailure =<< mkTextEncoding "ASCII//IGNORE"
test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT"
test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP"
putStrLn "mkTextEncoding latin1 tests"
test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "latin1"
test_latin1 IgnoreCodingFailure =<< mkTextEncoding "latin1//IGNORE"
test_latin1 TransliterateCodingFailure =<< mkTextEncoding "latin1//TRANSLIT"
test_latin1 RoundtripFailure =<< mkTextEncoding "latin1//ROUNDTRIP"
|