summaryrefslogtreecommitdiff
path: root/libraries/base/tests/IO/encoding005.hs
blob: b4ee38150777404363430f0cfbcc1d2ae23b52f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import Control.Monad
import Data.Word (Word8)
import Foreign.Ptr
import Foreign.Marshal.Array
import GHC.Foreign (peekCStringLen, withCStringLen)
import GHC.IO.Encoding.Failure (CodingFailureMode(..))
import qualified GHC.IO.Encoding.Latin1 as Latin1
import System.IO
import System.IO.Error

-- Tests for single-byte encodings that map directly to Unicode
-- (module GHC.IO.Encoding.Latin1)

eitherToMaybe :: Either a b -> Maybe b
eitherToMaybe (Left _) = Nothing
eitherToMaybe (Right b) = Just b

decode :: TextEncoding -> [Word8] -> IO (Maybe String)
decode enc xs = fmap eitherToMaybe . tryIOError $ withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz))

encode :: TextEncoding -> String -> IO (Maybe [Word8])
encode enc cs = fmap eitherToMaybe . tryIOError $ withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p))

testIO :: (Eq a, Show a) => IO a -> a -> IO ()
testIO action expected = do
  result <- action
  when (result /= expected) $
    putStrLn $ "Test failed: expected " ++ show expected ++ ", but got " ++ show result

-- Test char8-like encodings
test_char8 :: TextEncoding -> IO ()
test_char8 enc = do
  testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']

  testIO (encode enc ['\0'..'\x200']) $ Just ([0..0xff] ++ [0..0xff] ++ [0])

-- Test latin1-like encodings
test_latin1 :: CodingFailureMode -> TextEncoding -> IO ()
test_latin1 cfm enc = do
  testIO (decode enc [0..0xff]) $ Just ['\0'..'\xff']

  testIO (encode enc ['\0'..'\xff']) $ Just [0..0xff]
  testIO (encode enc "\xfe\xff\x100\x101\x100\xff\xfe") $ case cfm of
    ErrorOnCodingFailure -> Nothing
    IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe]
    TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe]
    -- N.B. The argument "latin1//TRANSLIT" to mkTextEncoding does not
    -- correspond to "latin1//TRANSLIT" in iconv! Instead GHC asks iconv
    -- to encode to "latin1" and uses its own "evil hack" to insert '?'
    -- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode.
    --
    -- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would
    -- transliterate to 'A' (ASCII 0x41). Similarly iconv would
    -- transliterate U+0101 LATIN SMALL LETTER A WITH MACRON to 'a'
    -- (ASCII 0x61).
    RoundtripFailure -> Nothing

test_ascii :: CodingFailureMode -> TextEncoding -> IO ()
test_ascii cfm enc = do
  testIO (decode enc [0..0x7f]) $ Just ['\0'..'\x7f']
  testIO (decode enc [0x7e,0x7f,0x80,0x81,0x80,0x7f,0x7e]) $ case cfm of
    ErrorOnCodingFailure -> Nothing
    IgnoreCodingFailure -> Just "\x7e\x7f\x7f\x7e"
    TransliterateCodingFailure -> Just "\x7e\x7f\xfffd\xfffd\xfffd\x7f\x7e"
    -- Another GHC special: decode invalid input to the Char U+FFFD
    -- REPLACEMENT CHARACTER.
    RoundtripFailure -> Just "\x7e\x7f\xdc80\xdc81\xdc80\x7f\x7e"
    -- GHC's PEP383-style String-encoding of invalid input,
    -- see Note [Roundtripping]

  testIO (encode enc ['\0'..'\x7f']) $ Just [0..0x7f]
  testIO (encode enc "\x7e\x7f\x80\x81\x80\x7f\xe9") $ case cfm of
    ErrorOnCodingFailure -> Nothing
    IgnoreCodingFailure -> Just [0x7e,0x7f,0x7f]
    TransliterateCodingFailure -> Just [0x7e,0x7f,0x3f,0x3f,0x3f,0x7f,0x3f]
    -- See comment in test_latin1. iconv -t ASCII//TRANSLIT would encode
    -- U+00E9 LATIN SMALL LETTER E WITH ACUTE as 'e' (ASCII 0x65).
    RoundtripFailure -> Nothing

  -- Test roundtripping for good measure
  case cfm of
    RoundtripFailure -> do
      Just s <- decode enc [0..0xff]
      testIO (encode enc s) $ Just [0..0xff]
    _ -> return ()

main = do
  putStrLn "char8 tests"
  test_char8 char8              -- char8 never fails in either direction

  -- These use GHC's own implementation
  putStrLn "Latin1.ascii tests"
  test_ascii ErrorOnCodingFailure (Latin1.ascii)
  test_ascii IgnoreCodingFailure (Latin1.mkAscii IgnoreCodingFailure)
  test_ascii TransliterateCodingFailure (Latin1.mkAscii TransliterateCodingFailure)
  test_ascii RoundtripFailure (Latin1.mkAscii RoundtripFailure)

  putStrLn "Latin1.latin1_checked tests"
  test_latin1 ErrorOnCodingFailure (Latin1.latin1_checked)
  test_latin1 IgnoreCodingFailure (Latin1.mkLatin1_checked IgnoreCodingFailure)
  test_latin1 TransliterateCodingFailure (Latin1.mkLatin1_checked TransliterateCodingFailure)
  test_latin1 RoundtripFailure (Latin1.mkLatin1_checked RoundtripFailure)

  -- These use iconv (normally, unless it is broken)
  putStrLn "mkTextEncoding ASCII tests"
  test_ascii ErrorOnCodingFailure =<< mkTextEncoding "ASCII"
  test_ascii IgnoreCodingFailure =<< mkTextEncoding "ASCII//IGNORE"
  test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT"
  test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP"

  putStrLn "mkTextEncoding latin1 tests"
  test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "latin1"
  test_latin1 IgnoreCodingFailure =<< mkTextEncoding "latin1//IGNORE"
  test_latin1 TransliterateCodingFailure =<< mkTextEncoding "latin1//TRANSLIT"
  test_latin1 RoundtripFailure =<< mkTextEncoding "latin1//ROUNDTRIP"