summaryrefslogtreecommitdiff
path: root/libraries/base/tests/IO/encoding004.hs
blob: 62ef5d6a93998cf7ddd1d64a956eaaf88c89de8d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import System.IO
import System.Directory
import Data.Char
import System.Process
import Control.Monad
import qualified Data.ByteString as BS
import System.Environment
import System.Exit
import System.FilePath
import Data.Maybe
import qualified Data.Map as M
import GHC.Foreign
import Control.Exception


decode :: TextEncoding -> BS.ByteString -> IO (Either SomeException String)
decode enc bs = try $ BS.useAsCStringLen bs $ peekCStringLen enc

encode :: TextEncoding -> String -> IO (Either SomeException BS.ByteString)
encode enc cs = try $ withCStringLen enc cs $ BS.packCStringLen

decodeEncode :: TextEncoding -> BS.ByteString -> IO (Either SomeException BS.ByteString)
decodeEncode enc bs = decode enc bs `bind` encode enc

encodedecode :: TextEncoding -> String -> IO (Either SomeException String)
encodedecode enc bs = encode enc bs `bind` decode enc

bind mx fxmy = do
    ei_e_cs <- mx
    case ei_e_cs of
        Left e   -> return (Left e)
        Right cs -> fxmy cs


main :: IO ()
main = forM_ [ ("CP936",  2, "CP936",      Just "CP936-UTF8")  -- Representative (roundtrippable) DBCS
             , ("CP1251", 1, "CP1251",     Just "CP1251-UTF8") -- Representative SBCS
             , ("UTF-8",  4, "CP936-UTF8", Nothing)            -- Sanity check
             ] $ \(enc_name, max_byte_length, file, mb_utf8_file) -> do
    putStrLn $ "== " ++ enc_name

    let fp = "encoded-data" </> file <.> "txt"
    enc <- mkTextEncoding enc_name
    bs <- BS.readFile fp

    -- In a DBCS you should never fail to encode truncated input for two consecutive truncation points,
    -- assuming that the input file is actually error free:
    testTruncations enc max_byte_length bs

    -- Should be able to roundtrip arbitrary rubbish, as long as we use the right encoding
    roundtrip_enc <- mkTextEncoding (enc_name ++ "//ROUNDTRIP")
    testRoundtripping roundtrip_enc bs

    -- Just check that we actually decode to the right thing, for good measure
    case mb_utf8_file of
      Nothing -> return ()
      Just utf8_file -> do
        utf8_bs <- BS.readFile ("encoded-data" </> utf8_file <.> "txt")
        Right expected <- decode utf8 utf8_bs
        Right actual   <- decode enc  bs
        unless (expected == actual) $ do
            putStrLn (bsDiff 0 actual expected)

forTruncations :: BS.ByteString -> (BS.ByteString -> IO a) -> IO [a]
forTruncations bs f = forSplits bs $ \before _ -> f before

forSplits :: BS.ByteString -> (BS.ByteString -> BS.ByteString -> IO a) -> IO [a]
forSplits bs f = forM [(800 * block) + ix | block <- [0..len `div` 800], ix <- [0..100]] $ \i -> uncurry f (BS.splitAt i bs)
    where len = BS.length bs

testTruncations :: TextEncoding -> Int -> BS.ByteString -> IO ()
testTruncations enc max_byte_length bs = do
    failures <- fmap catMaybes $ forTruncations bs $ testTruncation enc

    let failure_map = M.fromList failures
    forM_ failures $ \(i, e) -> do
        let js = [i+1..i+(max_byte_length - 1)]
        case sequence (map (`M.lookup` failure_map) js) of
            Nothing -> return ()
            Just es -> putStrLn ("Failed on consecutive truncated byte indexes " ++ show (i:js) ++ " (" ++ show (e:es) ++ ")")

testTruncation :: TextEncoding -> BS.ByteString -> IO (Maybe (Int, SomeException))
testTruncation enc expected = do
        --putStr (show i ++ ": ") >> hFlush stdout
        ei_e_actual <- decodeEncode enc expected
        case ei_e_actual of
            Left  e      -> return (Just (BS.length expected, e))
            Right actual | expected /= actual -> error $ "Mismatch on success when truncating at byte index " ++ show (BS.length expected)
                         | otherwise          -> return Nothing

testRoundtripping :: TextEncoding -> BS.ByteString -> IO ()
testRoundtripping roundtrip_enc bs = void $ forSplits bs $ \before after -> do
    let expected = before `BS.append` (fromIntegral (BS.length before `mod` 256) `BS.cons` after)
    Right actual <- decodeEncode roundtrip_enc expected
    when (actual /= expected) $ do
        let i_str = show (BS.length before)
        putStrLn $ "Failed to roundtrip given mutant byte at index " ++ i_str ++ " (" ++ bsDiff 0 (BS.unpack actual) (BS.unpack expected) ++ ")"
        -- Possibly useful for debugging porpoises:
        --BS.writeFile (i_str ++ ".expected") expected
        --BS.writeFile (i_str ++ ".actual")   actual

bsDiff :: (Show a, Eq a) => Int -> [a] -> [a] -> String
bsDiff _ [] [] = error "bsDiff"
bsDiff _ [] bs = "actual " ++ show (length bs) ++ " elements shorter than expected"
bsDiff _ as [] = "expected " ++ show (length as) ++ " elements shorter than actual"
bsDiff i (a:as) (b:bs) | a == b    = bsDiff (i + 1) as bs
                       | otherwise = show a ++ " /= " ++ show b ++ " at index " ++ show i