summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Gamari <ben@smart-cactus.org>2022-07-16 13:54:57 -0400
committerMarge Bot <ben+marge-bot@smart-cactus.org>2022-07-22 08:18:41 -0400
commitae1666353696b5d85938d8a2f5fb11fb66f21678 (patch)
tree7de167e7c2f27ec2acc300ba815677878c1bd738
parent2c5991ccaf45cb7e68e54d59a27ee144a4499edb (diff)
downloadhaskell-ae1666353696b5d85938d8a2f5fb11fb66f21678.tar.gz
ghc-boot: Clean up UTF-8 codecs
In preparation for moving the UTF-8 codecs into `base`: * Move them to GHC.Utils.Encoding.UTF8 * Make names more consistent * Add some Haddocks
-rw-r--r--compiler/GHC/CmmToAsm/Dwarf/Types.hs2
-rw-r--r--compiler/GHC/Data/FastString.hs6
-rw-r--r--compiler/GHC/Data/StringBuffer.hs10
-rw-r--r--compiler/GHC/Hs/DocString.hs2
-rw-r--r--compiler/GHC/StgToCmm/Prof.hs4
-rw-r--r--compiler/GHC/Types/Literal.hs2
-rw-r--r--libraries/ghc-boot/GHC/Data/ShortText.hs5
-rw-r--r--libraries/ghc-boot/GHC/Utils/Encoding.hs302
-rw-r--r--libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs344
-rw-r--r--libraries/ghc-boot/ghc-boot.cabal.in1
10 files changed, 363 insertions, 315 deletions
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Types.hs b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
index e29f03e1d6..caa829db21 100644
--- a/compiler/GHC/CmmToAsm/Dwarf/Types.hs
+++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
@@ -599,7 +599,7 @@ pprString str
= pprString' $ hcat $ map escapeChar $
if str `lengthIs` utf8EncodedLength str
then str
- else map (chr . fromIntegral) $ BS.unpack $ utf8EncodeString str
+ else map (chr . fromIntegral) $ BS.unpack $ utf8EncodeByteString str
-- | Escape a single non-unicode character
escapeChar :: Char -> SDoc
diff --git a/compiler/GHC/Data/FastString.hs b/compiler/GHC/Data/FastString.hs
index 131f174c41..98ca34c249 100644
--- a/compiler/GHC/Data/FastString.hs
+++ b/compiler/GHC/Data/FastString.hs
@@ -531,13 +531,13 @@ mkFastString :: String -> FastString
{-# NOINLINE[1] mkFastString #-}
mkFastString str =
inlinePerformIO $ do
- sbs <- utf8EncodeShortByteString str
+ let !sbs = utf8EncodeShortByteString str
mkFastStringWith (mkNewFastStringShortByteString sbs) sbs
-- The following rule is used to avoid polluting the non-reclaimable FastString
-- table with transient strings when we only want their encoding.
{-# RULES
-"bytesFS/mkFastString" forall x. bytesFS (mkFastString x) = utf8EncodeString x #-}
+"bytesFS/mkFastString" forall x. bytesFS (mkFastString x) = utf8EncodeByteString x #-}
-- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
mkFastStringByteList :: [Word8] -> FastString
@@ -554,7 +554,7 @@ mkNewFastStringShortByteString :: ShortByteString -> Int
-> FastMutInt -> IO FastString
mkNewFastStringShortByteString sbs uid n_zencs = do
let zstr = mkZFastString n_zencs sbs
- chars <- countUTF8Chars sbs
+ chars = utf8CountCharsShortByteString sbs
return (FastString uid chars sbs zstr)
hashStr :: ShortByteString -> Int
diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs
index e6dcb14b6b..1426cf26e3 100644
--- a/compiler/GHC/Data/StringBuffer.hs
+++ b/compiler/GHC/Data/StringBuffer.hs
@@ -199,7 +199,7 @@ stringToStringBuffer str =
let size = utf8EncodedLength str
buf <- mallocForeignPtrArray (size+3)
unsafeWithForeignPtr buf $ \ptr -> do
- utf8EncodeStringPtr ptr str
+ utf8EncodePtr ptr str
pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
-- sentinels for UTF-8 decoding
return (StringBuffer buf size 0)
@@ -297,7 +297,7 @@ prevChar (StringBuffer buf _ cur) _ =
inlinePerformIO $
unsafeWithForeignPtr buf $ \p -> do
p' <- utf8PrevChar (p `plusPtr` cur)
- return (fst (utf8DecodeChar p'))
+ return (fst (utf8DecodeCharPtr p'))
-- -----------------------------------------------------------------------------
-- Moving
@@ -383,7 +383,7 @@ lexemeToString :: StringBuffer
-> String
lexemeToString _ 0 = ""
lexemeToString (StringBuffer buf _ cur) bytes =
- utf8DecodeStringLazy buf cur bytes
+ utf8DecodeForeignPtr buf cur bytes
lexemeToFastString :: StringBuffer
-> Int -- ^ @n@, the number of bytes
@@ -405,7 +405,7 @@ decodePrevNChars n (StringBuffer buf _ cur) =
go buf0 n acc p | n == 0 || buf0 >= p = return acc
go buf0 n acc p = do
p' <- utf8PrevChar p
- let (c,_) = utf8DecodeChar p'
+ let (c,_) = utf8DecodeCharPtr p'
go buf0 (n - 1) (c:acc) p'
-- -----------------------------------------------------------------------------
@@ -414,7 +414,7 @@ parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
= inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
go i x | i == len = x
- | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
+ | otherwise = case fst (utf8DecodeCharPtr (ptr `plusPtr` (cur + i))) of
'_' -> go (i + 1) x -- skip "_" (#14473)
char -> go (i + 1) (x * radix + toInteger (char_to_int char))
in go 0 0
diff --git a/compiler/GHC/Hs/DocString.hs b/compiler/GHC/Hs/DocString.hs
index 3a557ee0e8..7e35004237 100644
--- a/compiler/GHC/Hs/DocString.hs
+++ b/compiler/GHC/Hs/DocString.hs
@@ -137,7 +137,7 @@ instance Outputable HsDocStringChunk where
mkHsDocStringChunk :: String -> HsDocStringChunk
-mkHsDocStringChunk s = HsDocStringChunk (utf8EncodeString s)
+mkHsDocStringChunk s = HsDocStringChunk (utf8EncodeByteString s)
-- | Create a 'HsDocString' from a UTF8-encoded 'ByteString'.
mkHsDocStringChunkUtf8ByteString :: ByteString -> HsDocStringChunk
diff --git a/compiler/GHC/StgToCmm/Prof.hs b/compiler/GHC/StgToCmm/Prof.hs
index 478925122c..8c8f89dbe9 100644
--- a/compiler/GHC/StgToCmm/Prof.hs
+++ b/compiler/GHC/StgToCmm/Prof.hs
@@ -230,7 +230,7 @@ emitCostCentreDecl cc = do
; modl <- newByteStringCLit (bytesFS $ moduleNameFS
$ moduleName
$ cc_mod cc)
- ; loc <- newByteStringCLit $ utf8EncodeString $
+ ; loc <- newByteStringCLit $ utf8EncodeByteString $
renderWithContext ctx (ppr $! costCentreSrcSpan cc)
; let
lits = [ zero platform, -- StgInt ccID,
@@ -297,7 +297,7 @@ emitInfoTableProv ip = do
ctx = stgToCmmContext cfg
platform = stgToCmmPlatform cfg
; let (src, label) = maybe ("", "") (first (renderWithContext ctx . ppr)) (infoTableProv ip)
- mk_string = newByteStringCLit . utf8EncodeString
+ mk_string = newByteStringCLit . utf8EncodeByteString
; label <- mk_string label
; modl <- newByteStringCLit (bytesFS $ moduleNameFS
$ moduleName mod)
diff --git a/compiler/GHC/Types/Literal.hs b/compiler/GHC/Types/Literal.hs
index 5b14ecc78d..b525fc94df 100644
--- a/compiler/GHC/Types/Literal.hs
+++ b/compiler/GHC/Types/Literal.hs
@@ -578,7 +578,7 @@ mkLitChar = LitChar
mkLitString :: String -> Literal
-- stored UTF-8 encoded
mkLitString [] = LitString mempty
-mkLitString s = LitString (utf8EncodeString s)
+mkLitString s = LitString (utf8EncodeByteString s)
mkLitBigNat :: Integer -> Literal
mkLitBigNat x = assertPpr (x >= 0) (integer x)
diff --git a/libraries/ghc-boot/GHC/Data/ShortText.hs b/libraries/ghc-boot/GHC/Data/ShortText.hs
index 2b3038ccfc..929b65b481 100644
--- a/libraries/ghc-boot/GHC/Data/ShortText.hs
+++ b/libraries/ghc-boot/GHC/Data/ShortText.hs
@@ -67,14 +67,15 @@ instance IsString ShortText where
-- | /O(n)/ Returns the length of the 'ShortText' in characters.
codepointLength :: ShortText -> Int
-codepointLength st = unsafeDupablePerformIO $ countUTF8Chars (contents st)
+codepointLength st = utf8CountCharsShortByteString (contents st)
+
-- | /O(1)/ Returns the length of the 'ShortText' in bytes.
byteLength :: ShortText -> Int
byteLength st = SBS.length $ contents st
-- | /O(n)/ Convert a 'String' into a 'ShortText'.
pack :: String -> ShortText
-pack s = unsafeDupablePerformIO $ ShortText <$> utf8EncodeShortByteString s
+pack s = ShortText $ utf8EncodeShortByteString s
-- | /O(n)/ Convert a 'ShortText' into a 'String'.
unpack :: ShortText -> String
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs
index 16386c69ca..183d29946f 100644
--- a/libraries/ghc-boot/GHC/Utils/Encoding.hs
+++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs
@@ -17,21 +17,7 @@
module GHC.Utils.Encoding (
-- * UTF-8
- utf8DecodeCharAddr#,
- utf8PrevChar,
- utf8CharStart,
- utf8DecodeChar,
- utf8DecodeByteString,
- utf8UnconsByteString,
- utf8DecodeShortByteString,
- utf8CompareShortByteString,
- utf8DecodeStringLazy,
- utf8EncodeChar,
- utf8EncodeString,
- utf8EncodeStringPtr,
- utf8EncodeShortByteString,
- utf8EncodedLength,
- countUTF8Chars,
+ module GHC.Utils.Encoding.UTF8,
-- * Z-encoding
UserString,
@@ -47,295 +33,11 @@ module GHC.Utils.Encoding (
import Prelude
import Foreign
-import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import Data.Char
import qualified Data.Char as Char
import Numeric
-import GHC.IO
-import GHC.ST
-import Data.ByteString (ByteString)
-import qualified Data.ByteString.Internal as BS
-import Data.ByteString.Short.Internal (ShortByteString(..))
-
-import GHC.Exts
-
--- -----------------------------------------------------------------------------
--- UTF-8
-
--- We can't write the decoder as efficiently as we'd like without
--- resorting to unboxed extensions, unfortunately. I tried to write
--- an IO version of this function, but GHC can't eliminate boxed
--- results from an IO-returning function.
---
--- We assume we can ignore overflow when parsing a multibyte character here.
--- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
--- before decoding them (see "GHC.Data.StringBuffer").
-
-{-# INLINE utf8DecodeChar# #-}
-utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
-utf8DecodeChar# indexWord8# =
- let !ch0 = word2Int# (indexWord8# 0#) in
- case () of
- _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
-
- | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
- let !ch1 = word2Int# (indexWord8# 1#) in
- if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
- (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
- (ch1 -# 0x80#)),
- 2# #)
-
- | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
- let !ch1 = word2Int# (indexWord8# 1#) in
- if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
- let !ch2 = word2Int# (indexWord8# 2#) in
- if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
- (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
- ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
- (ch2 -# 0x80#)),
- 3# #)
-
- | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
- let !ch1 = word2Int# (indexWord8# 1#) in
- if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
- let !ch2 = word2Int# (indexWord8# 2#) in
- if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
- let !ch3 = word2Int# (indexWord8# 3#) in
- if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
- (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
- ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
- ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
- (ch3 -# 0x80#)),
- 4# #)
-
- | otherwise -> fail 1#
- where
- -- all invalid sequences end up here:
- fail :: Int# -> (# Char#, Int# #)
- fail nBytes# = (# '\0'#, nBytes# #)
- -- '\xFFFD' would be the usual replacement character, but
- -- that's a valid symbol in Haskell, so will result in a
- -- confusing parse error later on. Instead we use '\0' which
- -- will signal a lexer error immediately.
-
-utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
-utf8DecodeCharAddr# a# off# =
-#if !MIN_VERSION_base(4,16,0)
- utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
-#else
- utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
-#endif
-
-utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
-utf8DecodeCharByteArray# ba# off# =
-#if !MIN_VERSION_base(4,16,0)
- utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
-#else
- utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
-#endif
-
-
-utf8DecodeChar :: Ptr Word8 -> (Char, Int)
-utf8DecodeChar !(Ptr a#) =
- case utf8DecodeCharAddr# a# 0# of
- (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
-
--- UTF-8 is cleverly designed so that we can always figure out where
--- the start of the current character is, given any position in a
--- stream. This function finds the start of the previous character,
--- assuming there *is* a previous character.
-utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
-utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
-
-utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
-utf8CharStart p = go p
- where go p = do w <- peek p
- if w >= 0x80 && w < 0xC0
- then go (p `plusPtr` (-1))
- else return p
-
-{-# INLINE utf8DecodeLazy# #-}
-utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
-utf8DecodeLazy# retain decodeChar# len#
- = unpack 0#
- where
- unpack i#
- | isTrue# (i# >=# len#) = retain >> return []
- | otherwise =
- case decodeChar# i# of
- (# c#, nBytes# #) -> do
- rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
- return (C# c# : rest)
-
-utf8DecodeByteString :: ByteString -> [Char]
-utf8DecodeByteString (BS.PS fptr offset len)
- = utf8DecodeStringLazy fptr offset len
-
-utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
-utf8UnconsByteString (BS.PS _ _ 0) = Nothing
-utf8UnconsByteString (BS.PS fptr offset len)
- = unsafeDupablePerformIO $
- withForeignPtr fptr $ \ptr -> do
- let (c,n) = utf8DecodeChar (ptr `plusPtr` offset)
- return $ Just (c, BS.PS fptr (offset + n) (len - n))
-
-utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
-utf8DecodeStringLazy fp offset (I# len#)
- = unsafeDupablePerformIO $ do
- let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
- utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
--- Note that since utf8DecodeLazy# returns a thunk the lifetime of the
--- ForeignPtr actually needs to be longer than the lexical lifetime
--- withForeignPtr would provide here. That's why we use touchForeignPtr to
--- keep the fp alive until the last character has actually been decoded.
-
-utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
-utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
- -- UTF-8 has the property that sorting by bytes values also sorts by
- -- code-points.
- -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
- -- doesn't hold and we must explicitly check this case here.
- -- Note that decoding every code point would also work but it would be much
- -- more costly.
- where
- !sz1 = sizeofByteArray# a1
- !sz2 = sizeofByteArray# a2
- go off1 off2
- | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
- | isTrue# (off1 >=# sz1) = LT
- | isTrue# (off2 >=# sz2) = GT
- | otherwise =
-#if !MIN_VERSION_base(4,16,0)
- let !b1_1 = indexWord8Array# a1 off1
- !b2_1 = indexWord8Array# a2 off2
-#else
- let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
- !b2_1 = word8ToWord# (indexWord8Array# a2 off2)
-#endif
- in case b1_1 of
- 0xC0## -> case b2_1 of
- 0xC0## -> go (off1 +# 1#) (off2 +# 1#)
-#if !MIN_VERSION_base(4,16,0)
- _ -> case indexWord8Array# a1 (off1 +# 1#) of
-#else
- _ -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
-#endif
- 0x80## -> LT
- _ -> go (off1 +# 1#) (off2 +# 1#)
- _ -> case b2_1 of
-#if !MIN_VERSION_base(4,16,0)
- 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
-#else
- 0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
-#endif
- 0x80## -> GT
- _ -> go (off1 +# 1#) (off2 +# 1#)
- _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT
- | isTrue# (b1_1 `ltWord#` b2_1) -> LT
- | otherwise -> go (off1 +# 1#) (off2 +# 1#)
-
-utf8DecodeShortByteString :: ShortByteString -> [Char]
-utf8DecodeShortByteString (SBS ba#)
- = unsafeDupablePerformIO $
- let len# = sizeofByteArray# ba# in
- utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
-
-countUTF8Chars :: ShortByteString -> IO Int
-countUTF8Chars (SBS ba) = go 0# 0#
- where
- len# = sizeofByteArray# ba
- go i# n#
- | isTrue# (i# >=# len#) =
- return (I# n#)
- | otherwise = do
- case utf8DecodeCharByteArray# ba i# of
- (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
-
-{-# INLINE utf8EncodeChar #-}
-utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
- -> Char -> ST s Int
-utf8EncodeChar write# c =
- let x = fromIntegral (ord c) in
- case () of
- _ | x > 0 && x <= 0x007f -> do
- write 0 x
- return 1
- -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we
- -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
- | x <= 0x07ff -> do
- write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
- write 1 (0x80 .|. (x .&. 0x3F))
- return 2
- | x <= 0xffff -> do
- write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
- write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
- write 2 (0x80 .|. (x .&. 0x3F))
- return 3
- | otherwise -> do
- write 0 (0xF0 .|. (x `shiftR` 18))
- write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
- write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
- write 3 (0x80 .|. (x .&. 0x3F))
- return 4
- where
- {-# INLINE write #-}
- write (I# off#) (W# c#) = ST $ \s ->
-#if !MIN_VERSION_base(4,16,0)
- case write# off# (narrowWord8# c#) s of
-#else
- case write# off# (wordToWord8# c#) s of
-#endif
- s -> (# s, () #)
-
-utf8EncodeString :: String -> ByteString
-utf8EncodeString s =
- unsafePerformIO $ do
- let len = utf8EncodedLength s
- buf <- mallocForeignPtrBytes len
- withForeignPtr buf $ \ptr -> do
- utf8EncodeStringPtr ptr s
- pure (BS.fromForeignPtr buf 0 len)
-
-utf8EncodeStringPtr :: Ptr Word8 -> String -> IO ()
-utf8EncodeStringPtr (Ptr a#) str = go a# str
- where go !_ [] = return ()
- go a# (c:cs) = do
-#if !MIN_VERSION_base(4,16,0)
- -- writeWord8OffAddr# was taking a Word#
- I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
-#else
- I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
-#endif
- go (a# `plusAddr#` off#) cs
-
-utf8EncodeShortByteString :: String -> IO ShortByteString
-utf8EncodeShortByteString str = IO $ \s ->
- case utf8EncodedLength str of { I# len# ->
- case newByteArray# len# s of { (# s, mba# #) ->
- case go mba# 0# str of { ST f_go ->
- case f_go s of { (# s, () #) ->
- case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
- (# s, SBS ba# #) }}}}}
- where
- go _ _ [] = return ()
- go mba# i# (c:cs) = do
-#if !MIN_VERSION_base(4,16,0)
- -- writeWord8Array# was taking a Word#
- I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
-#else
- I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
-#endif
- go mba# (i# +# off#) cs
-
-utf8EncodedLength :: String -> Int
-utf8EncodedLength str = go 0 str
- where go !n [] = n
- go n (c:cs)
- | ord c > 0 && ord c <= 0x007f = go (n+1) cs
- | ord c <= 0x07ff = go (n+2) cs
- | ord c <= 0xffff = go (n+3) cs
- | otherwise = go (n+4) cs
+import GHC.Utils.Encoding.UTF8
-- -----------------------------------------------------------------------------
-- Note [Z-Encoding]
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
new file mode 100644
index 0000000000..d1d470ab25
--- /dev/null
+++ b/libraries/ghc-boot/GHC/Utils/Encoding/UTF8.hs
@@ -0,0 +1,344 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-}
+{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
+-- We always optimise this, otherwise performance of a non-optimised
+-- compiler is severely affected. This module used to live in the `ghc`
+-- package but has been moved to `ghc-boot` because the definition
+-- of the package database (needed in both ghc and in ghc-pkg) lives in
+-- `ghc-boot` and uses ShortText, which in turn depends on this module.
+
+-- | Simple, non-streaming UTF-8 codecs.
+module GHC.Utils.Encoding.UTF8
+ ( -- * Decoding single characters
+ utf8DecodeCharAddr#
+ , utf8DecodeCharPtr
+ , utf8DecodeCharByteArray#
+ , utf8PrevChar
+ , utf8CharStart
+ , utf8UnconsByteString
+ -- * Decoding strings
+ , utf8DecodeByteString
+ , utf8DecodeShortByteString
+ , utf8DecodeForeignPtr
+ , utf8DecodeByteArray#
+ -- * Counting characters
+ , utf8CountCharsShortByteString
+ , utf8CountCharsByteArray#
+ -- * Comparison
+ , utf8CompareByteArray#
+ , utf8CompareShortByteString
+ -- * Encoding strings
+ , utf8EncodeByteArray#
+ , utf8EncodePtr
+ , utf8EncodeByteString
+ , utf8EncodeShortByteString
+ , utf8EncodedLength
+ ) where
+
+
+import Prelude
+
+import Foreign
+import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
+import Data.Char
+import GHC.IO
+import GHC.ST
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Internal as BS
+import Data.ByteString.Short.Internal (ShortByteString(..))
+
+import GHC.Exts
+
+-- | Find the start of the codepoint preceding the codepoint at the given
+-- 'Ptr'. This is undefined if there is no previous valid codepoint.
+utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
+utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
+
+-- | Find the start of the codepoint at the given 'Ptr'. This is undefined if
+-- there is no previous valid codepoint.
+utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
+utf8CharStart p = go p
+ where go p = do w <- peek p
+ if w >= 0x80 && w < 0xC0
+ then go (p `plusPtr` (-1))
+ else return p
+
+utf8CountCharsShortByteString :: ShortByteString -> Int
+utf8CountCharsShortByteString (SBS ba) = utf8CountCharsByteArray# ba
+
+utf8DecodeShortByteString :: ShortByteString -> [Char]
+utf8DecodeShortByteString (SBS ba#) = utf8DecodeByteArray# ba#
+
+-- | Decode a 'ByteString' containing a UTF-8 string.
+utf8DecodeByteString :: ByteString -> [Char]
+utf8DecodeByteString (BS.PS fptr offset len)
+ = utf8DecodeForeignPtr fptr offset len
+
+utf8EncodeShortByteString :: String -> ShortByteString
+utf8EncodeShortByteString str = SBS (utf8EncodeByteArray# str)
+
+-- | Encode a 'String' into a 'ByteString'.
+utf8EncodeByteString :: String -> ByteString
+utf8EncodeByteString s =
+ unsafePerformIO $ do
+ let len = utf8EncodedLength s
+ buf <- mallocForeignPtrBytes len
+ withForeignPtr buf $ \ptr -> do
+ utf8EncodePtr ptr s
+ pure (BS.fromForeignPtr buf 0 len)
+
+utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
+utf8UnconsByteString (BS.PS _ _ 0) = Nothing
+utf8UnconsByteString (BS.PS fptr offset len)
+ = unsafeDupablePerformIO $
+ withForeignPtr fptr $ \ptr -> do
+ let (c,n) = utf8DecodeCharPtr (ptr `plusPtr` offset)
+ return $ Just (c, BS.PS fptr (offset + n) (len - n))
+
+utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
+utf8CompareShortByteString (SBS a1) (SBS a2) = utf8CompareByteArray# a1 a2
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately. I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see "GHC.Data.StringBuffer").
+
+{-# INLINE utf8DecodeChar# #-}
+-- | Decode a single codepoint from a byte buffer indexed by the given indexing
+-- function.
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+ let !ch0 = word2Int# (indexWord8# 0#) in
+ case () of
+ _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
+
+ | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+ (ch1 -# 0x80#)),
+ 2# #)
+
+ | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ let !ch2 = word2Int# (indexWord8# 2#) in
+ if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+ (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+ ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ch2 -# 0x80#)),
+ 3# #)
+
+ | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ let !ch2 = word2Int# (indexWord8# 2#) in
+ if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+ let !ch3 = word2Int# (indexWord8# 3#) in
+ if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
+ (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+ ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+ ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ch3 -# 0x80#)),
+ 4# #)
+
+ | otherwise -> fail 1#
+ where
+ -- all invalid sequences end up here:
+ fail :: Int# -> (# Char#, Int# #)
+ fail nBytes# = (# '\0'#, nBytes# #)
+ -- '\xFFFD' would be the usual replacement character, but
+ -- that's a valid symbol in Haskell, so will result in a
+ -- confusing parse error later on. Instead we use '\0' which
+ -- will signal a lexer error immediately.
+
+-- | Decode a single character at the given 'Addr#'.
+utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharAddr# a# off# =
+#if !MIN_VERSION_base(4,16,0)
+ utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
+#else
+ utf8DecodeChar# (\i# -> word8ToWord# (indexWord8OffAddr# a# (i# +# off#)))
+#endif
+
+-- | Decode a single codepoint starting at the given 'Ptr'.
+utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
+utf8DecodeCharPtr !(Ptr a#) =
+ case utf8DecodeCharAddr# a# 0# of
+ (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
+
+-- | Decode a single codepoint starting at the given byte offset into a
+-- 'ByteArray#'.
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+#if !MIN_VERSION_base(4,16,0)
+ utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+#else
+ utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
+#endif
+
+{-# INLINE utf8Decode# #-}
+utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
+utf8Decode# retain decodeChar# len#
+ = unpack 0#
+ where
+ unpack i#
+ | isTrue# (i# >=# len#) = retain >> return []
+ | otherwise =
+ case decodeChar# i# of
+ (# c#, nBytes# #) -> do
+ rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
+ return (C# c# : rest)
+
+utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
+utf8DecodeForeignPtr fp offset (I# len#)
+ = unsafeDupablePerformIO $ do
+ let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
+ utf8Decode# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
+-- Note that since utf8Decode# returns a thunk the lifetime of the
+-- ForeignPtr actually needs to be longer than the lexical lifetime
+-- withForeignPtr would provide here. That's why we use touchForeignPtr to
+-- keep the fp alive until the last character has actually been decoded.
+
+utf8DecodeByteArray# :: ByteArray# -> [Char]
+utf8DecodeByteArray# ba#
+ = unsafeDupablePerformIO $
+ let len# = sizeofByteArray# ba# in
+ utf8Decode# (return ()) (utf8DecodeCharByteArray# ba#) len#
+
+utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
+utf8CompareByteArray# a1 a2 = go 0# 0#
+ -- UTF-8 has the property that sorting by bytes values also sorts by
+ -- code-points.
+ -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
+ -- doesn't hold and we must explicitly check this case here.
+ -- Note that decoding every code point would also work but it would be much
+ -- more costly.
+ where
+ !sz1 = sizeofByteArray# a1
+ !sz2 = sizeofByteArray# a2
+ go off1 off2
+ | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
+ | isTrue# (off1 >=# sz1) = LT
+ | isTrue# (off2 >=# sz2) = GT
+ | otherwise =
+#if !MIN_VERSION_base(4,16,0)
+ let !b1_1 = indexWord8Array# a1 off1
+ !b2_1 = indexWord8Array# a2 off2
+#else
+ let !b1_1 = word8ToWord# (indexWord8Array# a1 off1)
+ !b2_1 = word8ToWord# (indexWord8Array# a2 off2)
+#endif
+ in case b1_1 of
+ 0xC0## -> case b2_1 of
+ 0xC0## -> go (off1 +# 1#) (off2 +# 1#)
+#if !MIN_VERSION_base(4,16,0)
+ _ -> case indexWord8Array# a1 (off1 +# 1#) of
+#else
+ _ -> case word8ToWord# (indexWord8Array# a1 (off1 +# 1#)) of
+#endif
+ 0x80## -> LT
+ _ -> go (off1 +# 1#) (off2 +# 1#)
+ _ -> case b2_1 of
+#if !MIN_VERSION_base(4,16,0)
+ 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
+#else
+ 0xC0## -> case word8ToWord# (indexWord8Array# a2 (off2 +# 1#)) of
+#endif
+ 0x80## -> GT
+ _ -> go (off1 +# 1#) (off2 +# 1#)
+ _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT
+ | isTrue# (b1_1 `ltWord#` b2_1) -> LT
+ | otherwise -> go (off1 +# 1#) (off2 +# 1#)
+
+utf8CountCharsByteArray# :: ByteArray# -> Int
+utf8CountCharsByteArray# ba = go 0# 0#
+ where
+ len# = sizeofByteArray# ba
+ go i# n#
+ | isTrue# (i# >=# len#) = I# n#
+ | otherwise =
+ case utf8DecodeCharByteArray# ba i# of
+ (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
+
+{-# INLINE utf8EncodeChar #-}
+utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
+ -> Char -> ST s Int
+utf8EncodeChar write# c =
+ let x = fromIntegral (ord c) in
+ case () of
+ _ | x > 0 && x <= 0x007f -> do
+ write 0 x
+ return 1
+ -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we
+ -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
+ | x <= 0x07ff -> do
+ write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
+ write 1 (0x80 .|. (x .&. 0x3F))
+ return 2
+ | x <= 0xffff -> do
+ write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
+ write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
+ write 2 (0x80 .|. (x .&. 0x3F))
+ return 3
+ | otherwise -> do
+ write 0 (0xF0 .|. (x `shiftR` 18))
+ write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
+ write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
+ write 3 (0x80 .|. (x .&. 0x3F))
+ return 4
+ where
+ {-# INLINE write #-}
+ write (I# off#) (W# c#) = ST $ \s ->
+#if !MIN_VERSION_base(4,16,0)
+ case write# off# (narrowWord8# c#) s of
+#else
+ case write# off# (wordToWord8# c#) s of
+#endif
+ s -> (# s, () #)
+
+utf8EncodePtr :: Ptr Word8 -> String -> IO ()
+utf8EncodePtr (Ptr a#) str = go a# str
+ where go !_ [] = return ()
+ go a# (c:cs) = do
+#if !MIN_VERSION_base(4,16,0)
+ -- writeWord8OffAddr# was taking a Word#
+ I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
+#else
+ I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
+#endif
+ go (a# `plusAddr#` off#) cs
+
+utf8EncodeByteArray# :: String -> ByteArray#
+utf8EncodeByteArray# str = runRW# $ \s ->
+ case utf8EncodedLength str of { I# len# ->
+ case newByteArray# len# s of { (# s, mba# #) ->
+ case go mba# 0# str of { ST f_go ->
+ case f_go s of { (# s, () #) ->
+ case unsafeFreezeByteArray# mba# s of { (# _, ba# #) ->
+ ba# }}}}}
+ where
+ go _ _ [] = return ()
+ go mba# i# (c:cs) = do
+#if !MIN_VERSION_base(4,16,0)
+ -- writeWord8Array# was taking a Word#
+ I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
+#else
+ I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
+#endif
+ go mba# (i# +# off#) cs
+
+utf8EncodedLength :: String -> Int
+utf8EncodedLength str = go 0 str
+ where go !n [] = n
+ go n (c:cs)
+ | ord c > 0 && ord c <= 0x007f = go (n+1) cs
+ | ord c <= 0x07ff = go (n+2) cs
+ | ord c <= 0xffff = go (n+3) cs
+ | otherwise = go (n+4) cs
+
diff --git a/libraries/ghc-boot/ghc-boot.cabal.in b/libraries/ghc-boot/ghc-boot.cabal.in
index 3ca83a5e1c..531d445b57 100644
--- a/libraries/ghc-boot/ghc-boot.cabal.in
+++ b/libraries/ghc-boot/ghc-boot.cabal.in
@@ -45,6 +45,7 @@ Library
GHC.Data.ShortText
GHC.Data.SizedSeq
GHC.Utils.Encoding
+ GHC.Utils.Encoding.UTF8
GHC.LanguageExtensions
GHC.Unit.Database
GHC.Serialized