diff options
Diffstat (limited to 'compiler/GHC/Utils/Encoding.hs')
-rw-r--r-- | compiler/GHC/Utils/Encoding.hs | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/compiler/GHC/Utils/Encoding.hs b/compiler/GHC/Utils/Encoding.hs index 24637a3bff..592b0bd40e 100644 --- a/compiler/GHC/Utils/Encoding.hs +++ b/compiler/GHC/Utils/Encoding.hs @@ -19,6 +19,7 @@ module GHC.Utils.Encoding ( utf8DecodeChar, utf8DecodeByteString, utf8DecodeShortByteString, + utf8CompareShortByteString, utf8DecodeStringLazy, utf8EncodeChar, utf8EncodeString, @@ -164,6 +165,38 @@ utf8DecodeStringLazy fp offset (I# len#) -- withForeignPtr would provide here. That's why we use touchForeignPtr to -- keep the fp alive until the last character has actually been decoded. +utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering +utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0# + -- UTF-8 has the property that sorting by bytes values also sorts by + -- code-points. + -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property + -- doesn't hold and we must explicitly check this case here. + -- Note that decoding every code point would also work but it would be much + -- more costly. + where + !sz1 = sizeofByteArray# a1 + !sz2 = sizeofByteArray# a2 + go off1 off2 + | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ + | isTrue# (off1 >=# sz1) = LT + | isTrue# (off2 >=# sz2) = GT + | otherwise = + let !b1_1 = indexWord8Array# a1 off1 + !b2_1 = indexWord8Array# a2 off2 + in case b1_1 of + 0xC0## -> case b2_1 of + 0xC0## -> go (off1 +# 1#) (off2 +# 1#) + _ -> case indexWord8Array# a1 (off1 +# 1#) of + 0x80## -> LT + _ -> go (off1 +# 1#) (off2 +# 1#) + _ -> case b2_1 of + 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of + 0x80## -> GT + _ -> go (off1 +# 1#) (off2 +# 1#) + _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT + | isTrue# (b1_1 `ltWord#` b2_1) -> LT + | otherwise -> go (off1 +# 1#) (off2 +# 1#) + utf8DecodeShortByteString :: ShortByteString -> [Char] utf8DecodeShortByteString (SBS ba#) = unsafeDupablePerformIO $ |