summaryrefslogtreecommitdiff
path: root/compiler/GHC/Utils/Encoding.hs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/GHC/Utils/Encoding.hs')
-rw-r--r--compiler/GHC/Utils/Encoding.hs33
1 files changed, 33 insertions, 0 deletions
diff --git a/compiler/GHC/Utils/Encoding.hs b/compiler/GHC/Utils/Encoding.hs
index 24637a3bff..592b0bd40e 100644
--- a/compiler/GHC/Utils/Encoding.hs
+++ b/compiler/GHC/Utils/Encoding.hs
@@ -19,6 +19,7 @@ module GHC.Utils.Encoding (
utf8DecodeChar,
utf8DecodeByteString,
utf8DecodeShortByteString,
+ utf8CompareShortByteString,
utf8DecodeStringLazy,
utf8EncodeChar,
utf8EncodeString,
@@ -164,6 +165,38 @@ utf8DecodeStringLazy fp offset (I# len#)
-- withForeignPtr would provide here. That's why we use touchForeignPtr to
-- keep the fp alive until the last character has actually been decoded.
+utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
+utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
+ -- UTF-8 has the property that sorting by bytes values also sorts by
+ -- code-points.
+ -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
+ -- doesn't hold and we must explicitly check this case here.
+ -- Note that decoding every code point would also work but it would be much
+ -- more costly.
+ where
+ !sz1 = sizeofByteArray# a1
+ !sz2 = sizeofByteArray# a2
+ go off1 off2
+ | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ
+ | isTrue# (off1 >=# sz1) = LT
+ | isTrue# (off2 >=# sz2) = GT
+ | otherwise =
+ let !b1_1 = indexWord8Array# a1 off1
+ !b2_1 = indexWord8Array# a2 off2
+ in case b1_1 of
+ 0xC0## -> case b2_1 of
+ 0xC0## -> go (off1 +# 1#) (off2 +# 1#)
+ _ -> case indexWord8Array# a1 (off1 +# 1#) of
+ 0x80## -> LT
+ _ -> go (off1 +# 1#) (off2 +# 1#)
+ _ -> case b2_1 of
+ 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
+ 0x80## -> GT
+ _ -> go (off1 +# 1#) (off2 +# 1#)
+ _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT
+ | isTrue# (b1_1 `ltWord#` b2_1) -> LT
+ | otherwise -> go (off1 +# 1#) (off2 +# 1#)
+
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString (SBS ba#)
= unsafeDupablePerformIO $