diff options
author | Sylvain Henry <sylvain@haskus.fr> | 2020-08-11 13:15:41 +0200 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2020-09-01 12:39:36 -0400 |
commit | 4b4fbc58d37d37457144014ef82bdd928de175df (patch) | |
tree | 9b49838986f07b5843e13f33ad2f6fd19d83f987 /compiler/GHC/Data | |
parent | 884245dd29265b7bee12cda8c915da9c916251ce (diff) | |
download | haskell-4b4fbc58d37d37457144014ef82bdd928de175df.tar.gz |
Remove "Ord FastString" instance
FastStrings can be compared in 2 ways: by Unique or lexically. We don't
want to bless one particular way with an "Ord" instance because it leads
to bugs (#18562) or to suboptimal code (e.g. using lexical comparison
while a Unique comparison would suffice).
UTF-8 encoding has the advantage that sorting strings by their encoded
bytes also sorts them by their Unicode code points, without having to
decode the actual code points. BUT GHC uses Modified UTF-8 which
diverges from UTF-8 by encoding \0 as 0xC080 instead of 0x00 (to avoid
null bytes in the middle of a String so that the string can still be
null-terminated). This patch adds a new `utf8CompareShortByteString`
function that performs sorting by bytes but that also takes Modified
UTF-8 into account. It is much more performant than decoding the strings
into [Char] to perform comparisons (which we did in the previous patch).
Bump haddock submodule
Diffstat (limited to 'compiler/GHC/Data')
-rw-r--r-- | compiler/GHC/Data/FastString.hs | 77 |
1 files changed, 59 insertions, 18 deletions
diff --git a/compiler/GHC/Data/FastString.hs b/compiler/GHC/Data/FastString.hs index 1907ef91c9..604c8f3e25 100644 --- a/compiler/GHC/Data/FastString.hs +++ b/compiler/GHC/Data/FastString.hs @@ -2,6 +2,7 @@ {-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples, GeneralizedNewtypeDeriving #-} +{-# LANGUAGE DeriveDataTypeable #-} {-# OPTIONS_GHC -O2 -funbox-strict-fields #-} -- We always optimise this, otherwise performance of a non-optimised -- compiler is severely affected @@ -12,8 +13,12 @@ -- ['FastString'] -- -- * A compact, hash-consed, representation of character strings. --- * Comparison is O(1), and you can get a 'GHC.Types.Unique.Unique' from them. -- * Generated by 'fsLit'. +-- * You can get a 'GHC.Types.Unique.Unique' from them. +-- * Equality test is O(1) (it uses the Unique). +-- * Comparison is O(1) or O(n): +-- * O(n) but deterministic with lexical comparison (`lexicalCompareFS`) +-- * O(1) but non-deterministic with Unique comparison (`uniqCompareFS`) -- * Turn into 'GHC.Utils.Outputable.SDoc' with 'GHC.Utils.Outputable.ftext'. -- -- ['PtrString'] @@ -50,6 +55,8 @@ module GHC.Data.FastString -- * FastStrings FastString(..), -- not abstract, for now. + NonDetFastString (..), + LexicalFastString (..), -- ** Construction fsLit, @@ -74,6 +81,8 @@ module GHC.Data.FastString consFS, nilFS, isUnderscoreFS, + lexicalCompareFS, + uniqCompareFS, -- ** Outputting hPutFS, @@ -135,7 +144,7 @@ import GHC.Base (unpackCString#,unpackNBytes#) import GHC.Exts import GHC.IO --- | Gives the UTF-8 encoded bytes corresponding to a 'FastString' +-- | Gives the Modified UTF-8 encoded bytes corresponding to a 'FastString' bytesFS, fastStringToByteString :: FastString -> ByteString bytesFS = fastStringToByteString @@ -196,17 +205,10 @@ data FastString = FastString { instance Eq FastString where f1 == f2 = uniq f1 == uniq f2 -instance Ord FastString where - -- Compares lexicographically, not by unique - a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False } - a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False } - a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True } - a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True } - max x y | x >= y = x - | otherwise = y - min x y | x <= y = x - | otherwise = y - compare a b = cmpFS a b +-- We don't provide any "Ord FastString" instance to force you to think about +-- which ordering you want: +-- * lexical: deterministic, O(n). Cf lexicalCompareFS and LexicalFastString. +-- * by unique: non-deterministic, O(1). Cf uniqCompareFS and NonDetFastString. instance IsString FastString where fromString = fsLit @@ -231,12 +233,51 @@ instance Data FastString where instance NFData FastString where rnf fs = seq fs () --- | Compare FastString lexicographically -cmpFS :: FastString -> FastString -> Ordering -cmpFS fs1 fs2 = +-- | Compare FastString lexically +-- +-- If you don't care about the lexical ordering, use `uniqCompareFS` instead. +lexicalCompareFS :: FastString -> FastString -> Ordering +lexicalCompareFS fs1 fs2 = if uniq fs1 == uniq fs2 then EQ else - compare (unpackFS fs1) (unpackFS fs2) - -- compare as String, not as ShortByteString (cf #18562) + utf8CompareShortByteString (fs_sbs fs1) (fs_sbs fs2) + -- perform a lexical comparison taking into account the Modified UTF-8 + -- encoding we use (cf #18562) + +-- | Compare FastString by their Unique (not lexically). +-- +-- Much cheaper than `lexicalCompareFS` but non-deterministic! +uniqCompareFS :: FastString -> FastString -> Ordering +uniqCompareFS fs1 fs2 = compare (uniq fs1) (uniq fs2) + +-- | Non-deterministic FastString +-- +-- This is a simple FastString wrapper with an Ord instance using +-- `uniqCompareFS` (i.e. which compares FastStrings on their Uniques). Hence it +-- is not deterministic from one run to the other. +newtype NonDetFastString + = NonDetFastString FastString + deriving (Eq,Data) + +instance Ord NonDetFastString where + compare (NonDetFastString fs1) (NonDetFastString fs2) = uniqCompareFS fs1 fs2 + +instance Show NonDetFastString where + show (NonDetFastString fs) = show fs + +-- | Lexical FastString +-- +-- This is a simple FastString wrapper with an Ord instance using +-- `lexicalCompareFS` (i.e. which compares FastStrings on their String +-- representation). Hence it is deterministic from one run to the other. +newtype LexicalFastString + = LexicalFastString FastString + deriving (Eq,Data) + +instance Ord LexicalFastString where + compare (LexicalFastString fs1) (LexicalFastString fs2) = lexicalCompareFS fs1 fs2 + +instance Show LexicalFastString where + show (LexicalFastString fs) = show fs -- ----------------------------------------------------------------------------- -- Construction |