summaryrefslogtreecommitdiff
path: root/compiler/GHC/Data
diff options
context:
space:
mode:
authorSylvain Henry <sylvain@haskus.fr>2020-08-11 13:15:41 +0200
committerMarge Bot <ben+marge-bot@smart-cactus.org>2020-09-01 12:39:36 -0400
commit4b4fbc58d37d37457144014ef82bdd928de175df (patch)
tree9b49838986f07b5843e13f33ad2f6fd19d83f987 /compiler/GHC/Data
parent884245dd29265b7bee12cda8c915da9c916251ce (diff)
downloadhaskell-4b4fbc58d37d37457144014ef82bdd928de175df.tar.gz
Remove "Ord FastString" instance
FastStrings can be compared in 2 ways: by Unique or lexically. We don't want to bless one particular way with an "Ord" instance because it leads to bugs (#18562) or to suboptimal code (e.g. using lexical comparison while a Unique comparison would suffice). UTF-8 encoding has the advantage that sorting strings by their encoded bytes also sorts them by their Unicode code points, without having to decode the actual code points. BUT GHC uses Modified UTF-8 which diverges from UTF-8 by encoding \0 as 0xC080 instead of 0x00 (to avoid null bytes in the middle of a String so that the string can still be null-terminated). This patch adds a new `utf8CompareShortByteString` function that performs sorting by bytes but that also takes Modified UTF-8 into account. It is much more performant than decoding the strings into [Char] to perform comparisons (which we did in the previous patch). Bump haddock submodule
Diffstat (limited to 'compiler/GHC/Data')
-rw-r--r--compiler/GHC/Data/FastString.hs77
1 files changed, 59 insertions, 18 deletions
diff --git a/compiler/GHC/Data/FastString.hs b/compiler/GHC/Data/FastString.hs
index 1907ef91c9..604c8f3e25 100644
--- a/compiler/GHC/Data/FastString.hs
+++ b/compiler/GHC/Data/FastString.hs
@@ -2,6 +2,7 @@
{-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples,
GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE DeriveDataTypeable #-}
{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
-- We always optimise this, otherwise performance of a non-optimised
-- compiler is severely affected
@@ -12,8 +13,12 @@
-- ['FastString']
--
-- * A compact, hash-consed, representation of character strings.
--- * Comparison is O(1), and you can get a 'GHC.Types.Unique.Unique' from them.
-- * Generated by 'fsLit'.
+-- * You can get a 'GHC.Types.Unique.Unique' from them.
+-- * Equality test is O(1) (it uses the Unique).
+-- * Comparison is O(1) or O(n):
+-- * O(n) but deterministic with lexical comparison (`lexicalCompareFS`)
+-- * O(1) but non-deterministic with Unique comparison (`uniqCompareFS`)
-- * Turn into 'GHC.Utils.Outputable.SDoc' with 'GHC.Utils.Outputable.ftext'.
--
-- ['PtrString']
@@ -50,6 +55,8 @@ module GHC.Data.FastString
-- * FastStrings
FastString(..), -- not abstract, for now.
+ NonDetFastString (..),
+ LexicalFastString (..),
-- ** Construction
fsLit,
@@ -74,6 +81,8 @@ module GHC.Data.FastString
consFS,
nilFS,
isUnderscoreFS,
+ lexicalCompareFS,
+ uniqCompareFS,
-- ** Outputting
hPutFS,
@@ -135,7 +144,7 @@ import GHC.Base (unpackCString#,unpackNBytes#)
import GHC.Exts
import GHC.IO
--- | Gives the UTF-8 encoded bytes corresponding to a 'FastString'
+-- | Gives the Modified UTF-8 encoded bytes corresponding to a 'FastString'
bytesFS, fastStringToByteString :: FastString -> ByteString
bytesFS = fastStringToByteString
@@ -196,17 +205,10 @@ data FastString = FastString {
instance Eq FastString where
f1 == f2 = uniq f1 == uniq f2
-instance Ord FastString where
- -- Compares lexicographically, not by unique
- a <= b = case cmpFS a b of { LT -> True; EQ -> True; GT -> False }
- a < b = case cmpFS a b of { LT -> True; EQ -> False; GT -> False }
- a >= b = case cmpFS a b of { LT -> False; EQ -> True; GT -> True }
- a > b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True }
- max x y | x >= y = x
- | otherwise = y
- min x y | x <= y = x
- | otherwise = y
- compare a b = cmpFS a b
+-- We don't provide any "Ord FastString" instance to force you to think about
+-- which ordering you want:
+-- * lexical: deterministic, O(n). Cf lexicalCompareFS and LexicalFastString.
+-- * by unique: non-deterministic, O(1). Cf uniqCompareFS and NonDetFastString.
instance IsString FastString where
fromString = fsLit
@@ -231,12 +233,51 @@ instance Data FastString where
instance NFData FastString where
rnf fs = seq fs ()
--- | Compare FastString lexicographically
-cmpFS :: FastString -> FastString -> Ordering
-cmpFS fs1 fs2 =
+-- | Compare FastString lexically
+--
+-- If you don't care about the lexical ordering, use `uniqCompareFS` instead.
+lexicalCompareFS :: FastString -> FastString -> Ordering
+lexicalCompareFS fs1 fs2 =
if uniq fs1 == uniq fs2 then EQ else
- compare (unpackFS fs1) (unpackFS fs2)
- -- compare as String, not as ShortByteString (cf #18562)
+ utf8CompareShortByteString (fs_sbs fs1) (fs_sbs fs2)
+ -- perform a lexical comparison taking into account the Modified UTF-8
+ -- encoding we use (cf #18562)
+
+-- | Compare FastString by their Unique (not lexically).
+--
+-- Much cheaper than `lexicalCompareFS` but non-deterministic!
+uniqCompareFS :: FastString -> FastString -> Ordering
+uniqCompareFS fs1 fs2 = compare (uniq fs1) (uniq fs2)
+
+-- | Non-deterministic FastString
+--
+-- This is a simple FastString wrapper with an Ord instance using
+-- `uniqCompareFS` (i.e. which compares FastStrings on their Uniques). Hence it
+-- is not deterministic from one run to the other.
+newtype NonDetFastString
+ = NonDetFastString FastString
+ deriving (Eq,Data)
+
+instance Ord NonDetFastString where
+ compare (NonDetFastString fs1) (NonDetFastString fs2) = uniqCompareFS fs1 fs2
+
+instance Show NonDetFastString where
+ show (NonDetFastString fs) = show fs
+
+-- | Lexical FastString
+--
+-- This is a simple FastString wrapper with an Ord instance using
+-- `lexicalCompareFS` (i.e. which compares FastStrings on their String
+-- representation). Hence it is deterministic from one run to the other.
+newtype LexicalFastString
+ = LexicalFastString FastString
+ deriving (Eq,Data)
+
+instance Ord LexicalFastString where
+ compare (LexicalFastString fs1) (LexicalFastString fs2) = lexicalCompareFS fs1 fs2
+
+instance Show LexicalFastString where
+ show (LexicalFastString fs) = show fs
-- -----------------------------------------------------------------------------
-- Construction