diff options
author | Wander Hillen <wjw.hillen@gmail.com> | 2020-09-25 11:41:06 +0200 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2020-10-13 00:12:47 -0400 |
commit | 7fdcce6d4d13a10a1b2336c1d40482c64dba664d (patch) | |
tree | 66dfd350e5abef849793060d745d1a1df64e47df /compiler | |
parent | 9bbc84d20d0f50901351246cbe97c45234ca7d95 (diff) | |
download | haskell-7fdcce6d4d13a10a1b2336c1d40482c64dba664d.tar.gz |
Initial ShortText code and conversion of package db code
Metric Decrease:
Naperian
T10421
T10421a
T10547
T12150
T12234
T12425
T13035
T18140
T18304
T5837
T6048
T13253-spj
T18282
T18223
T3064
T9961
Metric Increase
T13701
HFSKJH
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/GHC/Driver/Backpack.hs | 7 | ||||
-rw-r--r-- | compiler/GHC/Driver/CodeOutput.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Driver/Finder.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Runtime/Linker.hs | 22 | ||||
-rw-r--r-- | compiler/GHC/SysTools/ExtraObj.hs | 3 | ||||
-rw-r--r-- | compiler/GHC/Unit/Info.hs | 29 | ||||
-rw-r--r-- | compiler/GHC/Unit/State.hs | 25 | ||||
-rw-r--r-- | compiler/GHC/Utils/Encoding.hs | 526 | ||||
-rw-r--r-- | compiler/ghc.cabal.in | 1 |
9 files changed, 51 insertions, 568 deletions
diff --git a/compiler/GHC/Driver/Backpack.hs b/compiler/GHC/Driver/Backpack.hs index 5b418d9166..cd9cb8672b 100644 --- a/compiler/GHC/Driver/Backpack.hs +++ b/compiler/GHC/Driver/Backpack.hs @@ -1,6 +1,8 @@ {-# LANGUAGE CPP #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE NondecreasingIndentation #-} +{-# LANGUAGE OverloadedStrings #-} + -- | This is the driver for the 'ghc --backpack' mode, which -- is a reimplementation of the "package manager" bits of @@ -38,6 +40,7 @@ import GHC.Unit.State import GHC.Driver.Types import GHC.Data.StringBuffer import GHC.Data.FastString +import qualified GHC.Data.ShortText as ST import GHC.Utils.Error import GHC.Types.SrcLoc import GHC.Driver.Main @@ -340,8 +343,8 @@ buildUnit session cid insts lunit = do unitAbiDepends = [], unitLinkerOptions = case session of TcSession -> [] - _ -> obj_files, - unitImportDirs = [ hi_dir ], + _ -> map ST.pack $ obj_files, + unitImportDirs = [ ST.pack $ hi_dir ], unitIsExposed = False, unitIsIndefinite = case session of TcSession -> True diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs index 0e43b64c77..cee81b900e 100644 --- a/compiler/GHC/Driver/CodeOutput.hs +++ b/compiler/GHC/Driver/CodeOutput.hs @@ -32,6 +32,7 @@ import GHC.Cmm.CLabel import GHC.Driver.Types import GHC.Driver.Session import GHC.Driver.Ppr +import qualified GHC.Data.ShortText as ST import GHC.Data.Stream ( Stream ) import qualified GHC.Data.Stream as Stream import GHC.SysTools.FileCleanup @@ -211,7 +212,7 @@ outputForeignStubs dflags mod location stubs let rts_includes = let rts_pkg = unsafeLookupUnitId (unitState dflags) rtsUnitId in concatMap mk_include (unitIncludes rts_pkg) - mk_include i = "#include \"" ++ i ++ "\"\n" + mk_include i = "#include \"" ++ ST.unpack i ++ "\"\n" -- wrapper code mentions the ffi_arg type, which comes from ffi.h ffi_includes diff --git a/compiler/GHC/Driver/Finder.hs b/compiler/GHC/Driver/Finder.hs index ffcd8d9359..57a9551b0f 100644 --- a/compiler/GHC/Driver/Finder.hs +++ b/compiler/GHC/Driver/Finder.hs @@ -43,6 +43,7 @@ import GHC.Unit.State import GHC.Driver.Types import GHC.Data.FastString +import qualified GHC.Data.ShortText as ST import GHC.Utils.Misc import GHC.Builtin.Names ( gHC_PRIM ) import GHC.Driver.Session @@ -380,7 +381,7 @@ findPackageModule_ hsc_env mod pkg_conf = mk_hi_loc = mkHiOnlyModLocation dflags package_hisuf - import_dirs = unitImportDirs pkg_conf + import_dirs = map ST.unpack $ unitImportDirs pkg_conf -- we never look for a .hi-boot file in an external package; -- .hi-boot files only make sense for the home package. in diff --git a/compiler/GHC/Runtime/Linker.hs b/compiler/GHC/Runtime/Linker.hs index aaa74b3625..4477a0ad2f 100644 --- a/compiler/GHC/Runtime/Linker.hs +++ b/compiler/GHC/Runtime/Linker.hs @@ -61,6 +61,7 @@ import GHC.Types.SrcLoc import qualified GHC.Data.Maybe as Maybes import GHC.Types.Unique.DSet import GHC.Data.FastString +import qualified GHC.Data.ShortText as ST import GHC.Platform import GHC.SysTools import GHC.SysTools.FileCleanup @@ -1282,10 +1283,10 @@ linkPackage hsc_env pkg let dflags = hsc_dflags hsc_env platform = targetPlatform dflags is_dyn = interpreterDynamic (hscInterp hsc_env) - dirs | is_dyn = Packages.unitLibraryDynDirs pkg - | otherwise = Packages.unitLibraryDirs pkg + dirs | is_dyn = map ST.unpack $ Packages.unitLibraryDynDirs pkg + | otherwise = map ST.unpack $ Packages.unitLibraryDirs pkg - let hs_libs = Packages.unitLibraries pkg + let hs_libs = map ST.unpack $ Packages.unitLibraries pkg -- The FFI GHCi import lib isn't needed as -- GHC.Runtime.Linker + rts/Linker.c link the -- interpreted references to FFI to the compiled FFI. @@ -1300,11 +1301,12 @@ linkPackage hsc_env pkg -- libs do not exactly match the .so/.dll equivalents. So if the -- package file provides an "extra-ghci-libraries" field then we use -- that instead of the "extra-libraries" field. - extra_libs = - (if null (Packages.unitExtDepLibsGhc pkg) - then Packages.unitExtDepLibsSys pkg - else Packages.unitExtDepLibsGhc pkg) - ++ [ lib | '-':'l':lib <- Packages.unitLinkerOptions pkg ] + extdeplibs = map ST.unpack (if null (Packages.unitExtDepLibsGhc pkg) + then Packages.unitExtDepLibsSys pkg + else Packages.unitExtDepLibsGhc pkg) + linkerlibs = [ lib | '-':'l':lib <- (map ST.unpack $ Packages.unitLinkerOptions pkg) ] + extra_libs = extdeplibs ++ linkerlibs + -- See Note [Fork/Exec Windows] gcc_paths <- getGCCPaths dflags (platformOS platform) dirs_env <- addEnvPaths "LIBRARY_PATH" dirs @@ -1434,8 +1436,8 @@ loadFrameworks :: HscEnv -> Platform -> UnitInfo -> IO () loadFrameworks hsc_env platform pkg = when (platformUsesFrameworks platform) $ mapM_ load frameworks where - fw_dirs = Packages.unitExtDepFrameworkDirs pkg - frameworks = Packages.unitExtDepFrameworks pkg + fw_dirs = map ST.unpack $ Packages.unitExtDepFrameworkDirs pkg + frameworks = map ST.unpack $ Packages.unitExtDepFrameworks pkg load fw = do r <- loadFramework hsc_env fw_dirs fw case r of diff --git a/compiler/GHC/SysTools/ExtraObj.hs b/compiler/GHC/SysTools/ExtraObj.hs index 94443824e3..9208c3870d 100644 --- a/compiler/GHC/SysTools/ExtraObj.hs +++ b/compiler/GHC/SysTools/ExtraObj.hs @@ -25,6 +25,7 @@ import GHC.Unit import GHC.SysTools.Elf import GHC.Utils.Misc import GHC.Prelude +import qualified GHC.Data.ShortText as ST import Control.Monad import Data.Maybe @@ -57,7 +58,7 @@ mkExtraObj dflags extn xs -- we're compiling C or assembler. When compiling C, we pass the usual -- set of include directories and PIC flags. cOpts = map Option (picCCOpts dflags) - ++ map (FileOption "-I") + ++ map (FileOption "-I" . ST.unpack) (unitIncludeDirs $ unsafeLookupUnit pkgs rtsUnit) -- When compiling assembler code, we drop the usual C options, and if the diff --git a/compiler/GHC/Unit/Info.hs b/compiler/GHC/Unit/Info.hs index abb2122ef0..1f2366f292 100644 --- a/compiler/GHC/Unit/Info.hs +++ b/compiler/GHC/Unit/Info.hs @@ -31,6 +31,7 @@ import Data.Version import Data.Bifunctor import GHC.Data.FastString +import qualified GHC.Data.ShortText as ST import GHC.Utils.Outputable import GHC.Unit.Module as Module import GHC.Types.Unique @@ -124,21 +125,21 @@ pprUnitInfo GenericUnitInfo {..} = field "exposed-modules" (ppr unitExposedModules), field "hidden-modules" (fsep (map ppr unitHiddenModules)), field "trusted" (ppr unitIsTrusted), - field "import-dirs" (fsep (map text unitImportDirs)), - field "library-dirs" (fsep (map text unitLibraryDirs)), - field "dynamic-library-dirs" (fsep (map text unitLibraryDynDirs)), - field "hs-libraries" (fsep (map text unitLibraries)), - field "extra-libraries" (fsep (map text unitExtDepLibsSys)), - field "extra-ghci-libraries" (fsep (map text unitExtDepLibsGhc)), - field "include-dirs" (fsep (map text unitIncludeDirs)), - field "includes" (fsep (map text unitIncludes)), + field "import-dirs" (fsep (map (text . ST.unpack) unitImportDirs)), + field "library-dirs" (fsep (map (text . ST.unpack) unitLibraryDirs)), + field "dynamic-library-dirs" (fsep (map (text . ST.unpack) unitLibraryDynDirs)), + field "hs-libraries" (fsep (map (text . ST.unpack) unitLibraries)), + field "extra-libraries" (fsep (map (text . ST.unpack) unitExtDepLibsSys)), + field "extra-ghci-libraries" (fsep (map (text . ST.unpack) unitExtDepLibsGhc)), + field "include-dirs" (fsep (map (text . ST.unpack) unitIncludeDirs)), + field "includes" (fsep (map (text . ST.unpack) unitIncludes)), field "depends" (fsep (map ppr unitDepends)), - field "cc-options" (fsep (map text unitCcOptions)), - field "ld-options" (fsep (map text unitLinkerOptions)), - field "framework-dirs" (fsep (map text unitExtDepFrameworkDirs)), - field "frameworks" (fsep (map text unitExtDepFrameworks)), - field "haddock-interfaces" (fsep (map text unitHaddockInterfaces)), - field "haddock-html" (fsep (map text unitHaddockHTMLs)) + field "cc-options" (fsep (map (text . ST.unpack) unitCcOptions)), + field "ld-options" (fsep (map (text . ST.unpack) unitLinkerOptions)), + field "framework-dirs" (fsep (map (text . ST.unpack) unitExtDepFrameworkDirs)), + field "frameworks" (fsep (map (text . ST.unpack) unitExtDepFrameworks)), + field "haddock-interfaces" (fsep (map (text . ST.unpack) unitHaddockInterfaces)), + field "haddock-html" (fsep (map (text . ST.unpack) unitHaddockHTMLs)) ] where field name body = text name <> colon <+> nest 4 body diff --git a/compiler/GHC/Unit/State.hs b/compiler/GHC/Unit/State.hs index be5b08110d..78f96c90f3 100644 --- a/compiler/GHC/Unit/State.hs +++ b/compiler/GHC/Unit/State.hs @@ -99,6 +99,7 @@ import GHC.Data.Maybe import System.Environment ( getEnv ) import GHC.Data.FastString +import qualified GHC.Data.ShortText as ST import GHC.Utils.Error ( debugTraceMsg, MsgDoc, dumpIfSet_dyn, withTiming, DumpFormat (..) ) import GHC.Utils.Exception @@ -749,7 +750,7 @@ mungeUnitInfo :: FilePath -> FilePath -> UnitInfo -> UnitInfo mungeUnitInfo top_dir pkgroot = mungeDynLibFields - . mungeUnitInfoPaths top_dir pkgroot + . mungeUnitInfoPaths (ST.pack top_dir) (ST.pack pkgroot) mungeDynLibFields :: UnitInfo -> UnitInfo mungeDynLibFields pkg = @@ -1797,7 +1798,7 @@ getUnitIncludePath ctx unit_state home_unit pkgs = collectIncludeDirs `fmap` getPreloadUnitsAnd ctx unit_state home_unit pkgs collectIncludeDirs :: [UnitInfo] -> [FilePath] -collectIncludeDirs ps = ordNub (filter notNull (concatMap unitIncludeDirs ps)) +collectIncludeDirs ps = map ST.unpack $ ordNub (filter (not . ST.null) (concatMap unitIncludeDirs ps)) -- | Find all the library paths in these and the preload packages getUnitLibraryPath :: SDocContext -> UnitState -> HomeUnit -> Ways -> [UnitId] -> IO [String] @@ -1822,8 +1823,8 @@ collectLinkOpts :: DynFlags -> [UnitInfo] -> ([String], [String], [String]) collectLinkOpts dflags ps = ( concatMap (map ("-l" ++) . packageHsLibs dflags) ps, - concatMap (map ("-l" ++) . unitExtDepLibsSys) ps, - concatMap unitLinkerOptions ps + concatMap (map ("-l" ++) . map ST.unpack . unitExtDepLibsSys) ps, + concatMap (map ST.unpack . unitLinkerOptions) ps ) collectArchives :: DynFlags -> UnitInfo -> IO [FilePath] collectArchives dflags pc = @@ -1831,7 +1832,7 @@ collectArchives dflags pc = | searchPath <- searchPaths , lib <- libs ] where searchPaths = ordNub . filter notNull . libraryDirsForWay (ways dflags) $ pc - libs = packageHsLibs dflags pc ++ unitExtDepLibsSys pc + libs = packageHsLibs dflags pc ++ (map ST.unpack $ unitExtDepLibsSys pc) getLibs :: DynFlags -> [UnitId] -> IO [(String,String)] getLibs dflags pkgs = do @@ -1846,7 +1847,7 @@ getLibs dflags pkgs = do filterM (doesFileExist . fst) candidates packageHsLibs :: DynFlags -> UnitInfo -> [String] -packageHsLibs dflags p = map (mkDynName . addSuffix) (unitLibraries p) +packageHsLibs dflags p = map (mkDynName . addSuffix . ST.unpack) (unitLibraries p) where ways0 = ways dflags @@ -1895,27 +1896,27 @@ packageHsLibs dflags p = map (mkDynName . addSuffix) (unitLibraries p) -- | Either the 'unitLibraryDirs' or 'unitLibraryDynDirs' as appropriate for the way. libraryDirsForWay :: Ways -> UnitInfo -> [String] -libraryDirsForWay ws - | WayDyn `elem` ws = unitLibraryDynDirs - | otherwise = unitLibraryDirs +libraryDirsForWay ws ui + | WayDyn `elem` ws = map ST.unpack $ unitLibraryDynDirs ui + | otherwise = map ST.unpack $ unitLibraryDirs ui -- | Find all the C-compiler options in these and the preload packages getUnitExtraCcOpts :: SDocContext -> UnitState -> HomeUnit -> [UnitId] -> IO [String] getUnitExtraCcOpts ctx unit_state home_unit pkgs = do ps <- getPreloadUnitsAnd ctx unit_state home_unit pkgs - return (concatMap unitCcOptions ps) + return $ map ST.unpack (concatMap unitCcOptions ps) -- | Find all the package framework paths in these and the preload packages getUnitFrameworkPath :: SDocContext -> UnitState -> HomeUnit -> [UnitId] -> IO [String] getUnitFrameworkPath ctx unit_state home_unit pkgs = do ps <- getPreloadUnitsAnd ctx unit_state home_unit pkgs - return (ordNub (filter notNull (concatMap unitExtDepFrameworkDirs ps))) + return $ map ST.unpack (ordNub (filter (not . ST.null) (concatMap unitExtDepFrameworkDirs ps))) -- | Find all the package frameworks in these and the preload packages getUnitFrameworks :: SDocContext -> UnitState -> HomeUnit -> [UnitId] -> IO [String] getUnitFrameworks ctx unit_state home_unit pkgs = do ps <- getPreloadUnitsAnd ctx unit_state home_unit pkgs - return (concatMap unitExtDepFrameworks ps) + return $ map ST.unpack (concatMap unitExtDepFrameworks ps) -- ----------------------------------------------------------------------------- -- Package Utils diff --git a/compiler/GHC/Utils/Encoding.hs b/compiler/GHC/Utils/Encoding.hs deleted file mode 100644 index 68ebeedaf7..0000000000 --- a/compiler/GHC/Utils/Encoding.hs +++ /dev/null @@ -1,526 +0,0 @@ -{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-} -{-# OPTIONS_GHC -O2 #-} --- We always optimise this, otherwise performance of a non-optimised --- compiler is severely affected - --- ----------------------------------------------------------------------------- --- --- (c) The University of Glasgow, 1997-2006 --- --- Character encodings --- --- ----------------------------------------------------------------------------- - -module GHC.Utils.Encoding ( - -- * UTF-8 - utf8DecodeCharAddr#, - utf8PrevChar, - utf8CharStart, - utf8DecodeChar, - utf8DecodeByteString, - utf8DecodeShortByteString, - utf8CompareShortByteString, - utf8DecodeStringLazy, - utf8EncodeChar, - utf8EncodeString, - utf8EncodeShortByteString, - utf8EncodedLength, - countUTF8Chars, - - -- * Z-encoding - zEncodeString, - zDecodeString, - - -- * Base62-encoding - toBase62, - toBase62Padded - ) where - -import GHC.Prelude - -import Foreign -import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) -import Data.Char -import qualified Data.Char as Char -import Numeric -import GHC.IO -import GHC.ST - -import Data.ByteString (ByteString) -import qualified Data.ByteString.Internal as BS -import Data.ByteString.Short.Internal (ShortByteString(..)) - -import GHC.Exts - --- ----------------------------------------------------------------------------- --- UTF-8 - --- We can't write the decoder as efficiently as we'd like without --- resorting to unboxed extensions, unfortunately. I tried to write --- an IO version of this function, but GHC can't eliminate boxed --- results from an IO-returning function. --- --- We assume we can ignore overflow when parsing a multibyte character here. --- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences --- before decoding them (see "GHC.Data.StringBuffer"). - -{-# INLINE utf8DecodeChar# #-} -utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #) -utf8DecodeChar# indexWord8# = - let !ch0 = word2Int# (indexWord8# 0#) in - case () of - _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #) - - | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) -> - let !ch1 = word2Int# (indexWord8# 1#) in - if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else - (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +# - (ch1 -# 0x80#)), - 2# #) - - | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) -> - let !ch1 = word2Int# (indexWord8# 1#) in - if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else - let !ch2 = word2Int# (indexWord8# 2#) in - if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else - (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +# - ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ch2 -# 0x80#)), - 3# #) - - | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) -> - let !ch1 = word2Int# (indexWord8# 1#) in - if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else - let !ch2 = word2Int# (indexWord8# 2#) in - if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else - let !ch3 = word2Int# (indexWord8# 3#) in - if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else - (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +# - ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +# - ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +# - (ch3 -# 0x80#)), - 4# #) - - | otherwise -> fail 1# - where - -- all invalid sequences end up here: - fail :: Int# -> (# Char#, Int# #) - fail nBytes# = (# '\0'#, nBytes# #) - -- '\xFFFD' would be the usual replacement character, but - -- that's a valid symbol in Haskell, so will result in a - -- confusing parse error later on. Instead we use '\0' which - -- will signal a lexer error immediately. - -utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #) -utf8DecodeCharAddr# a# off# = - utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#)) - -utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #) -utf8DecodeCharByteArray# ba# off# = - utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#)) - -utf8DecodeChar :: Ptr Word8 -> (Char, Int) -utf8DecodeChar !(Ptr a#) = - case utf8DecodeCharAddr# a# 0# of - (# c#, nBytes# #) -> ( C# c#, I# nBytes# ) - --- UTF-8 is cleverly designed so that we can always figure out where --- the start of the current character is, given any position in a --- stream. This function finds the start of the previous character, --- assuming there *is* a previous character. -utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8) -utf8PrevChar p = utf8CharStart (p `plusPtr` (-1)) - -utf8CharStart :: Ptr Word8 -> IO (Ptr Word8) -utf8CharStart p = go p - where go p = do w <- peek p - if w >= 0x80 && w < 0xC0 - then go (p `plusPtr` (-1)) - else return p - -{-# INLINE utf8DecodeLazy# #-} -utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char] -utf8DecodeLazy# retain decodeChar# len# - = unpack 0# - where - unpack i# - | isTrue# (i# >=# len#) = retain >> return [] - | otherwise = - case decodeChar# i# of - (# c#, nBytes# #) -> do - rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#) - return (C# c# : rest) - -utf8DecodeByteString :: ByteString -> [Char] -utf8DecodeByteString (BS.PS fptr offset len) - = utf8DecodeStringLazy fptr offset len - -utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char] -utf8DecodeStringLazy fp offset (I# len#) - = unsafeDupablePerformIO $ do - let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset - utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len# --- Note that since utf8DecodeLazy# returns a thunk the lifetime of the --- ForeignPtr actually needs to be longer than the lexical lifetime --- withForeignPtr would provide here. That's why we use touchForeignPtr to --- keep the fp alive until the last character has actually been decoded. - -utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering -utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0# - -- UTF-8 has the property that sorting by bytes values also sorts by - -- code-points. - -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property - -- doesn't hold and we must explicitly check this case here. - -- Note that decoding every code point would also work but it would be much - -- more costly. - where - !sz1 = sizeofByteArray# a1 - !sz2 = sizeofByteArray# a2 - go off1 off2 - | isTrue# ((off1 >=# sz1) `andI#` (off2 >=# sz2)) = EQ - | isTrue# (off1 >=# sz1) = LT - | isTrue# (off2 >=# sz2) = GT - | otherwise = - let !b1_1 = indexWord8Array# a1 off1 - !b2_1 = indexWord8Array# a2 off2 - in case b1_1 of - 0xC0## -> case b2_1 of - 0xC0## -> go (off1 +# 1#) (off2 +# 1#) - _ -> case indexWord8Array# a1 (off1 +# 1#) of - 0x80## -> LT - _ -> go (off1 +# 1#) (off2 +# 1#) - _ -> case b2_1 of - 0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of - 0x80## -> GT - _ -> go (off1 +# 1#) (off2 +# 1#) - _ | isTrue# (b1_1 `gtWord#` b2_1) -> GT - | isTrue# (b1_1 `ltWord#` b2_1) -> LT - | otherwise -> go (off1 +# 1#) (off2 +# 1#) - -utf8DecodeShortByteString :: ShortByteString -> [Char] -utf8DecodeShortByteString (SBS ba#) - = unsafeDupablePerformIO $ - let len# = sizeofByteArray# ba# in - utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len# - -countUTF8Chars :: ShortByteString -> IO Int -countUTF8Chars (SBS ba) = go 0# 0# - where - len# = sizeofByteArray# ba - go i# n# - | isTrue# (i# >=# len#) = - return (I# n#) - | otherwise = do - case utf8DecodeCharByteArray# ba i# of - (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#) - -{-# INLINE utf8EncodeChar #-} -utf8EncodeChar :: (Int# -> Word# -> State# s -> State# s) - -> Char -> ST s Int -utf8EncodeChar write# c = - let x = ord c in - case () of - _ | x > 0 && x <= 0x007f -> do - write 0 x - return 1 - -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we - -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8). - | x <= 0x07ff -> do - write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)) - write 1 (0x80 .|. (x .&. 0x3F)) - return 2 - | x <= 0xffff -> do - write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F) - write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F) - write 2 (0x80 .|. (x .&. 0x3F)) - return 3 - | otherwise -> do - write 0 (0xF0 .|. (x `shiftR` 18)) - write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F)) - write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F)) - write 3 (0x80 .|. (x .&. 0x3F)) - return 4 - where - {-# INLINE write #-} - write (I# off#) (I# c#) = ST $ \s -> - case write# off# (int2Word# c#) s of - s -> (# s, () #) - -utf8EncodeString :: Ptr Word8 -> String -> IO () -utf8EncodeString (Ptr a#) str = go a# str - where go !_ [] = return () - go a# (c:cs) = do - I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c - go (a# `plusAddr#` off#) cs - -utf8EncodeShortByteString :: String -> IO ShortByteString -utf8EncodeShortByteString str = IO $ \s -> - case utf8EncodedLength str of { I# len# -> - case newByteArray# len# s of { (# s, mba# #) -> - case go mba# 0# str of { ST f_go -> - case f_go s of { (# s, () #) -> - case unsafeFreezeByteArray# mba# s of { (# s, ba# #) -> - (# s, SBS ba# #) }}}}} - where - go _ _ [] = return () - go mba# i# (c:cs) = do - I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c - go mba# (i# +# off#) cs - -utf8EncodedLength :: String -> Int -utf8EncodedLength str = go 0 str - where go !n [] = n - go n (c:cs) - | ord c > 0 && ord c <= 0x007f = go (n+1) cs - | ord c <= 0x07ff = go (n+2) cs - | ord c <= 0xffff = go (n+3) cs - | otherwise = go (n+4) cs - --- ----------------------------------------------------------------------------- --- Note [Z-Encoding] --- ~~~~~~~~~~~~~~~~~ - -{- -This is the main name-encoding and decoding function. It encodes any -string into a string that is acceptable as a C name. This is done -right before we emit a symbol name into the compiled C or asm code. -Z-encoding of strings is cached in the FastString interface, so we -never encode the same string more than once. - -The basic encoding scheme is this. - -* Tuples (,,,) are coded as Z3T - -* Alphabetic characters (upper and lower) and digits - all translate to themselves; - except 'Z', which translates to 'ZZ' - and 'z', which translates to 'zz' - We need both so that we can preserve the variable/tycon distinction - -* Most other printable characters translate to 'zx' or 'Zx' for some - alphabetic character x - -* The others translate as 'znnnU' where 'nnn' is the decimal number - of the character - - Before After - -------------------------- - Trak Trak - foo_wib foozuwib - > zg - >1 zg1 - foo# foozh - foo## foozhzh - foo##1 foozhzh1 - fooZ fooZZ - :+ ZCzp - () Z0T 0-tuple - (,,,,) Z5T 5-tuple - (# #) Z1H unboxed 1-tuple (note the space) - (#,,,,#) Z5H unboxed 5-tuple - (NB: There is no Z1T nor Z0H.) --} - -type UserString = String -- As the user typed it -type EncodedString = String -- Encoded form - - -zEncodeString :: UserString -> EncodedString -zEncodeString cs = case maybe_tuple cs of - Just n -> n -- Tuples go to Z2T etc - Nothing -> go cs - where - go [] = [] - go (c:cs) = encode_digit_ch c ++ go' cs - go' [] = [] - go' (c:cs) = encode_ch c ++ go' cs - -unencodedChar :: Char -> Bool -- True for chars that don't need encoding -unencodedChar 'Z' = False -unencodedChar 'z' = False -unencodedChar c = c >= 'a' && c <= 'z' - || c >= 'A' && c <= 'Z' - || c >= '0' && c <= '9' - --- If a digit is at the start of a symbol then we need to encode it. --- Otherwise package names like 9pH-0.1 give linker errors. -encode_digit_ch :: Char -> EncodedString -encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c -encode_digit_ch c | otherwise = encode_ch c - -encode_ch :: Char -> EncodedString -encode_ch c | unencodedChar c = [c] -- Common case first - --- Constructors -encode_ch '(' = "ZL" -- Needed for things like (,), and (->) -encode_ch ')' = "ZR" -- For symmetry with ( -encode_ch '[' = "ZM" -encode_ch ']' = "ZN" -encode_ch ':' = "ZC" -encode_ch 'Z' = "ZZ" - --- Variables -encode_ch 'z' = "zz" -encode_ch '&' = "za" -encode_ch '|' = "zb" -encode_ch '^' = "zc" -encode_ch '$' = "zd" -encode_ch '=' = "ze" -encode_ch '>' = "zg" -encode_ch '#' = "zh" -encode_ch '.' = "zi" -encode_ch '<' = "zl" -encode_ch '-' = "zm" -encode_ch '!' = "zn" -encode_ch '+' = "zp" -encode_ch '\'' = "zq" -encode_ch '\\' = "zr" -encode_ch '/' = "zs" -encode_ch '*' = "zt" -encode_ch '_' = "zu" -encode_ch '%' = "zv" -encode_ch c = encode_as_unicode_char c - -encode_as_unicode_char :: Char -> EncodedString -encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str - else '0':hex_str - where hex_str = showHex (ord c) "U" - -- ToDo: we could improve the encoding here in various ways. - -- eg. strings of unicode characters come out as 'z1234Uz5678U', we - -- could remove the 'U' in the middle (the 'z' works as a separator). - -zDecodeString :: EncodedString -> UserString -zDecodeString [] = [] -zDecodeString ('Z' : d : rest) - | isDigit d = decode_tuple d rest - | otherwise = decode_upper d : zDecodeString rest -zDecodeString ('z' : d : rest) - | isDigit d = decode_num_esc d rest - | otherwise = decode_lower d : zDecodeString rest -zDecodeString (c : rest) = c : zDecodeString rest - -decode_upper, decode_lower :: Char -> Char - -decode_upper 'L' = '(' -decode_upper 'R' = ')' -decode_upper 'M' = '[' -decode_upper 'N' = ']' -decode_upper 'C' = ':' -decode_upper 'Z' = 'Z' -decode_upper ch = {-pprTrace "decode_upper" (char ch)-} ch - -decode_lower 'z' = 'z' -decode_lower 'a' = '&' -decode_lower 'b' = '|' -decode_lower 'c' = '^' -decode_lower 'd' = '$' -decode_lower 'e' = '=' -decode_lower 'g' = '>' -decode_lower 'h' = '#' -decode_lower 'i' = '.' -decode_lower 'l' = '<' -decode_lower 'm' = '-' -decode_lower 'n' = '!' -decode_lower 'p' = '+' -decode_lower 'q' = '\'' -decode_lower 'r' = '\\' -decode_lower 's' = '/' -decode_lower 't' = '*' -decode_lower 'u' = '_' -decode_lower 'v' = '%' -decode_lower ch = {-pprTrace "decode_lower" (char ch)-} ch - --- Characters not having a specific code are coded as z224U (in hex) -decode_num_esc :: Char -> EncodedString -> UserString -decode_num_esc d rest - = go (digitToInt d) rest - where - go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest - go n ('U' : rest) = chr n : zDecodeString rest - go n other = error ("decode_num_esc: " ++ show n ++ ' ':other) - -decode_tuple :: Char -> EncodedString -> UserString -decode_tuple d rest - = go (digitToInt d) rest - where - -- NB. recurse back to zDecodeString after decoding the tuple, because - -- the tuple might be embedded in a longer name. - go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest - go 0 ('T':rest) = "()" ++ zDecodeString rest - go n ('T':rest) = '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest - go 1 ('H':rest) = "(# #)" ++ zDecodeString rest - go n ('H':rest) = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest - go n other = error ("decode_tuple: " ++ show n ++ ' ':other) - -{- -Tuples are encoded as - Z3T or Z3H -for 3-tuples or unboxed 3-tuples respectively. No other encoding starts - Z<digit> - -* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple) - There are no unboxed 0-tuples. - -* "()" is the tycon for a boxed 0-tuple. - There are no boxed 1-tuples. --} - -maybe_tuple :: UserString -> Maybe EncodedString - -maybe_tuple "(# #)" = Just("Z1H") -maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of - (n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H") - _ -> Nothing -maybe_tuple "()" = Just("Z0T") -maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of - (n, ')' : _) -> Just ('Z' : shows (n+1) "T") - _ -> Nothing -maybe_tuple _ = Nothing - -count_commas :: Int -> String -> (Int, String) -count_commas n (',' : cs) = count_commas (n+1) cs -count_commas n cs = (n,cs) - - -{- -************************************************************************ -* * - Base 62 -* * -************************************************************************ - -Note [Base 62 encoding 128-bit integers] -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Instead of base-62 encoding a single 128-bit integer -(ceil(21.49) characters), we'll base-62 a pair of 64-bit integers -(2 * ceil(10.75) characters). Luckily for us, it's the same number of -characters! --} - --------------------------------------------------------------------------- --- Base 62 - --- The base-62 code is based off of 'locators' --- ((c) Operational Dynamics Consulting, BSD3 licensed) - --- | Size of a 64-bit word when written as a base-62 string -word64Base62Len :: Int -word64Base62Len = 11 - --- | Converts a 64-bit word into a base-62 string -toBase62Padded :: Word64 -> String -toBase62Padded w = pad ++ str - where - pad = replicate len '0' - len = word64Base62Len - length str -- 11 == ceil(64 / lg 62) - str = toBase62 w - -toBase62 :: Word64 -> String -toBase62 w = showIntAtBase 62 represent w "" - where - represent :: Int -> Char - represent x - | x < 10 = Char.chr (48 + x) - | x < 36 = Char.chr (65 + x - 10) - | x < 62 = Char.chr (97 + x - 36) - | otherwise = error "represent (base 62): impossible!" diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in index d12c2ca45e..4efae27e97 100644 --- a/compiler/ghc.cabal.in +++ b/compiler/ghc.cabal.in @@ -547,7 +547,6 @@ Library GHC.Data.BooleanFormula GHC.Utils.BufHandle GHC.Data.Graph.Directed - GHC.Utils.Encoding GHC.Utils.IO.Unsafe GHC.Data.FastMutInt GHC.Data.FastString |