summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Henry <sylvain@haskus.fr>2019-02-11 17:40:00 +0100
committerMarge Bot <ben+marge-bot@smart-cactus.org>2019-02-14 02:29:54 -0500
commit1d9a1d9fb8fe0a1fea2c44c4246f102ff3e1f3a3 (patch)
tree4abf3da5c8a8d5fdd88903613d2ce42346e4943f
parent0f1eb88c93143359fa671bb72aceebc299c87a95 (diff)
downloadhaskell-1d9a1d9fb8fe0a1fea2c44c4246f102ff3e1f3a3.tar.gz
NCG: fast compilation of very large strings (#16190)
This patch adds an optimization into the NCG: for large strings (threshold configurable via -fbinary-blob-threshold=NNN flag), instead of printing `.asciz "..."` in the generated ASM source, we print `.incbin "tmpXXX.dat"` and we dump the contents of the string into a temporary "tmpXXX.dat" file. See the note for more details.
-rw-r--r--compiler/main/DynFlags.hs6
-rw-r--r--compiler/nativeGen/PPC/Ppr.hs3
-rw-r--r--compiler/nativeGen/PprBase.hs48
-rw-r--r--compiler/nativeGen/SPARC/Ppr.hs8
-rw-r--r--compiler/nativeGen/X86/Ppr.hs4
-rw-r--r--docs/users_guide/using-optimisation.rst16
6 files changed, 73 insertions, 12 deletions
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index a9b4a03962..858d174c17 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -911,6 +911,9 @@ data DynFlags = DynFlags {
specConstrCount :: Maybe Int, -- ^ Max number of specialisations for any one function
specConstrRecursive :: Int, -- ^ Max number of specialisations for recursive types
-- Not optional; otherwise ForceSpecConstr can diverge.
+ binBlobThreshold :: Word, -- ^ Binary literals (e.g. strings) whose size is above
+ -- this threshold will be dumped in a binary file
+ -- by the assembler code generator (0 to disable)
liberateCaseThreshold :: Maybe Int, -- ^ Threshold for LiberateCase
floatLamArgs :: Maybe Int, -- ^ Arg count for lambda floating
-- See CoreMonad.FloatOutSwitches
@@ -1884,6 +1887,7 @@ defaultDynFlags mySettings (myLlvmTargets, myLlvmPasses) =
maxPmCheckIterations = 2000000,
ruleCheck = Nothing,
inlineCheck = Nothing,
+ binBlobThreshold = 500000, -- 500K is a good default (see #16190)
maxRelevantBinds = Just 6,
maxValidHoleFits = Just 6,
maxRefHoleFits = Just 6,
@@ -3526,6 +3530,8 @@ dynamic_flags_deps = [
setOptLevel (mb_n `orElse` 1)))
-- If the number is missing, use 1
+ , make_ord_flag defFlag "fbinary-blob-threshold"
+ (intSuffix (\n d -> d { binBlobThreshold = fromIntegral n }))
, make_ord_flag defFlag "fmax-relevant-binds"
(intSuffix (\n d -> d { maxRelevantBinds = Just n }))
diff --git a/compiler/nativeGen/PPC/Ppr.hs b/compiler/nativeGen/PPC/Ppr.hs
index 47ab07b633..c54d4430eb 100644
--- a/compiler/nativeGen/PPC/Ppr.hs
+++ b/compiler/nativeGen/PPC/Ppr.hs
@@ -125,8 +125,7 @@ pprDatas :: CmmStatics -> SDoc
pprDatas (Statics lbl dats) = vcat (pprLabel lbl : map pprData dats)
pprData :: CmmStatic -> SDoc
-pprData (CmmString str)
- = text "\t.string" <+> doubleQuotes (pprASCII str)
+pprData (CmmString str) = pprBytes str
pprData (CmmUninitialised bytes) = text ".space " <> int bytes
pprData (CmmStaticLit lit) = pprDataItem lit
diff --git a/compiler/nativeGen/PprBase.hs b/compiler/nativeGen/PprBase.hs
index afd16f8178..1f068c261b 100644
--- a/compiler/nativeGen/PprBase.hs
+++ b/compiler/nativeGen/PprBase.hs
@@ -14,6 +14,7 @@ module PprBase (
floatToBytes,
doubleToBytes,
pprASCII,
+ pprBytes,
pprSectionHeader
)
@@ -28,6 +29,7 @@ import DynFlags
import FastString
import Outputable
import Platform
+import FileCleanup
import qualified Data.Array.Unsafe as U ( castSTUArray )
import Data.Array.ST
@@ -40,6 +42,7 @@ import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import GHC.Exts
import GHC.Word
+import System.IO.Unsafe
@@ -125,6 +128,51 @@ pprASCII str
]
ord0 = 0x30 -- = ord '0'
+-- | Pretty print binary data.
+--
+-- Use either the ".string" directive or a ".incbin" directive.
+-- See Note [Embedding large binary blobs]
+--
+-- A NULL byte is added after the binary data.
+--
+pprBytes :: ByteString -> SDoc
+pprBytes bs = sdocWithDynFlags $ \dflags ->
+ if binBlobThreshold dflags == 0
+ || fromIntegral (BS.length bs) <= binBlobThreshold dflags
+ then text "\t.string " <> doubleQuotes (pprASCII bs)
+ else unsafePerformIO $ do
+ bFile <- newTempName dflags TFL_CurrentModule ".dat"
+ BS.writeFile bFile bs
+ return $ text "\t.incbin \"" <> text bFile <> text "\"\n\t.byte 0"
+
+{-
+Note [Embedding large binary blobs]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To embed a blob of binary data (e.g. an UTF-8 encoded string) into the generated
+code object, we have several options:
+
+ 1. Generate a ".byte" directive for each byte. This is what was done in the past
+ (see Note [Pretty print ASCII when AsmCodeGen]).
+
+ 2. Generate a single ".string"/".asciz" directive for the whole sequence of
+ bytes. Bytes in the ASCII printable range are rendered as characters and
+ other values are escaped (e.g., "\t", "\077", etc.).
+
+ 3. Create a temporary file into which we dump the binary data and generate a
+ single ".incbin" directive. The assembler will include the binary file for
+ us in the generated output object.
+
+Now the code generator uses either (2) or (3), depending on the binary blob
+size. Using (3) for small blobs adds too much overhead (see benchmark results
+in #16190), so we only do it when the size is above a threshold (500K at the
+time of writing).
+
+The threshold is configurable via the `-fbinary-blob-threshold` flag.
+
+-}
+
+
{-
Note [Pretty print ASCII when AsmCodeGen]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/compiler/nativeGen/SPARC/Ppr.hs b/compiler/nativeGen/SPARC/Ppr.hs
index 705fc31153..42ba13def4 100644
--- a/compiler/nativeGen/SPARC/Ppr.hs
+++ b/compiler/nativeGen/SPARC/Ppr.hs
@@ -49,8 +49,6 @@ import Unique ( pprUniqueAlways )
import Outputable
import Platform
import FastString
-import Data.Word
-import qualified Data.ByteString as BS
-- -----------------------------------------------------------------------------
-- Printing this stuff out
@@ -110,11 +108,7 @@ pprDatas :: CmmStatics -> SDoc
pprDatas (Statics lbl dats) = vcat (pprLabel lbl : map pprData dats)
pprData :: CmmStatic -> SDoc
-pprData (CmmString str)
- = vcat (map do1 (BS.unpack str)) $$ do1 0
- where
- do1 :: Word8 -> SDoc
- do1 w = text "\t.byte\t" <> int (fromIntegral w)
+pprData (CmmString str) = pprBytes str
pprData (CmmUninitialised bytes) = text ".skip " <> int bytes
pprData (CmmStaticLit lit) = pprDataItem lit
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 075bb26337..83356758af 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -47,7 +47,6 @@ import FastString
import Outputable
import Data.Word
-
import Data.Bits
-- -----------------------------------------------------------------------------
@@ -154,8 +153,7 @@ pprDatas (align, (Statics lbl dats))
= vcat (pprAlign align : pprLabel lbl : map pprData dats)
pprData :: CmmStatic -> SDoc
-pprData (CmmString str)
- = ptext (sLit "\t.asciz ") <> doubleQuotes (pprASCII str)
+pprData (CmmString str) = pprBytes str
pprData (CmmUninitialised bytes)
= sdocWithPlatform $ \platform ->
diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst
index cacc55325e..d6240bc5cb 100644
--- a/docs/users_guide/using-optimisation.rst
+++ b/docs/users_guide/using-optimisation.rst
@@ -1238,3 +1238,19 @@ by saying ``-fno-wombat``.
if a function definition will be inlined *at a call site*. The other option
determines if a function definition will be kept around at all for
potential inlining.
+
+.. ghc-flag:: -fbinary-blob-threshold=⟨n⟩
+ :shortdesc: *default: 500K.* Tweak assembly generator for binary blobs.
+ :type: dynamic
+ :category: optimization
+
+ :default: 500000
+
+ The native code-generator can either dump binary blobs (e.g. string
+ literals) into the assembly file (by using ".asciz" or ".string" assembler
+ directives) or it can dump them as binary data into a temporary file which
+ is then included by the assembler (using the ".incbin" assembler directive).
+
+ This flag sets the size (in bytes) threshold above which the second approach
+ is used. You can disable the second approach entirely by setting the
+ threshold to 0.