summaryrefslogtreecommitdiff
path: root/compiler/utils
diff options
context:
space:
mode:
authorIan Lynagh <igloo@earth.li>2012-06-30 00:45:42 +0100
committerIan Lynagh <igloo@earth.li>2012-06-30 00:45:42 +0100
commit39d766ee942a5395a103e9bfd60bac7062ceb70d (patch)
tree75ce3b2a91281da2121089ef6fadbef1fa71d96a /compiler/utils
parent18e62c55ad27ce16cedc7e4ffd1d71e2d1d73dc1 (diff)
downloadhaskell-39d766ee942a5395a103e9bfd60bac7062ceb70d.tar.gz
Copy Data.HashTable's hashString into our Util module
Data.HashTable is now deprecated and will soon be removed, but deSugar/Coverage.lhs uses hashString.
Diffstat (limited to 'compiler/utils')
-rw-r--r--compiler/utils/Util.lhs73
1 files changed, 72 insertions, 1 deletions
diff --git a/compiler/utils/Util.lhs b/compiler/utils/Util.lhs
index 0d2f7418b6..9d12946052 100644
--- a/compiler/utils/Util.lhs
+++ b/compiler/utils/Util.lhs
@@ -92,7 +92,10 @@ module Util (
abstractConstr, abstractDataType, mkNoRepType,
-- * Utils for printing C code
- charToC
+ charToC,
+
+ -- * Hashing
+ hashString,
) where
#include "HsVersions.h"
@@ -115,6 +118,7 @@ import System.Directory ( doesDirectoryExist, getModificationTime )
import System.FilePath
import Data.Char ( isUpper, isAlphaNum, isSpace, chr, ord, isDigit )
+import Data.Int
import Data.Ratio ( (%) )
import Data.Ord ( comparing )
import Data.Bits
@@ -1049,3 +1053,70 @@ charToC w =
chr (ord '0' + ord c `mod` 8)]
\end{code}
+%************************************************************************
+%* *
+\subsection[Utils-Hashing]{Utils for hashing}
+%* *
+%************************************************************************
+
+\begin{code}
+-- | A sample hash function for Strings. We keep multiplying by the
+-- golden ratio and adding. The implementation is:
+--
+-- > hashString = foldl' f golden
+-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m
+-- > magic = 0xdeadbeef
+--
+-- Where hashInt32 works just as hashInt shown above.
+--
+-- Knuth argues that repeated multiplication by the golden ratio
+-- will minimize gaps in the hash space, and thus it's a good choice
+-- for combining together multiple keys to form one.
+--
+-- Here we know that individual characters c are often small, and this
+-- produces frequent collisions if we use ord c alone. A
+-- particular problem are the shorter low ASCII and ISO-8859-1
+-- character strings. We pre-multiply by a magic twiddle factor to
+-- obtain a good distribution. In fact, given the following test:
+--
+-- > testp :: Int32 -> Int
+-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls
+-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]
+-- > hs = foldl' f golden
+-- > f m c = fromIntegral (ord c) * k + hashInt32 m
+-- > n = 100000
+--
+-- We discover that testp magic = 0.
+hashString :: String -> Int32
+hashString = foldl' f golden
+ where f m c = fromIntegral (ord c) * magic + hashInt32 m
+ magic = 0xdeadbeef
+
+golden :: Int32
+golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32
+-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32
+-- but that has bad mulHi properties (even adding 2^32 to get its inverse)
+-- Whereas the above works well and contains no hash duplications for
+-- [-32767..65536]
+
+-- | A sample (and useful) hash function for Int32,
+-- implemented by extracting the uppermost 32 bits of the 64-bit
+-- result of multiplying by a 33-bit constant. The constant is from
+-- Knuth, derived from the golden ratio:
+--
+-- > golden = round ((sqrt 5 - 1) * 2^32)
+--
+-- We get good key uniqueness on small inputs
+-- (a problem with previous versions):
+-- (length $ group $ sort $ map hashInt32 [-32767..65536]) == 65536 + 32768
+--
+hashInt32 :: Int32 -> Int32
+hashInt32 x = mulHi x golden + x
+
+-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply
+mulHi :: Int32 -> Int32 -> Int32
+mulHi a b = fromIntegral (r `shiftR` 32)
+ where r :: Int64
+ r = fromIntegral a * fromIntegral b
+\end{code}
+