diff options
| author | Ian Lynagh <igloo@earth.li> | 2012-06-30 00:45:42 +0100 |
|---|---|---|
| committer | Ian Lynagh <igloo@earth.li> | 2012-06-30 00:45:42 +0100 |
| commit | 39d766ee942a5395a103e9bfd60bac7062ceb70d (patch) | |
| tree | 75ce3b2a91281da2121089ef6fadbef1fa71d96a /compiler/utils | |
| parent | 18e62c55ad27ce16cedc7e4ffd1d71e2d1d73dc1 (diff) | |
| download | haskell-39d766ee942a5395a103e9bfd60bac7062ceb70d.tar.gz | |
Copy Data.HashTable's hashString into our Util module
Data.HashTable is now deprecated and will soon be removed, but
deSugar/Coverage.lhs uses hashString.
Diffstat (limited to 'compiler/utils')
| -rw-r--r-- | compiler/utils/Util.lhs | 73 |
1 files changed, 72 insertions, 1 deletions
diff --git a/compiler/utils/Util.lhs b/compiler/utils/Util.lhs index 0d2f7418b6..9d12946052 100644 --- a/compiler/utils/Util.lhs +++ b/compiler/utils/Util.lhs @@ -92,7 +92,10 @@ module Util ( abstractConstr, abstractDataType, mkNoRepType, -- * Utils for printing C code - charToC + charToC, + + -- * Hashing + hashString, ) where #include "HsVersions.h" @@ -115,6 +118,7 @@ import System.Directory ( doesDirectoryExist, getModificationTime ) import System.FilePath import Data.Char ( isUpper, isAlphaNum, isSpace, chr, ord, isDigit ) +import Data.Int import Data.Ratio ( (%) ) import Data.Ord ( comparing ) import Data.Bits @@ -1049,3 +1053,70 @@ charToC w = chr (ord '0' + ord c `mod` 8)] \end{code} +%************************************************************************ +%* * +\subsection[Utils-Hashing]{Utils for hashing} +%* * +%************************************************************************ + +\begin{code} +-- | A sample hash function for Strings. We keep multiplying by the +-- golden ratio and adding. The implementation is: +-- +-- > hashString = foldl' f golden +-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m +-- > magic = 0xdeadbeef +-- +-- Where hashInt32 works just as hashInt shown above. +-- +-- Knuth argues that repeated multiplication by the golden ratio +-- will minimize gaps in the hash space, and thus it's a good choice +-- for combining together multiple keys to form one. +-- +-- Here we know that individual characters c are often small, and this +-- produces frequent collisions if we use ord c alone. A +-- particular problem are the shorter low ASCII and ISO-8859-1 +-- character strings. We pre-multiply by a magic twiddle factor to +-- obtain a good distribution. In fact, given the following test: +-- +-- > testp :: Int32 -> Int +-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls +-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']] +-- > hs = foldl' f golden +-- > f m c = fromIntegral (ord c) * k + hashInt32 m +-- > n = 100000 +-- +-- We discover that testp magic = 0. +hashString :: String -> Int32 +hashString = foldl' f golden + where f m c = fromIntegral (ord c) * magic + hashInt32 m + magic = 0xdeadbeef + +golden :: Int32 +golden = 1013904242 -- = round ((sqrt 5 - 1) * 2^32) :: Int32 +-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32 +-- but that has bad mulHi properties (even adding 2^32 to get its inverse) +-- Whereas the above works well and contains no hash duplications for +-- [-32767..65536] + +-- | A sample (and useful) hash function for Int32, +-- implemented by extracting the uppermost 32 bits of the 64-bit +-- result of multiplying by a 33-bit constant. The constant is from +-- Knuth, derived from the golden ratio: +-- +-- > golden = round ((sqrt 5 - 1) * 2^32) +-- +-- We get good key uniqueness on small inputs +-- (a problem with previous versions): +-- (length $ group $ sort $ map hashInt32 [-32767..65536]) == 65536 + 32768 +-- +hashInt32 :: Int32 -> Int32 +hashInt32 x = mulHi x golden + x + +-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiply +mulHi :: Int32 -> Int32 -> Int32 +mulHi a b = fromIntegral (r `shiftR` 32) + where r :: Int64 + r = fromIntegral a * fromIntegral b +\end{code} + |
