src/tools/PerfectHash.pm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380

#----------------------------------------------------------------------
#
# PerfectHash.pm
#    Perl module that constructs minimal perfect hash functions
#
# This code constructs a minimal perfect hash function for the given
# set of keys, using an algorithm described in
# "An optimal algorithm for generating minimal perfect hash functions"
# by Czech, Havas and Majewski in Information Processing Letters,
# 43(5):256-264, October 1992.
# This implementation is loosely based on NetBSD's "nbperf",
# which was written by Joerg Sonnenberger.
#
# The resulting hash function is perfect in the sense that if the presented
# key is one of the original set, it will return the key's index in the set
# (in range 0..N-1).  However, the caller must still verify the match,
# as false positives are possible.  Also, the hash function may return
# values that are out of range (negative or >= N), due to summing unrelated
# hashtable entries.  This indicates that the presented key is definitely
# not in the set.
#
#
# Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
# src/tools/PerfectHash.pm
#
#----------------------------------------------------------------------

package PerfectHash;

use strict;
use warnings;


# At runtime, we'll compute two simple hash functions of the input key,
# and use them to index into a mapping table.  The hash functions are just
# multiply-and-add in uint32 arithmetic, with different multipliers and
# initial seeds.  All the complexity in this module is concerned with
# selecting hash parameters that will work and building the mapping table.

# We support making case-insensitive hash functions, though this only
# works for a strict-ASCII interpretation of case insensitivity,
# ie, A-Z maps onto a-z and nothing else.
my $case_fold = 0;


#
# Construct a C function implementing a perfect hash for the given keys.
# The C function definition is returned as a string.
#
# The keys should be passed as an array reference.  They can be any set
# of Perl strings; it is caller's responsibility that there not be any
# duplicates.  (Note that the "strings" can be binary data, but hashing
# e.g. OIDs has endianness hazards that callers must overcome.)
#
# The name to use for the function is specified as the second argument.
# It will be a global function by default, but the caller may prepend
# "static " to the result string if it wants a static function.
#
# Additional options can be specified as keyword-style arguments:
#
# case_fold => bool
# If specified as true, the hash function is case-insensitive, for the
# limited idea of case-insensitivity explained above.
#
# fixed_key_length => N
# If specified, all keys are assumed to have length N bytes, and the
# hash function signature will be just "int f(const void *key)"
# rather than "int f(const void *key, size_t keylen)".
#
sub generate_hash_function
{
	my ($keys_ref, $funcname, %options) = @_;

	# It's not worth passing this around as a parameter; just use a global.
	$case_fold = $options{case_fold} || 0;

	# Try different hash function parameters until we find a set that works
	# for these keys.  The multipliers are chosen to be primes that are cheap
	# to calculate via shift-and-add, so don't change them without care.
	# (Commonly, random seeds are tried, but we want reproducible results
	# from this program so we don't do that.)
	my $hash_mult1 = 257;
	my $hash_mult2;
	my $hash_seed1;
	my $hash_seed2;
	my @subresult;
  FIND_PARAMS:
	for ($hash_seed1 = 0; $hash_seed1 < 10; $hash_seed1++)
	{

		for ($hash_seed2 = 0; $hash_seed2 < 10; $hash_seed2++)
		{
			foreach (17, 31, 127, 8191)
			{
				$hash_mult2 = $_;    # "foreach $hash_mult2" doesn't work
				@subresult = _construct_hash_table(
					$keys_ref,   $hash_mult1, $hash_mult2,
					$hash_seed1, $hash_seed2);
				last FIND_PARAMS if @subresult;
			}
		}
	}

	# Choke if we couldn't find a workable set of parameters.
	die "failed to generate perfect hash" if !@subresult;

	# Extract info from _construct_hash_table's result array.
	my $elemtype = $subresult[0];
	my @hashtab  = @{ $subresult[1] };
	my $nhash    = scalar(@hashtab);

	# OK, construct the hash function definition including the hash table.
	my $f = '';
	$f .= sprintf "int\n";
	if (defined $options{fixed_key_length})
	{
		$f .= sprintf "%s(const void *key)\n{\n", $funcname;
	}
	else
	{
		$f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname;
	}
	$f .= sprintf "\tstatic const %s h[%d] = {\n\t\t", $elemtype, $nhash;
	for (my $i = 0; $i < $nhash; $i++)
	{
		# Hash element.
		$f .= sprintf "%d", $hashtab[$i];
		next if ($i == $nhash - 1);

		# Optional indentation and newline, with eight items per line.
		$f .= sprintf ",%s",
		  ($i % 8 == 7 ? "\n\t\t" : ' ' x (6 - length($hashtab[$i])));
	}
	$f .= sprintf "\n" if ($nhash % 8 != 0);
	$f .= sprintf "\t};\n\n";
	$f .= sprintf "\tconst unsigned char *k = (const unsigned char *) key;\n";
	$f .= sprintf "\tsize_t\t\tkeylen = %d;\n", $options{fixed_key_length}
	  if (defined $options{fixed_key_length});
	$f .= sprintf "\tuint32\t\ta = %d;\n",   $hash_seed1;
	$f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed2;
	$f .= sprintf "\twhile (keylen--)\n\t{\n";
	$f .= sprintf "\t\tunsigned char c = *k++";
	$f .= sprintf " | 0x20" if $case_fold;    # see comment below
	$f .= sprintf ";\n\n";
	$f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1;
	$f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2;
	$f .= sprintf "\t}\n";
	$f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
	$f .= sprintf "}\n";

	return $f;
}


# Calculate a hash function as the run-time code will do.
#
# If we are making a case-insensitive hash function, we implement that
# by OR'ing 0x20 into each byte of the key.  This correctly transforms
# upper-case ASCII into lower-case ASCII, while not changing digits or
# dollar signs.  (It does change '_', as well as other characters not
# likely to appear in keywords; this has little effect on the hash's
# ability to discriminate keywords.)
sub _calc_hash
{
	my ($key, $mult, $seed) = @_;

	my $result = $seed;
	for my $c (split //, $key)
	{
		my $cn = ord($c);
		$cn |= 0x20 if $case_fold;
		$result = ($result * $mult + $cn) % 4294967296;
	}
	return $result;
}


# Attempt to construct a mapping table for a minimal perfect hash function
# for the given keys, using the specified hash parameters.
#
# Returns an array containing the mapping table element type name as the
# first element, and a ref to an array of the table values as the second.
#
# Returns an empty array on failure; then caller should choose different
# hash parameter(s) and try again.
sub _construct_hash_table
{
	my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_;
	my @keys = @{$keys_ref};

	# This algorithm is based on a graph whose edges correspond to the
	# keys and whose vertices correspond to entries of the mapping table.
	# A key's edge links the two vertices whose indexes are the outputs of
	# the two hash functions for that key.  For K keys, the mapping
	# table must have at least 2*K+1 entries, guaranteeing that there's at
	# least one unused entry.  (In principle, larger mapping tables make it
	# easier to find a workable hash and increase the number of inputs that
	# can be rejected due to touching unused hashtable entries.  In practice,
	# neither effect seems strong enough to justify using a larger table.)
	my $nedges = scalar @keys;       # number of edges
	my $nverts = 2 * $nedges + 1;    # number of vertices

	# However, it would be very bad if $nverts were exactly equal to either
	# $hash_mult1 or $hash_mult2: effectively, that hash function would be
	# sensitive to only the last byte of each key.  Cases where $nverts is a
	# multiple of either multiplier likewise lose information.  (But $nverts
	# can't actually divide them, if they've been intelligently chosen as
	# primes.)  We can avoid such problems by adjusting the table size.
	while ($nverts % $hash_mult1 == 0
		|| $nverts % $hash_mult2 == 0)
	{
		$nverts++;
	}

	# Initialize the array of edges.
	my @E = ();
	foreach my $kw (@keys)
	{
		# Calculate hashes for this key.
		# The hashes are immediately reduced modulo the mapping table size.
		my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed1) % $nverts;
		my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed2) % $nverts;

		# If the two hashes are the same for any key, we have to fail
		# since this edge would itself form a cycle in the graph.
		return () if $hash1 == $hash2;

		# Add the edge for this key.
		push @E, { left => $hash1, right => $hash2 };
	}

	# Initialize the array of vertices, giving them all empty lists
	# of associated edges.  (The lists will be hashes of edge numbers.)
	my @V = ();
	for (my $v = 0; $v < $nverts; $v++)
	{
		push @V, { edges => {} };
	}

	# Insert each edge in the lists of edges connected to its vertices.
	for (my $e = 0; $e < $nedges; $e++)
	{
		my $v = $E[$e]{left};
		$V[$v]{edges}->{$e} = 1;

		$v = $E[$e]{right};
		$V[$v]{edges}->{$e} = 1;
	}

	# Now we attempt to prove the graph acyclic.
	# A cycle-free graph is either empty or has some vertex of degree 1.
	# Removing the edge attached to that vertex doesn't change this property,
	# so doing that repeatedly will reduce the size of the graph.
	# If the graph is empty at the end of the process, it was acyclic.
	# We track the order of edge removal so that the next phase can process
	# them in reverse order of removal.
	my @output_order = ();

	# Consider each vertex as a possible starting point for edge-removal.
	for (my $startv = 0; $startv < $nverts; $startv++)
	{
		my $v = $startv;

		# If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
		# remove that edge, and then consider the edge's other vertex to see
		# if it is now of degree 1.  The inner loop repeats until reaching a
		# vertex not of degree 1.
		while (scalar(keys(%{ $V[$v]{edges} })) == 1)
		{
			# Unlink its only edge.
			my $e = (keys(%{ $V[$v]{edges} }))[0];
			delete($V[$v]{edges}->{$e});

			# Unlink the edge from its other vertex, too.
			my $v2 = $E[$e]{left};
			$v2 = $E[$e]{right} if ($v2 == $v);
			delete($V[$v2]{edges}->{$e});

			# Push e onto the front of the output-order list.
			unshift @output_order, $e;

			# Consider v2 on next iteration of inner loop.
			$v = $v2;
		}
	}

	# We succeeded only if all edges were removed from the graph.
	return () if (scalar(@output_order) != $nedges);

	# OK, build the hash table of size $nverts.
	my @hashtab = (0) x $nverts;
	# We need a "visited" flag array in this step, too.
	my @visited = (0) x $nverts;

	# The goal is that for any key, the sum of the hash table entries for
	# its first and second hash values is the desired output (i.e., the key
	# number).  By assigning hash table values in the selected edge order,
	# we can guarantee that that's true.  This works because the edge first
	# removed from the graph (and hence last to be visited here) must have
	# at least one vertex it shared with no other edge; hence it will have at
	# least one vertex (hashtable entry) still unvisited when we reach it here,
	# and we can assign that unvisited entry a value that makes the sum come
	# out as we wish.  By induction, the same holds for all the other edges.
	foreach my $e (@output_order)
	{
		my $l = $E[$e]{left};
		my $r = $E[$e]{right};
		if (!$visited[$l])
		{
			# $hashtab[$r] might be zero, or some previously assigned value.
			$hashtab[$l] = $e - $hashtab[$r];
		}
		else
		{
			die "oops, doubly used hashtab entry" if $visited[$r];
			# $hashtab[$l] might be zero, or some previously assigned value.
			$hashtab[$r] = $e - $hashtab[$l];
		}
		# Now freeze both of these hashtab entries.
		$visited[$l] = 1;
		$visited[$r] = 1;
	}

	# Detect range of values needed in hash table.
	my $hmin = $nedges;
	my $hmax = 0;
	for (my $v = 0; $v < $nverts; $v++)
	{
		$hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
		$hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
	}

	# Choose width of hashtable entries.  In addition to the actual values,
	# we need to be able to store a flag for unused entries, and we wish to
	# have the property that adding any other entry value to the flag gives
	# an out-of-range result (>= $nedges).
	my $elemtype;
	my $unused_flag;

	if (   $hmin >= -0x7F
		&& $hmax <= 0x7F
		&& $hmin + 0x7F >= $nedges)
	{
		# int8 will work
		$elemtype    = 'int8';
		$unused_flag = 0x7F;
	}
	elsif ($hmin >= -0x7FFF
		&& $hmax <= 0x7FFF
		&& $hmin + 0x7FFF >= $nedges)
	{
		# int16 will work
		$elemtype    = 'int16';
		$unused_flag = 0x7FFF;
	}
	elsif ($hmin >= -0x7FFFFFFF
		&& $hmax <= 0x7FFFFFFF
		&& $hmin + 0x3FFFFFFF >= $nedges)
	{
		# int32 will work
		$elemtype    = 'int32';
		$unused_flag = 0x3FFFFFFF;
	}
	else
	{
		die "hash table values too wide";
	}

	# Set any unvisited hashtable entries to $unused_flag.
	for (my $v = 0; $v < $nverts; $v++)
	{
		$hashtab[$v] = $unused_flag if !$visited[$v];
	}

	return ($elemtype, \@hashtab);
}

1;