contrib/fuzzystrmatch/daitch_mokotoff_header.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

#!/usr/bin/perl
#
# Generation of types and lookup tables for Daitch-Mokotoff soundex.
#
# Copyright (c) 2023, PostgreSQL Global Development Group
#
# This module was originally sponsored by Finance Norway /
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
#

use strict;
use warnings;

die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
my $output_file = $ARGV[0];

# Open the output file
open my $OUTPUT, '>', $output_file
  or die "Could not open output file $output_file: $!\n";

# Parse code table and generate tree for letter transitions.
my %codes;
my $table = [ {}, [ [ "", "", "" ] ] ];
while (<DATA>)
{
	chomp;
	my ($letters, $codes) = split(/\s+/);
	my @codes = map { [ split(/,/) ] } split(/\|/, $codes);

	my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
	my $val = join(
		",\n",
		map {
			"\t{\n\t\t"
			  . join(", ", map { "\"$_\"" } @$_) . "\n\t}"
		} @codes);
	$codes{$key} = $val;

	for my $letter (split(/,/, $letters))
	{
		my $ref = $table->[0];
		# Link each character to the next in the letter combination.
		my @c = split(//, $letter);
		my $last_c = pop(@c);
		for my $c (@c)
		{
			$ref->{$c} //= [ {}, undef ];
			$ref->{$c}[0] //= {};
			$ref = $ref->{$c}[0];
		}
		# The sound code for the letter combination is stored at the last character.
		$ref->{$last_c}[1] = $key;
	}
}
close(DATA);

print $OUTPUT <<EOF;
/*
 * Constants and lookup tables for Daitch-Mokotoff Soundex
 *
 * Copyright (c) 2023, PostgreSQL Global Development Group
 *
 * This file is generated by daitch_mokotoff_header.pl
 */

/* Coding chart table: Soundex codes */
typedef char dm_code[2 + 1];	/* One or two sequential code digits + NUL */
typedef dm_code dm_codes[3];	/* Start of name, before a vowel, any other */

/* Coding chart table: Letter in input sequence */
struct dm_letter
{
	char		letter;			/* Present letter in sequence */
	const struct dm_letter *letters;	/* List of possible successive letters */
	const dm_codes *codes;		/* Code sequence(s) for complete sequence */
};

typedef struct dm_letter dm_letter;

/* Codes for letter sequence at start of name, before a vowel, and any other. */
EOF

for my $key (sort keys %codes)
{
	print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
	  . $codes{$key}
	  . "\n};\n";
}

print $OUTPUT <<EOF;

/* Coding for alternative following letters in sequence. */
EOF

sub hash2code
{
	my ($ref, $letter) = @_;

	my @letters = ();

	my $h = $ref->[0];
	for my $key (sort keys %$h)
	{
		$ref = $h->{$key};
		my $children = "NULL";
		if (defined $ref->[0])
		{
			$children = "letter_$letter$key";
			hash2code($ref, "$letter$key");
		}
		my $codes = $ref->[1] // "NULL";
		push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
	}

	print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
	for (@letters)
	{
		print $OUTPUT "$_,\n";
	}
	print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
	print $OUTPUT "};\n";
}

hash2code($table, '');

close $OUTPUT;

# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
#
# The conversion from the coding chart to the table should be self
# explanatory, but note the differences stated below.
#
# X = NC (not coded)
#
# The non-ASCII letters in the coding chart are coded with substitute
# lowercase ASCII letters, which sort after the uppercase ASCII letters:
#
# Ą => a (use '[' for table lookup)
# Ę => e (use '\\' for table lookup)
# Ţ => t (use ']' for table lookup)
#
# The rule for "UE" does not correspond to the coding chart, however
# it is used by all other known implementations, including the one at
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
#
# Note that the implementation assumes that vowels are assigned code
# 0 or 1. "J" can be either a vowel or a consonant.
#

__DATA__
AI,AJ,AY				0,1,X
AU						0,7,X
a						X,X,6|X,X,X
A						0,X,X
B						7,7,7
CHS						5,54,54
CH						5,5,5|4,4,4
CK						5,5,5|45,45,45
CZ,CS,CSZ,CZS			4,4,4
C						5,5,5|4,4,4
DRZ,DRS					4,4,4
DS,DSH,DSZ				4,4,4
DZ,DZH,DZS				4,4,4
D,DT					3,3,3
EI,EJ,EY				0,1,X
EU						1,1,X
e						X,X,6|X,X,X
E						0,X,X
FB						7,7,7
F						7,7,7
G						5,5,5
H						5,5,X
IA,IE,IO,IU				1,X,X
I						0,X,X
J						1,X,X|4,4,4
KS						5,54,54
KH						5,5,5
K						5,5,5
L						8,8,8
MN						66,66,66
M						6,6,6
NM						66,66,66
N						6,6,6
OI,OJ,OY				0,1,X
O						0,X,X
P,PF,PH					7,7,7
Q						5,5,5
RZ,RS					94,94,94|4,4,4
R						9,9,9
SCHTSCH,SCHTSH,SCHTCH	2,4,4
SCH						4,4,4
SHTCH,SHCH,SHTSH		2,4,4
SHT,SCHT,SCHD			2,43,43
SH						4,4,4
STCH,STSCH,SC			2,4,4
STRZ,STRS,STSH			2,4,4
ST						2,43,43
SZCZ,SZCS				2,4,4
SZT,SHD,SZD,SD			2,43,43
SZ						4,4,4
S						4,4,4
TCH,TTCH,TTSCH			4,4,4
TH						3,3,3
TRZ,TRS					4,4,4
TSCH,TSH				4,4,4
TS,TTS,TTSZ,TC			4,4,4
TZ,TTZ,TZS,TSZ			4,4,4
t						3,3,3|4,4,4
T						3,3,3
UI,UJ,UY,UE				0,1,X
U						0,X,X
V						7,7,7
W						7,7,7
X						5,54,54
Y						1,X,X
ZDZ,ZDZH,ZHDZH			2,4,4
ZD,ZHD					2,43,43
ZH,ZS,ZSCH,ZSH			4,4,4
Z						4,4,4