chromium/third_party/hunspell/google/bdict.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_
#define THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_

#include <stddef.h>
#include <stdint.h>

#include "base/hash/md5.h"

// BDict (binary dictionary) format. All offsets are little endian.
//
// Header (28 bytes).
//   "BDic" Signature (4 bytes)
//   Version (little endian 4 bytes)
//   Absolute offset in file of the aff info. (4 bytes)
//   Absolute offset in file of the dic table. (4 bytes)
//   (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes)
//
// Aff information:
//   Absolute offset in file of the affix group table (4 bytes)
//   Absolute offset in file of the affix rules table (4 bytes)
//   Absolute offset in file of the replacements table (4 bytes)
//   Absolute offset in file of the "other rules" table (4 bytes)
//
//   The data between the aff header and the affix rules table is the comment
//   from the beginning of the .aff file which often contains copyrights, etc.
//
//   Affix group table:
//     Array of NULL terminated strings. It will end in a double-NULL.
//
//   Affix rules table:
//     List of LF termianted lines. NULL terminated.
//
//   Replacements table:
//     List of pairs of NULL teminated words. The end is indicated by a
//     double-NULL. The first word in the pair is the replacement source, the
//     second is what to replace it with. Example:
//       foo\0bar\0a\0b\0\0
//     for replacing ("foo" with "bar") and ("a" with "b").
//
//   Other rules table:
//     List of LF termianted lines. NULL terminated.
//
//
// Dic table. This stores the .dic file which contains the words in the
// dictionary, and indices for each one that indicate a set of suffixes or
// prefixes that can be applied. We store it in a trie to save space. It
// replaces Hunspell's hash manager.
//
//   0abxxxxx xxxxxxxx (in binary) Leaf node:
//     The number stored in the bits represented by x is the affix index.
//
//     If bit <a> is set, the leaf node has an additional string. Following the
//     2 byte header is a NULL-terminated (possibly 0-length) string that should
//     be appended to the node. This allows long unique endings to be handled
//     efficiently.
//
//     If bit <b> is set, the leaf node has a supplimental list of affix IDs
//     following the ordinary data for the leaf node. These affix group IDs are
//     additional rules for the same word. For example, two prefixes may go
//     with distinct sets of suffixes.
//
//     If the affix index is all 1's, then that means that there is only the
//     supplimental list, and the 13-bit of affix built-in to the node don't
//     count. This is used to represent numbers greater than 13 bits, since
//     the supplimentary list has 16 bits per entry. The node must have a
//     supplimenal list if this is set.
//
//     This additional array is an array of 16-bit little-endian values,
//     terminated by 0xFFFF (since 0 is an affix ID meaning "no affix ID".
//
//   0x110000ab: Lookup node.
//     When <a> is set, addresses are 32-bits relative to the beginning of the
//     dictionary data. When unset, addresses are 16-bits relative to the
//     beginning of this node. All values are little endian.
//
//     When <b> is set, there is one additional entry before the table begins.
//     This is the 0th character. 0 is a common addition (meaning no more data)
//     and this prevents us from having to store entries for all the control
//     characters. This magic element is not counted in the table size.
//
//     The ID byte is followeed by two bytes:
//       XX: First character value in the lookup table.
//       XX: Number of characters in the lookup table.
//
//     This is followed optionally by the entry for 0, and then by a table of
//     size indicated by the second charatcer after the ID.
//
//   1110xxxx: List node with 8-bit addresses.
//     The number of items (max 16) in the list is stored in the bits xxxx.
//     Followed by N (character byte, 8-bit offset) pairs. These offsets are
//     relative to the end of the list of pairs.
//   1111xxxx: List node with 16-bit addresses. Same as above but offsets are
//     2-bytes each. LITTLE ENDIAN!

namespace hunspell {

#pragma pack(push, 1)

class BDict {
 public:
  // File header.
  enum { SIGNATURE = 0x63694442 };
  enum {
    MAJOR_VERSION = 2,
    MINOR_VERSION = 0
  };
  struct Header {
    uint32_t signature;

    // Major versions are incompatible with other major versions. Minor versions
    // should be readable by older programs expecting the same major version.
    uint16_t major_version;
    uint16_t minor_version;

    uint32_t aff_offset;  // Offset of the aff data.
    uint32_t dic_offset;  // Offset of the dic data.

    // Added by version 2.0.
    base::MD5Digest digest;  // MD5 digest of the aff data and the dic data.
  };

  // AFF section ===============================================================

  struct AffHeader {
    uint32_t affix_group_offset;
    uint32_t affix_rule_offset;
    uint32_t rep_offset;  // Replacements table.
    uint32_t other_offset;
  };

  // DIC section ===============================================================

  // Leaf ----------------------------------------------------------------------

  // Leaf nodes have the high bit set to 0.
  enum { LEAF_NODE_TYPE_MASK = 0x80 };  // 10000000
  enum { LEAF_NODE_TYPE_VALUE = 0 };    // 00000000

  // Leaf nodes with additional strings have the next-to-high bit set to 1.
  // This mask/value pair also includes the high bit set to 0 which is the leaf
  // indicator.
  enum { LEAF_NODE_ADDITIONAL_MASK = 0xC0 };   // 11000000
  enum { LEAF_NODE_ADDITIONAL_VALUE = 0x40 };  // 01000000

  // Leaf nodes with an additional array of affix rules following it.
  enum { LEAF_NODE_FOLLOWING_MASK = 0xA0 };  // 10100000
  enum { LEAF_NODE_FOLLOWING_VALUE = 0x20 }; // 00100000

  // The low 5 bits of the leaf node ID byte are the first 5 bits of the affix
  // ID. The following byte is used for the low bits of the affix ID (we don't
  // specify as mask for that).
  enum { LEAF_NODE_FIRST_BYTE_AFFIX_MASK = 0x1F };  // 00011111

  // The maximum affix value that can be stored in the first entry (not in the
  // following list). We reserve all 1's to be a magic value (see next entry)
  // so we can store large numbers somewhere else.
  enum { LEAF_NODE_MAX_FIRST_AFFIX_ID = 0x1FFE };  // 00011111 11111110

  // When the affix built-in to the leaf node (the first one) has too many bits
  // for the space reserved for it (13 bits), then we fill it with this value.
  // This means that the affix doesn't count. The affix will instead be stored
  // in the "following list" which allows up to 16 bits per entry.
  enum { FIRST_AFFIX_IS_UNUSED = 0x1FFF };  // 00011111 11111111

  // The maximum number of leaf nodes we'll read that have the same word and
  // follow each other (the FOLLOWING bit is set).
  enum { MAX_AFFIXES_PER_WORD = 32 };

  // The terminator for the list of following affix group IDs.
  enum { LEAF_NODE_FOLLOWING_LIST_TERMINATOR = 0xFFFF };

  // Lookup --------------------------------------------------------------------

  // Lookup nodes have the first 6 bits set to 110000.
  enum { LOOKUP_NODE_TYPE_MASK = 0xFC };   // 11111100
  enum { LOOKUP_NODE_TYPE_VALUE = 0xC0 };  // 11000000

  // Lookup nodes have the low bit meaning it has a 0th entry, and the
  // next-to-lowest bit indicating whether the offsets are 32-bits. Included
  // in these masks are the lookup ID above.
  enum { LOOKUP_NODE_0TH_MASK = 0xFD };    // 11111110
  enum { LOOKUP_NODE_0TH_VALUE = 0xC1 };   // 11000010
  enum { LOOKUP_NODE_32BIT_MASK = 0xFE};   // 11111110
  enum { LOOKUP_NODE_32BIT_VALUE = 0xC2};  // 11000001

  // List ----------------------------------------------------------------------

  // List nodes have the first 3 bits set to 1.
  enum { LIST_NODE_TYPE_MASK = 0xE0 };   // 11100000
  enum { LIST_NODE_TYPE_VALUE = 0xE0 };  // 11100000

  // The 4th from highest bit indicates a 16 bit (as opposed to 8 bit) list.
  // This mask/value also includes the list ID in the high 3 bits.
  enum { LIST_NODE_16BIT_MASK = 0xF0 };   // 11110000
  enum { LIST_NODE_16BIT_VALUE = 0xF0 };  // 11110000

  // The low 4 bits of the list ID byte are the count.
  enum { LIST_NODE_COUNT_MASK = 0xF };  // 00001111

  // Verifies the specified BDICT is sane. This function checks the BDICT header
  // and compares the MD5 digest of the data with the one in the header.
  static bool Verify(const char* bdict_data, size_t bdict_length);
};

#pragma pack(pop)

}  // namespace hunspell

#endif  // THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_