lib/uninorm.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

/* Normalization forms (composition and decomposition) of Unicode strings.
   Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2009.

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU Lesser General Public License as published
   by the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#ifndef _UNINORM_H
#define _UNINORM_H

/* Get size_t.  */
#include <stddef.h>

#include "unitypes.h"


#ifdef __cplusplus
extern "C" {
#endif


/* Conventions:

   All functions prefixed with u8_ operate on UTF-8 encoded strings.
   Their unit is an uint8_t (1 byte).

   All functions prefixed with u16_ operate on UTF-16 encoded strings.
   Their unit is an uint16_t (a 2-byte word).

   All functions prefixed with u32_ operate on UCS-4 encoded strings.
   Their unit is an uint32_t (a 4-byte word).

   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
   n units.

   Functions returning a string result take a (resultbuf, lengthp) argument
   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
   allocated string is returned.  In both cases, *lengthp is set to the
   length (number of units) of the returned string.  In case of error,
   NULL is returned and errno is set.  */


enum
{
  UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
  UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
  UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
  UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
  UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
  UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
  UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
  UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
  UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
  UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
  UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
  UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
  UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
  UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
  UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
  UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
  UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
};

/* Maximum size of decomposition of a single Unicode character.  */
#define UC_DECOMPOSITION_MAX_LENGTH 32

/* Return the character decomposition mapping of a Unicode character.
   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
   ucs_t elements.
   When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
   filled and N is returned.  Otherwise -1 is returned.  */
extern int
       uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);

/* Return the canonical character decomposition mapping of a Unicode character.
   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
   ucs_t elements.
   When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
   returned.  Otherwise -1 is returned.  */
extern int
       uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);


/* Attempt to combine the Unicode characters uc1, uc2.
   uc1 is known to have canonical combining class 0.
   Return the combination of uc1 and uc2, if it exists.
   Return 0 otherwise.
   Not all decompositions can be recombined using this function.  See the
   Unicode file CompositionExclusions.txt for details.  */
extern ucs4_t
       uc_composition (ucs4_t uc1, ucs4_t uc2);


/* An object of type uninorm_t denotes a Unicode normalization form.  */
struct unicode_normalization_form;
typedef const struct unicode_normalization_form *uninorm_t;

/* UNINORM_NFD: Normalization form D: canonical decomposition.  */
extern const struct unicode_normalization_form uninorm_nfd;
#define UNINORM_NFD (&uninorm_nfd)

/* UNINORM_NFC: Normalization form C: canonical decomposition, then
   canonical composition.  */
extern const struct unicode_normalization_form uninorm_nfc;
#define UNINORM_NFC (&uninorm_nfc)

/* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
extern const struct unicode_normalization_form uninorm_nfkd;
#define UNINORM_NFKD (&uninorm_nfkd)

/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
   canonical composition.  */
extern const struct unicode_normalization_form uninorm_nfkc;
#define UNINORM_NFKC (&uninorm_nfkc)

/* Test whether a normalization form does compatibility decomposition.  */
#define uninorm_is_compat_decomposing(nf) \
  ((* (const unsigned int *) (nf) >> 0) & 1)

/* Test whether a normalization form includes canonical composition.  */
#define uninorm_is_composing(nf) \
  ((* (const unsigned int *) (nf) >> 1) & 1)

/* Return the decomposing variant of a normalization form.
   This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
extern uninorm_t uninorm_decomposing_form (uninorm_t nf);


/* Return the specified normalization form of a string.  */
extern uint8_t *
       u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
		     uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
		      uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
		      uint32_t *resultbuf, size_t *lengthp);


/* Compare S1 and S2, ignoring differences in normalization.
   NF must be either UNINORM_NFD or UNINORM_NFKD.
   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
   return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
		   uninorm_t nf, int *resultp);
extern int
       u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
		    uninorm_t nf, int *resultp);
extern int
       u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
		    uninorm_t nf, int *resultp);


/* Converts the string S of length N to a NUL-terminated byte sequence, in such
   a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
   equivalent to comparing S1 and S2 with uN_normcoll().
   NF must be either UNINORM_NFC or UNINORM_NFKC.  */
extern char *
       u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
		    char *resultbuf, size_t *lengthp);
extern char *
       u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
		     char *resultbuf, size_t *lengthp);
extern char *
       u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
		     char *resultbuf, size_t *lengthp);


/* Compare S1 and S2, ignoring differences in normalization, using the
   collation rules of the current locale.
   NF must be either UNINORM_NFC or UNINORM_NFKC.
   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
   return 0.  Upon failure, return -1 with errno set.  */
extern int
       u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
		    uninorm_t nf, int *resultp);
extern int
       u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
		     uninorm_t nf, int *resultp);
extern int
       u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
		     uninorm_t nf, int *resultp);


/* Normalization of a stream of Unicode characters.

   A "stream of Unicode characters" is essentially a function that accepts an
   ucs4_t argument repeatedly, optionally combined with a function that
   "flushes" the stream.  */

/* Data type of a stream of Unicode characters that normalizes its input
   according to a given normalization form and passes the normalized character
   sequence to the encapsulated stream of Unicode characters.  */
struct uninorm_filter;

/* Create and return a normalization filter for Unicode characters.
   The pair (stream_func, stream_data) is the encapsulated stream.
   stream_func (stream_data, uc) receives the Unicode character uc
   and returns 0 if successful, or -1 with errno set upon failure.
   Return the new filter, or NULL with errno set upon failure.  */
extern struct uninorm_filter *
       uninorm_filter_create (uninorm_t nf,
			      int (*stream_func) (void *stream_data, ucs4_t uc),
			      void *stream_data);

/* Stuff a Unicode character into a normalizing filter.
   Return 0 if successful, or -1 with errno set upon failure.  */
extern int
       uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);

/* Bring data buffered in the filter to its destination, the encapsulated
   stream.
   Return 0 if successful, or -1 with errno set upon failure.
   Note! If after calling this function, additional characters are written
   into the filter, the resulting character sequence in the encapsulated stream
   will not necessarily be normalized.  */
extern int
       uninorm_filter_flush (struct uninorm_filter *filter);

/* Bring data buffered in the filter to its destination, the encapsulated
   stream, then close and free the filter.
   Return 0 if successful, or -1 with errno set upon failure.  */
extern int
       uninorm_filter_free (struct uninorm_filter *filter);


#ifdef __cplusplus
}
#endif


#endif /* _UNINORM_H */