summaryrefslogtreecommitdiff
path: root/src/myspell/csutil.hxx
blob: 2a16538056596065198c8b2fbaed43fd183f5a7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#ifndef __CSUTILHXX__
#define __CSUTILHXX__

// First some base level utility routines

#include "w_char.hxx"

#define NOCAP   0
#define INITCAP 1
#define ALLCAP  2
#define HUHCAP  3
#define HUHINITCAP  4

#define MORPH_STEM        "st:"
#define MORPH_ALLOMORPH   "al:"
#define MORPH_POS         "po:"
#define MORPH_DERI_PFX    "dp:"
#define MORPH_INFL_PFX    "ip:"
#define MORPH_TERM_PFX    "tp:"
#define MORPH_DERI_SFX    "ds:"
#define MORPH_INFL_SFX    "is:"
#define MORPH_TERM_SFX    "ts:"
#define MORPH_SURF_PFX    "sp:"
#define MORPH_FREQ        "fr:"
#define MORPH_PHON        "ph:"
#define MORPH_HYPH        "hy:"
#define MORPH_PART        "pa:"
#define MORPH_FLAG        "fl:"
#define MORPH_HENTRY      "_H:"
#define MORPH_TAG_LEN     strlen(MORPH_STEM)

#define MSEP_FLD ' '
#define MSEP_REC '\n'
#define MSEP_ALT '\v'

// default flags
#define DEFAULTFLAGS   65510
#define FORBIDDENWORD  65510
#define ONLYUPCASEFLAG 65511

// hash entry macros
#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \
    get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : NULL)
#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL)

#define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h))

// convert UTF-16 characters to UTF-8
char * u16_u8(char * dest, int size, const w_char * src, int srclen);

// convert UTF-8 characters to UTF-16
int u8_u16(w_char * dest, int size, const char * src);

// sort 2-byte vector
void flag_qsort(unsigned short flags[], int begin, int end);

// binary search in 2-byte vector
int flag_bsearch(unsigned short flags[], unsigned short flag, int right);

// remove end of line char(s)
void   mychomp(char * s);

// duplicate string
char * mystrdup(const char * s);

// duplicate reverse of string
char * myrevstrdup(const char * s);

// parse into tokens with char delimiter
char * mystrsep(char ** sptr, const char delim);
// parse into tokens with char delimiter
char * mystrsep2(char ** sptr, const char delim);

// parse into tokens with char delimiter
char * mystrrep(char *, const char *, const char *);

// append s to ends of every lines in text
void strlinecat(char * lines, const char * s);

// tokenize into lines with new line
   int line_tok(const char * text, char *** lines, char breakchar);

// tokenize into lines with new line and uniq in place
   char * line_uniq(char * text, char breakchar);
   char * line_uniq_app(char ** text, char breakchar);

// change oldchar to newchar in place
   char * tr(char * text, char oldc, char newc);

// reverse word
   int reverseword(char *);

// reverse word
   int reverseword_utf(char *);

// remove duplicates
 int uniqlist(char ** list, int n);

// free character array list
   void freelist(char *** list, int n);

// character encoding information
struct cs_info {
  unsigned char ccase;
  unsigned char clower;
  unsigned char cupper;
};

// Unicode character encoding information
struct unicode_info {
  unsigned short c;
  unsigned short cupper;
  unsigned short clower;
};

struct unicode_info2 {
  char cletter;
  unsigned short cupper;
  unsigned short clower;
};

int initialize_utf_tbl();
void free_utf_tbl();
unsigned short unicodetoupper(unsigned short c, int langnum);
unsigned short unicodetolower(unsigned short c, int langnum);
int unicodeisalpha(unsigned short c);

struct enc_entry {
  const char * enc_name;
  struct cs_info * cs_table;
};

// language to encoding default map

struct lang_map {
  const char * lang;
  const char * def_enc;
  int num;
};

struct cs_info * get_current_cs(const char * es);

const char * get_default_enc(const char * lang);

// get language identifiers of language codes
int get_lang_num(const char * lang);

// get characters of the given 8bit encoding with lower- and uppercase forms
char * get_casechars(const char * enc);

// convert null terminated string to all caps using encoding
void enmkallcap(char * d, const char * p, const char * encoding);

// convert null terminated string to all little using encoding
void enmkallsmall(char * d, const char * p, const char * encoding);

// convert null terminated string to have intial capital using encoding
void enmkinitcap(char * d, const char * p, const char * encoding);

// convert null terminated string to all caps
void mkallcap(char * p, const struct cs_info * csconv);

// convert null terminated string to all little
void mkallsmall(char * p, const struct cs_info * csconv);

// convert null terminated string to have intial capital
void mkinitcap(char * p, const struct cs_info * csconv);

// convert first nc characters of UTF-8 string to little
void mkallsmall_utf(w_char * u, int nc, int langnum);

// convert first nc characters of UTF-8 string to capital
void mkallcap_utf(w_char * u, int nc, int langnum);

// get type of capitalization
int get_captype(char * q, int nl, cs_info *);

// get type of capitalization (UTF-8)
int get_captype_utf8(w_char * q, int nl, int langnum);

// strip all ignored characters in the string
void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len);

// strip all ignored characters in the string
void remove_ignored_chars(char * word, char * ignored_chars);

int parse_string(char * line, char ** out, const char * name);

int parse_array(char * line, char ** out,
        unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8);

int fieldlen(const char * r);
char * copy_field(char * dest, const char * morph, const char * var);

int morphcmp(const char * s, const char * t);

int get_sfxcount(const char * morph);

// conversion function for protected memory
void store_pointer(char * dest, char * source);

// conversion function for protected memory
char * get_stored_pointer(char * s);

#endif