summaryrefslogtreecommitdiff
path: root/contrib/tsearch2
diff options
context:
space:
mode:
authorTeodor Sigaev <teodor@sigaev.ru>2007-01-15 15:16:28 +0000
committerTeodor Sigaev <teodor@sigaev.ru>2007-01-15 15:16:28 +0000
commitf2a01b0d5a784a5191faad7f2022383760064f8a (patch)
treec05f04522b2e4889d29a1f4fa51da98304f13eab /contrib/tsearch2
parent7021d6f6c831132ee351c424df039f054d44de7d (diff)
downloadpostgresql-f2a01b0d5a784a5191faad7f2022383760064f8a.tar.gz
Fix localization support for multibyte encoding and C locale.
Slightly reworked patch from Tatsuo Ishii
Diffstat (limited to 'contrib/tsearch2')
-rw-r--r--contrib/tsearch2/ts_locale.c47
-rw-r--r--contrib/tsearch2/ts_locale.h15
-rw-r--r--contrib/tsearch2/wordparser/parser.c134
3 files changed, 144 insertions, 52 deletions
diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c
index cac5317a10..cb022d7e2a 100644
--- a/contrib/tsearch2/ts_locale.c
+++ b/contrib/tsearch2/ts_locale.c
@@ -12,13 +12,13 @@
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
@@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len)
return wcstombs(to, from, len);
}
+#endif /* WIN32 */
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
+#ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
@@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len)
return r;
}
+ else
+#endif /* WIN32 */
+ if ( lc_ctype_is_c() )
+ {
+ /*
+ * pg_mb2wchar_with_len always adds trailing '\0', so
+ * 'to' should be allocated with sufficient space
+ */
+ return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ }
return mbstowcs(to, from, len);
}
-#endif /* WIN32 */
int
_t_isalpha(const char *ptr)
{
- wchar_t character;
+ wchar_t character[2];
+
+ if (lc_ctype_is_c())
+ return isalpha(TOUCHAR(ptr));
- char2wchar(&character, ptr, 1);
+ char2wchar(character, ptr, 1);
- return iswalpha((wint_t) character);
+ return iswalpha((wint_t) *character);
}
int
_t_isprint(const char *ptr)
{
- wchar_t character;
+ wchar_t character[2];
+
+ if (lc_ctype_is_c())
+ return isprint(TOUCHAR(ptr));
- char2wchar(&character, ptr, 1);
+ char2wchar(character, ptr, 1);
- return iswprint((wint_t) character);
+ return iswprint((wint_t) *character);
}
#endif /* TS_USE_WIDE */
@@ -126,7 +143,7 @@ lowerstr(char *str)
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("transalation failed from server encoding to wchar_t")));
+ errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
@@ -152,7 +169,7 @@ lowerstr(char *str)
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+ errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h
index e2e2248137..81d1a16600 100644
--- a/contrib/tsearch2/ts_locale.h
+++ b/contrib/tsearch2/ts_locale.h
@@ -30,16 +30,17 @@
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
+size_t char2wchar(wchar_t *to, const char *from, size_t len);
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
-size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
#else /* WIN32 */
-/* correct mbstowcs */
-#define char2wchar mbstowcs
+/* correct wcstombs */
#define wchar2char wcstombs
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
@@ -55,10 +56,10 @@ extern int _t_isprint(const char *ptr);
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
-#define COPYCHAR(d,s) do { \
- int lll = pg_mblen( s ); \
- \
- while( lll-- ) \
+#define COPYCHAR(d,s) do { \
+ int lll = pg_mblen( s ); \
+ \
+ while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c
index fced41ec5e..3706a0efb7 100644
--- a/contrib/tsearch2/wordparser/parser.c
+++ b/contrib/tsearch2/wordparser/parser.c
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */
#include "postgres.h"
@@ -40,16 +40,13 @@ TParserInit(char *str, int len)
#ifdef TS_USE_WIDE
/*
- * Use wide char code only when max encoding length > 1 and ctype != C.
- * Some operating systems fail with multi-byte encodings and a C locale.
- * Also, for a C locale there is no need to process as multibyte. From
- * backend/utils/adt/oracle_compat.c Teodor
+ * Use wide char code only when max encoding length > 1.
*/
- if (prs->charmaxlen > 1 && !lc_ctype_is_c())
+ if (prs->charmaxlen > 1)
{
prs->usewide = true;
- prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
+ prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
@@ -83,25 +80,99 @@ TParserClose(TParser * prs)
/*
* defining support function, equvalent is* macroses, but
- * working with any possible encodings and locales
+ * working with any possible encodings and locales. Note,
+ * that with multibyte encoding and C-locale isw* function may fail
+ * or give wrong result. Note 2: multibyte encoding and C-locale
+ * often are used for Asian languages.
*/
#ifdef TS_USE_WIDE
-#define p_iswhat(type) \
-static int \
-p_is##type(TParser *prs) { \
- Assert( prs->state ); \
- return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
- is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
-} \
- \
-static int \
-p_isnot##type(TParser *prs) { \
- return !p_is##type(prs); \
+#define p_iswhat(type) \
+static int \
+p_is##type(TParser *prs) { \
+ Assert( prs->state ); \
+ if ( prs->usewide ) \
+ { \
+ if ( lc_ctype_is_c() ) \
+ return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
+ \
+ return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
+ } \
+ \
+ return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
+} \
+ \
+static int \
+p_isnot##type(TParser *prs) { \
+ return !p_is##type(prs); \
}
+static int
+p_isalnum(TParser *prs)
+{
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalnum(0xff & c);
+ }
+
+ return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+ return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+}
+
+static int
+p_isnotalnum(TParser *prs)
+{
+ return !p_isalnum(prs);
+}
+
+static int
+p_isalpha(TParser *prs)
+{
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalpha(0xff & c);
+ }
+
+ return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+}
+
+static int
+p_isnotalpha(TParser *prs)
+{
+ return !p_isalpha(prs);
+}
/* p_iseq should be used only for ascii symbols */
@@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c)
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
}
+
#else /* TS_USE_WIDE */
-#define p_iswhat(type) \
-static int \
-p_is##type(TParser *prs) { \
- Assert( prs->state ); \
- return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
-} \
- \
-static int \
-p_isnot##type(TParser *prs) { \
- return !p_is##type(prs); \
+#define p_iswhat(type) \
+static int \
+p_is##type(TParser *prs) { \
+ Assert( prs->state ); \
+ return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
+} \
+ \
+static int \
+p_isnot##type(TParser *prs) { \
+ return !p_is##type(prs); \
}
@@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c)
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
-#endif /* TS_USE_WIDE */
p_iswhat(alnum)
p_iswhat(alpha)
+
+#endif /* TS_USE_WIDE */
+
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)