diff options
author | Moriyoshi Koizumi <moriyoshi@php.net> | 2010-12-19 16:36:37 +0000 |
---|---|---|
committer | Moriyoshi Koizumi <moriyoshi@php.net> | 2010-12-19 16:36:37 +0000 |
commit | bbf3d43c1ee0ad53b03c3821cd630f0746d5e954 (patch) | |
tree | fd11ea79a69ee445ffde8310a3760603bf3df821 /Zend/zend_multibyte.c | |
parent | c28cac404d2d0590ba2811f41331c60d09adbf1e (diff) | |
download | php-git-bbf3d43c1ee0ad53b03c3821cd630f0746d5e954.tar.gz |
* Refactor zend_multibyte facility.
Now mbstring.script_encoding is superseded by zend.script_encoding.
Diffstat (limited to 'Zend/zend_multibyte.c')
-rw-r--r-- | Zend/zend_multibyte.c | 1252 |
1 files changed, 121 insertions, 1131 deletions
diff --git a/Zend/zend_multibyte.c b/Zend/zend_multibyte.c index 3ca5191c00..dec07ecdcd 100644 --- a/Zend/zend_multibyte.c +++ b/Zend/zend_multibyte.c @@ -23,1219 +23,209 @@ #include "zend_compile.h" #include "zend_operators.h" #include "zend_multibyte.h" +#include "zend_ini.h" -static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC); -size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC); -size_t sjis_output_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC); -static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size); -static int zend_multibyte_parse_encoding_list(const char *encoding_list, -size_t encoding_list_size, zend_encoding ***result, size_t *result_size); -static zend_encoding *zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC); -static zend_encoding *zend_multibyte_detect_unicode(TSRMLS_D); -static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC); - -/* - * encodings - */ -static const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL}; -static zend_encoding encoding_ucs2 = { - NULL, - NULL, - "UCS-2", - (const char *(*)[])&ucs2_aliases, - 0 -}; - -static zend_encoding encoding_ucs2be = { - NULL, - NULL, - "UCS-2BE", - NULL, - 0 -}; - -static zend_encoding encoding_ucs2le = { - NULL, - NULL, - "UCS-2LE", - NULL, - 0 -}; - -static const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL}; -static zend_encoding encoding_ucs4 = { - NULL, - NULL, - "UCS-4", - (const char *(*)[])&ucs4_aliases, - 0 -}; - -static zend_encoding encoding_ucs4be = { - NULL, - NULL, - "UCS-4BE", - NULL, - 0 -}; - -static zend_encoding encoding_ucs4le = { - NULL, - NULL, - "UCS-4LE", - NULL, - 0 -}; - -static const char *utf32_aliases[] = {"utf32", NULL}; -static zend_encoding encoding_utf32 = { - NULL, - NULL, - "UTF-32", - (const char *(*)[])&utf32_aliases, - 0 -}; - -static zend_encoding encoding_utf32be = { - NULL, - NULL, - "UTF-32BE", - NULL, - 0 -}; - -static zend_encoding encoding_utf32le = { - NULL, - NULL, - "UTF-32LE", - NULL, - 0 -}; - -static const char *utf16_aliases[] = {"utf16", NULL}; -static zend_encoding encoding_utf16 = { - NULL, - NULL, - "UTF-16", - (const char *(*)[])&utf16_aliases, - 0 -}; - -static zend_encoding encoding_utf16be = { - NULL, - NULL, - "UTF-16BE", - NULL, - 0 -}; - -static zend_encoding encoding_utf16le = { - NULL, - NULL, - "UTF-16LE", - NULL, - 0 -}; - -static const char *utf8_aliases[] = {"utf8", NULL}; -static zend_encoding encoding_utf8 = { - NULL, - NULL, - "UTF-8", - (const char *(*)[])&utf8_aliases, - 1 -}; - -static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL}; -static zend_encoding encoding_ascii = { - NULL, - NULL, - "ASCII", - (const char *(*)[])&ascii_aliases, - 1 -}; - -static const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL}; -static zend_encoding encoding_euc_jp = { - NULL, - NULL, - "EUC-JP", - (const char *(*)[])&euc_jp_aliases, - 1 -}; - -static const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL}; -static zend_encoding encoding_sjis = { - sjis_input_filter, - sjis_output_filter, - "Shift_JIS", - (const char *(*)[])&sjis_aliases, - 0 -}; - -static const char *eucjp_win_aliases[] = {"eucJP-open", NULL}; -static zend_encoding encoding_eucjp_win = { - NULL, - NULL, - "eucJP-win", - (const char *(*)[])&eucjp_win_aliases, - 1 -}; - -static const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL}; -static zend_encoding encoding_sjis_win = { - /* sjis-filters does not care about diffs of Shift_JIS and CP932 */ - sjis_input_filter, - sjis_output_filter, - "SJIS-win", - (const char *(*)[])&sjis_win_aliases, - 0 -}; - -static const char *jis_aliases[] = {"ISO-2022-JP", NULL}; -static zend_encoding encoding_jis = { - NULL, - NULL, - "JIS", - (const char *(*)[])&jis_aliases, - 0 -}; - -static const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL}; -static zend_encoding encoding_euc_cn = { - NULL, - NULL, - "EUC-CN", - (const char *(*)[])&euc_cn_aliases, - 1 -}; - -static const char *cp936_aliases[] = {"CP-936", NULL}; -static zend_encoding encoding_cp936 = { - NULL, - NULL, - "CP936", - (const char *(*)[])&cp936_aliases, - 0 -}; - -static const char *hz_aliases[] = {"HZ-GB-2312", NULL}; -static zend_encoding encoding_hz = { - NULL, - NULL, - "HZ", - (const char *(*)[])&hz_aliases, - 0 -}; - -static const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL}; -static zend_encoding encoding_euc_tw = { - NULL, - NULL, - "EUC-TW", - (const char *(*)[])&euc_tw_aliases, - 1 -}; - -static const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL}; -static zend_encoding encoding_big5 = { - NULL, - NULL, - "BIG-5", - (const char *(*)[])&big5_aliases, - 0 -}; - -static const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL}; -static zend_encoding encoding_euc_kr = { - NULL, - NULL, - "EUC-KR", - (const char *(*)[])&euc_kr_aliases, - 1 -}; - -static const char *uhc_aliases[] = {"CP949", NULL}; -static zend_encoding encoding_uhc = { - NULL, - NULL, - "UHC", - (const char *(*)[])&uhc_aliases, - 1 -}; - -static zend_encoding encoding_2022kr = { - NULL, - NULL, - "ISO-2022-KR", - NULL, - 0 -}; - -static const char *cp1252_aliases[] = {"cp1252", NULL}; -static zend_encoding encoding_cp1252 = { - NULL, - NULL, - "Windows-1252", - (const char *(*)[])&cp1252_aliases, - 1 -}; - -static const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL}; -static zend_encoding encoding_8859_1 = { - NULL, - NULL, - "ISO-8859-1", - (const char *(*)[])&iso_8859_1_aliases, - 1 -}; - -static const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL}; -static zend_encoding encoding_8859_2 = { - NULL, - NULL, - "ISO-8859-2", - (const char *(*)[])&iso_8859_2_aliases, - 1 -}; - -static const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL}; -static zend_encoding encoding_8859_3 = { - NULL, - NULL, - "ISO-8859-3", - (const char *(*)[])&iso_8859_3_aliases, - 1 -}; - -static const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL}; -static zend_encoding encoding_8859_4 = { - NULL, - NULL, - "ISO-8859-4", - (const char *(*)[])&iso_8859_4_aliases, - 1 -}; - -static const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL}; -static zend_encoding encoding_8859_5 = { - NULL, - NULL, - "ISO-8859-5", - (const char *(*)[])&iso_8859_5_aliases, - 1 -}; - -static const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL}; -static zend_encoding encoding_8859_6 = { - NULL, - NULL, - "ISO-8859-6", - (const char *(*)[])&iso_8859_6_aliases, - 1 -}; - -static const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL}; -static zend_encoding encoding_8859_7 = { - NULL, - NULL, - "ISO-8859-7", - (const char *(*)[])&iso_8859_7_aliases, - 1 -}; - -static const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL}; -static zend_encoding encoding_8859_8 = { - NULL, - NULL, - "ISO-8859-8", - (const char *(*)[])&iso_8859_8_aliases, - 1 -}; - -static const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL}; -static zend_encoding encoding_8859_9 = { - NULL, - NULL, - "ISO-8859-9", - (const char *(*)[])&iso_8859_9_aliases, - 1 -}; - -static const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL}; -static zend_encoding encoding_8859_10 = { - NULL, - NULL, - "ISO-8859-10", - (const char *(*)[])&iso_8859_10_aliases, - 1 -}; - -static const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL}; -static zend_encoding encoding_8859_13 = { - NULL, - NULL, - "ISO-8859-13", - (const char *(*)[])&iso_8859_13_aliases, - 1 -}; - -static const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL}; -static zend_encoding encoding_8859_14 = { - NULL, - NULL, - "ISO-8859-14", - (const char *(*)[])&iso_8859_14_aliases, - 1 -}; - -static const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL}; -static zend_encoding encoding_8859_15 = { - NULL, - NULL, - "ISO-8859-15", - (const char *(*)[])&iso_8859_15_aliases, - 1 -}; - -static const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL}; -static zend_encoding encoding_cp1251 = { - NULL, - NULL, - "Windows-1251", - (const char *(*)[])&cp1251_aliases, - 1 -}; - -static const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL}; -static zend_encoding encoding_cp866 = { - NULL, - NULL, - "CP866", - (const char *(*)[])&cp866_aliases, - 1 -}; - -static const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL}; -static zend_encoding encoding_koi8r = { - NULL, - NULL, - "KOI8-R", - (const char *(*)[])&koi8r_aliases, - 1 -}; - -static const char *koi8u_aliases[] = {"KOI8-U", "KOI8U", NULL}; -static zend_encoding encoding_koi8u = { - NULL, - NULL, - "KOI8-U", - (const char *(*)[])&koi8u_aliases, - 1 -}; - -static const char *cp1254_aliases[] = {"cp1254", NULL}; -static zend_encoding encoding_cp1254 = { - NULL, - NULL, - "Windows-1254", - (const char *(*)[])&cp1254_aliases, - 1 -}; - -static const char *armscii8_aliases[] = { "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL}; -static zend_encoding encoding_armscii8 = { - NULL, - NULL, - "ArmSCII-8", - (const char *(*)[])&armscii8_aliases, - 1 -}; - -static const char *cp850_aliases[] = {"IBM850", NULL}; -static zend_encoding encoding_cp850 = { - NULL, - NULL, - "CP850", - (const char *(*)[])&cp850_aliases, - 1 -}; - -static zend_encoding *zend_encoding_table[] = { - &encoding_ucs4, - &encoding_ucs4be, - &encoding_ucs4le, - &encoding_ucs2, - &encoding_ucs2be, - &encoding_ucs2le, - &encoding_utf32, - &encoding_utf32be, - &encoding_utf32le, - &encoding_utf16, - &encoding_utf16be, - &encoding_utf16le, - &encoding_utf8, - &encoding_ascii, - &encoding_euc_jp, - &encoding_sjis, - &encoding_eucjp_win, - &encoding_sjis_win, - &encoding_jis, - &encoding_cp1252, - &encoding_8859_1, - &encoding_8859_2, - &encoding_8859_3, - &encoding_8859_4, - &encoding_8859_5, - &encoding_8859_6, - &encoding_8859_7, - &encoding_8859_8, - &encoding_8859_9, - &encoding_8859_10, - &encoding_8859_13, - &encoding_8859_14, - &encoding_8859_15, - &encoding_euc_cn, - &encoding_cp936, - &encoding_hz, - &encoding_euc_tw, - &encoding_big5, - &encoding_euc_kr, - &encoding_uhc, - &encoding_2022kr, - &encoding_cp1251, - &encoding_cp866, - &encoding_koi8r, - &encoding_koi8u, - &encoding_armscii8, - &encoding_cp1254, - &encoding_cp850, - NULL -}; - -static char* dummy_encoding_detector(const unsigned char *string, size_t length, char *list TSRMLS_DC) +static const zend_encoding *dummy_encoding_fetcher(const char *encoding_name TSRMLS_DC) { return NULL; } -static int dummy_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const char *encoding_to, const char *encoding_from TSRMLS_DC) +static const char *dummy_encoding_name_getter(const zend_encoding *encoding) { - return -1; + return NULL; } -static size_t dummy_encoding_oddlen(const unsigned char *string, size_t length, const char *encoding TSRMLS_DC) +static int dummy_encoding_lexer_compatibility_checker(const zend_encoding *encoding) { return 0; } -static int dummy_encoding_list_checker(const char *encoding_list TSRMLS_DC) +static const zend_encoding *dummy_encoding_detector(const unsigned char *string, size_t length, const zend_encoding **list, size_t list_size TSRMLS_DC) { - /* ignore encoding */ - return 1; + return NULL; } -static const char* dummy_get_internal_encoding(TSRMLS_D) +static size_t dummy_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from TSRMLS_DC) { - return NULL; + return (size_t)-1; } -ZEND_API zend_encoding_detector zend_multibyte_encoding_detector = dummy_encoding_detector; -ZEND_API zend_encoding_converter zend_multibyte_encoding_converter = dummy_encoding_converter; -ZEND_API zend_encoding_oddlen zend_multibyte_encoding_oddlen = dummy_encoding_oddlen; -ZEND_API zend_encoding_list_checker zend_multibyte_check_encoding_list = dummy_encoding_list_checker; -ZEND_API zend_encoding_name_getter zend_multibyte_get_internal_encoding = dummy_get_internal_encoding; - -ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list, -size_t encoding_list_size TSRMLS_DC) +static int dummy_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, int persistent TSRMLS_DC) { - if (CG(script_encoding_list)) { - efree(CG(script_encoding_list)); - CG(script_encoding_list) = NULL; - } - CG(script_encoding_list_size) = 0; - - if (!encoding_list) { - return 0; - } - - zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size))); - - return 0; + return FAILURE; } - -ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC) +static const zend_encoding *dummy_internal_encoding_getter(TSRMLS_D) { - CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name); - return 0; + return NULL; } -ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen, zend_encoding_list_checker encoding_list_checker, zend_encoding_name_getter get_internal_encoding TSRMLS_DC) +static int dummy_internal_encoding_setter(const zend_encoding *encoding TSRMLS_DC) { - zend_multibyte_encoding_detector = encoding_detector; - zend_multibyte_encoding_converter = encoding_converter; - zend_multibyte_encoding_oddlen = encoding_oddlen; - zend_multibyte_check_encoding_list = encoding_list_checker; - zend_multibyte_get_internal_encoding = get_internal_encoding; - return 0; + return FAILURE; } +static zend_multibyte_functions multibyte_functions = { + NULL, + dummy_encoding_fetcher, + dummy_encoding_name_getter, + dummy_encoding_lexer_compatibility_checker, + dummy_encoding_detector, + dummy_encoding_converter, + dummy_encoding_list_parser, + dummy_internal_encoding_getter, + dummy_internal_encoding_setter +}; -ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC) -{ - LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC); - LANG_SCNG(internal_encoding) = CG(internal_encoding); - - /* judge input/output filter */ - LANG_SCNG(input_filter) = NULL; - LANG_SCNG(output_filter) = NULL; +ZEND_API const zend_encoding *zend_multibyte_encoding_utf32be; +ZEND_API const zend_encoding *zend_multibyte_encoding_utf32le; +ZEND_API const zend_encoding *zend_multibyte_encoding_utf16be; +ZEND_API const zend_encoding *zend_multibyte_encoding_utf16le; +ZEND_API const zend_encoding *zend_multibyte_encoding_utf8; - if (!LANG_SCNG(script_encoding)) { - return 0; +ZEND_API int zend_multibyte_set_functions(const zend_multibyte_functions *functions TSRMLS_DC) +{ + zend_multibyte_encoding_utf32be = functions->encoding_fetcher("UTF-32BE" TSRMLS_CC); + if (!zend_multibyte_encoding_utf32be) { + return FAILURE; } - - if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) { - /* if encoding specfic filters exist, use them */ - if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) { - LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter; - LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter; - return 0; - } - - if (!LANG_SCNG(script_encoding)->compatible) { - /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */ - LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding); - LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; - LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; - return 0; - } else { - /* nothing to do in this case */ - return 0; - } + zend_multibyte_encoding_utf32le = functions->encoding_fetcher("UTF-32LE" TSRMLS_CC); + if (!zend_multibyte_encoding_utf32le) { + return FAILURE; } - - /* LANG_SCNG(internal_encoding) cannot be NULL here */ - if (LANG_SCNG(internal_encoding)->compatible) { - LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; - return 0; - } else if (LANG_SCNG(script_encoding)->compatible) { - LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; - return 0; + zend_multibyte_encoding_utf16be = functions->encoding_fetcher("UTF-16BE" TSRMLS_CC); + if (!zend_multibyte_encoding_utf16be) { + return FAILURE; } - - /* both script and internal encodings are incompatible w/ flex */ - LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; - LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; - - return 0; -} - - -ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name) -{ - int i, j; - zend_encoding *encoding; - - if (!encoding_name) { - return NULL; + zend_multibyte_encoding_utf16le = functions->encoding_fetcher("UTF-16LE" TSRMLS_CC); + if (!zend_multibyte_encoding_utf16le) { + return FAILURE; } - - for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) { - if (zend_binary_strcasecmp(encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) { - return encoding; - } + zend_multibyte_encoding_utf8 = functions->encoding_fetcher("UTF-8" TSRMLS_CC); + if (!zend_multibyte_encoding_utf8) { + return FAILURE; } - for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) { - if (encoding->aliases != NULL) { - for (j = 0; (*encoding->aliases)[j] != NULL; j++) { - if (zend_binary_strcasecmp((*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) { - return encoding; - } - } - } - } + multibyte_functions = *functions; - return NULL; + /* As zend_multibyte_set_functions() gets called after ini settings were + * populated, we need to reinitialize script_encoding here. + */ + { + const char *value = zend_ini_string("zend.script_encoding", sizeof("zend.script_encoding"), 0); + zend_multibyte_set_script_encoding_by_string(value, strlen(value) TSRMLS_CC); + } + return SUCCESS; } - -ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t -*to_length, const unsigned char *from, size_t from_length TSRMLS_DC) +ZEND_API const zend_multibyte_functions *zend_multibyte_get_functions(TSRMLS_D) { - const char *name; - - if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) { - name = "UTF-8"; - } else { - name = LANG_SCNG(internal_encoding)->name; - } - - return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC); + return multibyte_functions.provider_name ? &multibyte_functions: NULL; } -ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC) +ZEND_API const zend_encoding *zend_multibyte_fetch_encoding(const char *name TSRMLS_DC) { - const char *name; - - if (LANG_SCNG(script_encoding)->compatible == 0) { - name = "UTF-8"; - } else { - name = LANG_SCNG(script_encoding)->name; - } - - return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC); + return multibyte_functions.encoding_fetcher(name TSRMLS_CC); } -static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC) +ZEND_API const char *zend_multibyte_get_encoding_name(const zend_encoding *encoding) { - size_t oddlen; - - if (zend_multibyte_encoding_converter == dummy_encoding_converter) { - return 0; - } - - oddlen = zend_multibyte_encoding_oddlen(from, from_length, from_encoding TSRMLS_CC); - if (oddlen > 0) { - from_length -= oddlen; - } - - if (zend_multibyte_encoding_converter(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) { - return 0; - } - - return from_length; + return multibyte_functions.encoding_name_getter(encoding); } - -/* - * Shift_JIS Input/Output Filter - */ -static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0 -}; - -size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC) +ZEND_API int zend_multibyte_check_lexer_compatibility(const zend_encoding *encoding) { - const unsigned char *p; - unsigned char *q; - unsigned char c1, c2; - - *buf = (unsigned char*)emalloc(sjis_length * 3 / 2 + 1); - if (!*buf) - return 0; - *length = 0; - - p = sjis; - q = *buf; - - /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */ - while (*p && (p - sjis) < sjis_length) { - if (!(*p & 0x80)) { - *q++ = *p++; - continue; - } - - /* handling 8 bit code */ - if (table_sjis[*p] == 1) { - /* 1 byte kana */ - *q++ = 0x8e; - *q++ = *p++; - continue; - } - - if (!*(p+1)) { - *q++ = *p++; - break; - } - - if (table_sjis[*p] == 2) { - /* 2 byte kanji code */ - c1 = *p++; - if (!*p || (p - sjis) >= sjis_length) { - break; - } - c2 = *p++; - c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1; - c1 = (c1 << 1) + 1; - if (c2 >= 0x9e) { - c2 -= 0x7e; - c1++; - } else if (c2 > 0x7f) { - c2 -= 0x20; - } else { - c2 -= 0x1f; - } - - c1 |= 0x80; - c2 |= 0x80; - - *q++ = c1; - *q++ = c2; - } else { - /* - * for user defined chars (ATTENTION) - * - * THESE ARE NOT CODE FOR CONVERSION! :-P - * (using *ILLEGALLY* 3byte EUC-JP space) - * - * we cannot perfectly (== 1 to 1) convert these chars to EUC-JP. - * so, these code are for perfect RESTORING in sjis_output_filter() - */ - c1 = *p++; - if (!*p || (p - sjis) >= sjis_length) { - break; - } - c2 = *p++; - *q++ = 0x8f; - /* - * MAP TO (EUC-JP): - * type A: 0xeba1 - 0xf4fe - * type B: 0xf5a1 - 0xfefe - * type C: 0xa1a1 - 0xa6fe - */ - c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1); - c1 = (c1 << 1) + 1; - if (c2 >= 0x9e) { - c2 -= 0x7e; - c1++; - } else if (c2 > 0x7f) { - c2 -= 0x20; - } else { - c2 -= 0x1f; - } - - c1 |= 0x80; - c2 |= 0x80; - - *q++ = c1; - *q++ = c2; - } - } - *q = '\0'; - *length = q - *buf; - - return *length; + return multibyte_functions.lexer_compatibility_checker(encoding); } -static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 -}; - -size_t sjis_output_filter(unsigned char **sjis, size_t *sjis_length, const unsigned char *buf, size_t length TSRMLS_DC) +ZEND_API const zend_encoding *zend_multibyte_encoding_detector(const unsigned char *string, size_t length, const zend_encoding **list, size_t list_size TSRMLS_DC) { - unsigned char c1, c2; - unsigned char *p; - const unsigned char *q; - - if (!sjis || !sjis_length) { - return 0; - } - - /* always Shift_JIS <= EUC-JP */ - *sjis = (unsigned char*)emalloc(length+1); - if (!sjis) { - return 0; - } - p = *sjis; - q = buf; - - /* restore converted strings [EUC-JP -> Shift_JIS] */ - while (*q && (q - buf) < length) { - if (!(*q & 0x80)) { - *p++ = *q++; - continue; - } - - /* hankaku kana */ - if (*q == 0x8e) { - q++; - if (*q) { - *p++ = *q++; - } - continue; - } - - /* 2 byte kanji code */ - if (table_eucjp[*q] == 2) { - c1 = (*q++ & ~0x80) & 0xff; - if (*q) { - c2 = (*q++ & ~0x80) & 0xff; - } else { - q--; - break; - } - - c2 += (c1 & 0x01) ? 0x1f : 0x7d; - if (c2 >= 0x7f) { - c2++; - } - c1 = ((c1 - 0x21) >> 1) + 0x81; - if (c1 > 0x9f) { - c1 += 0x40; - } - - *p++ = c1; - *p++ = c2; - continue; - } - - if (*q == 0x8f) { - q++; - if (*q) { - c1 = (*q++ & ~0x80) & 0xff; - } else { - q--; - break; - } - if (*q) { - c2 = (*q++ & ~0x80) & 0xff; - } else { - q -= 2; - break; - } - - c2 += (c1 & 0x01) ? 0x1f : 0x7d; - if (c2 >= 0x7f) { - c2++; - } - c1 = ((c1 - 0x21) >> 1) + 0x81; - if (c1 > 0x9f) { - c1 += 0x40; - } - - if (c1 >= 0x81 && c1 <= 0x9f) { - c1 += 0x79; - } else { - c1 += 0x0a; - } - - *p++ = c1; - *p++ = c2; - continue; - } - - /* some other chars (may not happen) */ - *p++ = *q++; - } - *p = '\0'; - *sjis_length = p - *sjis; - - return q-buf; /* return length we actually read */ + return multibyte_functions.encoding_detector(string, length, list, list_size TSRMLS_CC); } - -static char *zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size) +ZEND_API size_t zend_multibyte_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from TSRMLS_DC) { - int i, list_size = 0; - const char *name; - char *list = NULL; - - if (!encoding_list || !encoding_list_size) { - return NULL; - } - - for (i = 0; i < encoding_list_size; i++) { - name = (*(encoding_list+i))->name; - if (name) { - list_size += strlen(name) + 1; - if (!list) { - list = (char*)emalloc(list_size); - if (!list) { - return NULL; - } - *list = '\0'; - } else { - list = (char*)erealloc(list, list_size); - if (!list) { - return NULL; - } - strcat(list, ","); - } - strcat(list, name); - } - } - return list; + return multibyte_functions.encoding_converter(to, to_length, from, from_length, encoding_to, encoding_from TSRMLS_CC); } - -static int zend_multibyte_parse_encoding_list(const char *encoding_list, -size_t encoding_list_size, zend_encoding ***result, size_t *result_size) +ZEND_API int zend_multibyte_parse_encoding_list(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, int persistent TSRMLS_DC) { - int n, size; - char *p, *p1, *p2, *endp, *tmpstr; - zend_encoding **list, **entry, *encoding; - - list = NULL; - if (encoding_list == NULL || encoding_list_size <= 0) { - return -1; - } else { - /* copy the encoding_list string for work */ - tmpstr = (char *)estrndup(encoding_list, encoding_list_size); - if (tmpstr == NULL) { - return -1; - } - /* count the number of listed encoding names */ - endp = tmpstr + encoding_list_size; - n = 1; - p1 = tmpstr; - while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) { - p1 = p2 + 1; - n++; - } - size = n; - /* make list */ - list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*)); - if (list != NULL) { - entry = list; - n = 0; - p1 = tmpstr; - do { - p2 = p = zend_memnstr(p1, ",", 1, endp); - if (p == NULL) { - p = endp; - } - *p = '\0'; - /* trim spaces */ - while (p1 < p && (*p1 == ' ' || *p1 == '\t')) { - p1++; - } - p--; - while (p > p1 && (*p == ' ' || *p == '\t')) { - *p = '\0'; - p--; - } - /* convert to the encoding number and check encoding */ - encoding = zend_multibyte_fetch_encoding(p1); - if (encoding) - { - *entry++ = encoding; - n++; - } - p1 = p2 + 1; - } while (n < size && p2 != NULL); - *result = list; - *result_size = n; - } - efree(tmpstr); - } - - if (list == NULL) { - return -1; - } - - return 0; + return multibyte_functions.encoding_list_parser(encoding_list, encoding_list_len, return_list, return_size, persistent TSRMLS_CC); } - -static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC) +ZEND_API const zend_encoding *zend_multibyte_get_internal_encoding(TSRMLS_D) { - zend_encoding *script_encoding; - char *name, *list; - - /* onetime_encoding is prior to everything */ - if (onetime_encoding != NULL) { - return onetime_encoding; - } - - if (CG(detect_unicode)) { - /* check out bom(byte order mark) and see if containing wchars */ - script_encoding = zend_multibyte_detect_unicode(TSRMLS_C); - if (script_encoding != NULL) { - /* bom or wchar detection is prior to 'script_encoding' option */ - return script_encoding; - } - } + return multibyte_functions.internal_encoding_getter(TSRMLS_C); +} - /* if no script_encoding specified, just leave alone */ - if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) { - return NULL; - } +ZEND_API const zend_encoding *zend_multibyte_get_script_encoding(TSRMLS_D) +{ + return LANG_SCNG(script_encoding); +} - /* if multiple encodings specified, detect automagically */ - if (CG(script_encoding_list_size) > 1 && - zend_multibyte_encoding_detector != dummy_encoding_detector) { - list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list), - CG(script_encoding_list_size)); - name = zend_multibyte_encoding_detector(LANG_SCNG(script_org), - LANG_SCNG(script_org_size), list TSRMLS_CC); - if (list) { - efree(list); - } - if (name) { - script_encoding = zend_multibyte_fetch_encoding(name); - efree(name); - } else { - script_encoding = NULL; - } - return script_encoding; +ZEND_API int zend_multibyte_set_script_encoding(const zend_encoding **encoding_list, size_t encoding_list_size TSRMLS_DC) +{ + if (CG(script_encoding_list)) { + efree(CG(script_encoding_list)); } - - return *(CG(script_encoding_list)); + CG(script_encoding_list) = encoding_list; + CG(script_encoding_list_size) = encoding_list_size; + return SUCCESS; } +ZEND_API int zend_multibyte_set_internal_encoding(const zend_encoding *encoding TSRMLS_DC) +{ + return multibyte_functions.internal_encoding_setter(encoding TSRMLS_CC); +} -static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D) +ZEND_API int zend_multibyte_set_script_encoding_by_string(const char *new_value, size_t new_value_length TSRMLS_DC) { - zend_encoding *script_encoding = NULL; - int bom_size; - unsigned char *script; - unsigned char *pos1, *pos2; + const zend_encoding **list = 0; + size_t size = 0; - if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) { - return NULL; + if (!new_value) { + zend_multibyte_set_script_encoding(NULL, 0 TSRMLS_CC); + return SUCCESS; } - /* check out BOM */ - if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) { - script_encoding = &encoding_utf32be; - bom_size = sizeof(BOM_UTF32_BE)-1; - } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) { - script_encoding = &encoding_utf32le; - bom_size = sizeof(BOM_UTF32_LE)-1; - } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) { - script_encoding = &encoding_utf16be; - bom_size = sizeof(BOM_UTF16_BE)-1; - } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) { - script_encoding = &encoding_utf16le; - bom_size = sizeof(BOM_UTF16_LE)-1; - } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) { - script_encoding = &encoding_utf8; - bom_size = sizeof(BOM_UTF8)-1; + if (FAILURE == zend_multibyte_parse_encoding_list(new_value, new_value_length, &list, &size, 1 TSRMLS_CC)) { + return FAILURE; } - if (script_encoding) { - /* remove BOM */ - script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size); - memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size); - efree(LANG_SCNG(script_org)); - LANG_SCNG(script_org) = script; - LANG_SCNG(script_org_size) -= bom_size; - - return script_encoding; + if (size == 0) { + pefree(list, 1); + return FAILURE; } - /* script contains NULL bytes -> auto-detection */ - if ((pos1 = memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size)))) { - /* check if the NULL byte is after the __HALT_COMPILER(); */ - pos2 = LANG_SCNG(script_org); - - while (pos1 - pos2 >= sizeof("__HALT_COMPILER();")-1) { - pos2 = memchr(pos2, '_', pos1 - pos2); - if (!pos2) break; - pos2++; - if (strncasecmp((char*)pos2, "_HALT_COMPILER", sizeof("_HALT_COMPILER")-1) == 0) { - pos2 += sizeof("_HALT_COMPILER")-1; - while (*pos2 == ' ' || - *pos2 == '\t' || - *pos2 == '\r' || - *pos2 == '\n') { - pos2++; - } - if (*pos2 == '(') { - pos2++; - while (*pos2 == ' ' || - *pos2 == '\t' || - *pos2 == '\r' || - *pos2 == '\n') { - pos2++; - } - if (*pos2 == ')') { - pos2++; - while (*pos2 == ' ' || - *pos2 == '\t' || - *pos2 == '\r' || - *pos2 == '\n') { - pos2++; - } - if (*pos2 == ';') { - return NULL; - } - } - } - } - } - /* make best effort if BOM is missing */ - return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC); + if (FAILURE == zend_multibyte_set_script_encoding(list, size TSRMLS_CC)) { + return FAILURE; } - return NULL; + return SUCCESS; } -static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC) +ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t +*to_length, const unsigned char *from, size_t from_length TSRMLS_DC) { - const unsigned char *p; - int wchar_size = 2; - int le = 0; - - /* utf-16 or utf-32? */ - p = script; - while ((p-script) < script_size) { - p = memchr(p, 0, script_size-(p-script)-2); - if (!p) { - break; - } - if (*(p+1) == '\0' && *(p+2) == '\0') { - wchar_size = 4; - break; - } - - /* searching for UTF-32 specific byte orders, so this will do */ - p += 4; - } - - /* BE or LE? */ - p = script; - while ((p-script) < script_size) { - if (*p == '\0' && *(p+wchar_size-1) != '\0') { - /* BE */ - le = 0; - break; - } else if (*p != '\0' && *(p+wchar_size-1) == '\0') { - /* LE* */ - le = 1; - break; - } - p += wchar_size; + const zend_encoding *internal_encoding = zend_multibyte_get_internal_encoding(TSRMLS_C); + if (!internal_encoding || !zend_multibyte_check_lexer_compatibility(internal_encoding)) { + internal_encoding = zend_multibyte_encoding_utf8; } + return zend_multibyte_encoding_converter(to, to_length, from, from_length, internal_encoding, LANG_SCNG(script_encoding) TSRMLS_CC); +} - if (wchar_size == 2) { - return le ? &encoding_utf16le : &encoding_utf16be; - } else { - return le ? &encoding_utf32le : &encoding_utf32be; +ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC) +{ + const zend_encoding *internal_encoding = zend_multibyte_get_internal_encoding(TSRMLS_C); + const zend_encoding *script_encoding = LANG_SCNG(script_encoding); + if (!internal_encoding || !zend_multibyte_check_lexer_compatibility(internal_encoding)) { + internal_encoding = zend_multibyte_encoding_utf8; + } + if (!zend_multibyte_check_lexer_compatibility(script_encoding)) { + script_encoding = zend_multibyte_encoding_utf8; } - - return NULL; + return zend_multibyte_encoding_converter(to, to_length, from, from_length, +script_encoding, internal_encoding TSRMLS_CC); } /* |