diff options
-rw-r--r-- | Zend/Zend.m4 | 13 | ||||
-rw-r--r-- | Zend/flex.skl | 11 | ||||
-rw-r--r-- | Zend/zend_compile.c | 52 | ||||
-rw-r--r-- | Zend/zend_globals.h | 32 | ||||
-rw-r--r-- | Zend/zend_highlight.c | 17 | ||||
-rw-r--r-- | Zend/zend_language_scanner.h | 16 | ||||
-rw-r--r-- | Zend/zend_language_scanner.l | 336 | ||||
-rw-r--r-- | Zend/zend_multibyte.c | 1133 | ||||
-rw-r--r-- | Zend/zend_multibyte.h | 79 | ||||
-rw-r--r-- | ext/mbstring/mbstring.c | 20 | ||||
-rw-r--r-- | main/main.c | 7 |
11 files changed, 1680 insertions, 36 deletions
diff --git a/Zend/Zend.m4 b/Zend/Zend.m4 index dc083338ee..fe6004de73 100644 --- a/Zend/Zend.m4 +++ b/Zend/Zend.m4 @@ -129,6 +129,13 @@ AC_ARG_ENABLE(memory-limit, ZEND_MEMORY_LIMIT=no ]) +AC_ARG_ENABLE(zend-multibyte, +[ --enable-zend-multibyte Compile with zend multibyte support. ], [ + ZEND_MULTIBYTE=$enableval +],[ + ZEND_MULTIBYTE=no +]) + AC_MSG_CHECKING(whether to enable thread-safety) AC_MSG_RESULT($ZEND_MAINTAINER_ZTS) @@ -140,6 +147,9 @@ AC_MSG_RESULT($ZEND_MEMORY_LIMIT) AC_MSG_CHECKING(whether to enable Zend debugging) AC_MSG_RESULT($ZEND_DEBUG) + +AC_MSG_CHECKING(whether to enable Zend multibyte) +AC_MSG_RESULT($ZEND_MULTIBYTE) if test "$ZEND_DEBUG" = "yes"; then AC_DEFINE(ZEND_DEBUG,1,[ ]) @@ -168,6 +178,9 @@ else AC_DEFINE(MEMORY_LIMIT, 0, [Memory limit]) fi +if test "$ZEND_MULTIBYTE" = "yes"; then + AC_DEFINE(ZEND_MULTIBYTE, 1, [ ]) +fi changequote({,}) if test -n "$GCC" && test "$ZEND_INLINE_OPTIMIZATION" != "yes"; then diff --git a/Zend/flex.skl b/Zend/flex.skl index 1fac72db1f..9a7c5cf873 100644 --- a/Zend/flex.skl +++ b/Zend/flex.skl @@ -440,12 +440,17 @@ YY_MALLOC_DECL #define ECHO /* There is no output */ #endif -#define YY_INPUT(buf, result, max_size) \ +#ifdef ZEND_MULTIBYTE +# define YY_INPUT(buf, result, max_size) \ + if ( ((result = zend_multibyte_yyinput(yyin, buf, max_size TSRMLS_CC)) == 0) \ + && zend_stream_ferror( yyin TSRMLS_CC) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); +#else +# define YY_INPUT(buf, result, max_size) \ if ( ((result = zend_stream_read(yyin, buf, max_size TSRMLS_CC)) == 0) \ && zend_stream_ferror( yyin TSRMLS_CC) ) \ YY_FATAL_ERROR( "input in flex scanner failed" ); - - +#endif #ifndef ECHO %- Standard (non-C++) definition diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index 7bbc9914d4..dd977a1f40 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -27,6 +27,10 @@ #include "zend_API.h" #include "zend_fast_cache.h" +#ifdef ZEND_MULTIBYTE +#include "zend_multibyte.h" +#endif /* ZEND_MULTIBYTE */ + ZEND_API zend_op_array *(*zend_compile_file)(zend_file_handle *file_handle, int type TSRMLS_DC); @@ -51,7 +55,14 @@ static void build_runtime_defined_function_key(zval *result, char *name, int nam /* NULL, name length, filename length, line number length */ result->value.str.len = 1+name_length+strlen(filename)+lineno_len; result->value.str.val = (char *) emalloc(result->value.str.len+1); +#ifdef ZEND_MULTIBYTE + /* must be binary safe */ + result->value.str.val[0] = '\0'; + memcpy(result->value.str.val+1, name, name_length); + sprintf(result->value.str.val+1+name_length, "%s%s", filename, lineno_buf); +#else sprintf(result->value.str.val, "%c%s%s%s", '\0', name, filename, lineno_buf); +#endif /* ZEND_MULTIBYTE */ result->type = IS_STRING; result->refcount = 1; } @@ -90,6 +101,15 @@ void zend_init_compiler_data_structures(TSRMLS_D) init_compiler_declarables(TSRMLS_C); CG(throw_list) = NULL; zend_hash_apply(CG(auto_globals), (apply_func_t) zend_auto_global_arm TSRMLS_CC); + +#ifdef ZEND_MULTIBYTE + CG(script_encoding_list) = NULL; + CG(script_encoding_list_size) = 0; + CG(internal_encoding) = NULL; + CG(encoding_detector) = NULL; + CG(encoding_converter) = NULL; + CG(encoding_oddlen) = NULL; +#endif /* ZEND_MULTIBYTE */ } @@ -114,6 +134,12 @@ void shutdown_compiler(TSRMLS_D) zend_stack_destroy(&CG(list_stack)); zend_hash_destroy(&CG(filenames_table)); zend_llist_destroy(&CG(open_files)); + +#ifdef ZEND_MULTIBYTE + if (CG(script_encoding_list)) { + efree(CG(script_encoding_list)); + } +#endif /* ZEND_MULTIBYTE */ } @@ -3064,6 +3090,32 @@ void zend_do_declare_stmt(znode *var, znode *val TSRMLS_DC) if (!zend_binary_strcasecmp(var->u.constant.value.str.val, var->u.constant.value.str.len, "ticks", sizeof("ticks")-1)) { convert_to_long(&val->u.constant); CG(declarables).ticks = val->u.constant; +#ifdef ZEND_MULTIBYTE + } else if (!zend_binary_strcasecmp(var->u.constant.value.str.val, var->u.constant.value.str.len, "encoding", sizeof("encoding")-1)) { + zend_encoding *new_encoding, *old_encoding; + zend_encoding_filter old_input_filter; + + if (val->u.constant.type == IS_CONSTANT) { + zend_error(E_COMPILE_ERROR, "Cannot use constants as encoding"); + } + convert_to_string(&val->u.constant); + new_encoding = zend_multibyte_fetch_encoding(val->u.constant.value.str.val); + if (!new_encoding) { + zend_error(E_COMPILE_WARNING, "Unsupported encoding [%s]", val->u.constant.value.str.val); + } else { + old_input_filter = LANG_SCNG(input_filter); + old_encoding = LANG_SCNG(script_encoding); + zend_multibyte_set_filter(new_encoding TSRMLS_CC); + + /* need to re-scan if input filter changed */ + if (old_input_filter != LANG_SCNG(input_filter) || + ((old_input_filter == zend_multibyte_script_encoding_filter) && + (new_encoding != old_encoding))) { + zend_multibyte_yyinput_again(old_input_filter, old_encoding TSRMLS_CC); + } + } + efree(val->u.constant.value.str.val); +#endif /* ZEND_MULTIBYTE */ } zval_dtor(&var->u.constant); } diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 1152290409..95f3c8933d 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -35,6 +35,10 @@ #include "zend_objects.h" #include "zend_objects_API.h" +#ifdef ZEND_MULTIBYTE +#include "zend_multibyte.h" +#endif /* ZEND_MULTIBYTE */ + /* Define ZTS if you want a thread-safe Zend */ /*#undef ZTS*/ @@ -127,6 +131,18 @@ struct _zend_compiler_globals { char *doc_comment; zend_uint doc_comment_len; + +#ifdef ZEND_MULTIBYTE + zend_encoding **script_encoding_list; + int script_encoding_list_size; + + zend_encoding *internal_encoding; + + /* multibyte utility functions */ + zend_encoding_detector encoding_detector; + zend_encoding_converter encoding_converter; + zend_encoding_oddlen encoding_oddlen; +#endif /* ZEND_MULTIBYTE */ }; @@ -271,6 +287,22 @@ struct _zend_scanner_globals { int yy_start_stack_ptr; int yy_start_stack_depth; int *yy_start_stack; + +#ifdef ZEND_MULTIBYTE + /* original (unfiltered) script */ + char *script_org; + int script_org_size; + + /* filtered script */ + char *script_filtered; + int script_filtered_size; + + /* input/ouput filters */ + zend_encoding_filter input_filter; + zend_encoding_filter output_filter; + zend_encoding *script_encoding; + zend_encoding *internal_encoding; +#endif /* ZEND_MULTIBYTE */ }; #endif /* ZEND_GLOBALS_H */ diff --git a/Zend/zend_highlight.c b/Zend/zend_highlight.c index 746dafee89..25c6070ec5 100644 --- a/Zend/zend_highlight.c +++ b/Zend/zend_highlight.c @@ -57,6 +57,17 @@ ZEND_API void zend_html_putc(char c) ZEND_API void zend_html_puts(const char *s, uint len TSRMLS_DC) { const char *ptr=s, *end=s+len; + +#ifdef ZEND_MULTIBYTE + char *filtered; + int filtered_len; + + if (LANG_SCNG(output_filter)) { + LANG_SCNG(output_filter)(&filtered, &filtered_len, s, len TSRMLS_CC); + ptr = filtered; + end = filtered + filtered_len; + } +#endif /* ZEND_MULTIBYTE */ while (ptr<end) { if (*ptr==' ') { @@ -75,6 +86,12 @@ ZEND_API void zend_html_puts(const char *s, uint len TSRMLS_DC) zend_html_putc(*ptr++); } } + +#ifdef ZEND_MULTIBYTE + if (LANG_SCNG(output_filter)) { + efree(filtered); + } +#endif /* ZEND_MULTIBYTE */ } diff --git a/Zend/zend_language_scanner.h b/Zend/zend_language_scanner.h index 332e020882..882f63e3bb 100644 --- a/Zend/zend_language_scanner.h +++ b/Zend/zend_language_scanner.h @@ -28,6 +28,22 @@ typedef struct _zend_lex_state { zend_file_handle *in; uint lineno; char *filename; + +#ifdef ZEND_MULTIBYTE + /* original (unfiltered) script */ + char *script_org; + int script_org_size; + + /* filtered script */ + char *script_filtered; + int script_filtered_size; + + /* input/ouput filters */ + zend_encoding_filter input_filter; + zend_encoding_filter output_filter; + zend_encoding *script_encoding; + zend_encoding *internal_encoding; +#endif /* ZEND_MULTIBYTE */ } zend_lex_state; diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index a12abdd4e9..7c3e38d1d8 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -127,6 +127,12 @@ void startup_scanner(TSRMLS_D) RESET_DOC_COMMENT(); SCNG(yy_start_stack_ptr) = 0; SCNG(yy_start_stack_depth) = 0; +#ifdef ZEND_MULTIBYTE + SCNG(script_org) = NULL; + SCNG(script_org_size) = 0; + SCNG(script_filtered) = NULL; + SCNG(script_filtered_size) = 0; +#endif /* ZEND_MULTIBYTE */ } @@ -137,6 +143,17 @@ void shutdown_scanner(TSRMLS_D) CG(heredoc_len)=0; } RESET_DOC_COMMENT(); + +#ifdef ZEND_MULTIBYTE + if (SCNG(script_org)) { + efree(SCNG(script_org)); + SCNG(script_org) = NULL; + } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + SCNG(script_filtered) = NULL; + } +#endif /* ZEND_MULTIBYTE */ } END_EXTERN_C() @@ -148,6 +165,17 @@ ZEND_API void zend_save_lexical_state(zend_lex_state *lex_state TSRMLS_DC) lex_state->state = YYSTATE; lex_state->filename = zend_get_compiled_filename(TSRMLS_C); lex_state->lineno = CG(zend_lineno); + +#ifdef ZEND_MULTIBYTE + lex_state->script_org = SCNG(script_org); + lex_state->script_org_size = SCNG(script_org_size); + lex_state->script_filtered = SCNG(script_filtered); + lex_state->script_filtered_size = SCNG(script_filtered_size); + lex_state->input_filter = SCNG(input_filter); + lex_state->output_filter = SCNG(output_filter); + lex_state->script_encoding = SCNG(script_encoding); + lex_state->internal_encoding = SCNG(internal_encoding); +#endif /* ZEND_MULTIBYTE */ } @@ -166,6 +194,17 @@ ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state TSRMLS_DC) BEGIN(lex_state->state); CG(zend_lineno) = lex_state->lineno; zend_restore_compiled_filename(lex_state->filename TSRMLS_CC); + +#ifdef ZEND_MULTIBYTE + SCNG(script_org) = lex_state->script_org; + SCNG(script_org_size) = lex_state->script_org_size; + SCNG(script_filtered) = lex_state->script_filtered; + SCNG(script_filtered_size) = lex_state->script_filtered_size; + SCNG(input_filter) = lex_state->input_filter; + SCNG(output_filter) = lex_state->output_filter; + SCNG(script_encoding) = lex_state->script_encoding; + SCNG(internal_encoding) = lex_state->internal_encoding; +#endif /* ZEND_MULTIBYTE */ } @@ -235,7 +274,40 @@ ZEND_API int open_file_for_scanning(zend_file_handle *file_handle TSRMLS_DC) /* Reset the scanner for scanning the new file */ SCNG(yy_in) = file_handle; + +#ifdef ZEND_MULTIBYTE + if (file_handle->handle.stream.interactive == 0) { + if (zend_multibyte_read_script(TSRMLS_C) != 0) { + return FAILURE; + } + + /* force flex to use buffer only */ + SCNG(yy_in) = NULL; + SCNG(init) = 0; + SCNG(start) = 1; + + zend_multibyte_set_filter(NULL TSRMLS_CC); + + if (!SCNG(input_filter)) { + SCNG(script_filtered) = (char*)emalloc(SCNG(script_org_size)+1); + memcpy(SCNG(script_filtered), SCNG(script_org), SCNG(script_org_size)+1); + SCNG(script_filtered_size) = SCNG(script_org_size); + } else { + SCNG(input_filter)(&SCNG(script_filtered), &SCNG(script_filtered_size), SCNG(script_org), SCNG(script_org_size) TSRMLS_CC); + } + + /* flex requires doubled null */ + SCNG(script_filtered) = (char*)erealloc(SCNG(script_filtered), SCNG(script_filtered_size)+2); + *(SCNG(script_filtered)+SCNG(script_filtered_size)) = (char)NULL; + *(SCNG(script_filtered)+SCNG(script_filtered_size)+1) = (char)NULL; + yy_scan_buffer(SCNG(script_filtered), SCNG(script_filtered_size)+2 TSRMLS_CC); + } else { + yy_switch_to_buffer(yy_create_buffer(SCNG(yy_in), YY_BUF_SIZE TSRMLS_CC) TSRMLS_CC); + } +#else /* !ZEND_MULTIBYTE */ yy_switch_to_buffer(yy_create_buffer(SCNG(yy_in), YY_BUF_SIZE TSRMLS_CC) TSRMLS_CC); +#endif /* ZEND_MULTIBYTE */ + BEGIN(INITIAL); if (file_handle->opened_path) { @@ -300,6 +372,17 @@ ZEND_API zend_op_array *compile_file(zend_file_handle *file_handle, int type TSR retval = NULL; } compilation_successful=1; + +#ifdef ZEND_MULTIBYTE + if (SCNG(script_org)) { + efree(SCNG(script_org)); + SCNG(script_org) = NULL; + } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + SCNG(script_filtered) = NULL; + } +#endif /* ZEND_MULTIBYTE */ } if (retval) { @@ -367,7 +450,29 @@ ZEND_API int zend_prepare_string_for_scanning(zval *str, char *filename TSRMLS_D str->value.str.val[str->value.str.len+1]=0; SCNG(yy_in)=NULL; + +#ifdef ZEND_MULTIBYTE + SCNG(script_org) = estrdup(str->value.str.val); + SCNG(script_org_size) = str->value.str.len; + + zend_multibyte_set_filter(CG(internal_encoding) TSRMLS_CC); + + if (!SCNG(input_filter)) { + SCNG(script_filtered) = (char*)emalloc(SCNG(script_org_size)+1); + memcpy(SCNG(script_filtered), SCNG(script_org), SCNG(script_org_size)+1); + SCNG(script_filtered_size) = SCNG(script_org_size); + } else { + SCNG(input_filter)(&SCNG(script_filtered), &SCNG(script_filtered_size), SCNG(script_org), SCNG(script_org_size) TSRMLS_CC); + } + + /* flex requires doubled null */ + SCNG(script_filtered) = (char*)erealloc(SCNG(script_filtered), SCNG(script_filtered_size)+2); + *(SCNG(script_filtered)+SCNG(script_filtered_size)) = (char)NULL; + *(SCNG(script_filtered)+SCNG(script_filtered_size)+1) = (char)NULL; + yy_scan_buffer(SCNG(script_filtered), SCNG(script_filtered_size)+2 TSRMLS_CC); +#else /* !ZEND_MULTIBYTE */ yy_scan_buffer(str->value.str.val, str->value.str.len+2 TSRMLS_CC); +#endif /* ZEND_MULTIBYTE */ zend_set_compiled_filename(filename TSRMLS_CC); CG(zend_lineno) = 1; @@ -408,6 +513,17 @@ zend_op_array *compile_string(zval *source_string, char *filename TSRMLS_DC) BEGIN(ST_IN_SCRIPTING); compiler_result = zendparse(TSRMLS_C); +#ifdef ZEND_MULTIBYTE + if (SCNG(script_org)) { + efree(SCNG(script_org)); + SCNG(script_org) = NULL; + } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + SCNG(script_filtered) = NULL; + } +#endif /* ZEND_MULTIBYTE */ + if (compiler_result==1) { CG(active_op_array) = original_active_op_array; CG(unclean_shutdown)=1; @@ -442,6 +558,16 @@ int highlight_file(char *filename, zend_syntax_highlighter_ini *syntax_highlight return FAILURE; } zend_highlight(syntax_highlighter_ini TSRMLS_CC); +#ifdef ZEND_MULTIBYTE + if (SCNG(script_org)) { + efree(SCNG(script_org)); + SCNG(script_org) = NULL; + } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + SCNG(script_filtered) = NULL; + } +#endif /* ZEND_MULTIBYTE */ zend_destroy_file_handle(&file_handle TSRMLS_CC); zend_restore_lexical_state(&original_lex_state TSRMLS_CC); return SUCCESS; @@ -459,12 +585,166 @@ int highlight_string(zval *str, zend_syntax_highlighter_ini *syntax_highlighter_ return FAILURE; } zend_highlight(syntax_highlighter_ini TSRMLS_CC); +#ifdef ZEND_MULTIBYTE + if (SCNG(script_org)) { + efree(SCNG(script_org)); + SCNG(script_org) = NULL; + } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + SCNG(script_filtered) = NULL; + } +#endif /* ZEND_MULTIBYTE */ zend_restore_lexical_state(&original_lex_state TSRMLS_CC); zval_dtor(str); return SUCCESS; } END_EXTERN_C() +#ifdef ZEND_MULTIBYTE +BEGIN_EXTERN_C() +ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter, zend_encoding *old_encoding TSRMLS_DC) +{ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + int offset, original_offset, length, free_flag; + char *p; + zend_encoding *new_encoding; + + /* calculate current position */ + offset = original_offset = yy_c_buf_p - b->yy_ch_buf; + if (old_input_filter && original_offset > 0) { + new_encoding = SCNG(script_encoding); + SCNG(script_encoding) = old_encoding; + do { + (old_input_filter)(&p, &length, SCNG(script_org), offset TSRMLS_CC); + if (!p) { + SCNG(script_encoding) = new_encoding; + return; + } + efree(p); + if (length > original_offset) { + offset--; + } else if (length < original_offset) { + offset++; + } + } while (original_offset != length); + SCNG(script_encoding) = new_encoding; + } + + /* convert and set */ + if (!SCNG(input_filter)) { + length = SCNG(script_org_size)-offset-1; + p = SCNG(script_org)+offset+1; + free_flag = 0; + } else { + SCNG(input_filter)(&p, &length, SCNG(script_org)+offset+1, SCNG(script_org_size)-offset-1 TSRMLS_CC); + free_flag = 1; + } + if (original_offset+length+1 > (int)b->yy_buf_size) { + b->yy_buf_size = original_offset+length+1; + b->yy_ch_buf = (char*)erealloc(b->yy_ch_buf, b->yy_buf_size+2); + SCNG(script_filtered) = b->yy_ch_buf; + SCNG(script_filtered_size) = b->yy_buf_size; + } + yy_c_buf_p = b->yy_ch_buf + original_offset; + strncpy(yy_c_buf_p+1, p, length); + b->yy_n_chars = original_offset + length + 1; + SCNG(yy_n_chars) = b->yy_n_chars; + b->yy_ch_buf[SCNG(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[SCNG(yy_n_chars)+1] = YY_END_OF_BUFFER_CHAR; + + if (free_flag) { + efree(p); + } +} + + +ZEND_API int zend_multibyte_yyinput(zend_file_handle *file_handle, char *buf, size_t len TSRMLS_DC) +{ + int c = '*', n; + + if (file_handle->handle.stream.interactive == 0) { + return zend_stream_read(file_handle, buf, len TSRMLS_CC); + } + + /* interactive */ + if (SCNG(script_org)) { + efree(SCNG(script_org)); + } + if (SCNG(script_filtered)) { + efree(SCNG(script_filtered)); + } + SCNG(script_org) = NULL; + SCNG(script_org_size) = 0; + + /* TODO: support widechars */ + + for (n = 0; n < sizeof(buf) && (c = zend_stream_getc(yyin TSRMLS_CC)) != EOF && c != '\n'; ++n) { + buf[n] = (char)c; + } + if (c == '\n') { + buf[n++] = (char) c; + } + + SCNG(script_org_size) = n; + SCNG(script_org) = (char*)emalloc(SCNG(script_org_size)+1); + memcpy(SCNG(script_org)+SCNG(script_org_size)-n, buf, n); + + return n; +} + + +ZEND_API int zend_multibyte_read_script(TSRMLS_D) +{ + char buf[8192]; + int n; + + if (SCNG(script_org)) { + efree(SCNG(script_org)); + } + SCNG(script_org) = NULL; + SCNG(script_org_size) = 0; + + for (;;) { + n = zend_stream_read(yyin, buf, sizeof(buf) TSRMLS_CC); + if (n <= 0) { + break; + } + + SCNG(script_org_size) += n; + if (SCNG(script_org)) { + SCNG(script_org) = (char*)erealloc(SCNG(script_org), SCNG(script_org_size)+1); + } else { + SCNG(script_org) = (char*)emalloc(SCNG(script_org_size)+1); + } + memcpy(SCNG(script_org)+SCNG(script_org_size)-n, buf, n); + } + + if (n < 0) { + return -1; + } + + if (!SCNG(script_org)) { + SCNG(script_org) = emalloc(SCNG(script_org_size)+1); + } + *(SCNG(script_org)+SCNG(script_org_size)) = (char)NULL; + + return 0; +} + + +# define zend_copy_value(zendlval, yytext, yyleng) \ + if (SCNG(output_filter)) { \ + SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), yytext, yyleng TSRMLS_CC); \ + } else { \ + zendlval->value.str.val = (char *) estrndup(yytext, yyleng); \ + zendlval->value.str.len = yyleng; \ + } +#else /* ZEND_MULTIBYTE */ +# define zend_copy_value(zendlval, yytext, yyleng) \ + zendlval->value.str.val = (char *)estrndup(yytext, yyleng); \ + zendlval->value.str.len = yyleng; +#endif /* ZEND_MULTIBYTE */ %} LNUM [0-9]+ @@ -631,8 +911,7 @@ NEWLINE ("\r"|"\n"|"\r\n") <ST_LOOKING_FOR_PROPERTY>{LABEL} { yy_pop_state(TSRMLS_C); - zendlval->value.str.val = (char *)estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_STRING; } @@ -889,8 +1168,7 @@ NEWLINE ("\r"|"\n"|"\r\n") <ST_LOOKING_FOR_VARNAME>{LABEL} { - zendlval->value.str.val = (char *) estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; yy_pop_state(TSRMLS_C); yy_push_state(ST_IN_SCRIPTING TSRMLS_CC); @@ -1025,8 +1303,21 @@ NEWLINE ("\r"|"\n"|"\r\n") } <INITIAL>(([^<]|"<"[^?%s<]){1,400})|"<s"|"<" { +#ifdef ZEND_MULTIBYTE + if (SCNG(output_filter)) { + int readsize; + readsize = SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), yytext, yyleng TSRMLS_CC); + if (readsize < yyleng) { + yyless(readsize); + } + } else { + zendlval->value.str.val = (char *) estrndup(yytext, yyleng); + zendlval->value.str.len = yyleng; + } +#else /* !ZEND_MULTIBYTE */ zendlval->value.str.val = (char *) estrndup(yytext, yyleng); zendlval->value.str.len = yyleng; +#endif /* ZEND_MULTIBYTE */ zendlval->type = IS_STRING; HANDLE_NEWLINES(yytext, yyleng); return T_INLINE_HTML; @@ -1101,22 +1392,19 @@ NEWLINE ("\r"|"\n"|"\r\n") } <ST_IN_SCRIPTING,ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE>"$"{LABEL} { - zendlval->value.str.val = (char *)estrndup(yytext+1, yyleng-1); - zendlval->value.str.len = yyleng-1; + zend_copy_value(zendlval, (yytext+1), (yyleng-1)); zendlval->type = IS_STRING; return T_VARIABLE; } <ST_IN_SCRIPTING>{LABEL} { - zendlval->value.str.val = (char *)estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_STRING; } <ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>{LABEL} { - zendlval->value.str.val = (char *)estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_STRING; } @@ -1302,6 +1590,14 @@ NEWLINE ("\r"|"\n"|"\r\n") } *t = 0; +#ifdef ZEND_MULTIBYTE + if (SCNG(output_filter)) { + s = zendlval->value.str.val; + SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), s, zendlval->value.str.len TSRMLS_CC); + efree(s); + } +#endif /* ZEND_MULTIBYTE */ + return T_CONSTANT_ENCAPSED_STRING; } @@ -1342,6 +1638,14 @@ NEWLINE ("\r"|"\n"|"\r\n") } *t = 0; +#ifdef ZEND_MULTIBYTE + if (SCNG(output_filter)) { + s = zendlval->value.str.val; + SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), s, zendlval->value.str.len TSRMLS_CC); + efree(s); + } +#endif /* ZEND_MULTIBYTE */ + return T_CONSTANT_ENCAPSED_STRING; } @@ -1409,8 +1713,7 @@ NEWLINE ("\r"|"\n"|"\r\n") BEGIN(ST_IN_SCRIPTING); return T_END_HEREDOC; } else { - zendlval->value.str.val = (char *)estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_STRING; } @@ -1427,24 +1730,21 @@ NEWLINE ("\r"|"\n"|"\r\n") <ST_SINGLE_QUOTE>([^'\\]|\\[^'\\])+ { HANDLE_NEWLINES(yytext, yyleng); - zendlval->value.str.val = (char *) estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_ENCAPSED_AND_WHITESPACE; } <ST_DOUBLE_QUOTES>[`]+ { - zendlval->value.str.val = (char *) estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_ENCAPSED_AND_WHITESPACE; } <ST_BACKQUOTE>["]+ { - zendlval->value.str.val = (char *) estrndup(yytext, yyleng); - zendlval->value.str.len = yyleng; + zend_copy_value(zendlval, yytext, yyleng); zendlval->type = IS_STRING; return T_ENCAPSED_AND_WHITESPACE; } diff --git a/Zend/zend_multibyte.c b/Zend/zend_multibyte.c index e69de29bb2..e567174370 100644 --- a/Zend/zend_multibyte.c +++ b/Zend/zend_multibyte.c @@ -0,0 +1,1133 @@ +/* + +----------------------------------------------------------------------+ + | Zend Engine | + +----------------------------------------------------------------------+ + | Copyright (c) 1998-2003 Zend Technologies Ltd. (http://www.zend.com) | + +----------------------------------------------------------------------+ + | This source file is subject to version 2.00 of the Zend license, | + | that is bundled with this package in the file LICENSE, and is | + | available at through the world-wide-web at | + | http://www.zend.com/license/2_00.txt. | + | If you did not receive a copy of the Zend license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@zend.com so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Masaki Fujimoto <fujimoto@php.net> | + | Rui Hirokawa <hirokawa@php.net> | + +----------------------------------------------------------------------+ +*/ + +/* $Id$ */ + +#include "zend.h" +#include "zend_compile.h" +#include "zend_operators.h" +#include "zend_multibyte.h" + +#ifdef ZEND_MULTIBYTE +static int zend_multibyte_encoding_filter(char **to, int *to_length, const char *to_encoding, const char *from, int from_length, const char *from_encoding TSRMLS_DC); +int sjis_input_filter(char **buf, int *length, const char *sjis, int sjis_length TSRMLS_DC); +int sjis_output_filter(char **buf, int *length, const char *sjis, int sjis_length TSRMLS_DC); +static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, int encoding_list_size); +static int zend_multibyte_parse_encoding_list(const char *encoding_list, int encoding_list_size, zend_encoding ***result, int *result_size); +static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC); +static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D); +static zend_encoding* zend_multibyte_detect_utf_encoding(char *script, int script_size TSRMLS_DC); + +/* + * encodings + */ +const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL}; +zend_encoding encoding_ucs2 = { + NULL, + NULL, + "UCS-2", + (const char *(*)[])&ucs2_aliases, + 0 +}; + +zend_encoding encoding_ucs2be = { + NULL, + NULL, + "UCS-2BE", + NULL, + 0 +}; + +zend_encoding encoding_ucs2le = { + NULL, + NULL, + "UCS-2LE", + NULL, + 0 +}; + +const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL}; +zend_encoding encoding_ucs4 = { + NULL, + NULL, + "UCS-4", + (const char *(*)[])&ucs4_aliases, + 0 +}; + +zend_encoding encoding_ucs4be = { + NULL, + NULL, + "UCS-4BE", + NULL, + 0 +}; + +zend_encoding encoding_ucs4le = { + NULL, + NULL, + "UCS-4LE", + NULL, + 0 +}; + +const char *utf32_aliases[] = {"utf32", NULL}; +zend_encoding encoding_utf32 = { + NULL, + NULL, + "UTF-32", + (const char *(*)[])&utf32_aliases, + 0 +}; + +zend_encoding encoding_utf32be = { + NULL, + NULL, + "UTF-32BE", + NULL, + 0 +}; + +zend_encoding encoding_utf32le = { + NULL, + NULL, + "UTF-32LE", + NULL, + 0 +}; + +const char *utf16_aliases[] = {"utf16", NULL}; +zend_encoding encoding_utf16 = { + NULL, + NULL, + "UTF-16", + (const char *(*)[])&utf16_aliases, + 0 +}; + +zend_encoding encoding_utf16be = { + NULL, + NULL, + "UTF-16BE", + NULL, + 0 +}; + +zend_encoding encoding_utf16le = { + NULL, + NULL, + "UTF-16LE", + NULL, + 0 +}; + +const char *utf8_aliases[] = {"utf8", NULL}; +zend_encoding encoding_utf8 = { + NULL, + NULL, + "UTF-8", + (const char *(*)[])&utf8_aliases, + 1 +}; + +const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL}; +zend_encoding encoding_ascii = { + NULL, + NULL, + "ASCII", + (const char *(*)[])&ascii_aliases, + 1 +}; + +const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL}; +zend_encoding encoding_euc_jp = { + NULL, + NULL, + "EUC-JP", + (const char *(*)[])&euc_jp_aliases, + 1 +}; + +const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL}; +zend_encoding encoding_sjis = { + sjis_input_filter, + sjis_output_filter, + "Shift_JIS", + (const char *(*)[])&sjis_aliases, + 0 +}; + +const char *eucjp_win_aliases[] = {"eucJP-open", NULL}; +zend_encoding encoding_eucjp_win = { + NULL, + NULL, + "eucJP-win", + (const char *(*)[])&eucjp_win_aliases, + 1 +}; + +const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL}; +zend_encoding encoding_sjis_win = { + /* sjis-filters does not care about diffs of Shift_JIS and CP932 */ + sjis_input_filter, + sjis_output_filter, + "SJIS-win", + (const char *(*)[])&sjis_win_aliases, + 0 +}; + +const char *jis_aliases[] = {"ISO-2022-JP", NULL}; +zend_encoding encoding_jis = { + NULL, + NULL, + "JIS", + (const char *(*)[])&jis_aliases, + 0 +}; + +const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL}; +zend_encoding encoding_euc_cn = { + NULL, + NULL, + "EUC-CN", + (const char *(*)[])&euc_cn_aliases, + 1 +}; + +const char *cp936_aliases[] = {"CP-936", NULL}; +zend_encoding encoding_cp936 = { + NULL, + NULL, + "CP936", + (const char *(*)[])&cp936_aliases, + 0 +}; + +const char *hz_aliases[] = {"HZ-GB-2312", NULL}; +zend_encoding encoding_hz = { + NULL, + NULL, + "HZ", + (const char *(*)[])&hz_aliases, + 0 +}; + +const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL}; +zend_encoding encoding_euc_tw = { + NULL, + NULL, + "EUC-TW", + (const char *(*)[])&euc_tw_aliases, + 1 +}; + +const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL}; +zend_encoding encoding_big5 = { + NULL, + NULL, + "BIG-5", + (const char *(*)[])&big5_aliases, + 0 +}; + +const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL}; +zend_encoding encoding_euc_kr = { + NULL, + NULL, + "EUC-KR", + (const char *(*)[])&euc_kr_aliases, + 1 +}; + +const char *uhc_aliases[] = {"CP949", NULL}; +zend_encoding encoding_uhc = { + NULL, + NULL, + "UHC", + (const char *(*)[])&uhc_aliases, + 1 +}; + +zend_encoding encoding_2022kr = { + NULL, + NULL, + "ISO-2022-KR", + NULL, + 0 +}; + +const char *cp1252_aliases[] = {"cp1252", NULL}; +zend_encoding encoding_cp1252 = { + NULL, + NULL, + "Windows-1252", + (const char *(*)[])&cp1252_aliases, + 1 +}; + +const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL}; +zend_encoding encoding_8859_1 = { + NULL, + NULL, + "ISO-8859-1", + (const char *(*)[])&iso_8859_1_aliases, + 1 +}; + +const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL}; +zend_encoding encoding_8859_2 = { + NULL, + NULL, + "ISO-8859-2", + (const char *(*)[])&iso_8859_2_aliases, + 1 +}; + +const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL}; +zend_encoding encoding_8859_3 = { + NULL, + NULL, + "ISO-8859-3", + (const char *(*)[])&iso_8859_3_aliases, + 1 +}; + +const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL}; +zend_encoding encoding_8859_4 = { + NULL, + NULL, + "ISO-8859-4", + (const char *(*)[])&iso_8859_4_aliases, + 1 +}; + +const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL}; +zend_encoding encoding_8859_5 = { + NULL, + NULL, + "ISO-8859-5", + (const char *(*)[])&iso_8859_5_aliases, + 1 +}; + +const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL}; +zend_encoding encoding_8859_6 = { + NULL, + NULL, + "ISO-8859-6", + (const char *(*)[])&iso_8859_6_aliases, + 1 +}; + +const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL}; +zend_encoding encoding_8859_7 = { + NULL, + NULL, + "ISO-8859-7", + (const char *(*)[])&iso_8859_7_aliases, + 1 +}; + +const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL}; +zend_encoding encoding_8859_8 = { + NULL, + NULL, + "ISO-8859-8", + (const char *(*)[])&iso_8859_8_aliases, + 1 +}; + +const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL}; +zend_encoding encoding_8859_9 = { + NULL, + NULL, + "ISO-8859-9", + (const char *(*)[])&iso_8859_9_aliases, + 1 +}; + +const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL}; +zend_encoding encoding_8859_10 = { + NULL, + NULL, + "ISO-8859-10", + (const char *(*)[])&iso_8859_10_aliases, + 1 +}; + +const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL}; +zend_encoding encoding_8859_13 = { + NULL, + NULL, + "ISO-8859-13", + (const char *(*)[])&iso_8859_13_aliases, + 1 +}; + +const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL}; +zend_encoding encoding_8859_14 = { + NULL, + NULL, + "ISO-8859-14", + (const char *(*)[])&iso_8859_14_aliases, + 1 +}; + +const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL}; +zend_encoding encoding_8859_15 = { + NULL, + NULL, + "ISO-8859-15", + (const char *(*)[])&iso_8859_15_aliases, + 1 +}; + +const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL}; +zend_encoding encoding_cp1251 = { + NULL, + NULL, + "Windows-1251", + (const char *(*)[])&cp1251_aliases, + 1 +}; + +const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL}; +zend_encoding encoding_cp866 = { + NULL, + NULL, + "CP866", + (const char *(*)[])&cp866_aliases, + 1 +}; + +const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL}; +zend_encoding encoding_koi8r = { + NULL, + NULL, + "KOI8-R", + (const char *(*)[])&koi8r_aliases, + 1 +}; + +zend_encoding *zend_encoding_table[] = { + &encoding_ucs4, + &encoding_ucs4be, + &encoding_ucs4le, + &encoding_ucs2, + &encoding_ucs2be, + &encoding_ucs2le, + &encoding_utf32, + &encoding_utf32be, + &encoding_utf32le, + &encoding_utf16, + &encoding_utf16be, + &encoding_utf16le, + &encoding_utf8, + &encoding_ascii, + &encoding_euc_jp, + &encoding_sjis, + &encoding_eucjp_win, + &encoding_sjis_win, + &encoding_jis, + &encoding_cp1252, + &encoding_8859_1, + &encoding_8859_2, + &encoding_8859_3, + &encoding_8859_4, + &encoding_8859_5, + &encoding_8859_6, + &encoding_8859_7, + &encoding_8859_8, + &encoding_8859_9, + &encoding_8859_10, + &encoding_8859_13, + &encoding_8859_14, + &encoding_8859_15, + &encoding_euc_cn, + &encoding_cp936, + &encoding_hz, + &encoding_euc_tw, + &encoding_big5, + &encoding_euc_kr, + &encoding_uhc, + &encoding_2022kr, + &encoding_cp1251, + &encoding_cp866, + &encoding_koi8r, + NULL +}; + + + +ZEND_API int zend_multibyte_set_script_encoding(char *encoding_list, int encoding_list_size TSRMLS_DC) +{ + if (CG(script_encoding_list)) { + efree(CG(script_encoding_list)); + CG(script_encoding_list) = NULL; + } + CG(script_encoding_list_size) = 0; + + if (!encoding_list) { + return 0; + } + + zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size))); + + return 0; +} + + +ZEND_API int zend_multibyte_set_internal_encoding(char *encoding_name, int encoding_name_size TSRMLS_DC) +{ + CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name); + return 0; +} + +ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC) +{ + CG(encoding_detector) = encoding_detector; + CG(encoding_converter) = encoding_converter; + CG(encoding_oddlen) = encoding_oddlen; + return 0; +} + + +ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC) +{ + LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC); + LANG_SCNG(internal_encoding) = CG(internal_encoding); + + /* judge input/output filter */ + LANG_SCNG(input_filter) = NULL; + LANG_SCNG(output_filter) = NULL; + + if (!LANG_SCNG(script_encoding)) { + return 0; + } + + if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) { + /* if encoding specfic filters exist, use them */ + if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) { + LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter; + LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter; + return 0; + } + + if (!LANG_SCNG(script_encoding)->compatible) { + /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */ + LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding); + LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; + LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; + return 0; + } else { + /* nothing to do in this case */ + return 0; + } + } + + /* LANG_SCNG(internal_encoding) cannot be NULL here */ + if (LANG_SCNG(internal_encoding)->compatible) { + LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; + return 0; + } else if (LANG_SCNG(script_encoding)->compatible) { + LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; + return 0; + } + + /* both script and internal encodings are incompatible w/ flex */ + LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter; + LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter; + + return 0; +} + + +ZEND_API zend_encoding* zend_multibyte_fetch_encoding(char *encoding_name) +{ + int i, j; + zend_encoding *encoding; + + if (!encoding_name) { + return NULL; + } + + for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) { + if (zend_binary_strcasecmp((char*)encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) { + return encoding; + } + } + + for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) { + if (encoding->aliases != NULL) { + for (j = 0; (*encoding->aliases)[j] != NULL; j++) { + if (zend_binary_strcasecmp((char*)(*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) { + return encoding; + } + } + } + } + + return NULL; +} + + +ZEND_API int zend_multibyte_script_encoding_filter(char **to, int *to_length, const char *from, int from_length TSRMLS_DC) +{ + const char *name; + + if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) { + name = "UTF-8"; + } else { + name = LANG_SCNG(internal_encoding)->name; + } + + return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC); +} + +ZEND_API int zend_multibyte_internal_encoding_filter(char **to, int *to_length, const char *from, int from_length TSRMLS_DC) +{ + const char *name; + + if (LANG_SCNG(script_encoding)->compatible == 0) { + name = "UTF-8"; + } else { + name = LANG_SCNG(script_encoding)->name; + } + + return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC); +} + +static int zend_multibyte_encoding_filter(char **to, int *to_length, const char *to_encoding, const char *from, int from_length, const char *from_encoding TSRMLS_DC) +{ + int oddlen; + + if (!CG(encoding_converter)) { + return 0; + } + + if (CG(encoding_oddlen)) { + oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC); + if (oddlen > 0) { + from_length -= oddlen; + } + } + + if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) { + return 0; + } + + return from_length; +} + + +/* + * Shift_JIS Input/Output Filter + */ +static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0 +}; + +int sjis_input_filter(char **buf, int *length, const char *sjis, int sjis_length TSRMLS_DC) +{ + unsigned char *p, *q; + unsigned char c1, c2; + + *buf = (char*)emalloc(sjis_length*3/2+1); + if (!*buf) + return 0; + *length = 0; + + p = (unsigned char*)sjis; + q = (unsigned char*)*buf; + + /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */ + while (*p && (p-(unsigned char*)sjis) < sjis_length) { + if (!(*p & 0x80)) { + *q++ = *p++; + continue; + } + + /* handling 8 bit code */ + if (table_sjis[*p] == 1) { + /* 1 byte kana */ + *q++ = 0x8e; + *q++ = *p++; + continue; + } + + if (!*(p+1)) { + *q++ = *p++; + break; + } + + if (table_sjis[*p] == 2) { + /* 2 byte kanji code */ + c1 = *p++; + if (!*p || (p-(unsigned char*)sjis) >= sjis_length) { + break; + } + c2 = *p++; + c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1; + c1 = (c1 << 1) + 1; + if (c2 >= 0x9e) { + c2 -= 0x7e; + c1++; + } else if (c2 > 0x7f) { + c2 -= 0x20; + } else { + c2 -= 0x1f; + } + + c1 |= 0x80; + c2 |= 0x80; + + *q++ = c1; + *q++ = c2; + } else { + /* + * for user defined chars (ATTENTION) + * + * THESE ARE NOT CODE FOR CONVERSION! :-P + * (using *ILLEGALLY* 3byte EUC-JP space) + * + * we cannot perfectly (== 1 to 1) convert these chars to EUC-JP. + * so, these code are for perfect RESTORING in sjis_output_filter() + */ + c1 = *p++; + if (!*p || (p-(unsigned char*)sjis) >= sjis_length) { + break; + } + c2 = *p++; + *q++ = (char)0x8f; + /* + * MAP TO (EUC-JP): + * type A: 0xeba1 - 0xf4fe + * type B: 0xf5a1 - 0xfefe + * type C: 0xa1a1 - 0xa6fe + */ + c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1); + c1 = (c1 << 1) + 1; + if (c2 >= 0x9e) { + c2 -= 0x7e; + c1++; + } else if (c2 > 0x7f) { + c2 -= 0x20; + } else { + c2 -= 0x1f; + } + + c1 |= 0x80; + c2 |= 0x80; + + *q++ = c1; + *q++ = c2; + } + } + *q = (char)NULL; + *length = (char*)q - *buf; + + return *length; +} + +static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +int sjis_output_filter(char **sjis, int *sjis_length, const char *buf, int length TSRMLS_DC) +{ + unsigned char c1, c2; + char *p; + const char *q; + + if (!sjis || !sjis_length) { + return 0; + } + + /* always Shift_JIS <= EUC-JP */ + *sjis = (char*)emalloc(length+1); + if (!sjis) { + return 0; + } + p = *sjis; + q = buf; + + /* restore converted strings [EUC-JP -> Shift_JIS] */ + while (*q) { + if (!(*q & 0x80)) { + *p++ = *q++; + continue; + } + + /* hankaku kana */ + if (*q == (char)0x8e) { + q++; + if (*q) { + *p++ = *q++; + } + continue; + } + + /* 2 byte kanji code */ + if (table_eucjp[(unsigned char)*q] == 2) { + c1 = (*q++ & ~0x80) & 0xff; + if (*q) { + c2 = (*q++ & ~0x80) & 0xff; + } else { + q--; + break; + } + + c2 += (c1 & 0x01) ? 0x1f : 0x7d; + if (c2 >= 0x7f) { + c2++; + } + c1 = ((c1 - 0x21) >> 1) + 0x81; + if (c1 > 0x9f) { + c1 += 0x40; + } + + *p++ = c1; + *p++ = c2; + continue; + } + + if (*q == (char)0x8f) { + q++; + if (*q) { + c1 = (*q++ & ~0x80) & 0xff; + } else { + q--; + break; + } + if (*q) { + c2 = (*q++ & ~0x80) & 0xff; + } else { + q -= 2; + break; + } + + c2 += (c1 & 0x01) ? 0x1f : 0x7d; + if (c2 >= 0x7f) { + c2++; + } + c1 = ((c1 - 0x21) >> 1) + 0x81; + if (c1 > 0x9f) { + c1 += 0x40; + } + + if (c1 >= 0x81 && c1 <= 0x9f) { + c1 += 0x79; + } else { + c1 += 0x0a; + } + + *p++ = c1; + *p++ = c2; + continue; + } + + /* some other chars (may not happen) */ + *p++ = *q++; + } + *p = '\0'; + *sjis_length = p - *sjis; + + return q-buf; /* return length we actually read */ +} + + +static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, int encoding_list_size) +{ + int i, list_size = 0; + const char *name; + char *list = NULL; + + if (!encoding_list || !encoding_list_size) { + return NULL; + } + + for (i = 0; i < encoding_list_size; i++) { + name = (*(encoding_list+i))->name; + if (name) { + list_size += strlen(name) + 1; + if (!list) { + list = (char*)emalloc(list_size); + if (!list) { + return NULL; + } + *list = (char)NULL; + } else { + list = (char*)erealloc(list, list_size); + if (!list) { + return NULL; + } + strcat(list, ","); + } + strcat(list, name); + } + } + return list; +} + + +static int zend_multibyte_parse_encoding_list(const char *encoding_list, int encoding_list_size, zend_encoding ***result, int *result_size) +{ + int n, size; + char *p, *p1, *p2, *endp, *tmpstr; + zend_encoding **list, **entry, *encoding; + + list = NULL; + if (encoding_list == NULL || encoding_list_size <= 0) { + return -1; + } else { + /* copy the encoding_list string for work */ + tmpstr = (char *)estrndup(encoding_list, encoding_list_size); + if (tmpstr == NULL) { + return -1; + } + /* count the number of listed encoding names */ + endp = tmpstr + encoding_list_size; + n = 1; + p1 = tmpstr; + while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) { + p1 = p2 + 1; + n++; + } + size = n; + /* make list */ + list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*)); + if (list != NULL) { + entry = list; + n = 0; + p1 = tmpstr; + do { + p2 = p = zend_memnstr(p1, ",", 1, endp); + if (p == NULL) { + p = endp; + } + *p = '\0'; + /* trim spaces */ + while (p1 < p && (*p1 == ' ' || *p1 == '\t')) { + p1++; + } + p--; + while (p > p1 && (*p == ' ' || *p == '\t')) { + *p = '\0'; + p--; + } + /* convert to the encoding number and check encoding */ + encoding = zend_multibyte_fetch_encoding(p1); + if (encoding) + { + *entry++ = encoding; + n++; + } + p1 = p2 + 1; + } while (n < size && p2 != NULL); + *result = list; + *result_size = n; + } + efree(tmpstr); + } + + if (list == NULL) { + return -1; + } + + return 0; +} + + +static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC) +{ + zend_encoding *script_encoding; + char *name, *list; + + /* onetime_encoding is prior to everything */ + if (onetime_encoding != NULL) { + return onetime_encoding; + } + + /* check out bom(byte order mark) and see if containing wchars */ + script_encoding = zend_multibyte_detect_unicode(TSRMLS_C); + if (script_encoding != NULL) { + /* bom or wchar detection is prior to 'script_encoding' option */ + return script_encoding; + } + + /* if no script_encoding specified, just leave alone */ + if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) { + return NULL; + } + + /* if multiple encodings specified, detect automagically */ + if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) { + list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list), + CG(script_encoding_list_size)); + name = CG(encoding_detector)(LANG_SCNG(script_org), + LANG_SCNG(script_org_size), list TSRMLS_CC); + if (list) { + efree(list); + } + if (name) { + script_encoding = zend_multibyte_fetch_encoding(name); + efree(name); + } else { + script_encoding = NULL; + } + return script_encoding; + } + + return *(CG(script_encoding_list)); +} + + +static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D) +{ + zend_encoding *script_encoding = NULL; + int bom_size; + char *script; + + if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) { + return NULL; + } + + /* check out BOM */ + if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) { + script_encoding = &encoding_utf32be; + bom_size = sizeof(BOM_UTF32_BE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) { + script_encoding = &encoding_utf32le; + bom_size = sizeof(BOM_UTF32_LE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) { + script_encoding = &encoding_utf16be; + bom_size = sizeof(BOM_UTF16_BE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) { + script_encoding = &encoding_utf16le; + bom_size = sizeof(BOM_UTF16_LE)-1; + } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) { + script_encoding = &encoding_utf8; + bom_size = sizeof(BOM_UTF8)-1; + } + + if (script_encoding) { + /* remove BOM */ + script = (char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size); + memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size); + efree(LANG_SCNG(script_org)); + LANG_SCNG(script_org) = script; + LANG_SCNG(script_org_size) -= bom_size; + + return script_encoding; + } + + /* script contains NULL bytes -> auto-detection */ + if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) { + /* make best effort if BOM is missing */ + return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC); + } + + return NULL; +} + +static zend_encoding* zend_multibyte_detect_utf_encoding(char *script, int script_size TSRMLS_DC) +{ + char *p; + int wchar_size = 2; + int le = 0; + + /* utf-16 or utf-32? */ + p = script; + while ((p-script) < script_size) { + p = memchr(p, 0, script_size-(p-script)-2); + if (!p) { + break; + } + if (*(p+1) == (char)NULL && *(p+2) == (char)NULL) { + wchar_size = 4; + break; + } + + /* searching for UTF-32 specific byte orders, so this will do */ + p += 4; + } + + /* BE or LE? */ + p = script; + while ((p-script) < script_size) { + if (*p == (char)NULL && *(p+wchar_size-1) != (char)NULL) { + /* BE */ + le = 0; + break; + } else if (*p != (char)NULL && *(p+wchar_size-1) == (char)NULL) { + /* LE* */ + le = 1; + break; + } + p += wchar_size; + } + + if (wchar_size == 2) { + return le ? &encoding_utf16le : &encoding_utf16be; + } else { + return le ? &encoding_utf32le : &encoding_utf32be; + } + + return NULL; +} +#endif /* ZEND_MULTIBYTE */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4 ts=4 tw=78 + * vim<600: sw=4 ts=4 tw=78 + */ diff --git a/Zend/zend_multibyte.h b/Zend/zend_multibyte.h new file mode 100644 index 0000000000..58e0228dc0 --- /dev/null +++ b/Zend/zend_multibyte.h @@ -0,0 +1,79 @@ +/* + +----------------------------------------------------------------------+ + | Zend Engine | + +----------------------------------------------------------------------+ + | Copyright (c) 1998-2003 Zend Technologies Ltd. (http://www.zend.com) | + +----------------------------------------------------------------------+ + | This source file is subject to version 2.00 of the Zend license, | + | that is bundled with this package in the file LICENSE, and is | + | available at through the world-wide-web at | + | http://www.zend.com/license/2_00.txt. | + | If you did not receive a copy of the Zend license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@zend.com so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Masaki Fujimoto <fujimoto@php.net> | + | Rui Hirokawa <hirokawa@php.net> | + +----------------------------------------------------------------------+ +*/ + +/* $Id$ */ + +#ifndef ZEND_MULTIBYTE_H +#define ZEND_MULTIBYTE_H + +#ifdef ZEND_MULTIBYTE + +#define BOM_UTF32_BE "\x00\x00\xfe\xff" +#define BOM_UTF32_LE "\xff\xfe\x00\x00" +#define BOM_UTF16_BE "\xfe\xff" +#define BOM_UTF16_LE "\xff\xfe" +#define BOM_UTF8 "\xef\xbb\xbf" + +typedef int (*zend_encoding_filter)(char **str, int *str_length, const char *buf, int length TSRMLS_DC); + +typedef char* (*zend_encoding_detector)(const char *string, int length, char *list TSRMLS_DC); + +typedef int (*zend_encoding_converter)(char **to, int *to_length, const char *from, int from_length, const char *encoding_to, const char *encoding_from TSRMLS_DC); + +typedef int (*zend_encoding_oddlen)(const char *string, int length, const char *encoding TSRMLS_DC); + +typedef struct _zend_encoding { + zend_encoding_filter input_filter; /* escape input filter */ + zend_encoding_filter output_filter; /* escape output filter */ + const char *name; /* encoding name */ + const char *(*aliases)[]; /* encoding name aliases */ + int compatible; /* flex compatible or not */ +} zend_encoding; + + +/* + * zend multibyte APIs + */ +BEGIN_EXTERN_C() +ZEND_API int zend_multibyte_set_script_encoding(char *encoding_list, int encoding_list_size TSRMLS_DC); +ZEND_API int zend_multibyte_set_internal_encoding(char *encoding_name, int encoding_name_size TSRMLS_DC); +ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC); +ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC); +ZEND_API zend_encoding* zend_multibyte_fetch_encoding(char *encoding_name); +ZEND_API int zend_multibyte_script_encoding_filter(char **to, int *to_length, const char *from, int from_length TSRMLS_DC); +ZEND_API int zend_multibyte_internal_encoding_filter(char **to, int *to_length, const char *from, int from_length TSRMLS_DC); + +/* in zend_language_scanner.l */ +ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter, zend_encoding *old_encoding TSRMLS_DC); +ZEND_API int zend_multibyte_yyinput(zend_file_handle *file_handle, char *buf, size_t len TSRMLS_DC); +ZEND_API int zend_multibyte_read_script(TSRMLS_D); +END_EXTERN_C() + +#endif /* ZEND_MULTIBYTE */ + +#endif /* ZEND_MULTIBYTE_H */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4 ts=4 tw=78 + * vim<600: sw=4 ts=4 tw=78 + */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index e386afee86..b2e5d1b549 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -848,6 +848,9 @@ PHP_RINIT_FUNCTION(mbstring) #if HAVE_MBREGEX PHP_RINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU); #endif +#ifdef ZEND_MULTIBYTE + php_mb_set_zend_encoding(TSRMLS_C); +#endif /* ZEND_MULTIBYTE */ return SUCCESS; } @@ -982,7 +985,10 @@ PHP_FUNCTION(mb_internal_encoding) } else { MBSTRG(current_internal_encoding) = no_encoding; #ifdef ZEND_MULTIBYTE - zend_multibyte_set_internal_encoding(Z_STRVAL_PP(arg1), Z_STRLEN_PP(arg1) TSRMLS_CC); + /* TODO: make independent from mbstring.encoding_translation? */ + if (MBSTRG(encoding_translation)) { + zend_multibyte_set_internal_encoding(name, name_len TSRMLS_CC); + } #endif /* ZEND_MULTIBYTE */ RETURN_TRUE; } @@ -3366,7 +3372,7 @@ MBSTRING_API int php_mb_set_zend_encoding(TSRMLS_D) int n, *entry, list_size = 0; zend_encoding_detector encoding_detector; zend_encoding_converter encoding_converter; - zend_multibyte_oddlen multibyte_oddlen; + zend_encoding_oddlen encoding_oddlen; /* notify script encoding to Zend Engine */ entry = MBSTRG(script_encoding_list); @@ -3392,19 +3398,17 @@ MBSTRING_API int php_mb_set_zend_encoding(TSRMLS_D) efree(list); } encoding_detector = php_mb_encoding_detector; - encoding_converter = NULL; - multibyte_oddlen = php_mb_oddlen; + encoding_converter = php_mb_encoding_converter; + encoding_oddlen = php_mb_oddlen; + /* TODO: make independent from mbstring.encoding_translation? */ if (MBSTRG(encoding_translation)) { /* notify internal encoding to Zend Engine */ name = (char*)mbfl_no_encoding2name(MBSTRG(current_internal_encoding)); zend_multibyte_set_internal_encoding(name, strlen(name) TSRMLS_CC); - - encoding_converter = php_mb_encoding_converter; } - zend_multibyte_set_functions(encoding_detector, encoding_converter, - multibyte_oddlen TSRMLS_CC); + zend_multibyte_set_functions(encoding_detector, encoding_converter, encoding_oddlen TSRMLS_CC); return 0; } diff --git a/main/main.c b/main/main.c index ba1cef84c7..8131a891d5 100644 --- a/main/main.c +++ b/main/main.c @@ -90,10 +90,6 @@ #include "php_logos.h" #include "php_streams.h" -#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) -#include "ext/mbstring/mbstring.h" -#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */ - #include "SAPI.h" #include "rfc1867.h" /* }}} */ @@ -1564,9 +1560,6 @@ PHPAPI int php_execute_script(zend_file_handle *primary_file TSRMLS_DC) } else { append_file_p = NULL; } -#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) - php_mb_set_zend_encoding(TSRMLS_C); -#endif /* ZEND_MULTIBYTE && HAVE_MBSTRING */ #ifdef PHP_WIN32 zend_unset_timeout(TSRMLS_C); #endif |