summaryrefslogtreecommitdiff
path: root/ext/standard/html.c
diff options
context:
space:
mode:
authorGustavo André dos Santos Lopes <cataphract@php.net>2010-10-12 02:51:11 +0000
committerGustavo André dos Santos Lopes <cataphract@php.net>2010-10-12 02:51:11 +0000
commit99b613cbc8f862de0606c226272f7538b63a612a (patch)
treec8171db9b950d6ec29029c2946e86347fe445157 /ext/standard/html.c
parent40c3aefafbf7553ca8480fa88dad97b2b8b7b552 (diff)
downloadphp-git-99b613cbc8f862de0606c226272f7538b63a612a.tar.gz
- Added a 3rd parameter to get_html_translation_table. It now takes a charset
hint, like htmlentities et al. - Fixed bug #49407 (get_html_translation_table doesn't handle UTF-8). - Fixed bug #25927 (get_html_translation_table calls the ' &#39; instead of &#039;). - Fixed tests for get_html_translation_table and unified the Windows and non-Windows versions of the tests.
Diffstat (limited to 'ext/standard/html.c')
-rw-r--r--ext/standard/html.c90
1 files changed, 61 insertions, 29 deletions
diff --git a/ext/standard/html.c b/ext/standard/html.c
index 3edcaa6dac..9cc17836a1 100644
--- a/ext/standard/html.c
+++ b/ext/standard/html.c
@@ -867,7 +867,7 @@ det_charset:
/* }}} */
/* {{{ php_utf32_utf8 */
-size_t php_utf32_utf8(unsigned char *buf, int k)
+size_t php_utf32_utf8(unsigned char *buf, unsigned k)
{
size_t retval = 0;
@@ -1408,54 +1408,86 @@ PHP_FUNCTION(htmlentities)
}
/* }}} */
-/* {{{ proto array get_html_translation_table([int table [, int quote_style]])
+/* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
Returns the internal translation table used by htmlspecialchars and htmlentities */
PHP_FUNCTION(get_html_translation_table)
{
long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
- int i, j;
- char ind[2];
- enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
+ unsigned int i;
+ int j;
+ unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
+ void *dummy;
+ char *charset_hint = NULL;
+ int charset_hint_len;
+ enum entity_charset charset;
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, &quote_style) == FAILURE) {
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
+ &which, &quote_style, &charset_hint, &charset_hint_len) == FAILURE) {
return;
}
- array_init(return_value);
+ charset = determine_charset(charset_hint TSRMLS_CC);
- ind[1] = 0;
+ array_init(return_value);
switch (which) {
- case HTML_ENTITIES:
- for (j=0; entity_map[j].charset != cs_terminator; j++) {
- if (entity_map[j].charset != charset)
+ case HTML_ENTITIES:
+ for (j = 0; entity_map[j].charset != cs_terminator; j++) {
+ if (entity_map[j].charset != charset)
+ continue;
+ for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
+ char buffer[16];
+ unsigned k;
+ size_t written;
+
+ if (entity_map[j].table[i] == NULL)
continue;
- for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
- char buffer[16];
+
+ k = i + entity_map[j].basechar;
- if (entity_map[j].table[i] == NULL)
- continue;
- /* what about wide chars here ?? */
- ind[0] = i + entity_map[j].basechar;
- snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
- add_assoc_string(return_value, ind, buffer, 1);
+ switch (charset) {
+ case cs_utf_8:
+ written = php_utf32_utf8(ind, k);
+ ind[written] = '\0';
+ break;
+ case cs_big5:
+ case cs_gb2312:
+ case cs_big5hkscs:
+ case cs_sjis:
+ /* we have no mappings for these, but if we had... */
+ /* break through */
+ default: /* one byte */
+ written = 1;
+ ind[0] = (unsigned char)k;
+ ind[1] = '\0';
+ break;
+ }
+ snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
+ if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
+ /* in case of the single quote, which is repeated, the first one wins,
+ * so don't replace the existint mapping */
+ add_assoc_string(return_value, (const char*)ind, buffer, 1);
}
}
- /* break thru */
-
- case HTML_SPECIALCHARS:
- for (j = 0; basic_entities[j].charcode != 0; j++) {
+ }
+ /* break thru */
- if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
- continue;
+ case HTML_SPECIALCHARS:
+ add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
+ for (j = 0; basic_entities[j].charcode != 0; j++) {
+ if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
+ continue;
- ind[0] = (unsigned char)basic_entities[j].charcode;
- add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
+ ind[0] = (unsigned char)basic_entities[j].charcode;
+ ind[1] = '\0';
+ if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
+ add_assoc_stringl(return_value, ind, basic_entities[j].entity,
+ basic_entities[j].entitylen, 1);
}
- add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
+ }
- break;
+ break;
}
}
/* }}} */