diff options
author | Nikita Popov <nikic@php.net> | 2016-12-15 17:22:08 +0100 |
---|---|---|
committer | Nikita Popov <nikic@php.net> | 2017-01-02 23:31:27 +0100 |
commit | 593b6cd0a87401d72aedfcbad8bf9b1b149d94f4 (patch) | |
tree | 7da3fafbf2c1bd4e9435b02505d6f15e0ffc3bf4 | |
parent | b06fb88cf9bcaea72dd87dd4273228243c5a5b0b (diff) | |
download | php-git-593b6cd0a87401d72aedfcbad8bf9b1b149d94f4.tar.gz |
Optimize get_browser() using prefix and contains checks
Avoid expensive regex compilations by checking for prefix
and contained segments beforehand.
-rw-r--r-- | ext/standard/browscap.c | 248 |
1 files changed, 185 insertions, 63 deletions
diff --git a/ext/standard/browscap.c b/ext/standard/browscap.c index 14ca27a95e..71491bcb45 100644 --- a/ext/standard/browscap.c +++ b/ext/standard/browscap.c @@ -27,6 +27,8 @@ #include "zend_ini_scanner.h" #include "zend_globals.h" +#define BROWSCAP_NUM_CONTAINS 5 + typedef struct { zend_string *key; zend_string *value; @@ -34,9 +36,13 @@ typedef struct { typedef struct { zend_string *pattern; - zend_string *regex; + zend_string *parent; uint32_t kv_start; uint32_t kv_end; + /* We ensure that the length fits in 16 bits, so this is fine */ + uint16_t contains_start[BROWSCAP_NUM_CONTAINS]; + uint8_t contains_len[BROWSCAP_NUM_CONTAINS]; + uint8_t prefix_len; } browscap_entry; typedef struct { @@ -67,7 +73,9 @@ static void browscap_entry_dtor(zval *zvalue) { browscap_entry *entry = Z_PTR_P(zvalue); zend_string_release(entry->pattern); - zend_string_release(entry->regex); + if (entry->parent) { + zend_string_release(entry->parent); + } efree(entry); } @@ -75,10 +83,55 @@ static void browscap_entry_dtor_persistent(zval *zvalue) { browscap_entry *entry = Z_PTR_P(zvalue); zend_string_release(entry->pattern); - zend_string_release(entry->regex); + if (entry->parent) { + zend_string_release(entry->parent); + } pefree(entry, 1); } +static inline zend_bool is_placeholder(char c) { + return c == '?' || c == '*'; +} + +/* Length of prefix not containing any wildcards */ +static uint8_t browscap_compute_prefix_len(zend_string *pattern) { + size_t i; + for (i = 0; i < ZSTR_LEN(pattern); i++) { + if (is_placeholder(ZSTR_VAL(pattern)[i])) { + break; + } + } + return MIN(i, UINT8_MAX); +} + +static size_t browscap_compute_contains( + zend_string *pattern, size_t start_pos, + uint16_t *contains_start, uint8_t *contains_len) { + size_t i = start_pos; + /* Find first non-placeholder character after prefix */ + for (; i < ZSTR_LEN(pattern); i++) { + if (!is_placeholder(ZSTR_VAL(pattern)[i])) { + /* Skip the case of a single non-placeholder character. + * Let's try to find something longer instead. */ + if (i + 1 < ZSTR_LEN(pattern) && + !is_placeholder(ZSTR_VAL(pattern)[i + 1])) { + break; + } + } + } + *contains_start = i; + + /* Find first placeholder character after that */ + for (; i < ZSTR_LEN(pattern); i++) { + if (is_placeholder(ZSTR_VAL(pattern)[i])) { + break; + } + } + *contains_len = MIN(i - *contains_start, UINT8_MAX); + return i; +} + +/* Length of regex, including escapes, anchors, etc. */ static size_t browscap_compute_regex_len(zend_string *pattern) { size_t i, len = ZSTR_LEN(pattern); for (i = 0; i < ZSTR_LEN(pattern); i++) { @@ -227,12 +280,17 @@ static HashTable *browscap_entry_to_array(browser_data *bdata, browscap_entry *e ALLOC_HASHTABLE(ht); zend_hash_init(ht, 8, NULL, ZVAL_PTR_DTOR, 0); - ZVAL_STR_COPY(&tmp, entry->regex); + ZVAL_STR(&tmp, browscap_convert_pattern(entry->pattern, 0)); zend_hash_str_add(ht, "browser_name_regex", sizeof("browser_name_regex")-1, &tmp); ZVAL_STR_COPY(&tmp, entry->pattern); zend_hash_str_add(ht, "browser_name_pattern", sizeof("browser_name_pattern")-1, &tmp); + if (entry->parent) { + ZVAL_STR_COPY(&tmp, entry->parent); + zend_hash_str_add(ht, "parent", sizeof("parent")-1, &tmp); + } + for (i = entry->kv_start; i < entry->kv_end; i++) { ZVAL_STR_COPY(&tmp, bdata->kv[i].value); zend_hash_add(ht, bdata->kv[i].key, &tmp); @@ -256,17 +314,6 @@ static void php_browscap_parser_cb(zval *arg1, zval *arg2, zval *arg3, int callb if (ctx->current_entry != NULL && arg2) { zend_string *new_key, *new_value; - /* parent entry can not be same as current section -> causes infinite loop! */ - if (!strcasecmp(Z_STRVAL_P(arg1), "parent") && - ctx->current_section_name != NULL && - !strcasecmp(ZSTR_VAL(ctx->current_section_name), Z_STRVAL_P(arg2)) - ) { - zend_error(E_CORE_ERROR, "Invalid browscap ini file: " - "'Parent' value cannot be same as the section name: %s " - "(in file %s)", ZSTR_VAL(ctx->current_section_name), INI_STR("browscap")); - return; - } - /* Set proper value for true/false settings */ if ((Z_STRLEN_P(arg2) == 2 && !strncasecmp(Z_STRVAL_P(arg2), "on", sizeof("on") - 1)) || (Z_STRLEN_P(arg2) == 3 && !strncasecmp(Z_STRVAL_P(arg2), "yes", sizeof("yes") - 1)) || @@ -284,28 +331,61 @@ static void php_browscap_parser_cb(zval *arg1, zval *arg2, zval *arg3, int callb new_value = browscap_intern_str(ctx, Z_STR_P(arg2)); } - new_key = browscap_intern_str_ci(ctx, Z_STR_P(arg1), persistent); - browscap_add_kv(bdata, new_key, new_value, persistent); - ctx->current_entry->kv_end = bdata->kv_used; + if (!strcasecmp(Z_STRVAL_P(arg1), "parent")) { + /* parent entry can not be same as current section -> causes infinite loop! */ + if (ctx->current_section_name != NULL && + !strcasecmp(ZSTR_VAL(ctx->current_section_name), Z_STRVAL_P(arg2)) + ) { + zend_error(E_CORE_ERROR, "Invalid browscap ini file: " + "'Parent' value cannot be same as the section name: %s " + "(in file %s)", ZSTR_VAL(ctx->current_section_name), INI_STR("browscap")); + return; + } + + if (ctx->current_entry->parent) { + zend_string_release(ctx->current_entry->parent); + } + ctx->current_entry->parent = new_value; + } else { + new_key = browscap_intern_str_ci(ctx, Z_STR_P(arg1), persistent); + browscap_add_kv(bdata, new_key, new_value, persistent); + ctx->current_entry->kv_end = bdata->kv_used; + } } break; - case ZEND_INI_PARSER_SECTION: { - zval processed; - zval unprocessed; + case ZEND_INI_PARSER_SECTION: + { + browscap_entry *entry; + zend_string *pattern = Z_STR_P(arg1); + size_t pos; + int i; + + if (ZSTR_LEN(pattern) > UINT16_MAX) { + php_error_docref(NULL, E_WARNING, + "Skipping excessively long pattern of length %zd", ZSTR_LEN(pattern)); + break; + } + + entry = ctx->current_entry + = pemalloc(sizeof(browscap_entry), persistent); + zend_hash_update_ptr(bdata->htab, pattern, entry); - ctx->current_entry = pemalloc(sizeof(browscap_entry), persistent); - zend_hash_update_ptr(bdata->htab, Z_STR_P(arg1), ctx->current_entry); + if (ctx->current_section_name) { + zend_string_release(ctx->current_section_name); + } + ctx->current_section_name = zend_string_copy(pattern); - if (ctx->current_section_name) { - zend_string_release(ctx->current_section_name); - } - ctx->current_section_name = zend_string_copy(Z_STR_P(arg1)); + entry->pattern = zend_string_copy(pattern); + entry->kv_end = entry->kv_start = bdata->kv_used; + entry->parent = NULL; - ctx->current_entry->regex = browscap_convert_pattern(Z_STR_P(arg1), persistent); - ctx->current_entry->pattern = zend_string_copy(Z_STR_P(arg1)); - ctx->current_entry->kv_end = ctx->current_entry->kv_start = bdata->kv_used; + pos = entry->prefix_len = browscap_compute_prefix_len(pattern); + for (i = 0; i < BROWSCAP_NUM_CONTAINS; i++) { + pos = browscap_compute_contains(pattern, pos, + &entry->contains_start[i], &entry->contains_len[i]); } break; + } } } /* }}} */ @@ -455,32 +535,79 @@ PHP_MSHUTDOWN_FUNCTION(browscap) /* {{{ */ } /* }}} */ +static inline size_t browscap_get_minimum_length(browscap_entry *entry) { + size_t len = entry->prefix_len; + int i; + for (i = 0; i < BROWSCAP_NUM_CONTAINS; i++) { + len += entry->contains_len[i]; + } + return len; +} + static int browser_reg_compare( zval *entry_zv, int num_args, va_list args, zend_hash_key *key) /* {{{ */ { browscap_entry *entry = Z_PTR_P(entry_zv); - char *lookup_browser_name = va_arg(args, char *); - int lookup_browser_length = va_arg(args, int); + zend_string *agent_name = va_arg(args, zend_string *); browscap_entry **found_entry_ptr = va_arg(args, browscap_entry **); browscap_entry *found_entry = *found_entry_ptr; + ALLOCA_FLAG(use_heap); + zend_string *pattern_lc, *regex; + const char *cur; + int i; pcre *re; int re_options; pcre_extra *re_extra; - /* See if we have an exact match, if so, we're done... */ - if (found_entry) { - if (!strcasecmp(ZSTR_VAL(found_entry->pattern), lookup_browser_name)) { - return 0; + /* Agent name too short */ + if (ZSTR_LEN(agent_name) < browscap_get_minimum_length(entry)) { + return 0; + } + + /* Quickly discard patterns where the prefix doesn't match. */ + if (zend_binary_strcasecmp( + ZSTR_VAL(agent_name), entry->prefix_len, + ZSTR_VAL(entry->pattern), entry->prefix_len) != 0) { + return 0; + } + + /* Lowercase the pattern, the agent name is already lowercase */ + ZSTR_ALLOCA_ALLOC(pattern_lc, ZSTR_LEN(entry->pattern), use_heap); + zend_str_tolower_copy(ZSTR_VAL(pattern_lc), ZSTR_VAL(entry->pattern), ZSTR_LEN(entry->pattern)); + + /* Check if the agent contains the "contains" portions */ + cur = ZSTR_VAL(agent_name) + entry->prefix_len; + for (i = 0; i < BROWSCAP_NUM_CONTAINS; i++) { + if (entry->contains_len[i] != 0) { + cur = zend_memnstr(cur, + ZSTR_VAL(pattern_lc) + entry->contains_start[i], + entry->contains_len[i], + ZSTR_VAL(agent_name) + ZSTR_LEN(agent_name)); + if (!cur) { + ZSTR_ALLOCA_FREE(pattern_lc, use_heap); + return 0; + } + cur += entry->contains_len[i]; } } - re = pcre_get_compiled_regex(entry->regex, &re_extra, &re_options); + /* See if we have an exact match, if so, we're done... */ + if (zend_string_equals(agent_name, pattern_lc)) { + *found_entry_ptr = entry; + ZSTR_ALLOCA_FREE(pattern_lc, use_heap); + return ZEND_HASH_APPLY_STOP; + } + + regex = browscap_convert_pattern(entry->pattern, 0); + re = pcre_get_compiled_regex(regex, &re_extra, &re_options); if (re == NULL) { + ZSTR_ALLOCA_FREE(pattern_lc, use_heap); + zend_string_release(regex); return 0; } - if (pcre_exec(re, re_extra, lookup_browser_name, lookup_browser_length, 0, re_options, NULL, 0) == 0) { + if (pcre_exec(re, re_extra, ZSTR_VAL(agent_name), ZSTR_LEN(agent_name), 0, re_options, NULL, 0) == 0) { /* If we've found a possible browser, we need to do a comparison of the number of characters changed in the user agent being checked versus the previous match found and the current match. */ @@ -523,6 +650,8 @@ static int browser_reg_compare( } } + ZSTR_ALLOCA_FREE(pattern_lc, use_heap); + zend_string_release(regex); return 0; } /* }}} */ @@ -537,12 +666,8 @@ static void browscap_zval_copy_ctor(zval *p) /* {{{ */ Get information about the capabilities of a browser. If browser_name is omitted or null, HTTP_USER_AGENT is used. Returns an object by default; if return_array is true, returns an array. */ PHP_FUNCTION(get_browser) { - char *agent_name = NULL; - size_t agent_name_len = 0; + zend_string *agent_name = NULL, *lookup_browser_name; zend_bool return_array = 0; - zval *agent, *parent_agent_name_zv, *http_user_agent; - zend_string *parent_agent_name; - char *lookup_browser_name; browser_data *bdata; browscap_entry *found_entry = NULL; HashTable *agent_ht; @@ -562,27 +687,29 @@ PHP_FUNCTION(get_browser) bdata = &global_bdata; } - if (zend_parse_parameters(ZEND_NUM_ARGS(), "|s!b", &agent_name, &agent_name_len, &return_array) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS(), "|S!b", &agent_name, &return_array) == FAILURE) { return; } if (agent_name == NULL) { - if ((Z_TYPE(PG(http_globals)[TRACK_VARS_SERVER]) == IS_ARRAY || zend_is_auto_global_str(ZEND_STRL("_SERVER"))) && - (http_user_agent = zend_hash_str_find(Z_ARRVAL_P(&PG(http_globals)[TRACK_VARS_SERVER]), "HTTP_USER_AGENT", sizeof("HTTP_USER_AGENT")-1)) == NULL - ) { + zval *http_user_agent = NULL; + if (Z_TYPE(PG(http_globals)[TRACK_VARS_SERVER]) == IS_ARRAY + || zend_is_auto_global_str(ZEND_STRL("_SERVER"))) { + http_user_agent = zend_hash_str_find( + Z_ARRVAL_P(&PG(http_globals)[TRACK_VARS_SERVER]), + "HTTP_USER_AGENT", sizeof("HTTP_USER_AGENT")-1); + } + if (http_user_agent == NULL) { php_error_docref(NULL, E_WARNING, "HTTP_USER_AGENT variable is not set, cannot determine user agent name"); RETURN_FALSE; } - agent_name = Z_STRVAL_P(http_user_agent); - agent_name_len = Z_STRLEN_P(http_user_agent); + agent_name = Z_STR_P(http_user_agent); } - lookup_browser_name = estrndup(agent_name, agent_name_len); - php_strtolower(lookup_browser_name, agent_name_len); - - found_entry = zend_hash_str_find_ptr(bdata->htab, lookup_browser_name, agent_name_len); + lookup_browser_name = zend_string_tolower(agent_name); + found_entry = zend_hash_find_ptr(bdata->htab, lookup_browser_name); if (found_entry == NULL) { - zend_hash_apply_with_arguments(bdata->htab, browser_reg_compare, 3, lookup_browser_name, agent_name_len, &found_entry); + zend_hash_apply_with_arguments(bdata->htab, browser_reg_compare, 2, lookup_browser_name, &found_entry); if (found_entry == NULL) { found_entry = zend_hash_str_find_ptr(bdata->htab, @@ -602,11 +729,9 @@ PHP_FUNCTION(get_browser) object_and_properties_init(return_value, zend_standard_class_def, agent_ht); } - parent_agent_name_zv = zend_hash_str_find(agent_ht, "parent", sizeof("parent")-1); - parent_agent_name = parent_agent_name_zv ? Z_STR_P(parent_agent_name_zv) : NULL; - - while (parent_agent_name) { - if ((found_entry = zend_hash_find_ptr(bdata->htab, parent_agent_name)) == NULL) { + while (found_entry->parent) { + found_entry = zend_hash_find_ptr(bdata->htab, found_entry->parent); + if (found_entry == NULL) { break; } @@ -617,14 +742,11 @@ PHP_FUNCTION(get_browser) zend_hash_merge(Z_OBJPROP_P(return_value), agent_ht, (copy_ctor_func_t) browscap_zval_copy_ctor, 0); } - parent_agent_name_zv = zend_hash_str_find(agent_ht, "parent", sizeof("parent")-1); - parent_agent_name = parent_agent_name_zv ? Z_STR_P(parent_agent_name_zv) : NULL; - zend_hash_destroy(agent_ht); efree(agent_ht); } - efree(lookup_browser_name); + zend_string_release(lookup_browser_name); } /* }}} */ |