diff options
Diffstat (limited to 'src/tool_urlglob.c')
-rw-r--r-- | src/tool_urlglob.c | 597 |
1 files changed, 597 insertions, 0 deletions
diff --git a/src/tool_urlglob.c b/src/tool_urlglob.c new file mode 100644 index 000000000..108fc3987 --- /dev/null +++ b/src/tool_urlglob.c @@ -0,0 +1,597 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at http://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ +#include "setup.h" + +#include <curl/curl.h> + +#define _MPRINTF_REPLACE /* we want curl-functions instead of native ones */ +#include <curl/mprintf.h> + +#include "tool_urlglob.h" +#include "tool_vms.h" + +#include "memdebug.h" /* keep this as LAST include */ + +typedef enum { + GLOB_OK, + GLOB_NO_MEM, + GLOB_ERROR +} GlobCode; + +/* + * glob_word() + * + * Input a full globbed string, set the forth argument to the amount of + * strings we get out of this. Return GlobCode. + */ +static GlobCode glob_word(URLGlob *, /* object anchor */ + char *, /* globbed string */ + size_t, /* position */ + int *); /* returned number of strings */ + +static GlobCode glob_set(URLGlob *glob, char *pattern, + size_t pos, int *amount) +{ + /* processes a set expression with the point behind the opening '{' + ','-separated elements are collected until the next closing '}' + */ + URLPattern *pat; + GlobCode res; + bool done = FALSE; + char* buf = glob->glob_buffer; + + pat = &glob->pattern[glob->size / 2]; + /* patterns 0,1,2,... correspond to size=1,3,5,... */ + pat->type = UPTSet; + pat->content.Set.size = 0; + pat->content.Set.ptr_s = 0; + pat->content.Set.elements = NULL; + + ++glob->size; + + while(!done) { + switch (*pattern) { + case '\0': /* URL ended while set was still open */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "unmatched brace at pos %zu\n", pos); + return GLOB_ERROR; + + case '{': + case '[': /* no nested expressions at this time */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "nested braces not supported at pos %zu\n", pos); + return GLOB_ERROR; + + case ',': + case '}': /* set element completed */ + *buf = '\0'; + if(pat->content.Set.elements) { + char **new_arr = realloc(pat->content.Set.elements, + (pat->content.Set.size + 1) * sizeof(char*)); + if(!new_arr) { + short elem; + for(elem = 0; elem < pat->content.Set.size; elem++) + Curl_safefree(pat->content.Set.elements[elem]); + Curl_safefree(pat->content.Set.elements); + pat->content.Set.ptr_s = 0; + pat->content.Set.size = 0; + } + pat->content.Set.elements = new_arr; + } + else + pat->content.Set.elements = malloc(sizeof(char*)); + if(!pat->content.Set.elements) { + snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n"); + return GLOB_NO_MEM; + } + pat->content.Set.elements[pat->content.Set.size] = + strdup(glob->glob_buffer); + if(!pat->content.Set.elements[pat->content.Set.size]) { + short elem; + for(elem = 0; elem < pat->content.Set.size; elem++) + Curl_safefree(pat->content.Set.elements[elem]); + Curl_safefree(pat->content.Set.elements); + pat->content.Set.ptr_s = 0; + pat->content.Set.size = 0; + snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n"); + return GLOB_NO_MEM; + } + ++pat->content.Set.size; + + if(*pattern == '}') { + /* entire set pattern completed */ + int wordamount; + + /* always check for a literal (may be "") between patterns */ + res = glob_word(glob, ++pattern, ++pos, &wordamount); + if(res) { + short elem; + for(elem = 0; elem < pat->content.Set.size; elem++) + Curl_safefree(pat->content.Set.elements[elem]); + Curl_safefree(pat->content.Set.elements); + pat->content.Set.ptr_s = 0; + pat->content.Set.size = 0; + return res; + } + + *amount = pat->content.Set.size * wordamount; + + done = TRUE; + continue; + } + + buf = glob->glob_buffer; + ++pattern; + ++pos; + break; + + case ']': /* illegal closing bracket */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "illegal pattern at pos %zu\n", pos); + return GLOB_ERROR; + + case '\\': /* escaped character, skip '\' */ + if(pattern[1]) { + ++pattern; + ++pos; + } + /* intentional fallthrough */ + default: + *buf++ = *pattern++; /* copy character to set element */ + ++pos; + } + } + return GLOB_OK; +} + +static GlobCode glob_range(URLGlob *glob, char *pattern, + size_t pos, int *amount) +{ + /* processes a range expression with the point behind the opening '[' + - char range: e.g. "a-z]", "B-Q]" + - num range: e.g. "0-9]", "17-2000]" + - num range with leading zeros: e.g. "001-999]" + expression is checked for well-formedness and collected until the next ']' + */ + URLPattern *pat; + char *c; + char sep; + char sep2; + int step; + int rc; + GlobCode res; + int wordamount = 1; + + pat = &glob->pattern[glob->size / 2]; + /* patterns 0,1,2,... correspond to size=1,3,5,... */ + ++glob->size; + + if(ISALPHA(*pattern)) { + /* character range detected */ + char min_c; + char max_c; + + pat->type = UPTCharRange; + + rc = sscanf(pattern, "%c-%c%c%d%c", &min_c, &max_c, &sep, &step, &sep2); + + if((rc < 3) || (min_c >= max_c) || ((max_c - min_c) > ('z' - 'a'))) { + /* the pattern is not well-formed */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "error: bad range specification after pos %zu\n", pos); + return GLOB_ERROR; + } + + /* check the (first) separating character */ + if((sep != ']') && (sep != ':')) { + snprintf(glob->errormsg, sizeof(glob->errormsg), + "error: unsupported character (%c) after range at pos %zu\n", + sep, pos); + return GLOB_ERROR; + } + + /* if there was a ":[num]" thing, use that as step or else use 1 */ + pat->content.CharRange.step = + ((sep == ':') && (rc == 5) && (sep2 == ']')) ? step : 1; + + pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c; + pat->content.CharRange.max_c = max_c; + } + else if(ISDIGIT(*pattern)) { + /* numeric range detected */ + int min_n; + int max_n; + + pat->type = UPTNumRange; + pat->content.NumRange.padlength = 0; + + rc = sscanf(pattern, "%d-%d%c%d%c", &min_n, &max_n, &sep, &step, &sep2); + + if((rc < 2) || (min_n > max_n)) { + /* the pattern is not well-formed */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "error: bad range specification after pos %zu\n", pos); + return GLOB_ERROR; + } + pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n; + pat->content.NumRange.max_n = max_n; + + /* if there was a ":[num]" thing, use that as step or else use 1 */ + pat->content.NumRange.step = + ((sep == ':') && (rc == 5) && (sep2 == ']')) ? step : 1; + + if(*pattern == '0') { + /* leading zero specified */ + c = pattern; + while(ISDIGIT(*c)) { + c++; + ++pat->content.NumRange.padlength; /* padding length is set for all + instances of this pattern */ + } + } + } + else { + snprintf(glob->errormsg, sizeof(glob->errormsg), + "illegal character in range specification at pos %zu\n", pos); + return GLOB_ERROR; + } + + c = (char*)strchr(pattern, ']'); /* continue after next ']' */ + if(c) + c++; + else { + snprintf(glob->errormsg, sizeof(glob->errormsg), "missing ']'"); + return GLOB_ERROR; /* missing ']' */ + } + + /* always check for a literal (may be "") between patterns */ + + res = glob_word(glob, c, pos + (c - pattern), &wordamount); + if(res == GLOB_ERROR) { + wordamount = 1; + res = GLOB_OK; + } + + if(!res) { + if(pat->type == UPTCharRange) + *amount = wordamount * (pat->content.CharRange.max_c - + pat->content.CharRange.min_c + 1); + else + *amount = wordamount * (pat->content.NumRange.max_n - + pat->content.NumRange.min_n + 1); + } + + return res; /* GLOB_OK or GLOB_NO_MEM */ +} + +static GlobCode glob_word(URLGlob *glob, char *pattern, + size_t pos, int *amount) +{ + /* processes a literal string component of a URL + special characters '{' and '[' branch to set/range processing functions + */ + char* buf = glob->glob_buffer; + size_t litindex; + GlobCode res = GLOB_OK; + + *amount = 1; /* default is one single string */ + + while(*pattern != '\0' && *pattern != '{' && *pattern != '[') { + if(*pattern == '}' || *pattern == ']') { + snprintf(glob->errormsg, sizeof(glob->errormsg), + "unmatched close brace/bracket at pos %zu\n", pos); + return GLOB_ERROR; + } + + /* only allow \ to escape known "special letters" */ + if(*pattern == '\\' && + (*(pattern+1) == '{' || *(pattern+1) == '[' || + *(pattern+1) == '}' || *(pattern+1) == ']') ) { + + /* escape character, skip '\' */ + ++pattern; + ++pos; + } + *buf++ = *pattern++; /* copy character to literal */ + ++pos; + } + *buf = '\0'; + litindex = glob->size / 2; + /* literals 0,1,2,... correspond to size=0,2,4,... */ + glob->literal[litindex] = strdup(glob->glob_buffer); + if(!glob->literal[litindex]) { + snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n"); + return GLOB_NO_MEM; + } + ++glob->size; + + switch (*pattern) { + case '\0': + /* singular URL processed */ + break; + + case '{': + /* process set pattern */ + res = glob_set(glob, ++pattern, ++pos, amount); + break; + + case '[': + /* process range pattern */ + res = glob_range(glob, ++pattern, ++pos, amount); + break; + } + + if(res) + Curl_safefree(glob->literal[litindex]); + + return res; +} + +int glob_url(URLGlob** glob, char* url, int *urlnum, FILE *error) +{ + /* + * We can deal with any-size, just make a buffer with the same length + * as the specified URL! + */ + URLGlob *glob_expand; + int amount; + char *glob_buffer; + GlobCode res; + + *glob = NULL; + + glob_buffer = malloc(strlen(url) + 1); + if(!glob_buffer) + return CURLE_OUT_OF_MEMORY; + + glob_expand = calloc(1, sizeof(URLGlob)); + if(!glob_expand) { + Curl_safefree(glob_buffer); + return CURLE_OUT_OF_MEMORY; + } + glob_expand->size = 0; + glob_expand->urllen = strlen(url); + glob_expand->glob_buffer = glob_buffer; + glob_expand->beenhere = 0; + + res = glob_word(glob_expand, url, 1, &amount); + if(!res) + *urlnum = amount; + else { + if(error && glob_expand->errormsg[0]) { + /* send error description to the error-stream */ + fprintf(error, "curl: (%d) [globbing] %s", + (res == GLOB_NO_MEM) ? CURLE_OUT_OF_MEMORY : CURLE_URL_MALFORMAT, + glob_expand->errormsg); + } + /* it failed, we cleanup */ + Curl_safefree(glob_buffer); + Curl_safefree(glob_expand); + *urlnum = 1; + return (res == GLOB_NO_MEM) ? CURLE_OUT_OF_MEMORY : CURLE_URL_MALFORMAT; + } + + *glob = glob_expand; + return CURLE_OK; +} + +void glob_cleanup(URLGlob* glob) +{ + size_t i; + int elem; + + for(i = glob->size - 1; i < glob->size; --i) { + if(!(i & 1)) { /* even indexes contain literals */ + Curl_safefree(glob->literal[i/2]); + } + else { /* odd indexes contain sets or ranges */ + if((glob->pattern[i/2].type == UPTSet) && + (glob->pattern[i/2].content.Set.elements)) { + for(elem = glob->pattern[i/2].content.Set.size - 1; + elem >= 0; + --elem) { + Curl_safefree(glob->pattern[i/2].content.Set.elements[elem]); + } + Curl_safefree(glob->pattern[i/2].content.Set.elements); + } + } + } + Curl_safefree(glob->glob_buffer); + Curl_safefree(glob); +} + +char *glob_next_url(URLGlob *glob) +{ + URLPattern *pat; + char *lit; + size_t i; + size_t j; + size_t len; + size_t buflen = glob->urllen + 1; + char *buf = glob->glob_buffer; + + if(!glob->beenhere) + glob->beenhere = 1; + else { + bool carry = TRUE; + + /* implement a counter over the index ranges of all patterns, + starting with the rightmost pattern */ + for(i = glob->size / 2 - 1; carry && (i < glob->size); --i) { + carry = FALSE; + pat = &glob->pattern[i]; + switch (pat->type) { + case UPTSet: + if((pat->content.Set.elements) && + (++pat->content.Set.ptr_s == pat->content.Set.size)) { + pat->content.Set.ptr_s = 0; + carry = TRUE; + } + break; + case UPTCharRange: + pat->content.CharRange.ptr_c = (char)(pat->content.CharRange.step + + (int)((unsigned char)pat->content.CharRange.ptr_c)); + if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) { + pat->content.CharRange.ptr_c = pat->content.CharRange.min_c; + carry = TRUE; + } + break; + case UPTNumRange: + pat->content.NumRange.ptr_n += pat->content.NumRange.step; + if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) { + pat->content.NumRange.ptr_n = pat->content.NumRange.min_n; + carry = TRUE; + } + break; + default: + printf("internal error: invalid pattern type (%d)\n", (int)pat->type); + exit (CURLE_FAILED_INIT); + } + } + if(carry) /* first pattern ptr has run into overflow, done! */ + return NULL; + } + + for(j = 0; j < glob->size; ++j) { + if(!(j&1)) { /* every other term (j even) is a literal */ + lit = glob->literal[j/2]; + len = snprintf(buf, buflen, "%s", lit); + buf += len; + buflen -= len; + } + else { /* the rest (i odd) are patterns */ + pat = &glob->pattern[j/2]; + switch(pat->type) { + case UPTSet: + if(pat->content.Set.elements) { + len = strlen(pat->content.Set.elements[pat->content.Set.ptr_s]); + snprintf(buf, buflen, "%s", + pat->content.Set.elements[pat->content.Set.ptr_s]); + buf += len; + buflen -= len; + } + break; + case UPTCharRange: + *buf++ = pat->content.CharRange.ptr_c; + break; + case UPTNumRange: + len = snprintf(buf, buflen, "%0*d", + pat->content.NumRange.padlength, + pat->content.NumRange.ptr_n); + buf += len; + buflen -= len; + break; + default: + printf("internal error: invalid pattern type (%d)\n", (int)pat->type); + exit (CURLE_FAILED_INIT); + } + } + } + *buf = '\0'; + return strdup(glob->glob_buffer); +} + +char *glob_match_url(char *filename, URLGlob *glob) +{ + char *target; + size_t allocsize; + char numbuf[18]; + char *appendthis = NULL; + size_t appendlen = 0; + size_t stringlen = 0; + + /* We cannot use the glob_buffer for storage here since the filename may + * be longer than the URL we use. We allocate a good start size, then + * we need to realloc in case of need. + */ + allocsize = strlen(filename) + 1; /* make it at least one byte to store the + trailing zero */ + target = malloc(allocsize); + if(!target) + return NULL; /* major failure */ + + while(*filename) { + if(*filename == '#' && ISDIGIT(filename[1])) { + unsigned long i; + char *ptr = filename; + unsigned long num = strtoul(&filename[1], &filename, 10); + i = num - 1UL; + + if(num && (i <= glob->size / 2)) { + URLPattern pat = glob->pattern[i]; + switch (pat.type) { + case UPTSet: + if(pat.content.Set.elements) { + appendthis = pat.content.Set.elements[pat.content.Set.ptr_s]; + appendlen = + strlen(pat.content.Set.elements[pat.content.Set.ptr_s]); + } + break; + case UPTCharRange: + numbuf[0] = pat.content.CharRange.ptr_c; + numbuf[1] = 0; + appendthis = numbuf; + appendlen = 1; + break; + case UPTNumRange: + snprintf(numbuf, sizeof(numbuf), "%0*d", + pat.content.NumRange.padlength, + pat.content.NumRange.ptr_n); + appendthis = numbuf; + appendlen = strlen(numbuf); + break; + default: + printf("internal error: invalid pattern type (%d)\n", + (int)pat.type); + Curl_safefree(target); + return NULL; + } + } + else { + /* #[num] out of range, use the #[num] in the output */ + filename = ptr; + appendthis = filename++; + appendlen = 1; + } + } + else { + appendthis = filename++; + appendlen = 1; + } + if(appendlen + stringlen >= allocsize) { + char *newstr; + /* we append a single byte to allow for the trailing byte to be appended + at the end of this function outside the while() loop */ + allocsize = (appendlen + stringlen) * 2; + newstr = realloc(target, allocsize + 1); + if(!newstr) { + Curl_safefree(target); + return NULL; + } + target = newstr; + } + memcpy(&target[stringlen], appendthis, appendlen); + stringlen += appendlen; + } + target[stringlen]= '\0'; + return target; +} |