urlapi: leaner with fewer allocs

Slightly faster with more robust code. Uses fewer and smaller mallocs. - remove two fields from the URL handle struct - reduce copies and allocs - use dynbuf buffers more instead of custom malloc + copies - uses dynbuf to build the host name in reduces serial alloc+free within the same function. - move dedotdotify into urlapi.c and make it static, not strdup the input and optimize it by checking for . and / before using strncmp - remove a few strlen() calls - add Curl_dyn_setlen() that can "trim" an existing dynbuf Closes #9408
author: Daniel Stenberg <daniel@haxx.se> 2022-09-01 10:16:24 +0200
committer: Daniel Stenberg <daniel@haxx.se> 2022-09-07 10:21:45 +0200
commit: f703cf971ccdb93fea05f223cabbb99ca06cdc75 (patch)
tree: fae9b603ebdc5cb79dc6aaed3b49e8744cd57969 /lib/urlapi.c
parent: 2ae81e680b8d6f8b9eb36fd1a2e223783526dbdd (diff)
download: curl-f703cf971ccdb93fea05f223cabbb99ca06cdc75.tar.gz
1 files changed, 487 insertions, 345 deletions
diff --git a/lib/urlapi.c b/lib/urlapi.c
index 0e07d0ba3..baecb84b9 100644
--- a/lib/urlapi.c
+++ b/lib/urlapi.c
@@ -27,12 +27,12 @@
 #include "urldata.h"
 #include "urlapi-int.h"
 #include "strcase.h"
-#include "dotdot.h"
 #include "url.h"
 #include "escape.h"
 #include "curl_ctype.h"
 #include "inet_pton.h"
 #include "inet_ntop.h"
+#include "strdup.h"
 
 /* The last 3 #include files should be in this order */
 #include "curl_printf.h"
@@ -68,9 +68,6 @@ struct Curl_URL {
   char *path;
   char *query;
   char *fragment;
-
-  char *scratch; /* temporary scratch area */
-  char *temppath; /* temporary path pointer */
   long portnum; /* the numerical version */
 };
 
@@ -88,8 +85,6 @@ static void free_urlhandle(struct Curl_URL *u)
   free(u->path);
   free(u->query);
   free(u->fragment);
-  free(u->scratch);
-  free(u->temppath);
 }
 
 /*
@@ -130,64 +125,19 @@ static bool urlchar_needs_escaping(int c)
   return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
 }
 
-/*
- * strlen_url() returns the length of the given URL if the spaces within the
- * URL were properly URL encoded.
- * URL encoding should be skipped for host names, otherwise IDN resolution
- * will fail.
- */
-static size_t strlen_url(const char *url, bool relative)
-{
-  const unsigned char *ptr;
-  size_t newlen = 0;
-  bool left = TRUE; /* left side of the ? */
-  const unsigned char *host_sep = (const unsigned char *) url;
-
-  if(!relative)
-    host_sep = (const unsigned char *) find_host_sep(url);
-
-  for(ptr = (unsigned char *)url; *ptr; ptr++) {
-
-    if(ptr < host_sep) {
-      ++newlen;
-      continue;
-    }
-
-    if(*ptr == ' ') {
-      if(left)
-        newlen += 3;
-      else
-        newlen++;
-      continue;
-    }
-
-    if (*ptr == '?')
-      left = FALSE;
-
-    if(urlchar_needs_escaping(*ptr))
-      newlen += 2;
-
-    newlen++;
-  }
-
-  return newlen;
-}
-
-/* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
- * the source URL accordingly.
+/* strcpy_url() creates a url in an output dynbuf and URL-encodes the spaces
+ * in the source URL accordingly.
+ *
  * URL encoding should be skipped for host names, otherwise IDN resolution
  * will fail.
  *
- * Returns TRUE if something was updated.
  */
-static bool strcpy_url(char *output, const char *url, bool relative)
+static CURLUcode strcpy_url(struct dynbuf *o, const char *url, bool relative)
 {
   /* we must add this with whitespace-replacing */
   bool left = TRUE;
   const unsigned char *iptr;
-  char *optr = output;
   const unsigned char *host_sep = (const unsigned char *) url;
-  bool changed = FALSE;
 
   if(!relative)
     host_sep = (const unsigned char *) find_host_sep(url);
@@ -197,19 +147,20 @@ static bool strcpy_url(char *output, const char *url, bool relative)
       iptr++) {
 
     if(iptr < host_sep) {
-      *optr++ = *iptr;
+      if(Curl_dyn_addn(o, iptr, 1))
+        return CURLUE_OUT_OF_MEMORY;
       continue;
     }
 
     if(*iptr == ' ') {
       if(left) {
-        *optr++='%'; /* add a '%' */
-        *optr++='2'; /* add a '2' */
-        *optr++='0'; /* add a '0' */
+        if(Curl_dyn_addn(o, "%20", 3))
+          return CURLUE_OUT_OF_MEMORY;
+      }
+      else {
+        if(Curl_dyn_addn(o, "+", 1))
+          return CURLUE_OUT_OF_MEMORY;
       }
-      else
-        *optr++='+'; /* add a '+' here */
-      changed = TRUE;
       continue;
     }
 
@@ -217,24 +168,24 @@ static bool strcpy_url(char *output, const char *url, bool relative)
       left = FALSE;
 
     if(urlchar_needs_escaping(*iptr)) {
-      msnprintf(optr, 4, "%%%02x", *iptr);
-      changed = TRUE;
-      optr += 3;
+      if(Curl_dyn_addf(o, "%%%02x", *iptr))
+        return CURLUE_OUT_OF_MEMORY;
+    }
+    else {
+      if(Curl_dyn_addn(o, iptr, 1))
+        return CURLUE_OUT_OF_MEMORY;
     }
-    else
-      *optr++ = *iptr;
   }
-  *optr = 0; /* null-terminate output buffer */
 
-  return changed;
+  return CURLUE_OK;
 }
 
 /*
- * Returns true if the given URL is absolute (as opposed to relative). Returns
- * the scheme in the buffer if TRUE and 'buf' is non-NULL. The buflen must
- * be larger than MAX_SCHEME_LEN if buf is set.
+ * Returns the length of the scheme if the given URL is absolute (as opposed
+ * to relative). Stores the scheme in the buffer if TRUE and 'buf' is
+ * non-NULL. The buflen must be larger than MAX_SCHEME_LEN if buf is set.
  */
-bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
+size_t Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
 {
   int i;
   DEBUGASSERT(!buf || (buflen > MAX_SCHEME_LEN));
@@ -243,7 +194,7 @@ bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
     buf[0] = 0; /* always leave a defined value in buf */
 #ifdef WIN32
   if(STARTS_WITH_DRIVE_PREFIX(url))
-    return FALSE;
+    return 0;
 #endif
   for(i = 0; i < MAX_SCHEME_LEN; ++i) {
     char s = url[i];
@@ -257,15 +208,17 @@ bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
     }
   }
   if(i && (url[i] == ':') && (url[i + 1] == '/')) {
+    /* the length of the scheme is the name part only */
+    size_t len = i;
     if(buf) {
       buf[i] = 0;
       while(i--) {
         buf[i] = Curl_raw_tolower(url[i]);
       }
     }
-    return TRUE;
+    return len;
   }
-  return FALSE;
+  return 0;
 }
 
 /*
@@ -273,34 +226,26 @@ bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
  * URL-encodes any spaces.
  * The returned pointer must be freed by the caller unless NULL
  * (returns NULL on out of memory).
+ *
+ * Note that this function destroys the 'base' string.
  */
-static char *concat_url(const char *base, const char *relurl)
+static char *concat_url(char *base, const char *relurl)
 {
   /***
    TRY to append this new path to the old URL
    to the right of the host part. Oh crap, this is doomed to cause
    problems in the future...
   */
-  char *newest;
+  struct dynbuf newest;
   char *protsep;
   char *pathsep;
-  size_t newlen;
   bool host_changed = FALSE;
-
   const char *useurl = relurl;
-  size_t urllen;
-
-  /* we must make our own copy of the URL to play with, as it may
-     point to read-only data */
-  char *url_clone = strdup(base);
-
-  if(!url_clone)
-    return NULL; /* skip out of this NOW */
 
   /* protsep points to the start of the host name */
-  protsep = strstr(url_clone, "//");
+  protsep = strstr(base, "//");
   if(!protsep)
-    protsep = url_clone;
+    protsep = base;
   else
     protsep += 2; /* pass the slashes */
 
@@ -393,38 +338,24 @@ static char *concat_url(const char *base, const char *relurl)
     }
   }
 
-  /* If the new part contains a space, this is a mighty stupid redirect
-     but we still make an effort to do "right". To the left of a '?'
-     letter we replace each space with %20 while it is replaced with '+'
-     on the right side of the '?' letter.
-  */
-  newlen = strlen_url(useurl, !host_changed);
-
-  urllen = strlen(url_clone);
-
-  newest = malloc(urllen + 1 + /* possible slash */
-                  newlen + 1 /* zero byte */);
-
-  if(!newest) {
-    free(url_clone); /* don't leak this */
-    return NULL;
-  }
+  Curl_dyn_init(&newest, CURL_MAX_INPUT_LENGTH);
 
   /* copy over the root url part */
-  memcpy(newest, url_clone, urllen);
+  if(Curl_dyn_add(&newest, base))
+    return NULL;
 
   /* check if we need to append a slash */
   if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
     ;
-  else
-    newest[urllen++]='/';
+  else {
+    if(Curl_dyn_addn(&newest, "/", 1))
+      return NULL;
+  }
 
   /* then append the new piece on the right side */
-  strcpy_url(&newest[urllen], useurl, !host_changed);
+  strcpy_url(&newest, useurl, !host_changed);
 
-  free(url_clone);
-
-  return newest;
+  return Curl_dyn_ptr(&newest);
 }
 
 /* scan for byte values < 31 or 127 */
@@ -458,7 +389,7 @@ static bool junkscan(const char *part, unsigned int flags)
  *
  */
 static CURLUcode parse_hostname_login(struct Curl_URL *u,
-                                      char **hostname,
+                                      struct dynbuf *host,
                                       unsigned int flags)
 {
   CURLUcode result = CURLUE_OK;
@@ -468,23 +399,27 @@ static CURLUcode parse_hostname_login(struct Curl_URL *u,
   char *optionsp = NULL;
   const struct Curl_handler *h = NULL;
 
-  /* At this point, we're hoping all the other special cases have
-   * been taken care of, so conn->host.name is at most
-   *    [user[:password][;options]]@]hostname
+  /* At this point, we assume all the other special cases have been taken
+   * care of, so the host is at most
+   *
+   *   [user[:password][;options]]@]hostname
    *
    * We need somewhere to put the embedded details, so do that first.
    */
 
-  char *ptr = strchr(*hostname, '@');
-  char *login = *hostname;
+  char *login = Curl_dyn_ptr(host);
+  char *ptr;
+
+  DEBUGASSERT(login);
 
+  ptr = strchr(login, '@');
   if(!ptr)
     goto out;
 
   /* We will now try to extract the
    * possible login information in a string like:
    * ftp://user:password@ftp.my.site:8021/README */
-  *hostname = ++ptr;
+  ptr++;
 
   /* if this is a known scheme, get some details */
   if(u->scheme)
@@ -530,6 +465,10 @@ static CURLUcode parse_hostname_login(struct Curl_URL *u,
     u->options = optionsp;
   }
 
+  /* move the name to the start of the host buffer */
+  if(Curl_dyn_tail(host, strlen(ptr)))
+    return CURLUE_OUT_OF_MEMORY;
+
   return CURLUE_OK;
   out:
 
@@ -543,13 +482,13 @@ static CURLUcode parse_hostname_login(struct Curl_URL *u,
   return result;
 }
 
-UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname,
+UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host,
                                    bool has_scheme)
 {
   char *portptr = NULL;
   char endbracket;
   int len;
-
+  char *hostname = Curl_dyn_ptr(host);
   /*
    * Find the end of an IPv6 address, either on the ']' ending bracket or
    * a percent-encoded zone index.
@@ -586,6 +525,7 @@ UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname,
     char *rest;
     long port;
     char portbuf[7];
+    size_t keep = portptr - hostname;
 
     /* Browser behavior adaptation. If there's a colon with no digits after,
        just cut off the name there which makes us ignore the colon and just
@@ -594,15 +534,15 @@ UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname,
        Don't do it if the URL has no scheme, to make something that looks like
        a scheme not work!
     */
-    if(!portptr[1]) {
-      *portptr = '\0';
+    Curl_dyn_setlen(host, keep);
+    portptr++;
+    if(!*portptr)
       return has_scheme ? CURLUE_OK : CURLUE_BAD_PORT_NUMBER;
-    }
 
-    if(!ISDIGIT(portptr[1]))
+    if(!ISDIGIT(*portptr))
       return CURLUE_BAD_PORT_NUMBER;
 
-    port = strtol(portptr + 1, &rest, 10);  /* Port number must be decimal */
+    port = strtol(portptr, &rest, 10);  /* Port number must be decimal */
 
     if(port > 0xffff)
       return CURLUE_BAD_PORT_NUMBER;
@@ -610,7 +550,6 @@ UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname,
     if(rest[0])
       return CURLUE_BAD_PORT_NUMBER;
 
-    *portptr++ = '\0'; /* cut off the name there */
     *rest = 0;
     /* generate a new port number string to get rid of leading zeroes etc */
     msnprintf(portbuf, sizeof(portbuf), "%ld", port);
@@ -623,12 +562,15 @@ UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname,
   return CURLUE_OK;
 }
 
-static CURLUcode hostname_check(struct Curl_URL *u, char *hostname)
+static CURLUcode hostname_check(struct Curl_URL *u, char *hostname,
+                                size_t hlen) /* length of hostname */
 {
   size_t len;
-  size_t hlen = strlen(hostname);
+  DEBUGASSERT(hostname);
 
-  if(hostname[0] == '[') {
+  if(!hostname[0])
+    return CURLUE_NO_HOST;
+  else if(hostname[0] == '[') {
     const char *l = "0123456789abcdefABCDEF:.";
     if(hlen < 4) /* '[::]' is the shortest possible valid string */
       return CURLUE_BAD_IPV6;
@@ -692,8 +634,6 @@ static CURLUcode hostname_check(struct Curl_URL *u, char *hostname)
       /* hostname with bad content */
       return CURLUE_BAD_HOSTNAME;
   }
-  if(!hostname[0])
-    return CURLUE_NO_HOST;
   return CURLUE_OK;
 }
 
@@ -787,79 +727,231 @@ static bool ipv4_normalize(const char *hostname, char *outp, size_t olen)
   return TRUE;
 }
 
-/* return strdup'ed version in 'outp', possibly percent decoded */
-static CURLUcode decode_host(char *hostname, char **outp)
+/* if necessary, replace the host content with a URL decoded version */
+static CURLUcode decode_host(struct dynbuf *host)
 {
   char *per = NULL;
-  if(hostname[0] != '[')
+  const char *hostname = Curl_dyn_ptr(host);
+  if(hostname[0] == '[')
     /* only decode if not an ipv6 numerical */
-    per = strchr(hostname, '%');
-  if(!per) {
-    *outp = strdup(hostname);
-    if(!*outp)
-      return CURLUE_OUT_OF_MEMORY;
-  }
+    return CURLUE_OK;
+  per = strchr(hostname, '%');
+  if(!per)
+    /* nothing to decode */
+    return CURLUE_OK;
   else {
-    /* might be encoded */
+    /* encoded */
     size_t dlen;
-    CURLcode result = Curl_urldecode(hostname, 0, outp, &dlen, REJECT_CTRL);
+    char *decoded;
+    CURLcode result = Curl_urldecode(hostname, 0, &decoded, &dlen,
+                                     REJECT_CTRL);
     if(result)
       return CURLUE_BAD_HOSTNAME;
+    Curl_dyn_reset(host);
+    result = Curl_dyn_addn(host, decoded, dlen);
+    free(decoded);
+    if(result)
+      return CURLUE_OUT_OF_MEMORY;
   }
 
   return CURLUE_OK;
 }
 
-static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
+/*
+ * "Remove Dot Segments"
+ * https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
+ */
+
+/*
+ * dedotdotify()
+ * @unittest: 1395
+ *
+ * This function gets a null-terminated path with dot and dotdot sequences
+ * passed in and strips them off according to the rules in RFC 3986 section
+ * 5.2.4.
+ *
+ * The function handles a query part ('?' + stuff) appended but it expects
+ * that fragments ('#' + stuff) have already been cut off.
+ *
+ * Note that this funciton *writes* a byte into the source buffer during its
+ * operation.
+ *
+ * RETURNS
+ *
+ * an allocated dedotdotified output string
+ */
+UNITTEST char *dedotdotify(const char *input, size_t clen);
+UNITTEST char *dedotdotify(const char *input, size_t clen)
 {
-  char *path;
-  bool path_alloced = FALSE;
+  char *out = malloc(clen + 1);
+  char *outptr;
+  const char *orginput = input;
+  char *queryp;
+  if(!out)
+    return NULL; /* out of memory */
+
+  *out = 0; /* null-terminates, for inputs like "./" */
+  outptr = out;
+
+  if(!*input)
+    /* zero length input string, return that */
+    return out;
+
+  /*
+   * To handle query-parts properly, we must find it and remove it during the
+   * dotdot-operation and then append it again at the end to the output
+   * string.
+   */
+  queryp = strchr(input, '?');
+
+  do {
+    bool dotdot = TRUE;
+    if(*input == '.') {
+      /*  A.  If the input buffer begins with a prefix of "../" or "./", then
+          remove that prefix from the input buffer; otherwise, */
+
+      if(!strncmp("./", input, 2)) {
+        input += 2;
+        clen -= 2;
+      }
+      else if(!strncmp("../", input, 3)) {
+        input += 3;
+        clen -= 3;
+      }
+      /*  D.  if the input buffer consists only of "." or "..", then remove
+          that from the input buffer; otherwise, */
+
+      else if(!strcmp(".", input) || !strcmp("..", input) ||
+              !strncmp(".?", input, 2) || !strncmp("..?", input, 3)) {
+        *out = 0;
+        break;
+      }
+      else
+        dotdot = FALSE;
+    }
+    else if(*input == '/') {
+      /*  B.  if the input buffer begins with a prefix of "/./" or "/.", where
+          "."  is a complete path segment, then replace that prefix with "/" in
+          the input buffer; otherwise, */
+      if(!strncmp("/./", input, 3)) {
+        input += 2;
+        clen -= 2;
+      }
+      else if(!strcmp("/.", input) || !strncmp("/.?", input, 3)) {
+        *outptr++ = '/';
+        *outptr = 0;
+        break;
+      }
+
+      /*  C.  if the input buffer begins with a prefix of "/../" or "/..",
+          where ".." is a complete path segment, then replace that prefix with
+          "/" in the input buffer and remove the last segment and its
+          preceding "/" (if any) from the output buffer; otherwise, */
+
+      else if(!strncmp("/../", input, 4)) {
+        input += 3;
+        clen -= 3;
+        /* remove the last segment from the output buffer */
+        while(outptr > out) {
+          outptr--;
+          if(*outptr == '/')
+            break;
+        }
+        *outptr = 0; /* null-terminate where it stops */
+      }
+      else if(!strcmp("/..", input) || !strncmp("/..?", input, 4)) {
+        /* remove the last segment from the output buffer */
+        while(outptr > out) {
+          outptr--;
+          if(*outptr == '/')
+            break;
+        }
+        *outptr++ = '/';
+        *outptr = 0; /* null-terminate where it stops */
+        break;
+      }
+      else
+        dotdot = FALSE;
+    }
+    else
+      dotdot = FALSE;
+
+    if(!dotdot) {
+      /*  E.  move the first path segment in the input buffer to the end of
+          the output buffer, including the initial "/" character (if any) and
+          any subsequent characters up to, but not including, the next "/"
+          character or the end of the input buffer. */
+
+      do {
+        *outptr++ = *input++;
+        clen--;
+      } while(*input && (*input != '/') && (*input != '?'));
+      *outptr = 0;
+    }
+
+    /* continue until end of input string OR, if there is a terminating
+       query part, stop there */
+  } while(*input && (!queryp || (input < queryp)));
+
+  if(queryp) {
+    size_t qlen;
+    /* There was a query part, append that to the output. */
+    size_t oindex = queryp - orginput;
+    qlen = strlen(&orginput[oindex]);
+    memcpy(outptr, &orginput[oindex], qlen + 1); /* include zero byte */
+  }
+
+  return out;
+}
+
+static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
+{
+  const char *path;
+  size_t pathlen;
   bool uncpath = FALSE;
-  char *hostname;
   char *query = NULL;
   char *fragment = NULL;
-  CURLUcode result;
-  bool url_has_scheme = FALSE;
   char schemebuf[MAX_SCHEME_LEN + 1];
   const char *schemep = NULL;
   size_t schemelen = 0;
   size_t urllen;
+  CURLUcode result = CURLUE_OK;
+  size_t fraglen = 0;
+  struct dynbuf host;
 
   DEBUGASSERT(url);
 
+  Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
+
   /*************************************************************
    * Parse the URL.
    ************************************************************/
   /* allocate scratch area */
   urllen = strlen(url);
-  if(urllen > CURL_MAX_INPUT_LENGTH)
+  if(urllen > CURL_MAX_INPUT_LENGTH) {
     /* excessive input length */
-    return CURLUE_MALFORMED_INPUT;
-
-  path = u->scratch = malloc(urllen * 2 + 2);
-  if(!path)
-    return CURLUE_OUT_OF_MEMORY;
-
-  hostname = &path[urllen + 1];
-  hostname[0] = 0;
-
-  if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
-    url_has_scheme = TRUE;
-    schemelen = strlen(schemebuf);
+    result = CURLUE_MALFORMED_INPUT;
+    goto fail;
   }
 
+  schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf));
+
   /* handle the file: scheme */
-  if(url_has_scheme && !strcmp(schemebuf, "file")) {
-    if(urllen <= 6)
+  if(schemelen && !strcmp(schemebuf, "file")) {
+    if(urllen <= 6) {
       /* file:/ is not enough to actually be a complete file: URL */
-      return CURLUE_BAD_FILE_URL;
+      result = CURLUE_BAD_FILE_URL;
+      goto fail;
+    }
 
     /* path has been allocated large enough to hold this */
-    strcpy(path, &url[5]);
+    path = (char *)&url[5];
 
-    u->scheme = strdup("file");
-    if(!u->scheme)
-      return CURLUE_OUT_OF_MEMORY;
+    schemep = u->scheme = strdup("file");
+    if(!u->scheme) {
+      result = CURLUE_OUT_OF_MEMORY;
+      goto fail;
+    }
 
     /* Extra handling URLs with an authority component (i.e. that start with
      * "file://")
@@ -869,7 +961,7 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
      */
     if(path[0] == '/' && path[1] == '/') {
       /* swallow the two slashes */
-      char *ptr = &path[2];
+      const char *ptr = &path[2];
 
       /*
        * According to RFC 8089, a file: URL can be reliably dereferenced if:
@@ -905,13 +997,17 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
              chars, and the delimiting slash character must be appended to the
              host name */
           path = strpbrk(ptr, "/\\:*?\"<>|");
-          if(!path || *path != '/')
-            return CURLUE_BAD_FILE_URL;
+          if(!path || *path != '/') {
+            result = CURLUE_BAD_FILE_URL;
+            goto fail;
+          }
 
           len = path - ptr;
           if(len) {
-            memcpy(hostname, ptr, len);
-            hostname[len] = 0;
+            if(Curl_dyn_addn(&host, ptr, len)) {
+              result = CURLUE_OUT_OF_MEMORY;
+              goto fail;
+            }
             uncpath = TRUE;
           }
 
@@ -919,7 +1015,8 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
 #else
           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
              none */
-          return CURLUE_BAD_FILE_URL;
+          result = CURLUE_BAD_FILE_URL;
+          goto fail;
 #endif
         }
       }
@@ -928,7 +1025,8 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
     }
 
     if(!uncpath)
-        hostname = NULL; /* no host for file: URLs by default */
+      /* no host for file: URLs by default */
+      Curl_dyn_reset(&host);
 
 #if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
     /* Don't allow Windows drive letters when not in Windows.
@@ -936,13 +1034,14 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
       /* File drive letters are only accepted in MSDOS/Windows */
-      return CURLUE_BAD_FILE_URL;
+      result = CURLUE_BAD_FILE_URL;
+      goto fail;
     }
 #else
     /* If the path starts with a slash and a drive letter, ditch the slash */
     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
       /* This cannot be done with strcpy, as the memory chunks overlap! */
-      memmove(path, &path[1], strlen(&path[1]) + 1);
+      path++;
     }
 #endif
 
@@ -952,32 +1051,39 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
     const char *p;
     const char *hostp;
     size_t len;
-    path[0] = 0;
 
-    if(url_has_scheme) {
+    if(schemelen) {
       int i = 0;
       p = &url[schemelen + 1];
       while(p && (*p == '/') && (i < 4)) {
         p++;
         i++;
       }
-      if((i < 1) || (i>3))
+      if((i < 1) || (i>3)) {
         /* less than one or more than three slashes */
-        return CURLUE_BAD_SLASHES;
+        result = CURLUE_BAD_SLASHES;
+        goto fail;
+      }
 
       schemep = schemebuf;
       if(!Curl_builtin_scheme(schemep) &&
-         !(flags & CURLU_NON_SUPPORT_SCHEME))
-        return CURLUE_UNSUPPORTED_SCHEME;
+         !(flags & CURLU_NON_SUPPORT_SCHEME)) {
+        result = CURLUE_UNSUPPORTED_SCHEME;
+        goto fail;
+      }
 
-      if(junkscan(schemep, flags))
-        return CURLUE_BAD_SCHEME;
+      if(junkscan(schemep, flags)) {
+        result = CURLUE_BAD_SCHEME;
+        goto fail;
+      }
     }
     else {
       /* no scheme! */
 
-      if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
-        return CURLUE_BAD_SCHEME;
+      if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME))) {
+        result = CURLUE_BAD_SCHEME;
+        goto fail;
+      }
       if(flags & CURLU_DEFAULT_SCHEME)
         schemep = DEFAULT_SCHEME;
 
@@ -994,122 +1100,156 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
 
     len = p - hostp;
     if(len) {
-      memcpy(hostname, hostp, len);
-      hostname[len] = 0;
+      if(Curl_dyn_addn(&host, hostp, len)) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
     }
     else {
-      if(!(flags & CURLU_NO_AUTHORITY))
-        return CURLUE_NO_HOST;
+      if(!(flags & CURLU_NO_AUTHORITY)) {
+        result = CURLUE_NO_HOST;
+        goto fail;
+      }
     }
 
-    strcpy(path, p);
+    path = (char *)p;
 
     if(schemep) {
       u->scheme = strdup(schemep);
-      if(!u->scheme)
-        return CURLUE_OUT_OF_MEMORY;
+      if(!u->scheme) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
     }
   }
 
-  if((flags & CURLU_URLENCODE) && path[0]) {
-    /* worst case output length is 3x the original! */
-    char *newp = malloc(strlen(path) * 3);
-    if(!newp)
-      return CURLUE_OUT_OF_MEMORY;
-    path_alloced = TRUE;
-    strcpy_url(newp, path, TRUE); /* consider it relative */
-    u->temppath = path = newp;
+  if(*path && (flags & CURLU_URLENCODE)) {
+    struct dynbuf enc;
+    Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+    if(strcpy_url(&enc, path, TRUE)) { /* consider it relative */
+      result = CURLUE_OUT_OF_MEMORY;
+      goto fail;
+    }
+    path = u->path = Curl_dyn_ptr(&enc);
   }
 
   fragment = strchr(path, '#');
   if(fragment) {
-    *fragment++ = 0;
-    if(junkscan(fragment, flags))
-      return CURLUE_BAD_FRAGMENT;
-    if(fragment[0]) {
-      u->fragment = strdup(fragment);
-      if(!u->fragment)
-        return CURLUE_OUT_OF_MEMORY;
+    fraglen = strlen(fragment);
+    if(fraglen > 1) {
+      /* skip the leading '#' in the copy but include the terminating null */
+      u->fragment = Curl_memdup(fragment + 1, fraglen);
+      if(!u->fragment) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
+
+      if(junkscan(u->fragment, flags)) {
+        result = CURLUE_BAD_FRAGMENT;
+        goto fail;
+      }
     }
   }
 
   query = strchr(path, '?');
-  if(query) {
-    *query++ = 0;
-    if(junkscan(query, flags))
-      return CURLUE_BAD_QUERY;
-    /* done even if the query part is a blank string */
-    u->query = strdup(query);
-    if(!u->query)
-      return CURLUE_OUT_OF_MEMORY;
-  }
+  if(query && (!fragment || (query < fragment))) {
+    size_t qlen = strlen(query) - fraglen; /* includes '?' */
+    pathlen = strlen(path) - qlen - fraglen;
+    if(qlen > 1) {
+      u->query = Curl_memdup(query + 1, qlen);
+      if(!u->query) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
+      u->query[qlen - 1] = 0;
 
-  if(junkscan(path, flags))
-    return CURLUE_BAD_PATH;
+      if(junkscan(u->query, flags)) {
+        result = CURLUE_BAD_QUERY;
+        goto fail;
+      }
+    }
+    else {
+      /* single byte query */
+      u->query = strdup("");
+      if(!u->query) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
+    }
+  }
+  else
+    pathlen = strlen(path) - fraglen;
 
-  if(!path[0])
-    /* if there's no path left set, unset */
+  if(!pathlen) {
+    /* there is no path left, unset */
     path = NULL;
+  }
   else {
+    if(!u->path) {
+      u->path = Curl_memdup(path, pathlen + 1);
+      if(!u->path) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
+      u->path[pathlen] = 0;
+      path = u->path;
+    }
+    else if(flags & CURLU_URLENCODE)
+      /* it might have encoded more than just the path so cut it */
+      u->path[pathlen] = 0;
+
+    if(junkscan(u->path, flags)) {
+      result = CURLUE_BAD_PATH;
+      goto fail;
+    }
+
     if(!(flags & CURLU_PATH_AS_IS)) {
       /* remove ../ and ./ sequences according to RFC3986 */
-      char *newp = Curl_dedotdotify(path);
-      if(!newp)
-        return CURLUE_OUT_OF_MEMORY;
-
-      if(strcmp(newp, path)) {
-        /* if we got a new version */
-        if(path_alloced)
-          Curl_safefree(u->temppath);
-        u->temppath = path = newp;
-        path_alloced = TRUE;
+      char *newp = dedotdotify((char *)path, pathlen);
+      if(!newp) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
       }
-      else
-        free(newp);
+      free(u->path);
+      u->path = newp;
     }
-
-    u->path = path_alloced?path:strdup(path);
-    if(!u->path)
-      return CURLUE_OUT_OF_MEMORY;
-    u->temppath = NULL; /* used now */
   }
 
-  if(hostname) {
+  if(Curl_dyn_len(&host)) {
     char normalized_ipv4[sizeof("255.255.255.255") + 1];
 
     /*
      * Parse the login details and strip them out of the host name.
      */
-    result = parse_hostname_login(u, &hostname, flags);
-    if(result)
-      return result;
-
-    result = Curl_parse_port(u, hostname, url_has_scheme);
+    result = parse_hostname_login(u, &host, flags);
+    if(!result)
+      result = Curl_parse_port(u, &host, schemelen);
     if(result)
-      return result;
+      goto fail;
 
-    if(junkscan(hostname, flags))
-      return CURLUE_BAD_HOSTNAME;
+    if(junkscan(Curl_dyn_ptr(&host), flags)) {
+      result = CURLUE_BAD_HOSTNAME;
+      goto fail;
+    }
 
-    if(0 == strlen(hostname) && (flags & CURLU_NO_AUTHORITY)) {
-      /* Skip hostname check, it's allowed to be empty. */
-      u->host = strdup("");
+    if(ipv4_normalize(Curl_dyn_ptr(&host),
+                      normalized_ipv4, sizeof(normalized_ipv4))) {
+      Curl_dyn_reset(&host);
+      if(Curl_dyn_add(&host, normalized_ipv4)) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
     }
     else {
-      if(ipv4_normalize(hostname, normalized_ipv4, sizeof(normalized_ipv4)))
-        u->host = strdup(normalized_ipv4);
-      else {
-        result = decode_host(hostname, &u->host);
-        if(result)
-          return result;
-        result = hostname_check(u, u->host);
-        if(result)
-          return result;
-      }
+      result = decode_host(&host);
+      if(!result)
+        result = hostname_check(u, Curl_dyn_ptr(&host), Curl_dyn_len(&host));
+      if(result)
+        goto fail;
     }
-    if(!u->host)
-      return CURLUE_OUT_OF_MEMORY;
+
     if((flags & CURLU_GUESS_SCHEME) && !schemep) {
+      const char *hostname = Curl_dyn_ptr(&host);
       /* legacy curl-style guess based on host name */
       if(checkprefix("ftp.", hostname))
         schemep = "ftp";
@@ -1127,27 +1267,26 @@ static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
         schemep = "http";
 
       u->scheme = strdup(schemep);
-      if(!u->scheme)
-        return CURLUE_OUT_OF_MEMORY;
+      if(!u->scheme) {
+        result = CURLUE_OUT_OF_MEMORY;
+        goto fail;
+      }
+    }
+  }
+  else if(flags & CURLU_NO_AUTHORITY) {
+    /* allowed to be empty. */
+    if(Curl_dyn_add(&host, "")) {
+      result = CURLUE_OUT_OF_MEMORY;
+      goto fail;
     }
   }
 
-  Curl_safefree(u->scratch);
-  Curl_safefree(u->temppath);
+  u->host = Curl_dyn_ptr(&host);
 
-  return CURLUE_OK;
-}
-
-/*
- * Parse the URL and set the relevant members of the Curl_URL struct.
- */
-static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
-{
-  CURLUcode result = seturl(url, u, flags);
-  if(result) {
-    free_urlhandle(u);
-    memset(u, 0, sizeof(struct Curl_URL));
-  }
+  return result;
+  fail:
+  Curl_dyn_free(&host);
+  free_urlhandle(u);
   return result;
 }
 
@@ -1165,8 +1304,6 @@ static CURLUcode parseurl_and_replace(const char *url, CURLU *u,
     free_urlhandle(u);
     *u = tmpurl;
   }
-  else
-    free_urlhandle(&tmpurl);
   return result;
 }
 
@@ -1344,14 +1481,13 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
       if(u->host[0] == '[') {
         if(u->zoneid) {
           /* make it '[ host %25 zoneid ]' */
+          struct dynbuf enc;
           size_t hostlen = strlen(u->host);
-          size_t alen = hostlen + 3 + strlen(u->zoneid) + 1;
-          allochost = malloc(alen);
-          if(!allochost)
+          Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+          if(Curl_dyn_addf(&enc, "%.*s%%25%s]", (int)hostlen - 1, u->host,
+                           u->zoneid))
             return CURLUE_OUT_OF_MEMORY;
-          memcpy(allochost, u->host, hostlen - 1);
-          msnprintf(&allochost[hostlen - 1], alen - hostlen + 1,
-                    "%%25%s]", u->zoneid);
+          allochost = Curl_dyn_ptr(&enc);
         }
       }
       else if(urlencode) {
@@ -1362,32 +1498,32 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
       else {
         /* only encode '%' in output host name */
         char *host = u->host;
-        size_t pcount = 0;
+        bool percent = FALSE;
         /* first, count number of percents present in the name */
         while(*host) {
-          if(*host == '%')
-            pcount++;
+          if(*host == '%') {
+            percent = TRUE;
+            break;
+          }
           host++;
         }
-        /* if there were percents, encode the host name */
-        if(pcount) {
-          size_t hostlen = strlen(u->host);
-          size_t alen = hostlen + 2 * pcount + 1;
-          char *o = allochost = malloc(alen);
-          if(!allochost)
-            return CURLUE_OUT_OF_MEMORY;
-
+        /* if there were percent(s), encode the host name */
+        if(percent) {
+          struct dynbuf enc;
+          CURLcode result;
+          Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
           host = u->host;
           while(*host) {
-            if(*host == '%') {
-              memcpy(o, "%25", 3);
-              o += 3;
-              host++;
-              continue;
-            }
-            *o++ = *host++;
+            if(*host == '%')
+              result = Curl_dyn_addn(&enc, "%25", 3);
+            else
+              result = Curl_dyn_addn(&enc, host, 1);
+            if(result)
+              return CURLUE_OUT_OF_MEMORY;
+            host++;
           }
-          *o = '\0';
+          free(u->host);
+          u->host = Curl_dyn_ptr(&enc);
         }
       }
 
@@ -1445,16 +1581,12 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what,
       *part = decoded;
     }
     if(urlencode) {
-      /* worst case output length is 3x the original! */
-      char *newp = malloc(strlen(*part) * 3);
-      if(!newp)
+      struct dynbuf enc;
+      Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+      if(strcpy_url(&enc, *part, TRUE)) /* consider it relative */
         return CURLUE_OUT_OF_MEMORY;
-      if(strcpy_url(newp, *part, TRUE)) { /* consider it relative */
-        free(*part);
-        *part = newp;
-      }
-      else
-        free(newp);
+      free(*part);
+      *part = Curl_dyn_ptr(&enc);
     }
 
     return CURLUE_OK;
@@ -1628,14 +1760,16 @@ CURLUcode curl_url_set(CURLU *u, CURLUPart what,
 
     if(urlencode) {
       const unsigned char *i;
-      char *o;
-      char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
-      if(!enc)
-        return CURLUE_OUT_OF_MEMORY;
-      for(i = (const unsigned char *)part, o = enc; *i; i++) {
+      struct dynbuf enc;
+
+      Curl_dyn_init(&enc, nalloc * 3 + 1);
+
+      for(i = (const unsigned char *)part; *i; i++) {
+        CURLcode result;
         if((*i == ' ') && plusencode) {
-          *o = '+';
-          o++;
+          result = Curl_dyn_addn(&enc, "+", 1);
+          if(result)
+            return CURLUE_OUT_OF_MEMORY;
         }
         else if(Curl_isunreserved(*i) ||
                 ((*i == '/') && urlskipslash) ||
@@ -1643,16 +1777,17 @@ CURLUcode curl_url_set(CURLU *u, CURLUPart what,
           if((*i == '=') && equalsencode)
             /* only skip the first equals sign */
             equalsencode = FALSE;
-          *o = *i;
-          o++;
+          result = Curl_dyn_addn(&enc, i, 1);
+          if(result)
+            return CURLUE_OUT_OF_MEMORY;
         }
         else {
-          msnprintf(o, 4, "%%%02x", *i);
-          o += 3;
+          result = Curl_dyn_addf(&enc, "%%%02x", *i);
+          if(result)
+            return CURLUE_OUT_OF_MEMORY;
         }
       }
-      *o = 0; /* null-terminate */
-      newp = enc;
+      newp = Curl_dyn_ptr(&enc);
     }
     else {
       char *p;
@@ -1674,34 +1809,41 @@ CURLUcode curl_url_set(CURLU *u, CURLUPart what,
     }
 
     if(appendquery) {
-      /* Append the string onto the old query. Add a '&' separator if none is
-         present at the end of the existing query already */
+      /* Append the 'newp' string onto the old query. Add a '&' separator if
+         none is present at the end of the existing query already */
+
       size_t querylen = u->query ? strlen(u->query) : 0;
       bool addamperand = querylen && (u->query[querylen -1] != '&');
       if(querylen) {
-        size_t newplen = strlen(newp);
-        char *p = malloc(querylen + addamperand + newplen + 1);
-        if(!p) {
-          free((char *)newp);
-          return CURLUE_OUT_OF_MEMORY;
+        struct dynbuf enc;
+        Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
+
+        if(Curl_dyn_addn(&enc, u->query, querylen)) /* add original query */
+          goto nomem;
+
+        if(addamperand) {
+          if(Curl_dyn_addn(&enc, "&", 1))
+            goto nomem;
         }
-        strcpy(p, u->query); /* original query */
-        if(addamperand)
-          p[querylen] = '&'; /* ampersand */
-        strcpy(&p[querylen + addamperand], newp); /* new suffix */
+        if(Curl_dyn_add(&enc, newp))
+          goto nomem;
         free((char *)newp);
         free(*storep);
-        *storep = p;
+        *storep = Curl_dyn_ptr(&enc);
         return CURLUE_OK;
+        nomem:
+        free((char *)newp);
+        return CURLUE_OUT_OF_MEMORY;
       }
     }
 
     if(what == CURLUPART_HOST) {
-      if(0 == strlen(newp) && (flags & CURLU_NO_AUTHORITY)) {
+      size_t n = strlen(newp);
+      if(!n && (flags & CURLU_NO_AUTHORITY)) {
         /* Skip hostname check, it's allowed to be empty. */
       }
       else {
-        if(hostname_check(u, (char *)newp)) {
+        if(hostname_check(u, (char *)newp, n)) {
           free((char *)newp);
           return CURLUE_BAD_HOSTNAME;
         }
author	Daniel Stenberg <daniel@haxx.se>	2022-09-01 10:16:24 +0200
committer	Daniel Stenberg <daniel@haxx.se>	2022-09-07 10:21:45 +0200
commit	f703cf971ccdb93fea05f223cabbb99ca06cdc75 (patch)
tree	fae9b603ebdc5cb79dc6aaed3b49e8744cd57969 /lib/urlapi.c
parent	2ae81e680b8d6f8b9eb36fd1a2e223783526dbdd (diff)
download	curl-f703cf971ccdb93fea05f223cabbb99ca06cdc75.tar.gz