diff options
author | Daniel Stenberg <daniel@haxx.se> | 2022-12-26 10:58:37 +0100 |
---|---|---|
committer | Daniel Stenberg <daniel@haxx.se> | 2022-12-26 23:29:23 +0100 |
commit | 901392cbb7939f43b7e7ed2b39135b45416a2714 (patch) | |
tree | fd74a1a6370ba56fcd76cd15cb052d352fbcb076 | |
parent | cf174810db32c362775349e9afe5543c0e9cc18b (diff) | |
download | curl-901392cbb7939f43b7e7ed2b39135b45416a2714.tar.gz |
urlapi: add CURLU_PUNYCODE
Allows curl_url_get() get the punycode version of host names for the
host name and URL parts.
Extend test 1560 to verify.
Closes #10109
-rw-r--r-- | .github/scripts/spellcheck.words | 1 | ||||
-rw-r--r-- | docs/libcurl/curl_url_get.3 | 11 | ||||
-rw-r--r-- | docs/libcurl/symbols-in-versions | 2 | ||||
-rw-r--r-- | include/curl/urlapi.h | 2 | ||||
-rw-r--r-- | lib/idn.c | 30 | ||||
-rw-r--r-- | lib/idn.h | 8 | ||||
-rw-r--r-- | lib/strerror.c | 3 | ||||
-rw-r--r-- | lib/urlapi.c | 28 | ||||
-rw-r--r-- | tests/data/test1538 | 3 | ||||
-rw-r--r-- | tests/libtest/lib1560.c | 16 |
10 files changed, 90 insertions, 14 deletions
diff --git a/.github/scripts/spellcheck.words b/.github/scripts/spellcheck.words index 9f3906b80..ea5115be4 100644 --- a/.github/scripts/spellcheck.words +++ b/.github/scripts/spellcheck.words @@ -573,6 +573,7 @@ PSL pthreads PTR ptr +punycode py pycurl QNX diff --git a/docs/libcurl/curl_url_get.3 b/docs/libcurl/curl_url_get.3 index d86d93754..2ae21e4fd 100644 --- a/docs/libcurl/curl_url_get.3 +++ b/docs/libcurl/curl_url_get.3 @@ -76,6 +76,17 @@ typically using non-ASCII bytes that otherwise will be percent-encoded. Note that even when not asking for URL encoding, the '%' (byte 37) will be URL encoded to make sure the host name remains valid. +.IP CURLU_PUNYCODE +If set and \fICURLU_URLENCODE\fP is not set, and asked to retrieve the +\fBCURLUPART_HOST\fP or \fBCURLUPART_URL\fP parts, libcurl returns the host +name in its punycode version if it contains any non-ASCII octets (and is an +IDN name). + +If libcurl is built without IDN capabilities, using this bit will make +\fIcurl_url_get(3)\fP return \fICURLUE_LACKS_IDN\fP if the host name contains +anything outside the ASCII range. + +(Added in curl 7.88.0) .SH PARTS .IP CURLUPART_URL When asked to return the full URL, \fIcurl_url_get(3)\fP will return a diff --git a/docs/libcurl/symbols-in-versions b/docs/libcurl/symbols-in-versions index 5ee245d36..6de01a08c 100644 --- a/docs/libcurl/symbols-in-versions +++ b/docs/libcurl/symbols-in-versions @@ -1055,6 +1055,7 @@ CURLU_NO_AUTHORITY 7.67.0 CURLU_NO_DEFAULT_PORT 7.62.0 CURLU_NON_SUPPORT_SCHEME 7.62.0 CURLU_PATH_AS_IS 7.62.0 +CURLU_PUNYCODE 7.88.0 CURLU_URLDECODE 7.62.0 CURLU_URLENCODE 7.62.0 CURLUE_BAD_FILE_URL 7.81.0 @@ -1071,6 +1072,7 @@ CURLUE_BAD_QUERY 7.81.0 CURLUE_BAD_SCHEME 7.81.0 CURLUE_BAD_SLASHES 7.81.0 CURLUE_BAD_USER 7.81.0 +CURLUE_LACKS_IDN 7.88.0 CURLUE_MALFORMED_INPUT 7.62.0 CURLUE_NO_FRAGMENT 7.62.0 CURLUE_NO_HOST 7.62.0 diff --git a/include/curl/urlapi.h b/include/curl/urlapi.h index e15c213cc..9e21df165 100644 --- a/include/curl/urlapi.h +++ b/include/curl/urlapi.h @@ -62,6 +62,7 @@ typedef enum { CURLUE_BAD_SCHEME, /* 27 */ CURLUE_BAD_SLASHES, /* 28 */ CURLUE_BAD_USER, /* 29 */ + CURLUE_LACKS_IDN, /* 30 */ CURLUE_LAST } CURLUcode; @@ -95,6 +96,7 @@ typedef enum { #define CURLU_NO_AUTHORITY (1<<10) /* Allow empty authority when the scheme is unknown. */ #define CURLU_ALLOW_SPACE (1<<11) /* Allow spaces in the URL */ +#define CURLU_PUNYCODE (1<<12) /* get the host name in pynycode */ typedef struct Curl_URL CURLU; @@ -116,7 +116,7 @@ bool Curl_is_ASCII_name(const char *hostname) * Curl_idn_decode() returns an allocated IDN decoded string if it was * possible. NULL on error. */ -static char *Curl_idn_decode(const char *input) +static char *idn_decode(const char *input) { char *decoded = NULL; #ifdef USE_LIBIDN2 @@ -144,24 +144,29 @@ static char *Curl_idn_decode(const char *input) return decoded; } +char *Curl_idn_decode(const char *input) +{ + char *d = idn_decode(input); +#ifdef USE_LIBIDN2 + if(d) { + char *c = strdup(d); + idn2_free(d); + d = c; + } +#endif + return d; +} + /* * Frees data allocated by idnconvert_hostname() */ void Curl_free_idnconverted_hostname(struct hostname *host) { -#if defined(USE_LIBIDN2) if(host->encalloc) { - idn2_free(host->encalloc); /* must be freed with idn2_free() since this was - allocated by libidn */ + /* must be freed with idn2_free() if allocated by libidn */ + Curl_idn_free(host->encalloc); host->encalloc = NULL; } -#elif defined(USE_WIN32_IDN) - free(host->encalloc); /* must be freed with free() since this was - allocated by Curl_win32_idn_to_ascii */ - host->encalloc = NULL; -#else - (void)host; -#endif } #endif /* USE_IDN */ @@ -177,7 +182,7 @@ CURLcode Curl_idnconvert_hostname(struct hostname *host) #ifdef USE_IDN /* Check name for non-ASCII and convert hostname if we can */ if(!Curl_is_ASCII_name(host->name)) { - char *decoded = Curl_idn_decode(host->name); + char *decoded = idn_decode(host->name); if(decoded) { /* successful */ host->encalloc = decoded; @@ -190,4 +195,3 @@ CURLcode Curl_idnconvert_hostname(struct hostname *host) #endif return CURLE_OK; } - @@ -32,7 +32,15 @@ CURLcode Curl_idnconvert_hostname(struct hostname *host); #if defined(USE_LIBIDN2) || defined(USE_WIN32_IDN) #define USE_IDN void Curl_free_idnconverted_hostname(struct hostname *host); +char *Curl_idn_decode(const char *input); +#ifdef USE_LIBIDN2 +#define Curl_idn_free(x) idn2_free(x) +#else +#define Curl_idn_free(x) free(x) +#endif + #else #define Curl_free_idnconverted_hostname(x) +#define Curl_idn_decode(x) NULL #endif #endif /* HEADER_CURL_IDN_H */ diff --git a/lib/strerror.c b/lib/strerror.c index b9a51e26b..3d7193509 100644 --- a/lib/strerror.c +++ b/lib/strerror.c @@ -550,6 +550,9 @@ curl_url_strerror(CURLUcode error) case CURLUE_BAD_USER: return "Bad user"; + case CURLUE_LACKS_IDN: + return "libcurl lacks IDN support"; + case CURLUE_LAST: break; } diff --git a/lib/urlapi.c b/lib/urlapi.c index b96af35ad..29238c7e3 100644 --- a/lib/urlapi.c +++ b/lib/urlapi.c @@ -33,6 +33,7 @@ #include "inet_pton.h" #include "inet_ntop.h" #include "strdup.h" +#include "idn.h" /* The last 3 #include files should be in this order */ #include "curl_printf.h" @@ -1379,6 +1380,7 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what, char portbuf[7]; bool urldecode = (flags & CURLU_URLDECODE)?1:0; bool urlencode = (flags & CURLU_URLENCODE)?1:0; + bool punycode = FALSE; bool plusdecode = FALSE; (void)flags; if(!u) @@ -1408,6 +1410,7 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what, case CURLUPART_HOST: ptr = u->host; ifmissing = CURLUE_NO_HOST; + punycode = (flags & CURLU_PUNYCODE)?1:0; break; case CURLUPART_ZONEID: ptr = u->zoneid; @@ -1460,6 +1463,7 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what, char *options = u->options; char *port = u->port; char *allochost = NULL; + punycode = (flags & CURLU_PUNYCODE)?1:0; if(u->scheme && strcasecompare("file", u->scheme)) { url = aprintf("file://%s%s%s", u->path, @@ -1514,6 +1518,17 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what, if(!allochost) return CURLUE_OUT_OF_MEMORY; } + else if(punycode) { + if(!Curl_is_ASCII_name(u->host)) { +#ifndef USE_IDN + return CURLUE_LACKS_IDN; +#else + allochost = Curl_idn_decode(u->host); + if(!allochost) + return CURLUE_OUT_OF_MEMORY; +#endif + } + } else { /* only encode '%' in output host name */ char *host = u->host; @@ -1611,6 +1626,19 @@ CURLUcode curl_url_get(CURLU *u, CURLUPart what, free(*part); *part = Curl_dyn_ptr(&enc); } + else if(punycode) { + if(!Curl_is_ASCII_name(u->host)) { +#ifndef USE_IDN + return CURLUE_LACKS_IDN; +#else + char *allochost = Curl_idn_decode(*part); + if(!allochost) + return CURLUE_OUT_OF_MEMORY; + free(*part); + *part = allochost; +#endif + } + } return CURLUE_OK; } diff --git a/tests/data/test1538 b/tests/data/test1538 index ba5cf29ce..59cd1628e 100644 --- a/tests/data/test1538 +++ b/tests/data/test1538 @@ -185,7 +185,8 @@ u26: Bad query u27: Bad scheme u28: Unsupported number of slashes following scheme u29: Bad user -u30: CURLUcode unknown +u30: libcurl lacks IDN support +u31: CURLUcode unknown </stdout> </verify> diff --git a/tests/libtest/lib1560.c b/tests/libtest/lib1560.c index 570e5082d..f28bdf111 100644 --- a/tests/libtest/lib1560.c +++ b/tests/libtest/lib1560.c @@ -31,6 +31,9 @@ */ #include "test.h" +#if defined(USE_LIBIDN2) || defined(USE_WIN32_IDN) +#define USE_IDN +#endif #include "testutil.h" #include "warnless.h" @@ -138,6 +141,15 @@ struct clearurlcase { }; static const struct testcase get_parts_list[] ={ +#ifdef USE_IDN + {"https://räksmörgås.se", + "https | [11] | [12] | [13] | xn--rksmrgs-5wao1o.se | " + "[15] | / | [16] | [17]", 0, CURLU_PUNYCODE, CURLUE_OK}, +#else + {"https://räksmörgås.se", + "https | [11] | [12] | [13] | [30] | [15] | / | [16] | [17]", + 0, CURLU_PUNYCODE, CURLUE_OK}, +#endif /* https://ℂᵤⓇℒ。𝐒🄴 */ {"https://" "%e2%84%82%e1%b5%a4%e2%93%87%e2%84%92%e3%80%82%f0%9d%90%92%f0%9f%84%b4", @@ -454,6 +466,10 @@ static const struct testcase get_parts_list[] ={ }; static const struct urltestcase get_url_list[] = { +#ifdef USE_IDN + {"https://räksmörgås.se/path?q#frag", + "https://xn--rksmrgs-5wao1o.se/path?q#frag", 0, CURLU_PUNYCODE, CURLUE_OK}, +#endif /* unsupported schemes with no guessing enabled */ {"data:text/html;charset=utf-8;base64,PCFET0NUWVBFIEhUTUw+PG1ldGEgY", "", 0, 0, CURLUE_UNSUPPORTED_SCHEME}, |