/* * utf-test.c -- test the utf functions * * ==================================================================== * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * ==================================================================== */ #include "../svn_test.h" #include "svn_utf.h" #include "svn_pools.h" #include "private/svn_string_private.h" #include "private/svn_utf_private.h" /* Random number seed. Yes, it's global, just pretend you can't see it. */ static apr_uint32_t diff_diff3_seed; /* Return the value of the current random number seed, initializing it if necessary */ static apr_uint32_t seed_val(void) { static svn_boolean_t first = TRUE; if (first) { diff_diff3_seed = (apr_uint32_t) apr_time_now(); first = FALSE; } return diff_diff3_seed; } /* Return a random number N such that MIN_VAL <= N <= MAX_VAL */ static apr_uint32_t range_rand(apr_uint32_t min_val, apr_uint32_t max_val) { apr_uint64_t diff = max_val - min_val; apr_uint64_t val = diff * svn_test_rand(&diff_diff3_seed); val /= 0xffffffff; return min_val + (apr_uint32_t) val; } /* Explicit tests of various valid/invalid sequences */ static svn_error_t * utf_validate(apr_pool_t *pool) { struct data { svn_boolean_t valid; char string[20]; } tests[] = { {TRUE, {'a', 'b', '\0'}}, {FALSE, {'a', 'b', '\x80', '\0'}}, {FALSE, {'a', 'b', '\xC0', '\0'}}, {FALSE, {'a', 'b', '\xC0', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xC5', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xC5', '\xC0', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE0', '\0'}}, {FALSE, {'a', 'b', '\xE0', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE0', '\xA0', '\0'}}, {FALSE, {'a', 'b', '\xE0', '\xA0', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xE0', '\xA0', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE0', '\x9F', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE0', '\xCF', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE5', '\0'}}, {FALSE, {'a', 'b', '\xE5', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE5', '\x81', '\0'}}, {FALSE, {'a', 'b', '\xE5', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xE5', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE5', '\xE1', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xE5', '\x81', '\xE1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xED', '\0'}}, {FALSE, {'a', 'b', '\xED', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xED', '\x81', '\0'}}, {FALSE, {'a', 'b', '\xED', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xED', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xED', '\xA0', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xED', '\x81', '\xC1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xEE', '\0'}}, {FALSE, {'a', 'b', '\xEE', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xEE', '\x81', '\0'}}, {FALSE, {'a', 'b', '\xEE', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xEE', '\x81', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xEE', '\xA0', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xEE', '\xC0', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xEE', '\x81', '\xC1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\0'}}, {FALSE, {'a', 'b', '\xF0', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x91', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x91', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x91', '\x81', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x91', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xF0', '\x91', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x81', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\xC1', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x91', '\xC1', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF0', '\x91', '\x81', '\xC1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF2', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF2', '\x91', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF2', '\x91', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xF2', '\x91', '\x81', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xF2', '\x81', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF2', '\xC1', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF2', '\x91', '\xC1', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF2', '\x91', '\x81', '\xC1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x91', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x91', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x91', '\x81', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xF4', '\x81', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\xC1', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x91', '\xC1', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x91', '\x81', '\xC1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF5', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF5', '\x81', 'x', 'y', '\0'}}, {TRUE, {'a', 'b', '\xF4', '\x81', '\x81', '\x81', 'x', 'y', 'a', 'b', '\xF2', '\x91', '\x81', '\x81', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x81', '\x81', '\x81', 'x', 'y', 'a', 'b', '\xF2', '\x91', '\x81', '\xC1', 'x', 'y', '\0'}}, {FALSE, {'a', 'b', '\xF4', '\x81', '\x81', '\x81', 'x', 'y', 'a', 'b', '\xF2', '\x91', '\x81', 'x', 'y', '\0'}}, {-1}, }; int i = 0; while (tests[i].valid != -1) { const char *last = svn_utf__last_valid(tests[i].string, strlen(tests[i].string)); apr_size_t len = strlen(tests[i].string); if ((svn_utf__cstring_is_valid(tests[i].string) != tests[i].valid) || (svn_utf__is_valid(tests[i].string, len) != tests[i].valid)) return svn_error_createf (SVN_ERR_TEST_FAILED, NULL, "is_valid test %d failed", i); if (!svn_utf__is_valid(tests[i].string, last - tests[i].string) || (tests[i].valid && *last)) return svn_error_createf (SVN_ERR_TEST_FAILED, NULL, "last_valid test %d failed", i); ++i; } return SVN_NO_ERROR; } /* Compare the two different implementations using random data. */ static svn_error_t * utf_validate2(apr_pool_t *pool) { int i; seed_val(); /* We want enough iterations so that most runs get both valid and invalid strings. We also want enough iterations such that a deliberate error in one of the implementations will trigger a failure. By experiment the second requirement requires a much larger number of iterations that the first. */ for (i = 0; i < 100000; ++i) { unsigned int j; char str[64]; apr_size_t len; /* A random string; experiment shows that it's occasionally (less than 1%) valid but usually invalid. */ for (j = 0; j < sizeof(str) - 1; ++j) str[j] = (char)range_rand(0, 255); str[sizeof(str) - 1] = 0; len = strlen(str); if (svn_utf__last_valid(str, len) != svn_utf__last_valid2(str, len)) { /* Duplicate calls for easy debugging */ svn_utf__last_valid(str, len); svn_utf__last_valid2(str, len); return svn_error_createf (SVN_ERR_TEST_FAILED, NULL, "is_valid2 test %d failed", i); } } return SVN_NO_ERROR; } /* Test conversion from different codepages to utf8. */ static svn_error_t * test_utf_cstring_to_utf8_ex2(apr_pool_t *pool) { apr_size_t i; apr_pool_t *subpool = svn_pool_create(pool); struct data { const char *string; const char *expected_result; const char *from_page; } tests[] = { {"ascii text\n", "ascii text\n", "unexistent-page"}, {"Edelwei\xdf", "Edelwei\xc3\x9f", "ISO-8859-1"} }; for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) { const char *dest; svn_pool_clear(subpool); SVN_ERR(svn_utf_cstring_to_utf8_ex2(&dest, tests[i].string, tests[i].from_page, pool)); if (strcmp(dest, tests[i].expected_result)) { return svn_error_createf (SVN_ERR_TEST_FAILED, NULL, "svn_utf_cstring_to_utf8_ex2 ('%s', '%s') returned ('%s') " "instead of ('%s')", tests[i].string, tests[i].from_page, dest, tests[i].expected_result); } } svn_pool_destroy(subpool); return SVN_NO_ERROR; } /* Test conversion to different codepages from utf8. */ static svn_error_t * test_utf_cstring_from_utf8_ex2(apr_pool_t *pool) { apr_size_t i; apr_pool_t *subpool = svn_pool_create(pool); struct data { const char *string; const char *expected_result; const char *to_page; } tests[] = { {"ascii text\n", "ascii text\n", "unexistent-page"}, {"Edelwei\xc3\x9f", "Edelwei\xdf", "ISO-8859-1"} }; for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) { const char *dest; svn_pool_clear(subpool); SVN_ERR(svn_utf_cstring_from_utf8_ex2(&dest, tests[i].string, tests[i].to_page, pool)); if (strcmp(dest, tests[i].expected_result)) { return svn_error_createf (SVN_ERR_TEST_FAILED, NULL, "svn_utf_cstring_from_utf8_ex2 ('%s', '%s') returned ('%s') " "instead of ('%s')", tests[i].string, tests[i].to_page, dest, tests[i].expected_result); } } svn_pool_destroy(subpool); return SVN_NO_ERROR; } /* Test normalization-independent UTF-8 string comparison */ static svn_error_t * test_utf_collated_compare(apr_pool_t *pool) { /* Normalized: NFC */ static const char nfc[] = "\xe1\xb9\xa8" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "\xe1\xb8\x87" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "\xe1\xb8\x9d" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "\xc5\xa1" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "\xe1\xbb\x9d" /* o with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ /* Normalized: NFD */ static const char nfd[] = "S\xcc\xa3\xcc\x87" /* S with dot above and below */ "u\xcc\x8a" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "v\xcc\x83" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "r\xcc\x8f" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "i\xcc\x88\xcc\x81" /* i with diaeresis and acute */ "o\xcc\x9b\xcc\x80" /* o with grave and hook */ "n\xcc\xad"; /* n with circumflex below */ /* Mixed, denormalized */ static const char mixup[] = "S\xcc\x87\xcc\xa3" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "o\xcc\x80\xcc\x9b" /* o with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ static const char longer[] = "\xe1\xb9\xa8" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "\xe1\xb8\x87" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "\xe1\xb8\x9d" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "\xc5\xa1" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "\xe1\xbb\x9d" /* o with grave and hook */ "\xe1\xb9\x8b" /* n with circumflex below */ "X"; static const char shorter[] = "\xe1\xb9\xa8" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "\xe1\xb8\x87" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "\xe1\xb8\x9d" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "\xc5\xa1" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "\xe1\xbb\x9d"; /* o with grave and hook */ static const char lowcase[] = "s\xcc\x87\xcc\xa3" /* s with dot above and below */ "\xc5\xaf" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "o\xcc\x80\xcc\x9b" /* o with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ static const struct utfcmp_test_t { const char *stra; char op; const char *strb; const char *taga; const char *tagb; } utfcmp_tests[] = { /* Empty key */ {"", '=', "", "empty", "empty"}, {"", '<', "a", "empty", "nonempty"}, {"a", '>', "", "nonempty", "empty"}, /* Deterministic ordering */ {"a", '<', "b", "a", "b"}, {"b", '<', "c", "b", "c"}, {"a", '<', "c", "a", "c"}, /* Normalized equality */ {nfc, '=', nfd, "nfc", "nfd"}, {nfd, '=', nfc, "nfd", "nfc"}, {nfc, '=', mixup, "nfc", "mixup"}, {nfd, '=', mixup, "nfd", "mixup"}, {mixup, '=', nfd, "mixup", "nfd"}, {mixup, '=', nfc, "mixup", "nfc"}, /* Key length */ {nfc, '<', longer, "nfc", "longer"}, {longer, '>', nfc, "longer", "nfc"}, {nfd, '>', shorter, "nfd", "shorter"}, {shorter, '<', nfd, "shorter", "nfd"}, {mixup, '<', lowcase, "mixup", "lowcase"}, {lowcase, '>', mixup, "lowcase", "mixup"}, {NULL, 0, NULL, NULL, NULL} }; const struct utfcmp_test_t *ut; svn_membuf_t bufa, bufb; svn_membuf__create(&bufa, 0, pool); svn_membuf__create(&bufb, 0, pool); srand(111); for (ut = utfcmp_tests; ut->stra; ++ut) { const svn_boolean_t implicit_size = (rand() % 17) & 1; const apr_size_t lena = (implicit_size ? SVN_UTF__UNKNOWN_LENGTH : strlen(ut->stra)); const apr_size_t lenb = (implicit_size ? SVN_UTF__UNKNOWN_LENGTH : strlen(ut->strb)); int result; SVN_ERR(svn_utf__normcmp(&result, ut->stra, lena, ut->strb, lenb, &bufa, &bufb)); /* UCS-4 debugging dump of the decomposed strings { const apr_int32_t *const ucsbufa = bufa.data; const apr_int32_t *const ucsbufb = bufb.data; apr_size_t i; printf("(%c)%7s %c %s\n", ut->op, ut->taga, (!result ? '=' : (result < 0 ? '<' : '>')), ut->tagb); for (i = 0; i < bufa.size || i < bufb.size; ++i) { if (i < bufa.size && i < bufb.size) printf(" U+%04X U+%04X\n", ucsbufa[i], ucsbufb[i]); else if (i < bufa.size) printf(" U+%04X\n", ucsbufa[i]); else printf(" U+%04X\n", ucsbufb[i]); } } */ if (('=' == ut->op && 0 != result) || ('<' == ut->op && 0 <= result) || ('>' == ut->op && 0 >= result)) { return svn_error_createf (SVN_ERR_TEST_FAILED, NULL, "Ut->Op '%s' %c '%s' but '%s' %c '%s'", ut->taga, ut->op, ut->tagb, ut->taga, (!result ? '=' : (result < 0 ? '<' : '>')), ut->tagb); } } return SVN_NO_ERROR; } static svn_error_t * test_utf_pattern_match(apr_pool_t *pool) { static const struct glob_test_t { svn_boolean_t sql_like; svn_boolean_t matches; const char *pattern; const char *string; const char *escape; } glob_tests[] = { #define LIKE_MATCH TRUE, TRUE #define LIKE_FAIL TRUE, FALSE #define GLOB_MATCH FALSE, TRUE #define GLOB_FAIL FALSE, FALSE {LIKE_FAIL, "", "test", NULL}, {GLOB_FAIL, "", "test", NULL}, {LIKE_FAIL, "", "%", NULL}, {GLOB_FAIL, "", "*", NULL}, {LIKE_FAIL, "test", "%", NULL}, {GLOB_FAIL, "test", "*", NULL}, {LIKE_MATCH, "test", "test", NULL}, {GLOB_MATCH, "test", "test", NULL}, {LIKE_MATCH, "t\xe1\xb8\x9dst", "te\xcc\xa7\xcc\x86st", NULL}, {GLOB_MATCH, "te\xcc\xa7\xcc\x86st", "t\xe1\xb8\x9dst", NULL}, {LIKE_FAIL, "test", "test", "\xe1\xb8\x9d"}, /* escape char not ascii */ {LIKE_FAIL, "test", "test", ""}, /* empty escape string */ {LIKE_MATCH, "te#st", "test", "#"}, {LIKE_FAIL, "te#st", "test", NULL}, {GLOB_MATCH, "te\\st", "test", NULL}, {LIKE_MATCH, "te##st", "te#st", "#"}, {LIKE_FAIL, "te##st", "te#st", NULL}, {GLOB_MATCH, "te\\\\st", "te\\st", NULL}, {GLOB_FAIL, "te\\\\st", "te\\st", "\\"}, /* escape char with glob */ {LIKE_FAIL, "te#%t", "te%t", NULL}, {LIKE_MATCH, "te#%t", "te%t", "#"}, {GLOB_MATCH, "te\\*t", "te*t", NULL}, {LIKE_FAIL, "te#%t", "test", NULL}, {GLOB_FAIL, "te\\*t", "test", NULL}, {LIKE_FAIL, "te#_t", "te_t", NULL}, {LIKE_MATCH, "te#_t", "te_t", "#"}, {GLOB_MATCH, "te\\?t", "te?t", NULL}, {LIKE_FAIL, "te#_t", "test", NULL}, {LIKE_FAIL, "te#_t", "test", "#"}, {GLOB_FAIL, "te\\?t", "test", NULL}, {LIKE_MATCH, "_est", "test", NULL}, {GLOB_MATCH, "?est", "test", NULL}, {LIKE_MATCH, "te_t", "test", NULL}, {GLOB_MATCH, "te?t", "test", NULL}, {LIKE_MATCH, "tes_", "test", NULL}, {GLOB_MATCH, "tes?", "test", NULL}, {LIKE_FAIL, "test_", "test", NULL}, {GLOB_FAIL, "test?", "test", NULL}, {LIKE_MATCH, "[s%n]", "[subversion]", NULL}, {GLOB_FAIL, "[s*n]", "[subversion]", NULL}, {LIKE_MATCH, "#[s%n]", "[subversion]", "#"}, {GLOB_MATCH, "\\[s*n]", "[subversion]", NULL}, {GLOB_MATCH, ".[\\-\\t]", ".t", NULL}, {GLOB_MATCH, "test*?*[a-z]*", "testgoop", NULL}, {GLOB_MATCH, "te[^x]t", "test", NULL}, {GLOB_MATCH, "te[^abc]t", "test", NULL}, {GLOB_MATCH, "te[^x]t", "test", NULL}, {GLOB_MATCH, "te[!x]t", "test", NULL}, {GLOB_FAIL, "te[^x]t", "text", NULL}, {GLOB_FAIL, "te[^\\x]t", "text", NULL}, {GLOB_FAIL, "te[^x\\", "text", NULL}, {GLOB_FAIL, "te[/]t", "text", NULL}, {GLOB_MATCH, "te[r-t]t", "test", NULL}, {GLOB_MATCH, "te[r-Tz]t", "tezt", NULL}, {GLOB_FAIL, "te[R-T]t", "tent", NULL}, /* {GLOB_MATCH, "tes[]t]", "test", NULL}, */ {GLOB_MATCH, "tes[t-]", "test", NULL}, {GLOB_MATCH, "tes[t-]]", "test]", NULL}, {GLOB_FAIL, "tes[t-]]", "test", NULL}, {GLOB_FAIL, "tes[u-]", "test", NULL}, {GLOB_FAIL, "tes[t-]", "tes[t-]", NULL}, {GLOB_MATCH, "test[/-/]", "test/", NULL}, {GLOB_MATCH, "test[\\/-/]", "test/", NULL}, {GLOB_MATCH, "test[/-\\/]", "test/", NULL}, #undef LIKE_MATCH #undef LIKE_FAIL #undef GLOB_MATCH #undef GLOB_FAIL {FALSE, FALSE, NULL, NULL, NULL} }; const struct glob_test_t *gt; svn_membuf_t bufa, bufb, bufc; svn_membuf__create(&bufa, 0, pool); svn_membuf__create(&bufb, 0, pool); svn_membuf__create(&bufc, 0, pool); srand(79); for (gt = glob_tests; gt->pattern; ++gt) { const svn_boolean_t implicit_size = (rand() % 13) & 1; const apr_size_t lenptn = (implicit_size ? SVN_UTF__UNKNOWN_LENGTH : strlen(gt->pattern)); const apr_size_t lenstr = (implicit_size ? SVN_UTF__UNKNOWN_LENGTH : strlen(gt->string)); const apr_size_t lenesc = (implicit_size ? SVN_UTF__UNKNOWN_LENGTH : (gt->escape ? strlen(gt->escape) : 0)); svn_boolean_t match; svn_error_t *err; err = svn_utf__glob(&match, gt->pattern, lenptn, gt->string, lenstr, gt->escape, lenesc, gt->sql_like, &bufa, &bufb, &bufc); if (!gt->sql_like && gt->escape && !err) return svn_error_create (SVN_ERR_TEST_FAILED, err, "Failed to detect GLOB ESCAPE"); if ((err && gt->matches) || (!err && !match != !gt->matches)) { if (gt->sql_like) return svn_error_createf (SVN_ERR_TEST_FAILED, err, "Wrong result: %s'%s' LIKE '%s'%s%s%s%s", (gt->matches ? "NOT " : ""), gt->string, gt->pattern, (gt->escape ? " ESCAPE " : ""), (gt->escape ? "'" : ""), (gt->escape ? gt->escape : ""), (gt->escape ? "'" : "")); else return svn_error_createf (SVN_ERR_TEST_FAILED, err, "Wrong result: %s%s GLOB %s", (gt->matches ? "NOT " : ""), gt->string, gt->pattern); } if (err) svn_error_clear(err); } return SVN_NO_ERROR; } static svn_error_t * test_utf_fuzzy_escape(apr_pool_t *pool) { /* Accented latin, mixed normalization */ static const char mixup[] = "S\xcc\x87\xcc\xa3" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "o\xcc\x80\xcc\x9b" /* o with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ /* As above, but latin lowercase 'o' replaced with Greek 'omicron' */ static const char greekish[] = "S\xcc\x87\xcc\xa3" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "\xce\xbf\xcc\x80\xcc\x9b" /* omicron with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ /* More interesting invalid characters. */ static const char invalid[] = "Not Unicode: \xef\xb7\x91;" /* U+FDD1 */ "Out of range: \xf4\x90\x80\x81;" /* U+110001 */ "Not UTF-8: \xe6;" "Null byte: \0;"; const char *fuzzy; fuzzy = svn_utf__fuzzy_escape(mixup, strlen(mixup), pool); SVN_TEST_ASSERT(0 == strcmp(fuzzy, "Subversion")); fuzzy = svn_utf__fuzzy_escape(greekish, strlen(greekish), pool); SVN_TEST_ASSERT(0 == strcmp(fuzzy, "Subversi{U+03BF}n")); fuzzy = svn_utf__fuzzy_escape(invalid, sizeof(invalid) - 1, pool); /*fprintf(stderr, "%s\n", fuzzy);*/ SVN_TEST_ASSERT(0 == strcmp(fuzzy, "Not Unicode: {U?FDD1};" "Out of range: ?\\F4?\\90?\\80?\\81;" "Not UTF-8: ?\\E6;" "Null byte: \\0;")); return SVN_NO_ERROR; } static svn_error_t * test_utf_is_normalized(apr_pool_t *pool) { /* Normalized: NFC */ static const char nfc[] = "\xe1\xb9\xa8" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "\xe1\xb8\x87" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "\xe1\xb8\x9d" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "\xc5\xa1" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "\xe1\xbb\x9d" /* o with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ /* Normalized: NFD */ static const char nfd[] = "S\xcc\xa3\xcc\x87" /* S with dot above and below */ "u\xcc\x8a" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "v\xcc\x83" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "r\xcc\x8f" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "i\xcc\x88\xcc\x81" /* i with diaeresis and acute */ "o\xcc\x9b\xcc\x80" /* o with grave and hook */ "n\xcc\xad"; /* n with circumflex below */ /* Mixed, denormalized */ static const char mixup[] = "S\xcc\x87\xcc\xa3" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "b\xcc\xb1" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "e\xcc\xa7\xcc\x86" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "s\xcc\x8c" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "o\xcc\x80\xcc\x9b" /* o with grave and hook */ "\xe1\xb9\x8b"; /* n with circumflex below */ /* Invalid UTF-8 */ static const char invalid[] = "\xe1\xb9\xa8" /* S with dot above and below */ "\xc5\xaf" /* u with ring */ "\xe1\xb8\x87" /* b with macron below */ "\xe1\xb9\xbd" /* v with tilde */ "\xe1\xb8\x9d" /* e with breve and cedilla */ "\xc8\x91" /* r with double grave */ "\xc5\xa1" /* s with caron */ "\xe1\xb8\xaf" /* i with diaeresis and acute */ "\xe6" /* Invalid byte */ "\xe1\xb9\x8b"; /* n with circumflex below */ SVN_ERR_ASSERT(svn_utf__is_normalized(nfc, pool)); SVN_ERR_ASSERT(!svn_utf__is_normalized(nfd, pool)); SVN_ERR_ASSERT(!svn_utf__is_normalized(mixup, pool)); SVN_ERR_ASSERT(!svn_utf__is_normalized(invalid, pool)); return SVN_NO_ERROR; } static svn_error_t * test_utf_conversions(apr_pool_t *pool) { static const struct cvt_test_t { svn_boolean_t sixteenbit; svn_boolean_t bigendian; const char *source; const char *result; } tests[] = { #define UTF_32_LE FALSE, FALSE #define UTF_32_BE FALSE, TRUE #define UTF_16_LE TRUE, FALSE #define UTF_16_BE TRUE, TRUE /* Normal character conversion */ { UTF_32_LE, "t\0\0\0" "e\0\0\0" "s\0\0\0" "t\0\0\0" "\0\0\0\0", "test" }, { UTF_32_BE, "\0\0\0t" "\0\0\0e" "\0\0\0s" "\0\0\0t" "\0\0\0\0", "test" }, { UTF_16_LE, "t\0" "e\0" "s\0" "t\0" "\0\0", "test" }, { UTF_16_BE, "\0t" "\0e" "\0s" "\0t" "\0\0", "test" }, /* Valid surrogate pairs */ { UTF_16_LE, "\x00\xD8" "\x00\xDC" "\0\0", "\xf0\x90\x80\x80" }, /* U+010000 */ { UTF_16_LE, "\x34\xD8" "\x1E\xDD" "\0\0", "\xf0\x9d\x84\x9e" }, /* U+01D11E */ { UTF_16_LE, "\xFF\xDB" "\xFD\xDF" "\0\0", "\xf4\x8f\xbf\xbd" }, /* U+10FFFD */ { UTF_16_BE, "\xD8\x00" "\xDC\x00" "\0\0", "\xf0\x90\x80\x80" }, /* U+010000 */ { UTF_16_BE, "\xD8\x34" "\xDD\x1E" "\0\0", "\xf0\x9d\x84\x9e" }, /* U+01D11E */ { UTF_16_BE, "\xDB\xFF" "\xDF\xFD" "\0\0", "\xf4\x8f\xbf\xbd" }, /* U+10FFFD */ /* Swapped, single and trailing surrogate pairs */ { UTF_16_LE, "*\0" "\x00\xDC" "\x00\xD8" "*\0\0\0", "*\xed\xb0\x80" "\xed\xa0\x80*" }, { UTF_16_LE, "*\0" "\x1E\xDD" "*\0\0\0", "*\xed\xb4\x9e*" }, { UTF_16_LE, "*\0" "\xFF\xDB" "*\0\0\0", "*\xed\xaf\xbf*" }, { UTF_16_LE, "\x1E\xDD" "\0\0", "\xed\xb4\x9e" }, { UTF_16_LE, "\xFF\xDB" "\0\0", "\xed\xaf\xbf" }, { UTF_16_BE, "\0*" "\xDC\x00" "\xD8\x00" "\0*\0\0", "*\xed\xb0\x80" "\xed\xa0\x80*" }, { UTF_16_BE, "\0*" "\xDD\x1E" "\0*\0\0", "*\xed\xb4\x9e*" }, { UTF_16_BE, "\0*" "\xDB\xFF" "\0*\0\0", "*\xed\xaf\xbf*" }, { UTF_16_BE, "\xDD\x1E" "\0\0", "\xed\xb4\x9e" }, { UTF_16_BE, "\xDB\xFF" "\0\0", "\xed\xaf\xbf" }, #undef UTF_32_LE #undef UTF_32_BE #undef UTF_16_LE #undef UTF_16_BE { 0 } }; const struct cvt_test_t *tc; const svn_string_t *result; int i; for (i = 1, tc = tests; tc->source; ++tc, ++i) { if (tc->sixteenbit) SVN_ERR(svn_utf__utf16_to_utf8(&result, (const void*)tc->source, SVN_UTF__UNKNOWN_LENGTH, tc->bigendian, pool, pool)); else SVN_ERR(svn_utf__utf32_to_utf8(&result, (const void*)tc->source, SVN_UTF__UNKNOWN_LENGTH, tc->bigendian, pool, pool)); SVN_ERR_ASSERT(0 == strcmp(result->data, tc->result)); } /* Test counted strings with NUL characters */ SVN_ERR(svn_utf__utf16_to_utf8( &result, (void*)("x\0" "\0\0" "y\0" "*\0"), 3, FALSE, pool, pool)); SVN_ERR_ASSERT(0 == memcmp(result->data, "x\0y", 3)); SVN_ERR(svn_utf__utf32_to_utf8( &result, (void*)("\0\0\0x" "\0\0\0\0" "\0\0\0y" "\0\0\0*"), 3, TRUE, pool, pool)); SVN_ERR_ASSERT(0 == memcmp(result->data, "x\0y", 3)); return SVN_NO_ERROR; } /* The test table. */ static int max_threads = 1; static struct svn_test_descriptor_t test_funcs[] = { SVN_TEST_NULL, SVN_TEST_PASS2(utf_validate, "test is_valid/last_valid"), SVN_TEST_PASS2(utf_validate2, "test last_valid/last_valid2"), SVN_TEST_PASS2(test_utf_cstring_to_utf8_ex2, "test svn_utf_cstring_to_utf8_ex2"), SVN_TEST_PASS2(test_utf_cstring_from_utf8_ex2, "test svn_utf_cstring_from_utf8_ex2"), SVN_TEST_PASS2(test_utf_collated_compare, "test svn_utf__normcmp"), SVN_TEST_PASS2(test_utf_pattern_match, "test svn_utf__glob"), SVN_TEST_PASS2(test_utf_fuzzy_escape, "test svn_utf__fuzzy_escape"), SVN_TEST_PASS2(test_utf_is_normalized, "test svn_utf__is_normalized"), SVN_TEST_PASS2(test_utf_conversions, "test svn_utf__utf{16,32}_to_utf8"), SVN_TEST_NULL }; SVN_TEST_MAIN