summaryrefslogtreecommitdiff
path: root/pcretest.c
diff options
context:
space:
mode:
authorchpe <chpe@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-10-16 15:55:48 +0000
committerchpe <chpe@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-10-16 15:55:48 +0000
commitbfced143d08671a664b7c2cd5952b0dc693e413e (patch)
tree5660056d40874682548e1ebd5f0cc01801bd6319 /pcretest.c
parent979dbda8ef7124bd03fd6387e8a9707aa6342e83 (diff)
downloadpcre-bfced143d08671a664b7c2cd5952b0dc693e413e.tar.gz
pcre32: pcretest: Don't convert data line to UTF-8 first
While reading the data lines, directly put them into the 8, 16 or 32 bit buffers instead of first converting them into UTF-8 and only afterwards converting that buffer to 16/32 bit. This is necessary so the in 32 bit mode the \x{} escapes can use the full 32-bit range (while the non-standard 5/6 byte UTF-8 sequences can only express characters up to 31-bits). git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1090 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcretest.c')
-rw-r--r--pcretest.c234
1 files changed, 145 insertions, 89 deletions
diff --git a/pcretest.c b/pcretest.c
index 982bdd0..92625ed 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -831,7 +831,6 @@ static const unsigned char *last_callout_mark = NULL;
static int buffer_size = 50000;
static pcre_uint8 *buffer = NULL;
-static pcre_uint8 *dbuffer = NULL;
static pcre_uint8 *pbuffer = NULL;
/* Another buffer is needed translation to 16/32-bit character strings. It will
@@ -1666,10 +1665,9 @@ for (;;)
{
int new_buffer_size = 2*buffer_size;
pcre_uint8 *new_buffer = (pcre_uint8 *)malloc(new_buffer_size);
- pcre_uint8 *new_dbuffer = (pcre_uint8 *)malloc(new_buffer_size);
pcre_uint8 *new_pbuffer = (pcre_uint8 *)malloc(new_buffer_size);
- if (new_buffer == NULL || new_dbuffer == NULL || new_pbuffer == NULL)
+ if (new_buffer == NULL || new_pbuffer == NULL)
{
fprintf(stderr, "pcretest: malloc(%d) failed\n", new_buffer_size);
exit(1);
@@ -1684,11 +1682,9 @@ for (;;)
here = new_buffer + (here - buffer);
free(buffer);
- free(dbuffer);
free(pbuffer);
buffer = new_buffer;
- dbuffer = new_dbuffer;
pbuffer = new_pbuffer;
}
}
@@ -2719,6 +2715,8 @@ int all_use_dfa = 0;
int verify_jit = 0;
int yield = 0;
int stack_size;
+pcre_uint8 *dbuffer = NULL;
+size_t dbuffer_size = 1u << 14;
#if !defined NOPOSIX
int posix = 0;
@@ -2762,7 +2760,6 @@ debugging. They grow automatically when very long lines are read. The 16-
and 32-bit buffers (buffer16, buffer32) are obtained only if needed. */
buffer = (pcre_uint8 *)malloc(buffer_size);
-dbuffer = (pcre_uint8 *)malloc(buffer_size);
pbuffer = (pcre_uint8 *)malloc(buffer_size);
/* The outfile variable is static so that new_malloc can use it. */
@@ -4060,7 +4057,15 @@ while (!done)
for (;;)
{
- pcre_uint8 *q;
+#ifdef SUPPORT_PCRE8
+ pcre_uint8 *q8;
+#endif
+#ifdef SUPPORT_PCRE16
+ pcre_uint16 *q16;
+#endif
+#ifdef SUPPORT_PCRE32
+ pcre_uint32 *q32;
+#endif
pcre_uint8 *bptr;
int *use_offsets = offsets;
int use_size_offsets = size_offsets;
@@ -4132,7 +4137,47 @@ while (!done)
p = buffer;
while (isspace(*p)) p++;
- bptr = q = dbuffer;
+#ifndef NOUTF
+ /* Check that the data is well-formed UTF-8 if we're in UTF mode. To create
+ invalid input to pcre_exec, you must use \x?? or \x{} sequences. */
+ if (use_utf)
+ {
+ char *q;
+ pcre_uint32 c;
+ int n = 1;
+
+ for (q = p; n > 0 && *q; q += n) n = utf82ord(q, &c);
+ if (n <= 0)
+ {
+ fprintf(outfile, "**Failed: invalid UTF-8 string cannot be used as input in UTF mode\n");
+ goto NEXT_DATA;
+ }
+ }
+#endif
+
+ /* Allocate a buffer to hold the data line. len+1 is an upper bound on
+ the number of pcre_uchar units that will be needed. */
+ if (dbuffer == NULL || len >= dbuffer_size)
+ {
+ dbuffer_size *= 2;
+ dbuffer = (pcre_uint8 *)realloc(dbuffer, dbuffer_size * CHAR_SIZE);
+ if (dbuffer == NULL)
+ {
+ fprintf(stderr, "pcretest: malloc(%d) failed\n", dbuffer_size);
+ exit(1);
+ }
+ }
+
+#ifdef SUPPORT_PCRE8
+ q8 = (pcre_uint8 *) dbuffer;
+#endif
+#ifdef SUPPORT_PCRE16
+ q16 = (pcre_uint16 *) dbuffer;
+#endif
+#ifdef SUPPORT_PCRE32
+ q32 = (pcre_uint32 *) dbuffer;
+#endif
+
while ((c = *p++) != 0)
{
int i = 0;
@@ -4145,11 +4190,9 @@ while (!done)
if (c != '\\')
{
- if (use_utf)
- {
- *q++ = c;
- continue;
- }
+#ifndef NOUTF
+ if (use_utf && HASUTF8EXTRALEN(c)) { GETUTF8INC(c, p); }
+#endif
}
/* Handle backslash escapes */
@@ -4210,11 +4253,13 @@ while (!done)
c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'a' - 10);
p++;
}
- if (use_utf)
+#if !defined NOUTF && defined SUPPORT_PCRE8
+ if (use_utf && (pcre_mode == PCRE8_MODE))
{
- *q++ = c;
+ *q8++ = c;
continue;
}
+#endif
break;
case 0: /* \ followed by EOF allows for an empty line */
@@ -4427,48 +4472,114 @@ while (!done)
than 127 in UTF mode must have come from \x{...} or octal constructs
because values from \x.. get this far only in non-UTF mode. */
-#if !defined NOUTF || defined SUPPORT_PCRE16 || defined SUPPORT_PCRE32
- if (pcre_mode != PCRE8_MODE || use_utf)
+#ifdef SUPPORT_PCRE8
+ if (pcre_mode == PCRE8_MODE)
{
- pcre_uint8 buff8[8];
- int ii, utn;
- utn = ord2utf8(c, buff8);
- for (ii = 0; ii < utn; ii++) *q++ = buff8[ii];
+#ifndef NOUTF
+ if (use_utf)
+ {
+ q8 += ord2utf8(c, q8);
+ }
+ else
+#endif
+ {
+ if (c > 0xffu)
+ {
+ fprintf(outfile, "** Character \\x{%x} is greater than 255 "
+ "and UTF-8 mode is not enabled.\n", c);
+ fprintf(outfile, "** Truncation will probably give the wrong "
+ "result.\n");
+ }
+
+ *q8++ = c;
+ }
}
- else
#endif
+#ifdef SUPPORT_PCRE16
+ if (pcre_mode == PCRE16_MODE)
{
- if (c > 255)
+#ifndef NOUTF
+ if (use_utf)
+ {
+ if (c > 0x10ffffu)
+ {
+ fprintf(outfile, "**Failed: character value greater than 0x10ffff "
+ "cannot be converted to UTF-16\n");
+ goto NEXT_DATA;
+ }
+ else if (c >= 0x10000u)
+ {
+ c-= 0x10000u;
+ *q16++ = 0xD800 | (c >> 10);
+ *q16++ = 0xDC00 | (c & 0x3ff);
+ }
+ else
+ *q16++ = c;
+ }
+ else
+#endif
{
- fprintf(outfile, "** Character \\x{%x} is greater than 255 "
- "and UTF-8 mode is not enabled.\n", c);
- fprintf(outfile, "** Truncation will probably give the wrong "
- "result.\n");
+ if (c > 0xffffu)
+ {
+ fprintf(outfile, "** Character value is greater than 0xffff "
+ "and UTF-16 mode is not enabled.\n", c);
+ fprintf(outfile, "** Truncation will probably give the wrong "
+ "result.\n");
+ }
+
+ *q16++ = c;
}
- *q++ = c;
}
+#endif
+#ifdef SUPPORT_PCRE32
+ if (pcre_mode == PCRE32_MODE)
+ {
+ *q32++ = c;
+ }
+#endif
+
}
/* Reached end of subject string */
- *q = 0;
- len = (int)(q - dbuffer);
+#ifdef SUPPORT_PCRE8
+ if (pcre_mode == PCRE8_MODE)
+ {
+ *q8 = 0;
+ len = (int)(q8 - (pcre_uint8 *)dbuffer);
+ }
+#endif
+#ifdef SUPPORT_PCRE16
+ if (pcre_mode == PCRE16_MODE)
+ {
+ *q16 = 0;
+ len = (int)(q16 - (pcre_uint16 *)dbuffer);
+ }
+#endif
+#ifdef SUPPORT_PCRE32
+ if (pcre_mode == PCRE32_MODE)
+ {
+ *q32 = 0;
+ len = (int)(q32 - (pcre_uint32 *)dbuffer);
+ }
+#endif
/* Move the data to the end of the buffer so that a read over the end of
the buffer will be seen by valgrind, even if it doesn't cause a crash. If
we are using the POSIX interface, we must include the terminating zero. */
+ bptr = dbuffer;
+
#if !defined NOPOSIX
if (posix || do_posix)
{
- memmove(bptr + buffer_size - len - 1, bptr, len + 1);
- bptr += buffer_size - len - 1;
+ memmove(bptr + dbuffer_size - len - 1, bptr, len + 1);
+ bptr += dbuffer_size - len - 1;
}
else
#endif
{
- memmove(bptr + buffer_size - len, bptr, len);
- bptr += buffer_size - len;
+ bptr = memmove(bptr + (dbuffer_size - len) * CHAR_SIZE, bptr, len * CHAR_SIZE);
}
if ((all_use_dfa || use_dfa) && find_match_limit)
@@ -4532,61 +4643,6 @@ while (!done)
/* Handle matching via the native interface - repeats for /g and /G */
-#ifdef SUPPORT_PCRE16
- if (pcre_mode == PCRE16_MODE)
- {
- len = to16(TRUE, bptr, REAL_PCRE_OPTIONS(re) & PCRE_UTF8, len);
- switch(len)
- {
- case -1:
- fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
- "converted to UTF-16\n");
- goto NEXT_DATA;
-
- case -2:
- fprintf(outfile, "**Failed: character value greater than 0x10ffff "
- "cannot be converted to UTF-16\n");
- goto NEXT_DATA;
-
- case -3:
- fprintf(outfile, "**Failed: character value greater than 0xffff "
- "cannot be converted to 16-bit in non-UTF mode\n");
- goto NEXT_DATA;
-
- default:
- break;
- }
- bptr = (pcre_uint8 *)buffer16;
- }
-#endif
-
-#ifdef SUPPORT_PCRE32
- if (pcre_mode == PCRE32_MODE)
- {
- len = to32(TRUE, bptr, REAL_PCRE_OPTIONS(re) & PCRE_UTF32, len);
- switch(len)
- {
- case -1:
- fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
- "converted to UTF-32\n");
- goto NEXT_DATA;
-
- case -2:
- fprintf(outfile, "**Failed: character value greater than 0x10ffff "
- "cannot be converted to UTF-32\n");
- goto NEXT_DATA;
-
- case -3:
- fprintf(outfile, "**Failed: character value is ill-formed UTF-32\n");
- goto NEXT_DATA;
-
- default:
- break;
- }
- bptr = (pcre_uint8 *)buffer32;
- }
-#endif
-
/* Ensure that there is a JIT callback if we want to verify that JIT was
actually used. If jit_stack == NULL, no stack has yet been assigned. */