summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-04-05 15:35:59 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-04-05 15:35:59 +0000
commit6cfdfa01a2fd8749bccf17f030aea0536b5090aa (patch)
treed7bd3c63a17f04c6aac73e2b27bb82f5a2c66df2
parent4ecaf0e27e8a2174744c464b97893e0754df9482 (diff)
downloadpcre-6cfdfa01a2fd8749bccf17f030aea0536b5090aa.tar.gz
Implement PCRE_NEVER_UTF
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1309 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rw-r--r--doc/pcreapi.313
-rw-r--r--doc/pcrepattern.37
-rw-r--r--doc/pcretest.16
-rw-r--r--pcre.h.in21
-rw-r--r--pcre_compile.c20
-rw-r--r--pcre_internal.h6
-rw-r--r--pcreposix.c3
-rw-r--r--pcretest.c1
-rw-r--r--testdata/testinput154
-rw-r--r--testdata/testinput184
-rw-r--r--testdata/testoutput156
-rw-r--r--testdata/testoutput18-166
-rw-r--r--testdata/testoutput18-326
14 files changed, 86 insertions, 20 deletions
diff --git a/ChangeLog b/ChangeLog
index 977f3b7..b48abbd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -132,6 +132,9 @@ Version 8.33 xx-xxxx-201x
34. Auto-detect and optimize limited repetitions in JIT.
+35. Implement PCRE_NEVER_UTF to lock out the use of UTF, in particular,
+ blocking (*UTF) etc.
+
Version 8.32 30-November-2012
-----------------------------
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 4463470..42364ee 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1,4 +1,4 @@
-.TH PCREAPI 3 "26 March 2013" "PCRE 8.33"
+.TH PCREAPI 3 "05 April 2013" "PCRE 8.33"
.SH NAME
PCRE - Perl-compatible regular expressions
.sp
@@ -755,6 +755,15 @@ equivalent to Perl's /m option, and it can be changed within a pattern by a
(?m) option setting. If there are no newlines in a subject string, or no
occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
.sp
+ PCRE_NEVER_UTF
+.sp
+This option locks out interpretation of the pattern as UTF-8 (or UTF-16 or
+UTF-32 in the 16-bit and 32-bit libraries). In particular, it prevents the
+creator of the pattern from switching to UTF interpretation by starting the
+pattern with (*UTF). This may be useful in applications that process patterns
+from external sources. The combination of PCRE_UTF8 and PCRE_NEVER_UTF also
+causes an error.
+.sp
PCRE_NEWLINE_CR
PCRE_NEWLINE_LF
PCRE_NEWLINE_CRLF
@@ -2834,6 +2843,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 26 March 2013
+Last updated: 05 April 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index b28dabc..9b124a2 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "27 March 2013" "PCRE 8.33"
+.TH PCREPATTERN 3 "05 April 2013" "PCRE 8.33"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -1366,7 +1366,8 @@ above. There are also the (*UTF8), (*UTF16),(*UTF32), and (*UCP) leading
sequences that can be used to set UTF and Unicode property modes; they are
equivalent to setting the PCRE_UTF8, PCRE_UTF16, PCRE_UTF32 and the PCRE_UCP
options, respectively. The (*UTF) sequence is a generic version that can be
-used with any of the libraries.
+used with any of the libraries. However, the application can set the
+PCRE_NEVER_UTF option, which locks out the use of the (*UTF) sequences.
.
.
.\" HTML <a name="subpattern"></a>
@@ -3100,6 +3101,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 27 March 2013
+Last updated: 05 April 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index b1c6f98..cae1522 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -1,4 +1,4 @@
-.TH PCRETEST 1 "22 February 2013" "PCRE 8.33"
+.TH PCRETEST 1 "05 April 2013" "PCRE 8.33"
.SH NAME
pcretest - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@@ -287,6 +287,7 @@ fall into several groups that are described in detail in the following
sections.
.sp
\fB/8\fP set UTF mode
+ \fB/9\fP set PCRE_NEVER_UTF (locks out UTF mode)
\fB/?\fP disable UTF validity check
\fB/+\fP show remainder of subject after match
\fB/=\fP show all captures (not just those that are set)
@@ -357,6 +358,7 @@ options that do not correspond to anything in Perl:
\fB/8\fP PCRE_UTF32 ) when using the 32-bit
\fB/?\fP PCRE_NO_UTF32_CHECK ) library
.sp
+ \fB/9\fP PCRE_NEVER_UTF
\fB/A\fP PCRE_ANCHORED
\fB/C\fP PCRE_AUTO_CALLOUT
\fB/E\fP PCRE_DOLLAR_ENDONLY
@@ -1081,6 +1083,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 22 February 2013
+Last updated: 05 April 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/pcre.h.in b/pcre.h.in
index 3e036e3..f86f045 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
- Copyright (c) 1997-2012 University of Cambridge
+ Copyright (c) 1997-2013 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -96,11 +96,14 @@ extern "C" {
#endif
/* Public options. Some are compile-time only, some are run-time only, and some
-are both, so we keep them all distinct. However, almost all the bits in the
-options word are now used. In the long run, we may have to re-use some of the
-compile-time only bits for runtime options, or vice versa. Any of the
-compile-time options may be inspected during studying (and therefore JIT
-compiling).
+are both. Most of the compile-time options are saved with the compiled regex so
+that they can be inspected during studying (and therefore JIT compiling). Note
+that pcre_study() has its own set of options. Originally, all the options
+defined here used distinct bits. However, almost all the bits in a 32-bit word
+are now used, so in order to conserve them, option bits that were previously
+only recognized at matching time (i.e. by pcre_exec() or pcre_dfa_exec()) may
+also be used for compile-time options that affect only compiling and are not
+relevant for studying or JIT compiling.
Some options for pcre_compile() change its behaviour but do not affect the
behaviour of the execution functions. Other options are passed through to the
@@ -142,7 +145,11 @@ with J. */
#define PCRE_AUTO_CALLOUT 0x00004000 /* C1 */
#define PCRE_PARTIAL_SOFT 0x00008000 /* E D J ) Synonyms */
#define PCRE_PARTIAL 0x00008000 /* E D J ) */
-#define PCRE_DFA_SHORTEST 0x00010000 /* D */
+
+/* This pair use the same bit. */
+#define PCRE_NEVER_UTF 0x00010000 /* C1 ) Overlaid */
+#define PCRE_DFA_SHORTEST 0x00010000 /* D ) Overlaid */
+
#define PCRE_DFA_RESTART 0x00020000 /* D */
#define PCRE_FIRSTLINE 0x00040000 /* C3 */
#define PCRE_DUPNAMES 0x00080000 /* C1 */
diff --git a/pcre_compile.c b/pcre_compile.c
index 110df2a..c6fb875 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2012 University of Cambridge
+ Copyright (c) 1997-2013 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -508,6 +508,7 @@ static const char error_texts[] =
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
"character value in \\u.... sequence is too large\0"
"invalid UTF-32 string\0"
+ "setting UTF is disabled by the application\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -7771,6 +7772,7 @@ int newline;
int errorcode = 0;
int skipatstart = 0;
BOOL utf;
+BOOL never_utf = FALSE;
size_t size;
pcre_uchar *code;
const pcre_uchar *codestart;
@@ -7829,6 +7831,15 @@ if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
errorcode = ERR17;
goto PCRE_EARLY_ERROR_RETURN;
}
+
+/* If PCRE_NEVER_UTF is set, remember it. As this option steals a bit that is
+also used for execution options, flatten it just in case. */
+
+if ((options & PCRE_NEVER_UTF) != 0)
+ {
+ never_utf = TRUE;
+ options &= ~PCRE_NEVER_UTF;
+ }
/* Check for global one-time settings at the start of the pattern, and remember
the offset for later. */
@@ -7885,9 +7896,14 @@ PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
else break;
}
-
+
/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
utf = (options & PCRE_UTF8) != 0;
+if (utf && never_utf)
+ {
+ errorcode = ERR78;
+ goto PCRE_EARLY_ERROR_RETURN2;
+ }
/* Can't support UTF unless PCRE has been compiled to include the code. The
return of an error code from PRIV(valid_utf)() is a new feature, introduced in
diff --git a/pcre_internal.h b/pcre_internal.h
index d9e0c60..6306eb1 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -7,7 +7,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2012 University of Cambridge
+ Copyright (c) 1997-2013 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -1164,7 +1164,7 @@ time, run time, or study time, respectively. */
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
- PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
+ PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE|PCRE_NEVER_UTF)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@@ -2271,7 +2271,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
- ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERRCOUNT };
+ ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERRCOUNT };
/* JIT compiling modes. The function list is indexed by them. */
enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE,
diff --git a/pcreposix.c b/pcreposix.c
index 15195c0..97576db 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -162,7 +162,8 @@ static const int eint[] = {
/* 75 */
REG_BADPAT, /* overlong MARK name */
REG_BADPAT, /* character value in \u.... sequence is too large */
- REG_BADPAT /* invalid UTF-32 string (should not occur) */
+ REG_BADPAT, /* invalid UTF-32 string (should not occur) */
+ REG_BADPAT /* setting UTF is disabled by the application */
};
/* Table of texts corresponding to POSIX error codes */
diff --git a/pcretest.c b/pcretest.c
index cd84665..8ebd471 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -3689,6 +3689,7 @@ while (!done)
case 'Y': options |= PCRE_NO_START_OPTIMISE; break;
case 'Z': debug_lengths = 0; break;
case '8': options |= PCRE_UTF8; use_utf = 1; break;
+ case '9': options |= PCRE_NEVER_UTF; break;
case '?': options |= PCRE_NO_UTF8_CHECK; break;
case 'T':
diff --git a/testdata/testinput15 b/testdata/testinput15
index 85a31cf..9aa8f68 100644
--- a/testdata/testinput15
+++ b/testdata/testinput15
@@ -357,4 +357,8 @@ correctly, but that messes up comparisons). --/
\x{ff000041}
\x{7f000041}
+/(*UTF8)abc/9
+
+/abc/89
+
/-- End of testinput15 --/
diff --git a/testdata/testinput18 b/testdata/testinput18
index fe62b8c..42bad1f 100644
--- a/testdata/testinput18
+++ b/testdata/testinput18
@@ -291,4 +291,8 @@ correctly, but that messes up comparisons). --/
/\x{a0}+\s!/8BZT1
\x{a0}\x20!
+/(*UTF)abc/9
+
+/abc/89
+
/-- End of testinput18 --/
diff --git a/testdata/testoutput15 b/testdata/testoutput15
index a5a9b3d..1541637 100644
--- a/testdata/testoutput15
+++ b/testdata/testoutput15
@@ -1128,4 +1128,10 @@ Need char = \x{bf}
\x{7f000041}
Error -10 (bad UTF-8 string) offset=0 reason=12
+/(*UTF8)abc/9
+Failed: setting UTF is disabled by the application at offset 0
+
+/abc/89
+Failed: setting UTF is disabled by the application at offset 0
+
/-- End of testinput15 --/
diff --git a/testdata/testoutput18-16 b/testdata/testoutput18-16
index 135dcf5..e91d841 100644
--- a/testdata/testoutput18-16
+++ b/testdata/testoutput18-16
@@ -1015,4 +1015,10 @@ Failed: invalid UTF-16 string at offset 0
\x{a0}\x20!
0: \x{a0} !
+/(*UTF)abc/9
+Failed: setting UTF is disabled by the application at offset 0
+
+/abc/89
+Failed: setting UTF is disabled by the application at offset 0
+
/-- End of testinput18 --/
diff --git a/testdata/testoutput18-32 b/testdata/testoutput18-32
index cfd3d29..1dba7bc 100644
--- a/testdata/testoutput18-32
+++ b/testdata/testoutput18-32
@@ -1012,4 +1012,10 @@ Error -24 (bad offset value)
\x{a0}\x20!
0: \x{a0} !
+/(*UTF)abc/9
+Failed: setting UTF is disabled by the application at offset 0
+
+/abc/89
+Failed: setting UTF is disabled by the application at offset 0
+
/-- End of testinput18 --/