Implement PCRE_NEVER_UTF

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1309 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2013-04-05 15:35:59 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2013-04-05 15:35:59 +0000
commit: 6cfdfa01a2fd8749bccf17f030aea0536b5090aa (patch)
tree: d7bd3c63a17f04c6aac73e2b27bb82f5a2c66df2
parent: 4ecaf0e27e8a2174744c464b97893e0754df9482 (diff)
download: pcre-6cfdfa01a2fd8749bccf17f030aea0536b5090aa.tar.gz
14 files changed, 86 insertions, 20 deletions
diff --git a/ChangeLog b/ChangeLog
index 977f3b7..b48abbd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -132,6 +132,9 @@ Version 8.33 xx-xxxx-201x
 
 34. Auto-detect and optimize limited repetitions in JIT.
 
+35. Implement PCRE_NEVER_UTF to lock out the use of UTF, in particular, 
+    blocking (*UTF) etc.
+
 
 Version 8.32 30-November-2012
 -----------------------------
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 4463470..42364ee 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1,4 +1,4 @@
-.TH PCREAPI 3 "26 March 2013" "PCRE 8.33"
+.TH PCREAPI 3 "05 April 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .sp
@@ -755,6 +755,15 @@ equivalent to Perl's /m option, and it can be changed within a pattern by a
 (?m) option setting. If there are no newlines in a subject string, or no
 occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
 .sp
+  PCRE_NEVER_UTF
+.sp
+This option locks out interpretation of the pattern as UTF-8 (or UTF-16 or
+UTF-32 in the 16-bit and 32-bit libraries). In particular, it prevents the
+creator of the pattern from switching to UTF interpretation by starting the
+pattern with (*UTF). This may be useful in applications that process patterns
+from external sources. The combination of PCRE_UTF8 and PCRE_NEVER_UTF also 
+causes an error.
+.sp
   PCRE_NEWLINE_CR
   PCRE_NEWLINE_LF
   PCRE_NEWLINE_CRLF
@@ -2834,6 +2843,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 26 March 2013
+Last updated: 05 April 2013
 Copyright (c) 1997-2013 University of Cambridge.
 .fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index b28dabc..9b124a2 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "27 March 2013" "PCRE 8.33"
+.TH PCREPATTERN 3 "05 April 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -1366,7 +1366,8 @@ above. There are also the (*UTF8), (*UTF16),(*UTF32), and (*UCP) leading
 sequences that can be used to set UTF and Unicode property modes; they are
 equivalent to setting the PCRE_UTF8, PCRE_UTF16, PCRE_UTF32 and the PCRE_UCP
 options, respectively. The (*UTF) sequence is a generic version that can be
-used with any of the libraries.
+used with any of the libraries. However, the application can set the 
+PCRE_NEVER_UTF option, which locks out the use of the (*UTF) sequences.
 .
 .
 .\" HTML <a name="subpattern"></a>
@@ -3100,6 +3101,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 27 March 2013
+Last updated: 05 April 2013
 Copyright (c) 1997-2013 University of Cambridge.
 .fi
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index b1c6f98..cae1522 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -1,4 +1,4 @@
-.TH PCRETEST 1 "22 February 2013" "PCRE 8.33"
+.TH PCRETEST 1 "05 April 2013" "PCRE 8.33"
 .SH NAME
 pcretest - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -287,6 +287,7 @@ fall into several groups that are described in detail in the following
 sections.
 .sp
   \fB/8\fP              set UTF mode
+  \fB/9\fP              set PCRE_NEVER_UTF (locks out UTF mode) 
   \fB/?\fP              disable UTF validity check
   \fB/+\fP              show remainder of subject after match
   \fB/=\fP              show all captures (not just those that are set)
@@ -357,6 +358,7 @@ options that do not correspond to anything in Perl:
   \fB/8\fP              PCRE_UTF32          ) when using the 32-bit
   \fB/?\fP              PCRE_NO_UTF32_CHECK )   library
 .sp
+  \fB/9\fP              PCRE_NEVER_UTF
   \fB/A\fP              PCRE_ANCHORED
   \fB/C\fP              PCRE_AUTO_CALLOUT
   \fB/E\fP              PCRE_DOLLAR_ENDONLY
@@ -1081,6 +1083,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 22 February 2013
+Last updated: 05 April 2013
 Copyright (c) 1997-2013 University of Cambridge.
 .fi
diff --git a/pcre.h.in b/pcre.h.in
index 3e036e3..f86f045 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -5,7 +5,7 @@
 /* This is the public header file for the PCRE library, to be #included by
 applications that call the PCRE functions.
 
-           Copyright (c) 1997-2012 University of Cambridge
+           Copyright (c) 1997-2013 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -96,11 +96,14 @@ extern "C" {
 #endif
 
 /* Public options. Some are compile-time only, some are run-time only, and some
-are both, so we keep them all distinct. However, almost all the bits in the
-options word are now used. In the long run, we may have to re-use some of the
-compile-time only bits for runtime options, or vice versa. Any of the
-compile-time options may be inspected during studying (and therefore JIT
-compiling).
+are both. Most of the compile-time options are saved with the compiled regex so
+that they can be inspected during studying (and therefore JIT compiling). Note
+that pcre_study() has its own set of options. Originally, all the options
+defined here used distinct bits. However, almost all the bits in a 32-bit word
+are now used, so in order to conserve them, option bits that were previously
+only recognized at matching time (i.e. by pcre_exec() or pcre_dfa_exec()) may
+also be used for compile-time options that affect only compiling and are not
+relevant for studying or JIT compiling.
 
 Some options for pcre_compile() change its behaviour but do not affect the
 behaviour of the execution functions. Other options are passed through to the
@@ -142,7 +145,11 @@ with J. */
 #define PCRE_AUTO_CALLOUT       0x00004000  /* C1       */
 #define PCRE_PARTIAL_SOFT       0x00008000  /*    E D J  ) Synonyms */
 #define PCRE_PARTIAL            0x00008000  /*    E D J  )          */
-#define PCRE_DFA_SHORTEST       0x00010000  /*      D   */
+
+/* This pair use the same bit. */
+#define PCRE_NEVER_UTF          0x00010000  /* C1        ) Overlaid */
+#define PCRE_DFA_SHORTEST       0x00010000  /*      D    ) Overlaid */
+
 #define PCRE_DFA_RESTART        0x00020000  /*      D   */
 #define PCRE_FIRSTLINE          0x00040000  /* C3       */
 #define PCRE_DUPNAMES           0x00080000  /* C1       */
diff --git a/pcre_compile.c b/pcre_compile.c
index 110df2a..c6fb875 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2012 University of Cambridge
+           Copyright (c) 1997-2013 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -508,6 +508,7 @@ static const char error_texts[] =
   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
   "character value in \\u.... sequence is too large\0"
   "invalid UTF-32 string\0"
+  "setting UTF is disabled by the application\0"
   ;
 
 /* Table to identify digits and hex digits. This is used when compiling
@@ -7771,6 +7772,7 @@ int newline;
 int errorcode = 0;
 int skipatstart = 0;
 BOOL utf;
+BOOL never_utf = FALSE;
 size_t size;
 pcre_uchar *code;
 const pcre_uchar *codestart;
@@ -7829,6 +7831,15 @@ if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
   errorcode = ERR17;
   goto PCRE_EARLY_ERROR_RETURN;
   }
+  
+/* If PCRE_NEVER_UTF is set, remember it. As this option steals a bit that is 
+also used for execution options, flatten it just in case. */ 
+
+if ((options & PCRE_NEVER_UTF) != 0)
+  {
+  never_utf = TRUE; 
+  options &= ~PCRE_NEVER_UTF; 
+  } 
 
 /* Check for global one-time settings at the start of the pattern, and remember
 the offset for later. */
@@ -7885,9 +7896,14 @@ PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
   else break;
   }
-
+  
 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
 utf = (options & PCRE_UTF8) != 0;
+if (utf && never_utf)
+  {
+  errorcode = ERR78;
+  goto PCRE_EARLY_ERROR_RETURN2;
+  } 
 
 /* Can't support UTF unless PCRE has been compiled to include the code. The
 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
diff --git a/pcre_internal.h b/pcre_internal.h
index d9e0c60..6306eb1 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -7,7 +7,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2012 University of Cambridge
+           Copyright (c) 1997-2013 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -1164,7 +1164,7 @@ time, run time, or study time, respectively. */
    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
    PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
    PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
-   PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
+   PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE|PCRE_NEVER_UTF)
 
 #define PUBLIC_EXEC_OPTIONS \
   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@@ -2271,7 +2271,7 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
        ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
        ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
        ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
-       ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERRCOUNT };
+       ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERRCOUNT };
 
 /* JIT compiling modes. The function list is indexed by them. */
 enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE,
diff --git a/pcreposix.c b/pcreposix.c
index 15195c0..97576db 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -162,7 +162,8 @@ static const int eint[] = {
   /* 75 */
   REG_BADPAT,  /* overlong MARK name */
   REG_BADPAT,  /* character value in \u.... sequence is too large */
-  REG_BADPAT   /* invalid UTF-32 string (should not occur) */
+  REG_BADPAT,  /* invalid UTF-32 string (should not occur) */
+  REG_BADPAT   /* setting UTF is disabled by the application */ 
 };
 
 /* Table of texts corresponding to POSIX error codes */
diff --git a/pcretest.c b/pcretest.c
index cd84665..8ebd471 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -3689,6 +3689,7 @@ while (!done)
       case 'Y': options |= PCRE_NO_START_OPTIMISE; break;
       case 'Z': debug_lengths = 0; break;
       case '8': options |= PCRE_UTF8; use_utf = 1; break;
+      case '9': options |= PCRE_NEVER_UTF; break; 
       case '?': options |= PCRE_NO_UTF8_CHECK; break;
 
       case 'T':
diff --git a/testdata/testinput15 b/testdata/testinput15
index 85a31cf..9aa8f68 100644
--- a/testdata/testinput15
+++ b/testdata/testinput15
@@ -357,4 +357,8 @@ correctly, but that messes up comparisons). --/
   \x{ff000041}
   \x{7f000041} 
 
+/(*UTF8)abc/9
+
+/abc/89
+
 /-- End of testinput15 --/
diff --git a/testdata/testinput18 b/testdata/testinput18
index fe62b8c..42bad1f 100644
--- a/testdata/testinput18
+++ b/testdata/testinput18
@@ -291,4 +291,8 @@ correctly, but that messes up comparisons). --/
 /\x{a0}+\s!/8BZT1
     \x{a0}\x20!
 
+/(*UTF)abc/9
+
+/abc/89
+
 /-- End of testinput18 --/
diff --git a/testdata/testoutput15 b/testdata/testoutput15
index a5a9b3d..1541637 100644
--- a/testdata/testoutput15
+++ b/testdata/testoutput15
@@ -1128,4 +1128,10 @@ Need char = \x{bf}
   \x{7f000041} 
 Error -10 (bad UTF-8 string) offset=0 reason=12
 
+/(*UTF8)abc/9
+Failed: setting UTF is disabled by the application at offset 0
+
+/abc/89
+Failed: setting UTF is disabled by the application at offset 0
+
 /-- End of testinput15 --/
diff --git a/testdata/testoutput18-16 b/testdata/testoutput18-16
index 135dcf5..e91d841 100644
--- a/testdata/testoutput18-16
+++ b/testdata/testoutput18-16
@@ -1015,4 +1015,10 @@ Failed: invalid UTF-16 string at offset 0
     \x{a0}\x20!
  0: \x{a0} !
 
+/(*UTF)abc/9
+Failed: setting UTF is disabled by the application at offset 0
+
+/abc/89
+Failed: setting UTF is disabled by the application at offset 0
+
 /-- End of testinput18 --/
diff --git a/testdata/testoutput18-32 b/testdata/testoutput18-32
index cfd3d29..1dba7bc 100644
--- a/testdata/testoutput18-32
+++ b/testdata/testoutput18-32
@@ -1012,4 +1012,10 @@ Error -24 (bad offset value)
     \x{a0}\x20!
  0: \x{a0} !
 
+/(*UTF)abc/9
+Failed: setting UTF is disabled by the application at offset 0
+
+/abc/89
+Failed: setting UTF is disabled by the application at offset 0
+
 /-- End of testinput18 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2013-04-05 15:35:59 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2013-04-05 15:35:59 +0000
commit	6cfdfa01a2fd8749bccf17f030aea0536b5090aa (patch)
tree	d7bd3c63a17f04c6aac73e2b27bb82f5a2c66df2
parent	4ecaf0e27e8a2174744c464b97893e0754df9482 (diff)
download	pcre-6cfdfa01a2fd8749bccf17f030aea0536b5090aa.tar.gz