diff options
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | NEWS | 2 | ||||
-rwxr-xr-x | RunTest | 94 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 28 | ||||
-rw-r--r-- | doc/pcretest.1 | 8 | ||||
-rw-r--r-- | pcre16_utf16_utils.c | 3 | ||||
-rw-r--r-- | pcre16_valid_utf16.c | 1 | ||||
-rw-r--r-- | pcre_compile.c | 85 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 178 | ||||
-rw-r--r-- | pcre_exec.c | 318 | ||||
-rw-r--r-- | pcre_internal.h | 7 | ||||
-rw-r--r-- | pcre_maketables.c | 2 | ||||
-rw-r--r-- | pcre_newline.c | 78 | ||||
-rw-r--r-- | pcre_study.c | 81 | ||||
-rw-r--r-- | pcregrep.c | 44 | ||||
-rw-r--r-- | pcretest.c | 57 | ||||
-rw-r--r-- | testdata/testinputEBC | 121 | ||||
-rw-r--r-- | testdata/testoutputEBC | 182 |
18 files changed, 909 insertions, 386 deletions
@@ -64,6 +64,12 @@ Version 8.32 14. Applied user-supplied patch to pcrecpp.cc to allow PCRE_NO_UTF8_CHECK to be set. + +15. The EBCDIC support had decayed; later updates to the code had included + explicit references to (e.g.) \x0a instead of CHAR_LF. There has been a + general tidy up of EBCDIC-related issues, and the documentation was also + not quite right. There is now a test that can be run on ASCII systems to + check some of the EBCDIC-related things (but is it not a full test). @@ -12,6 +12,8 @@ Release 8.32 xx-xxxx-2012 . \X now matches a Unicode extended grapheme cluster. +. The EBCDIC support, which had decayed, has had a spring clean. + Release 8.31 06-July-2012 ------------------------- @@ -2,7 +2,7 @@ # Run the PCRE tests using the pcretest program. The appropriate tests are # selected, depending on which build-time options were used. - +# # All tests are now run both with and without -s, to ensure that everything is # tested with and without studying. However, there are some tests that produce # different output after studying, typically when we are tracing the actual @@ -12,23 +12,31 @@ # any difference to their output. There is also one test which compiles invalid # UTF-8 with the UTF-8 check turned off; for this, studying must also be # disabled with /SS. - +# # When JIT support is available, all the tests are also run with -s+ to test # (again, almost) everything with studying and the JIT option. There are also # two tests for JIT-specific features, one to be run when JIT support is # available, and one when it is not. - +# # Whichever of the 8-bit and 16-bit libraries exist are tested. It is also # possible to select which to test by the arguments -8 or -16. - +# # Other arguments for this script can be individual test numbers, or the word # "valgrind", or "sim" followed by an argument to run cross-compiled # executables under a simulator, for example: # # RunTest 3 sim "qemu-arm -s 8388608" # -# Finally, if the script is obeyed as "RunTest list", a list of available -# tests is output, but none of them are run. +# +# There are two special cases where only one argument is allowed: +# +# If the first and only argument is "ebcdic", the script runs the special +# EBCDIC test that can be useful for checking certain EBCDIC features, even +# when run in an ASCII environment. +# +# If the script is obeyed as "RunTest list", a list of available tests is +# output, but none of them are run. + # Define test titles in variables so that they can be output as a list. Some # of them are modified (e.g. with -8 or -16) when used in the actual tests. @@ -83,6 +91,56 @@ if [ $# -eq 1 -a "$1" = "list" ]; then exit 0 fi +# Set up a suitable "diff" command for comparison. Some systems +# have a diff that lacks a -u option. Try to deal with this. + +cf="diff" +diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u" + +# Find the test data + +if [ -n "$srcdir" -a -d "$srcdir" ] ; then + testdata="$srcdir/testdata" +elif [ -d "./testdata" ] ; then + testdata=./testdata +elif [ -d "../testdata" ] ; then + testdata=../testdata +else + echo "Cannot find the testdata directory" + exit 1 +fi + + +# ------ Special EBCDIC Test ------- + +if [ $# -eq 1 -a "$1" = "ebcdic" ]; then + ./pcretest -C ebcdic >/dev/null + ebcdic=$? + if [ $ebcdic -ne 1 ] ; then + echo "Cannot run EBCDIC tests: EBCDIC support not compiled" + exit 1 + fi + + for opt in "" "-s" "-dfa" "-s -dfa"; do + ./pcretest -q $opt $testdata/testinputEBC >testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutputEBC testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-s" ] ; then echo " OK with study" + elif [ "$opt" = "-dfa" ] ; then echo " OK using DFA" + elif [ "$opt" = "-s -dfa" ] ; then echo " OK using DFA with study" + else echo " OK" + fi + done + +exit 0 +fi + + +# ------ Normal Tests ------ + # Default values valgrind= @@ -152,29 +210,7 @@ while [ $# -gt 0 ] ; do shift done -# Set up a suitable "diff" command for comparison. Some systems -# have a diff that lacks a -u option. Try to deal with this. - -cf="diff" -diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u" - -# Find the test data - -if [ -n "$srcdir" -a -d "$srcdir" ] ; then - testdata="$srcdir/testdata" -elif [ -d "./testdata" ] ; then - testdata=./testdata -elif [ -d "../testdata" ] ; then - testdata=../testdata -else - echo "Cannot find the testdata directory" - exit 1 -fi - -# Find which optional facilities are available. In some Windows environments -# the output of pcretest -C has CRLF at the end of each line, but the shell -# strips only linefeeds from the output of a `backquoted` command. Hence the -# alternative patterns. +# Find which optional facilities are available. $sim ./pcretest -C linksize >/dev/null link_size=$? diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index f3f0fb8..1e2c078 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -1,4 +1,4 @@ -.TH PCREPATTERN 3 "04 May 2012" "PCRE 8.31" +.TH PCREPATTERN 3 "10 September 2012" "PCRE 8.31" .SH NAME PCRE - Perl-compatible regular expressions .SH "PCRE REGULAR EXPRESSION DETAILS" @@ -69,6 +69,16 @@ functions, are discussed in the page. . . +.SH "EBCDIC CHARACTER CODES" +.rs +.sp +PCRE can be compiled to run in an environment that uses EBCDIC as its character +code rather than ASCII or Unicode (typically a mainframe system). In the +sections below, character code values are ASCII or Unicode; in an EBCDIC +environment these characters may have different code values, and there are no +code points greater than 255. +. +. .\" HTML <a name="newlines"></a> .SH "NEWLINE CONVENTIONS" .rs @@ -320,7 +330,7 @@ subsequent digits stand for themselves. The value of the character is constrained in the same way as characters specified in hexadecimal. For example: .sp - \e040 is another way of writing a space + \e040 is another way of writing an ASCII space .\" JOIN \e40 is the same, provided there are fewer than 40 previous capturing subpatterns @@ -478,7 +488,7 @@ release 5.10. In contrast to the other sequences, which match only ASCII characters by default, these always match certain high-valued codepoints, whether or not PCRE_UCP is set. The horizontal space characters are: .sp - U+0009 Horizontal tab + U+0009 Horizontal tab (HT) U+0020 Space U+00A0 Non-break space U+1680 Ogham space mark @@ -500,11 +510,11 @@ whether or not PCRE_UCP is set. The horizontal space characters are: .sp The vertical space characters are: .sp - U+000A Linefeed - U+000B Vertical tab - U+000C Form feed - U+000D Carriage return - U+0085 Next line + U+000A Linefeed (LF) + U+000B Vertical tab (VT) + U+000C Form feed (FF) + U+000D Carriage return (CR) + U+0085 Next line (NEL) U+2028 Line separator U+2029 Paragraph separator .sp @@ -2953,6 +2963,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 25 August 2012 +Last updated: 10 September 2012 Copyright (c) 1997-2012 University of Cambridge. .fi diff --git a/doc/pcretest.1 b/doc/pcretest.1 index bb57b47..163ac63 100644 --- a/doc/pcretest.1 +++ b/doc/pcretest.1 @@ -1,4 +1,4 @@ -.TH PCRETEST 1 "29 August 2012" "PCRE 8.32" +.TH PCRETEST 1 "10 September 2012" "PCRE 8.32" .SH NAME pcretest - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -77,12 +77,16 @@ Output information about a specific build-time option, then exit. This functionality is intended for use in scripts such as \fBRunTest\fP. The following options output the value indicated: .sp + ebcdic-nl the code for LF (= NL) in an EBCDIC environment: + 0x15 or 0x25 + 0 if used in an ASCII environment linksize the internal link size (2, 3, or 4) newline the default newline setting: CR, LF, CRLF, ANYCRLF, or ANY .sp The following options output 1 for true or zero for false: .sp + ebcdic compiled for an EBCDIC environment jit just-in-time support is available pcre16 the 16-bit library was built pcre8 the 8-bit library was built @@ -1045,6 +1049,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 29 August 2012 +Last updated: 10 September 2012 Copyright (c) 1997-2012 University of Cambridge. .fi diff --git a/pcre16_utf16_utils.c b/pcre16_utf16_utils.c index 1f97293..49ced0c 100644 --- a/pcre16_utf16_utils.c +++ b/pcre16_utf16_utils.c @@ -118,10 +118,11 @@ while (iptr < end) if (host_byte_order != NULL) *host_byte_order = host_bo; -#else /* SUPPORT_UTF */ +#else /* Not SUPPORT_UTF */ (void)(output); /* Keep picky compilers happy */ (void)(input); (void)(keep_boms); +(void)(host_byte_order); #endif /* SUPPORT_UTF */ return length; } diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c index 4d97cc6..8047d23 100644 --- a/pcre16_valid_utf16.c +++ b/pcre16_valid_utf16.c @@ -138,6 +138,7 @@ for (p = string; length-- > 0; p++) #else /* SUPPORT_UTF */ (void)(string); /* Keep picky compilers happy */ (void)(length); +(void)(erroroffset); #endif /* SUPPORT_UTF */ return PCRE_UTF16_ERR0; /* This indicates success */ diff --git a/pcre_compile.c b/pcre_compile.c index 1426b9a..e2d081a 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -789,7 +789,7 @@ else if ((i = escapes[c - CHAR_0]) != 0) c = i; #else /* EBCDIC coding */ /* Not alphanumeric */ -else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} +else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} else if ((i = escapes[c - 0x48]) != 0) c = i; #endif @@ -3168,8 +3168,9 @@ if (next >= 0) switch(op_code) case OP_NOT_HSPACE: switch(next) { - case 0x09: - case 0x20: + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: case 0x1680: case 0x180e: @@ -3187,6 +3188,7 @@ if (next >= 0) switch(op_code) case 0x202f: case 0x205f: case 0x3000: +#endif /* Not EBCDIC */ return op_code == OP_NOT_HSPACE; default: return op_code != OP_NOT_HSPACE; @@ -3197,13 +3199,15 @@ if (next >= 0) switch(op_code) case OP_NOT_VSPACE: switch(next) { - case 0x0a: - case 0x0b: - case 0x0c: - case 0x0d: - case 0x85: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif return op_code == OP_NOT_VSPACE; default: return op_code != OP_NOT_VSPACE; @@ -3261,8 +3265,9 @@ switch(op_code) case ESC_H: switch(c) { - case 0x09: - case 0x20: + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: case 0x1680: case 0x180e: @@ -3280,6 +3285,7 @@ switch(op_code) case 0x202f: case 0x205f: case 0x3000: +#endif /* Not EBCDIC */ return -next != ESC_h; default: return -next == ESC_h; @@ -3289,13 +3295,15 @@ switch(op_code) case ESC_V: switch(c) { - case 0x0a: - case 0x0b: - case 0x0c: - case 0x0d: - case 0x85: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ return -next != ESC_v; default: return -next == ESC_v; @@ -4057,7 +4065,8 @@ for (;; ptr++) /* Perl 5.004 onwards omits VT from \s, but we must preserve it if it was previously set by something earlier in the character - class. */ + class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and + EBCDIC, so we lazily just adjust the appropriate bit. */ case ESC_s: classbits[0] |= cbits[cbit_space]; @@ -4072,8 +4081,9 @@ for (;; ptr++) continue; case ESC_h: - SETBIT(classbits, 0x09); /* VT */ - SETBIT(classbits, 0x20); /* SPACE */ + SETBIT(classbits, CHAR_HT); + SETBIT(classbits, CHAR_SPACE); +#ifndef EBCDIC SETBIT(classbits, 0xa0); /* NSBP */ #ifndef COMPILE_PCRE8 xclass = TRUE; @@ -4109,6 +4119,7 @@ for (;; ptr++) class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata); } #endif +#endif /* Not EBCDIC */ continue; case ESC_H: @@ -4117,13 +4128,16 @@ for (;; ptr++) int x = 0xff; switch (c) { - case 0x09/8: x ^= 1 << (0x09%8); break; - case 0x20/8: x ^= 1 << (0x20%8); break; - case 0xa0/8: x ^= 1 << (0xa0%8); break; + case CHAR_HT/8: x ^= 1 << (CHAR_HT%8); break; + case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break; +#ifndef EBCDIC + case 0xa0/8: x ^= 1 << (0xa0%8); break; /* NSBSP */ +#endif default: break; } classbits[c] |= x; } +#ifndef EBCDIC #ifndef COMPILE_PCRE8 xclass = TRUE; *class_uchardata++ = XCL_RANGE; @@ -4150,7 +4164,7 @@ for (;; ptr++) if (utf) class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); else -#endif +#endif /* SUPPORT_UTF */ *class_uchardata++ = 0xffff; #elif defined SUPPORT_UTF if (utf) @@ -4179,14 +4193,16 @@ for (;; ptr++) class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); } #endif +#endif /* Not EBCDIC */ continue; case ESC_v: - SETBIT(classbits, 0x0a); /* LF */ - SETBIT(classbits, 0x0b); /* VT */ - SETBIT(classbits, 0x0c); /* FF */ - SETBIT(classbits, 0x0d); /* CR */ - SETBIT(classbits, 0x85); /* NEL */ + SETBIT(classbits, CHAR_LF); + SETBIT(classbits, CHAR_VT); + SETBIT(classbits, CHAR_FF); + SETBIT(classbits, CHAR_CR); + SETBIT(classbits, CHAR_NEL); +#ifndef EBCDIC #ifndef COMPILE_PCRE8 xclass = TRUE; *class_uchardata++ = XCL_RANGE; @@ -4201,6 +4217,7 @@ for (;; ptr++) class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); } #endif +#endif /* Not EBCDIC */ continue; case ESC_V: @@ -4209,17 +4226,18 @@ for (;; ptr++) int x = 0xff; switch (c) { - case 0x0a/8: x ^= 1 << (0x0a%8); - x ^= 1 << (0x0b%8); - x ^= 1 << (0x0c%8); - x ^= 1 << (0x0d%8); - break; - case 0x85/8: x ^= 1 << (0x85%8); break; + case CHAR_LF/8: x ^= 1 << (CHAR_LF%8); + x ^= 1 << (CHAR_VT%8); + x ^= 1 << (CHAR_FF%8); + x ^= 1 << (CHAR_CR%8); + break; + case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break; default: break; } classbits[c] |= x; } +#ifndef EBCDIC #ifndef COMPILE_PCRE8 xclass = TRUE; *class_uchardata++ = XCL_RANGE; @@ -4245,6 +4263,7 @@ for (;; ptr++) class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); } #endif +#endif /* Not EBCDIC */ continue; #ifdef SUPPORT_UCP diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index a8f2f91..ad0be6c 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -1370,7 +1370,7 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { - int lgb, rgb; + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) @@ -1378,15 +1378,15 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - lgb = UCD_GRAPHBREAK(c); + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); + rgb = UCD_GRAPHBREAK(d); if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - lgb = rgb; + lgb = rgb; nptr += dlen; } count++; @@ -1406,20 +1406,22 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL01; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1; /* Fall through */ ANYNL01: - case 0x000a: + case CHAR_LF: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ @@ -1446,13 +1448,15 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ OK = TRUE; break; @@ -1485,8 +1489,9 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -1504,6 +1509,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ OK = TRUE; break; @@ -1629,7 +1635,7 @@ for (;;) ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { - int lgb, rgb; + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || @@ -1638,15 +1644,15 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - lgb = UCD_GRAPHBREAK(c); + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); + rgb = UCD_GRAPHBREAK(d); if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - lgb = rgb; + lgb = rgb; nptr += dlen; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); @@ -1673,20 +1679,22 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL02; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1; /* Fall through */ ANYNL02: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) { @@ -1721,13 +1729,15 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ OK = TRUE; break; @@ -1767,8 +1777,9 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -1786,6 +1797,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ OK = TRUE; break; @@ -1899,7 +1911,7 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - int lgb, rgb; + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) @@ -1907,15 +1919,15 @@ for (;;) active_count--; /* Remove non-match possibility */ next_active_state--; } - lgb = UCD_GRAPHBREAK(c); + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); + rgb = UCD_GRAPHBREAK(d); if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - lgb = rgb; + lgb = rgb; nptr += dlen; } if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) @@ -1941,20 +1953,22 @@ for (;;) int ncount = 0; switch (c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL03; - case 0x000d: - if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; + case CHAR_CR: + if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1; /* Fall through */ ANYNL03: - case 0x000a: + case CHAR_LF: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ @@ -1985,13 +1999,15 @@ for (;;) BOOL OK; switch (c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ OK = TRUE; break; @@ -2027,8 +2043,9 @@ for (;;) BOOL OK; switch (c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -2046,6 +2063,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ OK = TRUE; break; @@ -2123,18 +2141,18 @@ for (;;) case OP_EXTUNI: if (clen > 0) { - int lgb, rgb; + int lgb, rgb; const pcre_uchar *nptr = ptr + clen; int ncount = 0; - lgb = UCD_GRAPHBREAK(c); + lgb = UCD_GRAPHBREAK(c); while (nptr < end_subject) { dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } - rgb = UCD_GRAPHBREAK(d); + rgb = UCD_GRAPHBREAK(d); if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; ncount++; - lgb = rgb; + lgb = rgb; nptr += dlen; } if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) @@ -2152,25 +2170,27 @@ for (;;) case OP_ANYNL: if (clen > 0) switch(c) { - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; - case 0x000a: + case CHAR_LF: ADD_NEW(state_offset + 1, 0); break; - case 0x000d: + case CHAR_CR: if (ptr + 1 >= end_subject) { ADD_NEW(state_offset + 1, 0); if ((md->moptions & PCRE_PARTIAL_HARD) != 0) reset_could_continue = TRUE; } - else if (ptr[1] == 0x0a) + else if (ptr[1] == CHAR_LF) { ADD_NEW_DATA(-(state_offset + 1), 0, 1); } @@ -2186,13 +2206,15 @@ for (;;) case OP_NOT_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ break; default: @@ -2205,13 +2227,15 @@ for (;;) case OP_VSPACE: if (clen > 0) switch(c) { - case 0x000a: - case 0x000b: - case 0x000c: - case 0x000d: - case 0x0085: + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ ADD_NEW(state_offset + 1, 0); break; @@ -2223,8 +2247,9 @@ for (;;) case OP_NOT_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -2242,6 +2267,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ break; default: @@ -2254,8 +2280,9 @@ for (;;) case OP_HSPACE: if (clen > 0) switch(c) { - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -2273,6 +2300,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ ADD_NEW(state_offset + 1, 0); break; } diff --git a/pcre_exec.c b/pcre_exec.c index c297da2..7bb85d0 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -2415,22 +2415,24 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: + case CHAR_CR: if (eptr >= md->end_subject) { SCHECK_PARTIAL(); } - else if (*eptr == 0x0a) eptr++; + else if (*eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -2447,8 +2449,9 @@ for (;;) switch(c) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -2466,6 +2469,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } ecode++; @@ -2481,8 +2485,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -2500,6 +2505,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ break; } ecode++; @@ -2515,13 +2521,15 @@ for (;;) switch(c) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } ecode++; @@ -2537,13 +2545,15 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif /* Not EBCDIC */ break; } ecode++; @@ -4313,18 +4323,20 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -4343,8 +4355,9 @@ for (;;) switch(c) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -4362,6 +4375,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } } @@ -4379,8 +4393,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -4398,6 +4413,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif break; } } @@ -4415,13 +4431,15 @@ for (;;) switch(c) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif RRETURN(MATCH_NOMATCH); } } @@ -4439,13 +4457,15 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif break; } } @@ -4604,16 +4624,16 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: #ifdef COMPILE_PCRE16 case 0x2028: case 0x2029: @@ -4635,8 +4655,9 @@ for (;;) switch(*eptr++) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ #ifdef COMPILE_PCRE16 case 0x1680: /* OGHAM SPACE MARK */ @@ -4655,7 +4676,8 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ -#endif +#endif /* COMPILE_PCRE16 */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } } @@ -4672,8 +4694,9 @@ for (;;) switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ #ifdef COMPILE_PCRE16 case 0x1680: /* OGHAM SPACE MARK */ @@ -4692,7 +4715,8 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ -#endif +#endif /* COMPILE_PCRE16 */ +#endif /* Not EBCDIC */ break; } } @@ -4709,11 +4733,11 @@ for (;;) switch(*eptr++) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: #ifdef COMPILE_PCRE16 case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ @@ -4734,11 +4758,11 @@ for (;;) switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: #ifdef COMPILE_PCRE16 case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ @@ -5100,17 +5124,20 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: case 0x2029: +#endif /* Not EBCDIC */ if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -5120,8 +5147,9 @@ for (;;) switch(c) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -5139,6 +5167,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } break; @@ -5147,8 +5176,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -5166,6 +5196,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ break; } break; @@ -5174,13 +5205,15 @@ for (;;) switch(c) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } break; @@ -5189,13 +5222,15 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif /* Not EBCDIC */ break; } break; @@ -5274,16 +5309,16 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + case CHAR_CR: + if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++; break; - case 0x000a: + case CHAR_LF: break; - case 0x000b: - case 0x000c: - case 0x0085: + case CHAR_VT: + case CHAR_FF: + case CHAR_NEL: #ifdef COMPILE_PCRE16 case 0x2028: case 0x2029: @@ -5297,8 +5332,9 @@ for (;;) switch(c) { default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ #ifdef COMPILE_PCRE16 case 0x1680: /* OGHAM SPACE MARK */ @@ -5317,7 +5353,8 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ -#endif +#endif /* COMPILE_PCRE16 */ +#endif /* Not EBCDIC */ RRETURN(MATCH_NOMATCH); } break; @@ -5326,8 +5363,9 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ #ifdef COMPILE_PCRE16 case 0x1680: /* OGHAM SPACE MARK */ @@ -5346,7 +5384,8 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ -#endif +#endif /* COMPILE_PCRE16 */ +#endif /* Not EBCDIC */ break; } break; @@ -5355,11 +5394,11 @@ for (;;) switch(c) { default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: #ifdef COMPILE_PCRE16 case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ @@ -5372,11 +5411,11 @@ for (;;) switch(c) { default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: #ifdef COMPILE_PCRE16 case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ @@ -5754,17 +5793,20 @@ for (;;) break; } GETCHARLEN(c, eptr, len); - if (c == 0x000d) + if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; + if (*eptr == CHAR_LF) eptr++; } else { - if (c != 0x000a && + if (c != CHAR_LF && (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && - c != 0x0085 && c != 0x2028 && c != 0x2029))) + (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL +#ifndef EBCDIC + && c != 0x2028 && c != 0x2029 +#endif /* Not EBCDIC */ + ))) break; eptr += len; } @@ -5786,8 +5828,9 @@ for (;;) switch(c) { default: gotspace = FALSE; break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ + case CHAR_HT: + case CHAR_SPACE: +#ifndef EBCDIC case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ @@ -5805,6 +5848,7 @@ for (;;) case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif /* Not EBCDIC */ gotspace = TRUE; break; } @@ -5828,13 +5872,15 @@ for (;;) switch(c) { default: gotspace = FALSE; break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: + case CHAR_NEL: +#ifndef EBCDIC case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif /* Not EBCDIC */ gotspace = TRUE; break; } @@ -5950,8 +5996,8 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ BACKCHAR(eptr); - if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && - eptr[-1] == '\r') eptr--; + if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_NL && + eptr[-1] == CHAR_CR) eptr--; } } else @@ -6002,19 +6048,19 @@ for (;;) break; } c = *eptr; - if (c == 0x000d) + if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; + if (*eptr == CHAR_LF) eptr++; } else { - if (c != 0x000a && (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && c != 0x0085 + if (c != CHAR_LF && (md->bsr_anycrlf || + (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL #ifdef COMPILE_PCRE16 - && c != 0x2028 && c != 0x2029 + && c != 0x2028 && c != 0x2029 #endif - ))) break; + ))) break; eptr++; } } @@ -6029,11 +6075,14 @@ for (;;) break; } c = *eptr; - if (c == 0x09 || c == 0x20 || c == 0xa0 + if (c == CHAR_HT || c == CHAR_SPACE +#ifndef EBCDIC + || c == 0xa0 #ifdef COMPILE_PCRE16 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A) || c == 0x202f || c == 0x205f || c == 0x3000 -#endif +#endif /* COMPILE_PCRE16 */ +#endif /* Not EBCDIC */ ) break; eptr++; } @@ -6048,11 +6097,14 @@ for (;;) break; } c = *eptr; - if (c != 0x09 && c != 0x20 && c != 0xa0 + if (c != CHAR_HT && c != CHAR_SPACE +#ifndef EBCDIC + && c != 0xa0 #ifdef COMPILE_PCRE16 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A) && c != 0x202f && c != 0x205f && c != 0x3000 -#endif +#endif /* COMPILE_PCRE16 */ +#endif /* Not EBCDIC */ ) break; eptr++; } @@ -6067,7 +6119,8 @@ for (;;) break; } c = *eptr; - if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85 + if (c == CHAR_LF || c == CHAR_VT || c == CHAR_FF || + c == CHAR_CR || c == CHAR_NEL #ifdef COMPILE_PCRE16 || c == 0x2028 || c == 0x2029 #endif @@ -6085,7 +6138,8 @@ for (;;) break; } c = *eptr; - if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85 + if (c != CHAR_LF && c != CHAR_VT && c != CHAR_FF && + c != CHAR_CR && c != CHAR_NEL #ifdef COMPILE_PCRE16 && c != 0x2028 && c != 0x2029 #endif @@ -6188,8 +6242,8 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; - if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && - eptr[-1] == '\r') eptr--; + if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF && + eptr[-1] == CHAR_CR) eptr--; } } diff --git a/pcre_internal.h b/pcre_internal.h index 49530f9..7e94232 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -984,11 +984,12 @@ same code point. */ #else /* Not EBCDIC */ /* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for -compatibility. NEL is the Unicode newline character. */ +compatibility. NEL is the Unicode newline character; make sure it is +a positive value. */ #define CHAR_LF '\n' #define CHAR_NL CHAR_LF -#define CHAR_NEL '\x85' +#define CHAR_NEL ((unsigned char)'\x85') #define CHAR_ESC '\033' #define CHAR_DEL '\177' @@ -1262,7 +1263,7 @@ only. */ #define CHAR_CR '\015' #define CHAR_LF '\012' #define CHAR_NL CHAR_LF -#define CHAR_NEL '\x85' +#define CHAR_NEL ((unsigned char)'\x85') #define CHAR_BS '\010' #define CHAR_BEL '\007' #define CHAR_ESC '\033' diff --git a/pcre_maketables.c b/pcre_maketables.c index 1275cb2..8e466cc 100644 --- a/pcre_maketables.c +++ b/pcre_maketables.c @@ -127,7 +127,7 @@ within regexes. */ for (i = 0; i < 256; i++) { int x = 0; - if (i != 0x0b && isspace(i)) x += ctype_space; + if (i != CHAR_VT && isspace(i)) x += ctype_space; if (isalpha(i)) x += ctype_letter; if (isdigit(i)) x += ctype_digit; if (isxdigit(i)) x += ctype_xdigit; diff --git a/pcre_newline.c b/pcre_newline.c index a0a13c8..5e257d8 100644 --- a/pcre_newline.c +++ b/pcre_newline.c @@ -60,7 +60,7 @@ http://unicode.org/unicode/reports/tr18/. */ *************************************************/ /* It is guaranteed that the initial value of ptr is less than the end of the -string that is being processed. +string that is being processed. Arguments: ptr pointer to possible newline @@ -86,12 +86,14 @@ if (utf) else #endif /* SUPPORT_UTF */ c = *ptr; + +/* Note that this function is called only for ANY or ANYCRLF. */ if (type == NLTYPE_ANYCRLF) switch(c) { - case 0x000a: *lenptr = 1; return TRUE; /* LF */ - case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; - return TRUE; /* CR */ + case CHAR_LF: *lenptr = 1; return TRUE; + case CHAR_CR: *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; + return TRUE; default: return FALSE; } @@ -99,20 +101,29 @@ if (type == NLTYPE_ANYCRLF) switch(c) else switch(c) { - case 0x000a: /* LF */ - case 0x000b: /* VT */ - case 0x000c: *lenptr = 1; return TRUE; /* FF */ - case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; - return TRUE; /* CR */ +#ifdef EBCDIC + case CHAR_NEL: +#endif + case CHAR_LF: + case CHAR_VT: + case CHAR_FF: *lenptr = 1; return TRUE; + + case CHAR_CR: + *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1; + return TRUE; + +#ifndef EBCDIC #ifdef COMPILE_PCRE8 - case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ + case CHAR_NEL: *lenptr = utf? 2 : 1; return TRUE; case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ -#else - case 0x0085: /* NEL */ +#else /* 16-bit (can't be EBCDIC) */ + case CHAR_NEL: case 0x2028: /* LS */ case 0x2029: *lenptr = 1; return TRUE; /* PS */ -#endif /* COMPILE_PCRE8 */ +#endif /* COMPILE_PCRE8 */ +#endif /* Not EBCDIC */ + default: return FALSE; } } @@ -153,30 +164,45 @@ else #endif /* SUPPORT_UTF */ c = *ptr; +/* Note that this function is called only for ANY or ANYCRLF. */ + if (type == NLTYPE_ANYCRLF) switch(c) { - case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; - return TRUE; /* LF */ - case 0x000d: *lenptr = 1; return TRUE; /* CR */ + case CHAR_LF: + *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; + return TRUE; + + case CHAR_CR: *lenptr = 1; return TRUE; default: return FALSE; } +/* NLTYPE_ANY */ + else switch(c) { - case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; - return TRUE; /* LF */ - case 0x000b: /* VT */ - case 0x000c: /* FF */ - case 0x000d: *lenptr = 1; return TRUE; /* CR */ + case CHAR_LF: + *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1; + return TRUE; + +#ifdef EBCDIC + case CHAR_NEL: +#endif + case CHAR_VT: + case CHAR_FF: + case CHAR_CR: *lenptr = 1; return TRUE; + +#ifndef EBCDIC #ifdef COMPILE_PCRE8 - case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ - case 0x2028: /* LS */ - case 0x2029: *lenptr = 3; return TRUE; /* PS */ + case CHAR_NEL: *lenptr = utf? 2 : 1; return TRUE; + case 0x2028: /* LS */ + case 0x2029: *lenptr = 3; return TRUE; /* PS */ #else - case 0x0085: /* NEL */ + case CHAR_NEL: case 0x2028: /* LS */ case 0x2029: *lenptr = 1; return TRUE; /* PS */ -#endif /* COMPILE_PCRE8 */ +#endif /* COMPILE_PCRE8 */ +#endif /* NotEBCDIC */ + default: return FALSE; } } diff --git a/pcre_study.c b/pcre_study.c index 0359e45..805c28f 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -567,15 +567,15 @@ if (utf && c > 127) #endif /* Not SUPPORT_UCP */ return p; } -#else /* Not SUPPORT_UTF */ +#else /* Not SUPPORT_UTF */ (void)(utf); /* Stops warning for unused parameter */ -#endif +#endif /* SUPPORT_UTF */ /* Not UTF-8 mode, or character is less than 127. */ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; -#endif +#endif /* COMPILE_PCRE8 */ #ifdef COMPILE_PCRE16 if (c > 0xff) @@ -597,10 +597,12 @@ if (utf && c > 127) c = 0xff; SET_BIT(c); } -#endif +#endif /* SUPPORT_UCP */ return p; } -#endif +#else /* Not SUPPORT_UTF */ +(void)(utf); /* Stops warning for unused parameter */ +#endif /* SUPPORT_UTF */ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; @@ -988,8 +990,8 @@ do identical. */ case OP_HSPACE: - SET_BIT(0x09); - SET_BIT(0x20); + SET_BIT(CHAR_HT); + SET_BIT(CHAR_SPACE); #ifdef SUPPORT_UTF if (utf) { @@ -998,45 +1000,47 @@ do SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ -#endif +#endif /* COMPILE_PCRE8 */ #ifdef COMPILE_PCRE16 SET_BIT(0xA0); SET_BIT(0xFF); /* For characters > 255 */ -#endif +#endif /* COMPILE_PCRE16 */ } else #endif /* SUPPORT_UTF */ { +#ifndef EBCDIC SET_BIT(0xA0); +#endif /* Not EBCDIC */ #ifdef COMPILE_PCRE16 SET_BIT(0xFF); /* For characters > 255 */ -#endif +#endif /* COMPILE_PCRE16 */ } try_next = FALSE; break; case OP_ANYNL: case OP_VSPACE: - SET_BIT(0x0A); - SET_BIT(0x0B); - SET_BIT(0x0C); - SET_BIT(0x0D); + SET_BIT(CHAR_LF); + SET_BIT(CHAR_VT); + SET_BIT(CHAR_FF); + SET_BIT(CHAR_CR); #ifdef SUPPORT_UTF if (utf) { #ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ -#endif +#endif /* COMPILE_PCRE8 */ #ifdef COMPILE_PCRE16 - SET_BIT(0x85); + SET_BIT(CHAR_NEL); SET_BIT(0xFF); /* For characters > 255 */ -#endif +#endif /* COMPILE_PCRE16 */ } else #endif /* SUPPORT_UTF */ { - SET_BIT(0x85); + SET_BIT(CHAR_NEL); #ifdef COMPILE_PCRE16 SET_BIT(0xFF); /* For characters > 255 */ #endif @@ -1060,7 +1064,8 @@ do break; /* The cbit_space table has vertical tab as whitespace; we have to - ensure it is set as not whitespace. */ + ensure it is set as not whitespace. Luckily, the code value is the same + (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */ case OP_NOT_WHITESPACE: set_nottype_bits(start_bits, cbit_space, table_limit, cd); @@ -1068,8 +1073,9 @@ do try_next = FALSE; break; - /* The cbit_space table has vertical tab as whitespace; we have to - not set it from the table. */ + /* The cbit_space table has vertical tab as whitespace; we have to not + set it from the table. Luckily, the code value is the same (0x0b) in + ASCII and EBCDIC, so we can just adjust the appropriate bit. */ case OP_WHITESPACE: c = start_bits[1]; /* Save in case it was already set */ @@ -1123,8 +1129,8 @@ do return SSB_FAIL; case OP_HSPACE: - SET_BIT(0x09); - SET_BIT(0x20); + SET_BIT(CHAR_HT); + SET_BIT(CHAR_SPACE); #ifdef SUPPORT_UTF if (utf) { @@ -1133,38 +1139,40 @@ do SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ -#endif +#endif /* COMPILE_PCRE8 */ #ifdef COMPILE_PCRE16 SET_BIT(0xA0); SET_BIT(0xFF); /* For characters > 255 */ -#endif +#endif /* COMPILE_PCRE16 */ } else #endif /* SUPPORT_UTF */ +#ifndef EBCDIC SET_BIT(0xA0); +#endif /* Not EBCDIC */ break; case OP_ANYNL: case OP_VSPACE: - SET_BIT(0x0A); - SET_BIT(0x0B); - SET_BIT(0x0C); - SET_BIT(0x0D); + SET_BIT(CHAR_LF); + SET_BIT(CHAR_VT); + SET_BIT(CHAR_FF); + SET_BIT(CHAR_CR); #ifdef SUPPORT_UTF if (utf) { #ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ -#endif +#endif /* COMPILE_PCRE8 */ #ifdef COMPILE_PCRE16 - SET_BIT(0x85); + SET_BIT(CHAR_NEL); SET_BIT(0xFF); /* For characters > 255 */ -#endif +#endif /* COMPILE_PCRE16 */ } else #endif /* SUPPORT_UTF */ - SET_BIT(0x85); + SET_BIT(CHAR_NEL); break; case OP_NOT_DIGIT: @@ -1176,7 +1184,9 @@ do break; /* The cbit_space table has vertical tab as whitespace; we have to - ensure it gets set as not whitespace. */ + ensure it gets set as not whitespace. Luckily, the code value is the + same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate + bit. */ case OP_NOT_WHITESPACE: set_nottype_bits(start_bits, cbit_space, table_limit, cd); @@ -1184,7 +1194,8 @@ do break; /* The cbit_space table has vertical tab as whitespace; we have to - avoid setting it. */ + avoid setting it. Luckily, the code value is the same (0x0b) in ASCII + and EBCDIC, so we can just adjust the appropriate bit. */ case OP_WHITESPACE: c = start_bits[1]; /* Save in case it was already set */ @@ -933,12 +933,12 @@ switch(endlinetype) switch (c) { - case 0x0a: /* LF */ + case '\n': *lenptr = 1; return p; - case 0x0d: /* CR */ - if (p < endptr && *p == 0x0a) + case '\r': + if (p < endptr && *p == '\n') { *lenptr = 2; p++; @@ -977,14 +977,14 @@ switch(endlinetype) switch (c) { - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ + case '\n': /* LF */ + case '\v': /* VT */ + case '\f': /* FF */ *lenptr = 1; return p; - case 0x0d: /* CR */ - if (p < endptr && *p == 0x0a) + case '\r': /* CR */ + if (p < endptr && *p == '\n') { *lenptr = 2; p++; @@ -992,14 +992,16 @@ switch(endlinetype) else *lenptr = 1; return p; - case 0x85: /* NEL */ +#ifndef EBCDIC + case 0x85: /* Unicode NEL */ *lenptr = utf8? 2 : 1; return p; - case 0x2028: /* LS */ - case 0x2029: /* PS */ + case 0x2028: /* Unicode LS */ + case 0x2029: /* Unicode PS */ *lenptr = 3; return p; +#endif /* Not EBCDIC */ default: break; @@ -1083,8 +1085,8 @@ switch(endlinetype) if (endlinetype == EL_ANYCRLF) switch (c) { - case 0x0a: /* LF */ - case 0x0d: /* CR */ + case '\n': /* LF */ + case '\r': /* CR */ return p; default: @@ -1093,13 +1095,15 @@ switch(endlinetype) else switch (c) { - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LS */ - case 0x2029: /* PS */ + case '\n': /* LF */ + case '\v': /* VT */ + case '\f': /* FF */ + case '\r': /* CR */ +#ifndef EBCDIE + case 0x85: /* Unicode NEL */ + case 0x2028: /* Unicode LS */ + case 0x2029: /* Unicode PS */ +#endif /* Not EBCDIC */ return p; default: @@ -1103,15 +1103,17 @@ return sys_errlist[n]; *************************************************/ /* -Argument: the return code from PCRE_CONFIG_NEWLINE -Returns: nothing +Arguments: + rc the return code from PCRE_CONFIG_NEWLINE + isc TRUE if called from "-C newline" +Returns: nothing */ static void -print_newline_config(int rc) +print_newline_config(int rc, BOOL isc) { const char *s = NULL; -printf(" Newline sequence is "); +if (!isc) printf(" Newline sequence is "); switch(rc) { case CHAR_CR: s = "CR"; break; @@ -2407,9 +2409,8 @@ while (argc > 1 && argv[op][0] == '-') (void)PCRE_CONFIG(PCRE_CONFIG_LINK_SIZE, &rc); printf("%d\n", rc); yield = rc; - goto EXIT; } - if (strcmp(argv[op + 1], "pcre8") == 0) + else if (strcmp(argv[op + 1], "pcre8") == 0) { #ifdef SUPPORT_PCRE8 printf("1\n"); @@ -2418,9 +2419,8 @@ while (argc > 1 && argv[op][0] == '-') printf("0\n"); yield = 0; #endif - goto EXIT; } - if (strcmp(argv[op + 1], "pcre16") == 0) + else if (strcmp(argv[op + 1], "pcre16") == 0) { #ifdef SUPPORT_PCRE16 printf("1\n"); @@ -2429,9 +2429,8 @@ while (argc > 1 && argv[op][0] == '-') printf("0\n"); yield = 0; #endif - goto EXIT; } - if (strcmp(argv[op + 1], "utf") == 0) + else if (strcmp(argv[op + 1], "utf") == 0) { #ifdef SUPPORT_PCRE8 (void)pcre_config(PCRE_CONFIG_UTF8, &rc); @@ -2442,31 +2441,49 @@ while (argc > 1 && argv[op][0] == '-') printf("%d\n", rc); yield = rc; #endif - goto EXIT; } - if (strcmp(argv[op + 1], "ucp") == 0) + else if (strcmp(argv[op + 1], "ucp") == 0) { (void)PCRE_CONFIG(PCRE_CONFIG_UNICODE_PROPERTIES, &rc); printf("%d\n", rc); yield = rc; - goto EXIT; } - if (strcmp(argv[op + 1], "jit") == 0) + else if (strcmp(argv[op + 1], "jit") == 0) { (void)PCRE_CONFIG(PCRE_CONFIG_JIT, &rc); printf("%d\n", rc); yield = rc; - goto EXIT; } - if (strcmp(argv[op + 1], "newline") == 0) + else if (strcmp(argv[op + 1], "newline") == 0) { (void)PCRE_CONFIG(PCRE_CONFIG_NEWLINE, &rc); - print_newline_config(rc); - goto EXIT; + print_newline_config(rc, TRUE); } - printf("Unknown -C option: %s\n", argv[op + 1]); + else if (strcmp(argv[op + 1], "ebcdic") == 0) + { +#ifdef EBCDIC + printf("1\n"); + yield = 1; +#else + printf("0\n"); +#endif + } + else if (strcmp(argv[op + 1], "ebcdic-nl") == 0) + { +#ifdef EBCDIC + printf("0x%02x\n", CHAR_LF); +#else + printf("0\n"); +#endif + } + else + { + printf("Unknown -C option: %s\n", argv[op + 1]); + } goto EXIT; } + + /* No argument for -C: output all configuration information. */ printf("PCRE version %s\n", version); printf("Compiled with\n"); @@ -2507,7 +2524,7 @@ are set, either both UTFs are supported or both are not supported. */ else printf(" No just-in-time compiler support\n"); (void)PCRE_CONFIG(PCRE_CONFIG_NEWLINE, &rc); - print_newline_config(rc); + print_newline_config(rc, FALSE); (void)PCRE_CONFIG(PCRE_CONFIG_BSR, &rc); printf(" \\R matches %s\n", rc? "CR, LF, or CRLF only" : "all Unicode newlines"); diff --git a/testdata/testinputEBC b/testdata/testinputEBC new file mode 100644 index 0000000..56efcd0 --- /dev/null +++ b/testdata/testinputEBC @@ -0,0 +1,121 @@ +/-- This is a specialized test for checking, when PCRE is compiled with the +EBCDIC option but in an ASCII environment, that newline and white space +functionality is working. It catches cases where explicit values such as 0x0a +have been used instead of names like CHAR_LF. Needless to say, it is not a +genuine EBCDIC test! In patterns, alphabetic characters that follow a backslash +must be in EBCDIC code. In data, newlines and other spacing characters must be +in EBCDIC, but can be specified as escapes. --/ + +/-- Test default newline and variations --/ + +/^A/m + ABC + 12\x15ABC + +/^A/m<any> + 12\x15ABC + 12\x0dABC + 12\x0d\x15ABC + 12\x25ABC + +/^A/m<anycrlf> + 12\x15ABC + 12\x0dABC + 12\x0d\x15ABC + ** Fail + 12\x25ABC + +/-- Test \h --/ + +/^A\ˆ/ + A B + +/-- Test \H --/ + +/^A\È/ + AB + ** Fail + A B + +/-- Test \R --/ + +/^A\Ù/ + A\x15B + A\x0dB + A\x25B + A\x0bB + A\x0cB + ** Fail + A B + +/-- Test \v --/ + +/^A\¥/ + A\x15B + A\x0dB + A\x25B + A\x0bB + A\x0cB + ** Fail + A B + +/-- Test \V --/ + +/^A\å/ + A B + ** Fail + A\x15B + A\x0dB + A\x25B + A\x0bB + A\x0cB + +/-- For repeated items, use an atomic group so that the output is the same +for DFA matching (otherwise it may show multiple matches). --/ + +/-- Test \h+ --/ + +/^A(?>\ˆ+)/ + A B + +/-- Test \H+ --/ + +/^A(?>\È+)/ + AB + ** Fail + A B + +/-- Test \R+ --/ + +/^A(?>\Ù+)/ + A\x15B + A\x0dB + A\x25B + A\x0bB + A\x0cB + ** Fail + A B + +/-- Test \v+ --/ + +/^A(?>\¥+)/ + A\x15B + A\x0dB + A\x25B + A\x0bB + A\x0cB + ** Fail + A B + +/-- Test \V+ --/ + +/^A(?>\å+)/ + A B + ** Fail + A\x15B + A\x0dB + A\x25B + A\x0bB + A\x0cB + +/-- End --/ diff --git a/testdata/testoutputEBC b/testdata/testoutputEBC new file mode 100644 index 0000000..abbfdc4 --- /dev/null +++ b/testdata/testoutputEBC @@ -0,0 +1,182 @@ +/-- This is a specialized test for checking, when PCRE is compiled with the +EBCDIC option but in an ASCII environment, that newline and white space +functionality is working. It catches cases where explicit values such as 0x0a +have been used instead of names like CHAR_LF. Needless to say, it is not a +genuine EBCDIC test! In patterns, alphabetic characters that follow a backslash +must be in EBCDIC code. In data, newlines and other spacing characters must be +in EBCDIC, but can be specified as escapes. --/ + +/-- Test default newline and variations --/ + +/^A/m + ABC + 0: A + 12\x15ABC + 0: A + +/^A/m<any> + 12\x15ABC + 0: A + 12\x0dABC + 0: A + 12\x0d\x15ABC + 0: A + 12\x25ABC + 0: A + +/^A/m<anycrlf> + 12\x15ABC + 0: A + 12\x0dABC + 0: A + 12\x0d\x15ABC + 0: A + ** Fail +No match + 12\x25ABC +No match + +/-- Test \h --/ + +/^A\ˆ/ + A B + 0: A\x20 + +/-- Test \H --/ + +/^A\È/ + AB + 0: AB + ** Fail +No match + A B +No match + +/-- Test \R --/ + +/^A\Ù/ + A\x15B + 0: A\x15 + A\x0dB + 0: A\x0d + A\x25B + 0: A\x25 + A\x0bB + 0: A\x0b + A\x0cB + 0: A\x0c + ** Fail +No match + A B +No match + +/-- Test \v --/ + +/^A\¥/ + A\x15B + 0: A\x15 + A\x0dB + 0: A\x0d + A\x25B + 0: A\x25 + A\x0bB + 0: A\x0b + A\x0cB + 0: A\x0c + ** Fail +No match + A B +No match + +/-- Test \V --/ + +/^A\å/ + A B + 0: A\x20 + ** Fail +No match + A\x15B +No match + A\x0dB +No match + A\x25B +No match + A\x0bB +No match + A\x0cB +No match + +/-- For repeated items, use an atomic group so that the output is the same +for DFA matching (otherwise it may show multiple matches). --/ + +/-- Test \h+ --/ + +/^A(?>\ˆ+)/ + A B + 0: A\x20 + +/-- Test \H+ --/ + +/^A(?>\È+)/ + AB + 0: AB + ** Fail +No match + A B +No match + +/-- Test \R+ --/ + +/^A(?>\Ù+)/ + A\x15B + 0: A\x15 + A\x0dB + 0: A\x0d + A\x25B + 0: A\x25 + A\x0bB + 0: A\x0b + A\x0cB + 0: A\x0c + ** Fail +No match + A B +No match + +/-- Test \v+ --/ + +/^A(?>\¥+)/ + A\x15B + 0: A\x15 + A\x0dB + 0: A\x0d + A\x25B + 0: A\x25 + A\x0bB + 0: A\x0b + A\x0cB + 0: A\x0c + ** Fail +No match + A B +No match + +/-- Test \V+ --/ + +/^A(?>\å+)/ + A B + 0: A\x20B + ** Fail +No match + A\x15B +No match + A\x0dB +No match + A\x25B +No match + A\x0bB +No match + A\x0cB +No match + +/-- End --/ |