summaryrefslogtreecommitdiff
path: root/libjava/classpath/gnu/regexp/RE.java
diff options
context:
space:
mode:
Diffstat (limited to 'libjava/classpath/gnu/regexp/RE.java')
-rw-r--r--libjava/classpath/gnu/regexp/RE.java190
1 files changed, 161 insertions, 29 deletions
diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java
index ef606a6d8a7..e0665f097c2 100644
--- a/libjava/classpath/gnu/regexp/RE.java
+++ b/libjava/classpath/gnu/regexp/RE.java
@@ -41,6 +41,7 @@ import java.io.Serializable;
import java.util.Locale;
import java.util.PropertyResourceBundle;
import java.util.ResourceBundle;
+import java.util.Stack;
import java.util.Vector;
/**
@@ -78,13 +79,18 @@ import java.util.Vector;
* <P>
*
* These methods all have similar argument lists. The input can be a
- * String, a character array, a StringBuffer, or an
+ * CharIndexed, String, a character array, a StringBuffer, or an
* InputStream of some sort. Note that when using an
* InputStream, the stream read position cannot be guaranteed after
* attempting a match (this is not a bug, but a consequence of the way
* regular expressions work). Using an REMatchEnumeration can
* eliminate most positioning problems.
*
+ * Although the input object can be of various types, it is recommended
+ * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
+ * can show the last match found on this input, which helps the expression
+ * \G work as the end of the previous match.
+ *
* <P>
*
* The optional index argument specifies the offset from the beginning
@@ -235,6 +241,17 @@ public class RE extends REToken {
*/
public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
+ /**
+ * Compilation flag. Allow whitespace and comments in pattern.
+ * This is equivalent to the "/x" operator in Perl.
+ */
+ public static final int REG_X_COMMENTS = 0x0400;
+
+ /**
+ * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
+ */
+ public static final int REG_ICASE_USASCII = 0x0800;
+
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
return VERSION;
@@ -334,6 +351,7 @@ public class RE extends REToken {
// Precalculate these so we don't pay for the math every time we
// need to access them.
boolean insens = ((cflags & REG_ICASE) > 0);
+ boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
// Parse pattern into tokens. Does anyone know if it's more efficient
// to use char[] than a String.charAt()? I'm assuming so.
@@ -372,6 +390,31 @@ public class RE extends REToken {
if (quot)
unit.bk = false;
+ if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) {
+ if (Character.isWhitespace(unit.ch)) {
+ continue;
+ }
+ if (unit.ch == '#') {
+ for (int i = index; i < pLength; i++) {
+ if (pattern[i] == '\n') {
+ index = i + 1;
+ continue;
+ }
+ else if (pattern[i] == '\r') {
+ if (i + 1 < pLength && pattern[i + 1] == '\n') {
+ index = i + 2;
+ }
+ else {
+ index = i + 1;
+ }
+ continue;
+ }
+ }
+ index = pLength;
+ continue;
+ }
+ }
+
// ALTERNATION OPERATOR
// \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
// not available if RE_LIMITED_OPS is set
@@ -420,6 +463,7 @@ public class RE extends REToken {
else {
addToken(currentToken);
currentToken = new RETokenChar(subIndex,unit.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
}
@@ -495,8 +539,8 @@ public class RE extends REToken {
case 'd':
case 'm':
case 's':
- // case 'u': not supported
- // case 'x': not supported
+ case 'u':
+ case 'x':
case '-':
if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
// Set or reset syntax flags.
@@ -535,8 +579,20 @@ public class RE extends REToken {
newCflags |= REG_DOT_NEWLINE;
flagIndex++;
break;
- // case 'u': not supported
- // case 'x': not supported
+ case 'u':
+ if (negate)
+ newCflags |= REG_ICASE_USASCII;
+ else
+ newCflags &= ~REG_ICASE_USASCII;
+ flagIndex++;
+ break;
+ case 'x':
+ if (negate)
+ newCflags &= ~REG_X_COMMENTS;
+ else
+ newCflags |= REG_X_COMMENTS;
+ flagIndex++;
+ break;
case '-':
negate = true;
flagIndex++;
@@ -553,6 +609,7 @@ public class RE extends REToken {
syntax = newSyntax;
cflags = newCflags;
insens = ((cflags & REG_ICASE) > 0);
+ insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
// This can be treated as though it were a comment.
comment = true;
index = flagIndex - 1;
@@ -565,6 +622,7 @@ public class RE extends REToken {
syntax = newSyntax;
cflags = newCflags;
insens = ((cflags & REG_ICASE) > 0);
+ insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
index = flagIndex -1;
// Fall through to the next case.
}
@@ -673,6 +731,7 @@ public class RE extends REToken {
syntax = savedSyntax;
cflags = savedCflags;
insens = ((cflags & REG_ICASE) > 0);
+ insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
flagsSaved = false;
}
} // not a comment
@@ -785,6 +844,7 @@ public class RE extends REToken {
index = index - 2 + ce.len;
addToken(currentToken);
currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// BACKREFERENCE OPERATOR
@@ -812,6 +872,7 @@ public class RE extends REToken {
int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
currentToken = new RETokenBackRef(subIndex,num,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
index = numEnd;
}
@@ -860,6 +921,7 @@ public class RE extends REToken {
else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NON-DIGIT OPERATOR
@@ -868,6 +930,7 @@ public class RE extends REToken {
else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NEWLINE ESCAPE
@@ -892,6 +955,7 @@ public class RE extends REToken {
else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NON-WHITESPACE OPERATOR
@@ -900,6 +964,7 @@ public class RE extends REToken {
else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// TAB ESCAPE
@@ -916,6 +981,7 @@ public class RE extends REToken {
else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NON-ALPHANUMERIC OPERATOR
@@ -924,12 +990,19 @@ public class RE extends REToken {
else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// END OF STRING OPERATOR
- // \Z
+ // \Z, \z
- else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
+ // FIXME: \Z and \z are different in that if the input string
+ // ends with a line terminator, \Z matches the position before
+ // the final terminator. This special behavior of \Z is yet
+ // to be implemented.
+
+ else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
+ syntax.get(RESyntax.RE_STRING_ANCHORS)) {
addToken(currentToken);
currentToken = new RETokenEnd(subIndex,null);
}
@@ -945,6 +1018,7 @@ public class RE extends REToken {
index = index - 2 + ce.len;
addToken(currentToken);
currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NAMED PROPERTY
@@ -958,6 +1032,16 @@ public class RE extends REToken {
index = index - 2 + np.len;
addToken(currentToken);
currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+ if (insensUSASCII) currentToken.unicodeAware = false;
+ }
+
+ // END OF PREVIOUS MATCH
+ // \G
+
+ else if (unit.bk && (unit.ch == 'G') &&
+ syntax.get(RESyntax.RE_STRING_ANCHORS)) {
+ addToken(currentToken);
+ currentToken = new RETokenEndOfPreviousMatch(subIndex);
}
// NON-SPECIAL CHARACTER (or escape to make literal)
@@ -966,6 +1050,7 @@ public class RE extends REToken {
else { // not a special character
addToken(currentToken);
currentToken = new RETokenChar(subIndex,unit.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
} // end while
@@ -1006,6 +1091,7 @@ public class RE extends REToken {
throws REException {
boolean insens = ((cflags & REG_ICASE) > 0);
+ boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
Vector options = new Vector();
Vector addition = new Vector();
boolean additionAndAppeared = false;
@@ -1035,7 +1121,9 @@ public class RE extends REToken {
if ((ch == '-') && (lastCharIsSet)) {
if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
if ((ch = pattern[index]) == ']') {
- options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
lastChar = '-';
} else {
if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
@@ -1045,7 +1133,9 @@ public class RE extends REToken {
ch = ce.ch;
index = index + ce.len - 1;
}
- options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+ RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
lastChar = 0; lastCharIsSet = false;
index++;
}
@@ -1088,12 +1178,20 @@ public class RE extends REToken {
asciiEsc = ce.ch; asciiEscIsSet = true;
index = index - 1 + ce.len - 1;
}
- if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ if (lastCharIsSet) {
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
if (posixID != -1) {
- options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
+ RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
} else if (np != null) {
- options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+ RETokenNamedProperty t = getRETokenNamedProperty(subIndex,np,insens,index);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
} else if (asciiEscIsSet) {
lastChar = asciiEsc; lastCharIsSet = true;
} else {
@@ -1104,8 +1202,11 @@ public class RE extends REToken {
StringBuffer posixSet = new StringBuffer();
index = getPosixSet(pattern,index+1,posixSet);
int posixId = RETokenPOSIX.intValue(posixSet.toString());
- if (posixId != -1)
- options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
+ if (posixId != -1) {
+ RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
} else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
ParseCharClassResult result = parseCharClass(
subIndex, pattern, index, pLength, cflags, syntax, 0);
@@ -1158,14 +1259,22 @@ public class RE extends REToken {
result.index: result.index - 1);
}
} else {
- if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ if (lastCharIsSet) {
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
lastChar = ch; lastCharIsSet = true;
}
if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
} // while in list
// Out of list, index is one past ']'
- if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ if (lastCharIsSet) {
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
ParseCharClassResult result = new ParseCharClassResult();
// Create a new RETokenOneOf
@@ -1396,11 +1505,10 @@ public class RE extends REToken {
return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
REMatch m = new REMatch(numSubs, index, eflags);
if (firstToken.match(input, m)) {
- while (m != null) {
+ if (m != null) {
if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
return true;
}
- m = m.next;
}
}
return false;
@@ -1508,17 +1616,27 @@ public class RE extends REToken {
}
/* Implements abstract method REToken.match() */
- boolean match(CharIndexed input, REMatch mymatch) {
+ boolean match(CharIndexed input, REMatch mymatch) {
if (firstToken == null) {
return next(input, mymatch);
}
// Note the start of this subexpression
- mymatch.start[subIndex] = mymatch.index;
+ mymatch.start1[subIndex] = mymatch.index;
return firstToken.match(input, mymatch);
}
-
+
+ REMatch findMatch(CharIndexed input, REMatch mymatch) {
+ if (mymatch.backtrackStack == null)
+ mymatch.backtrackStack = new BacktrackStack();
+ boolean b = match(input, mymatch);
+ if (b) {
+ return mymatch;
+ }
+ return null;
+ }
+
/**
* Returns the first match found in the input. If no match is found,
* null is returned.
@@ -1602,6 +1720,7 @@ public class RE extends REToken {
*/
best.end[0] = best.index;
best.finish(input);
+ input.setLastMatch(best);
return best;
}
}
@@ -1942,19 +2061,34 @@ public class RE extends REToken {
}
void dump(StringBuffer os) {
- os.append('(');
+ os.append("(?#startRE subIndex=" + subIndex + ")");
if (subIndex == 0)
os.append("?:");
if (firstToken != null)
firstToken.dumpAll(os);
- os.append(')');
+ if (subIndex == 0)
+ os.append(")");
+ os.append("(?#endRE subIndex=" + subIndex + ")");
}
// Cast input appropriately or throw exception
- private static CharIndexed makeCharIndexed(Object input, int index) {
- // We could let a String fall through to final input, but since
- // it's the most likely input type, we check it first.
- if (input instanceof String)
+ // This method was originally a private method, but has been made
+ // public because java.util.regex.Matcher uses this.
+ public static CharIndexed makeCharIndexed(Object input, int index) {
+ // The case where input is already a CharIndexed is supposed
+ // be the most likely because this is the case with
+ // java.util.regex.Matcher.
+ // We could let a String or a CharSequence fall through
+ // to final input, but since it'a very likely input type,
+ // we check it first.
+ if (input instanceof CharIndexed) {
+ CharIndexed ci = (CharIndexed) input;
+ ci.setAnchor(index);
+ return ci;
+ }
+ else if (input instanceof CharSequence)
+ return new CharIndexedCharSequence((CharSequence) input,index);
+ else if (input instanceof String)
return new CharIndexedString((String) input,index);
else if (input instanceof char[])
return new CharIndexedCharArray((char[]) input,index);
@@ -1962,8 +2096,6 @@ public class RE extends REToken {
return new CharIndexedStringBuffer((StringBuffer) input,index);
else if (input instanceof InputStream)
return new CharIndexedInputStream((InputStream) input,index);
- else if (input instanceof CharIndexed)
- return (CharIndexed) input; // do we lose index info?
else
return new CharIndexedString(input.toString(), index);
}