1 files changed, 161 insertions, 29 deletions
diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java
index ef606a6d8a7..e0665f097c2 100644
--- a/libjava/classpath/gnu/regexp/RE.java
+++ b/libjava/classpath/gnu/regexp/RE.java
@@ -41,6 +41,7 @@ import java.io.Serializable;
 import java.util.Locale;
 import java.util.PropertyResourceBundle;
 import java.util.ResourceBundle;
+import java.util.Stack;
 import java.util.Vector;
 
 /**
@@ -78,13 +79,18 @@ import java.util.Vector;
  * <P>
  *
  * These methods all have similar argument lists.  The input can be a
- * String, a character array, a StringBuffer, or an
+ * CharIndexed, String, a character array, a StringBuffer, or an
  * InputStream of some sort.  Note that when using an
  * InputStream, the stream read position cannot be guaranteed after
  * attempting a match (this is not a bug, but a consequence of the way
  * regular expressions work).  Using an REMatchEnumeration can
  * eliminate most positioning problems.
  *
+ * Although the input object can be of various types, it is recommended
+ * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
+ * can show the last match found on this input, which helps the expression
+ * \G work as the end of the previous match.
+ *
  * <P>
  *
  * The optional index argument specifies the offset from the beginning
@@ -235,6 +241,17 @@ public class RE extends REToken {
    */
   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 
+  /**
+   * Compilation flag. Allow whitespace and comments in pattern.
+   * This is equivalent to the "/x" operator in Perl.
+   */
+  public static final int REG_X_COMMENTS = 0x0400;
+
+  /**
+   * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
+   */
+  public static final int REG_ICASE_USASCII = 0x0800;
+
   /** Returns a string representing the version of the gnu.regexp package. */
   public static final String version() {
     return VERSION;
@@ -334,6 +351,7 @@ public class RE extends REToken {
     // Precalculate these so we don't pay for the math every time we
     // need to access them.
     boolean insens = ((cflags & REG_ICASE) > 0);
+    boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 
     // Parse pattern into tokens.  Does anyone know if it's more efficient
     // to use char[] than a String.charAt()?  I'm assuming so.
@@ -372,6 +390,31 @@ public class RE extends REToken {
       if (quot)
       	unit.bk = false;
 
+      if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) {
+	if (Character.isWhitespace(unit.ch)) {
+	     continue;
+	}
+	if (unit.ch == '#') {
+	  for (int i = index; i < pLength; i++) {
+	    if (pattern[i] == '\n') {
+	      index = i + 1;
+	      continue;
+	    }
+	    else if (pattern[i] == '\r') {
+	      if (i + 1 < pLength && pattern[i + 1] == '\n') {
+		index = i + 2;
+	      }
+	      else {
+		index = i + 1;
+	      }
+	      continue;
+	    }
+	  }
+	  index = pLength;
+	  continue;
+	}
+      }
+
       // ALTERNATION OPERATOR
       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
       //  not available if RE_LIMITED_OPS is set
@@ -420,6 +463,7 @@ public class RE extends REToken {
         else {
           addToken(currentToken);
           currentToken = new RETokenChar(subIndex,unit.ch,insens);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
         } 
       }
       
@@ -495,8 +539,8 @@ public class RE extends REToken {
 	  case 'd':
 	  case 'm':
 	  case 's':
-	  // case 'u':  not supported
-	  // case 'x':  not supported
+	  case 'u':
+	  case 'x':
 	  case '-':
             if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
 	    // Set or reset syntax flags.
@@ -535,8 +579,20 @@ public class RE extends REToken {
 		    newCflags |= REG_DOT_NEWLINE;
 		  flagIndex++;
 		  break;
-	  	// case 'u': not supported
-	  	// case 'x': not supported
+	  	case 'u':
+		  if (negate)
+		    newCflags |= REG_ICASE_USASCII;
+		  else
+		    newCflags &= ~REG_ICASE_USASCII;
+		  flagIndex++;
+		  break;
+	  	case 'x':
+		  if (negate)
+		    newCflags &= ~REG_X_COMMENTS;
+		  else
+		    newCflags |= REG_X_COMMENTS;
+		  flagIndex++;
+		  break;
 	  	case '-':
 		  negate = true;
 		  flagIndex++;
@@ -553,6 +609,7 @@ public class RE extends REToken {
 		syntax = newSyntax;
 		cflags = newCflags;
 		insens = ((cflags & REG_ICASE) > 0);
+		insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 		// This can be treated as though it were a comment.
 		comment = true;
 		index = flagIndex - 1;
@@ -565,6 +622,7 @@ public class RE extends REToken {
 		syntax = newSyntax;
 		cflags = newCflags;
 		insens = ((cflags & REG_ICASE) > 0);
+		insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 		index = flagIndex -1;
 		// Fall through to the next case.
 	    }
@@ -673,6 +731,7 @@ public class RE extends REToken {
 	      syntax = savedSyntax;
 	      cflags = savedCflags;
 	      insens = ((cflags & REG_ICASE) > 0);
+	      insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 	      flagsSaved = false;
 	  }
 	} // not a comment
@@ -785,6 +844,7 @@ public class RE extends REToken {
 	index = index - 2 + ce.len;
 	addToken(currentToken);
 	currentToken = new RETokenChar(subIndex,ce.ch,insens);
+	if (insensUSASCII) currentToken.unicodeAware = false;
       }
 
       // BACKREFERENCE OPERATOR
@@ -812,6 +872,7 @@ public class RE extends REToken {
 	int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
 
 	currentToken = new RETokenBackRef(subIndex,num,insens);
+	if (insensUSASCII) currentToken.unicodeAware = false;
 	index = numEnd;
       }
 
@@ -860,6 +921,7 @@ public class RE extends REToken {
       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 	addToken(currentToken);
 	currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
+	if (insensUSASCII) currentToken.unicodeAware = false;
       }
 
       // NON-DIGIT OPERATOR
@@ -868,6 +930,7 @@ public class RE extends REToken {
 	else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 	  addToken(currentToken);
 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	}
 
 	// NEWLINE ESCAPE
@@ -892,6 +955,7 @@ public class RE extends REToken {
 	else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 	  addToken(currentToken);
 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	}
 
 	// NON-WHITESPACE OPERATOR
@@ -900,6 +964,7 @@ public class RE extends REToken {
 	else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 	  addToken(currentToken);
 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	}
 
 	// TAB ESCAPE
@@ -916,6 +981,7 @@ public class RE extends REToken {
 	else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 	  addToken(currentToken);
 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	}
 
 	// NON-ALPHANUMERIC OPERATOR
@@ -924,12 +990,19 @@ public class RE extends REToken {
 	else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 	  addToken(currentToken);
 	  currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	}
 
 	// END OF STRING OPERATOR
-        //  \Z
+        //  \Z, \z
 
-	else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
+	// FIXME: \Z and \z are different in that if the input string
+	// ends with a line terminator, \Z matches the position before
+	// the final terminator.  This special behavior of \Z is yet
+	// to be implemented.
+
+	else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
+		 syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 	  addToken(currentToken);
 	  currentToken = new RETokenEnd(subIndex,null);
 	}
@@ -945,6 +1018,7 @@ public class RE extends REToken {
 	  index = index - 2 + ce.len;
 	  addToken(currentToken);
 	  currentToken = new RETokenChar(subIndex,ce.ch,insens);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	}
 
 	// NAMED PROPERTY
@@ -958,6 +1032,16 @@ public class RE extends REToken {
 	  index = index - 2 + np.len;
 	  addToken(currentToken);
 	  currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
+	}
+
+	// END OF PREVIOUS MATCH
+        //  \G
+
+	else if (unit.bk && (unit.ch == 'G') &&
+		 syntax.get(RESyntax.RE_STRING_ANCHORS)) {
+	  addToken(currentToken);
+	  currentToken = new RETokenEndOfPreviousMatch(subIndex);
 	}
 
 	// NON-SPECIAL CHARACTER (or escape to make literal)
@@ -966,6 +1050,7 @@ public class RE extends REToken {
 	else {  // not a special character
 	  addToken(currentToken);
 	  currentToken = new RETokenChar(subIndex,unit.ch,insens);
+	  if (insensUSASCII) currentToken.unicodeAware = false;
 	} 
       } // end while
 
@@ -1006,6 +1091,7 @@ public class RE extends REToken {
 		throws REException {
 
 	boolean insens = ((cflags & REG_ICASE) > 0);
+	boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 	Vector options = new Vector();
 	Vector addition = new Vector();
 	boolean additionAndAppeared = false;
@@ -1035,7 +1121,9 @@ public class RE extends REToken {
 	  if ((ch == '-') && (lastCharIsSet)) {
 	    if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 	    if ((ch = pattern[index]) == ']') {
-	      options.addElement(new RETokenChar(subIndex,lastChar,insens));
+	      RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
 	      lastChar = '-';
 	    } else {
 	      if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
@@ -1045,7 +1133,9 @@ public class RE extends REToken {
 		ch = ce.ch;
 		index = index + ce.len - 1;
 	      }
-	      options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+	      RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
 	      lastChar = 0; lastCharIsSet = false;
 	      index++;
 	    }
@@ -1088,12 +1178,20 @@ public class RE extends REToken {
 	      asciiEsc = ce.ch; asciiEscIsSet = true;
 	      index = index - 1 + ce.len - 1;
 	    }
-	    if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+	    if (lastCharIsSet) {
+	      RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
+	    }
 	    
 	    if (posixID != -1) {
-	      options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
+	      RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
 	    } else if (np != null) {
-	      options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+	      RETokenNamedProperty t = getRETokenNamedProperty(subIndex,np,insens,index);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
 	    } else if (asciiEscIsSet) {
 	      lastChar = asciiEsc; lastCharIsSet = true;
 	    } else {
@@ -1104,8 +1202,11 @@ public class RE extends REToken {
 	    StringBuffer posixSet = new StringBuffer();
 	    index = getPosixSet(pattern,index+1,posixSet);
 	    int posixId = RETokenPOSIX.intValue(posixSet.toString());
-	    if (posixId != -1)
-	      options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
+	    if (posixId != -1) {
+	      RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
+	    }
 	  } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
 		ParseCharClassResult result = parseCharClass(
 		    subIndex, pattern, index, pLength, cflags, syntax, 0);
@@ -1158,14 +1259,22 @@ public class RE extends REToken {
 			result.index: result.index - 1);
 		}
 	  } else {
-	    if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+	    if (lastCharIsSet) {
+	      RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+	      if (insensUSASCII) t.unicodeAware = false;
+	      options.addElement(t);
+	    }
 	    lastChar = ch; lastCharIsSet = true;
 	  }
 	  if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 	} // while in list
 	// Out of list, index is one past ']'
 	    
-	if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+	if (lastCharIsSet) {
+	  RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+	  if (insensUSASCII) t.unicodeAware = false;
+	  options.addElement(t);
+	}
 	   
 	ParseCharClassResult result = new ParseCharClassResult(); 
 	// Create a new RETokenOneOf
@@ -1396,11 +1505,10 @@ public class RE extends REToken {
       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
     REMatch m = new REMatch(numSubs, index, eflags);
     if (firstToken.match(input, m)) {
-	while (m != null) {
+	if (m != null) {
 	    if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
 		return true;
 	    }
-	    m = m.next;
 	}
     }
     return false;
@@ -1508,17 +1616,27 @@ public class RE extends REToken {
   }
   
     /* Implements abstract method REToken.match() */
-    boolean match(CharIndexed input, REMatch mymatch) { 
+    boolean match(CharIndexed input, REMatch mymatch) {
 	if (firstToken == null) {
 	    return next(input, mymatch);
 	}
 
 	// Note the start of this subexpression
-	mymatch.start[subIndex] = mymatch.index;
+	mymatch.start1[subIndex] = mymatch.index;
 
 	return firstToken.match(input, mymatch);
     }
-  
+
+    REMatch findMatch(CharIndexed input, REMatch mymatch) {
+        if (mymatch.backtrackStack == null)
+	  mymatch.backtrackStack = new BacktrackStack();
+	boolean b = match(input, mymatch);
+	if (b) {
+	    return mymatch;
+	}
+	return null;
+    }
+
   /**
    * Returns the first match found in the input.  If no match is found,
    * null is returned.
@@ -1602,6 +1720,7 @@ public class RE extends REToken {
 		  */
 		  best.end[0] = best.index;
 		  best.finish(input);
+		  input.setLastMatch(best);
 		  return best;
 	      }
 	  }
@@ -1942,19 +2061,34 @@ public class RE extends REToken {
    }
 
   void dump(StringBuffer os) {
-    os.append('(');
+    os.append("(?#startRE subIndex=" + subIndex + ")");
     if (subIndex == 0)
       os.append("?:");
     if (firstToken != null)
       firstToken.dumpAll(os);
-    os.append(')');
+    if (subIndex == 0)
+      os.append(")");
+    os.append("(?#endRE subIndex=" + subIndex + ")");
   }
 
   // Cast input appropriately or throw exception
-  private static CharIndexed makeCharIndexed(Object input, int index) {
-      // We could let a String fall through to final input, but since
-      // it's the most likely input type, we check it first.
-    if (input instanceof String)
+  // This method was originally a private method, but has been made
+  // public because java.util.regex.Matcher uses this.
+  public static CharIndexed makeCharIndexed(Object input, int index) {
+      // The case where input is already a CharIndexed is supposed
+      // be the most likely because this is the case with
+      // java.util.regex.Matcher.
+      // We could let a String or a CharSequence fall through
+      // to final input, but since it'a very likely input type, 
+      // we check it first.
+    if (input instanceof CharIndexed) {
+	CharIndexed ci = (CharIndexed) input;
+	ci.setAnchor(index);
+	return ci;
+    }
+    else if (input instanceof CharSequence)
+      return new CharIndexedCharSequence((CharSequence) input,index);
+    else if (input instanceof String)
       return new CharIndexedString((String) input,index);
     else if (input instanceof char[])
       return new CharIndexedCharArray((char[]) input,index);
@@ -1962,8 +2096,6 @@ public class RE extends REToken {
       return new CharIndexedStringBuffer((StringBuffer) input,index);
     else if (input instanceof InputStream)
       return new CharIndexedInputStream((InputStream) input,index);
-    else if (input instanceof CharIndexed)
-	return (CharIndexed) input; // do we lose index info?
     else 
 	return new CharIndexedString(input.toString(), index);
   }