/* * Regexp.java * * Copyright (c) 1999 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * * SCCS: %Z% %M% %I% %E% %U% */ // Included in SQLite3 port to C# for use in testharness only; 2008 Noah B Hart //$Header$ using System; using System.Text; namespace sunlabs.brazil.util.regexp { /// The Regexp class can be used to match a pattern against a /// string and optionally replace the matched parts with new strings. ///

/// Regular expressions were implemented by translating Henry Spencer's /// regular expression package for tcl8.0. /// Much of the description below is copied verbatim from the tcl8.0 regsub /// manual entry. ///


/// REGULAR EXPRESSIONS ///

/// A regular expression is zero or more branches, separated by /// "|". It matches anything that matches one of the branches. ///

/// A branch is zero or more pieces, concatenated. /// It matches a match for the first piece, followed by a match for the /// second piece, etc. ///

/// A piece is an atom, possibly followed by "*", "+", or /// "?".

///

/// An atom is

///

/// A range is a sequence of characters enclosed in "[]". /// The range normally matches any single character from the sequence. /// If the sequence begins with "^", the range matches any single character /// not from the rest of the sequence. /// If two characters in the sequence are separated by "-", this is shorthand /// for the full list of characters between them (e.g. "[0-9]" matches any /// decimal digit). To include a literal "]" in the sequence, make it the /// first character (following a possible "^"). To include a literal "-", /// make it the first or last character. ///

/// In general there may be more than one way to match a regular expression /// to an input string. For example, consider the command ///

  /// String[] match = new String[2];
  /// Regexp.match("(a*)b*", "aabaaabb", match);
  /// 
/// Considering only the rules given so far, match[0] and /// match[1] could end up with the values /// or any of several other combinations. To resolve this potential ambiguity, /// Regexp chooses among alternatives using the rule "first then longest". /// In other words, it considers the possible matches in order working /// from left to right across the input string and the pattern, and it /// attempts to match longer pieces of the input string before shorter /// ones. More specifically, the following rules apply in decreasing /// order of priority:
    ///
  1. If a regular expression could match two different parts of an input /// string then it will match the one that begins earliest. ///
  2. If a regular expression contains "|" operators then the /// leftmost matching sub-expression is chosen. ///
  3. In "*", "+", and "?" constructs, longer matches are chosen in /// preference to shorter ones. ///
  4. /// In sequences of expression components the components are considered /// from left to right. ///
///

/// In the example from above, "(a*)b*" therefore matches exactly "aab"; the /// "(a*)" portion of the pattern is matched first and it consumes the leading /// "aa", then the "b*" portion of the pattern consumes the next "b". Or, /// consider the following example: ///

  /// String match = new String[3];
  /// Regexp.match("(ab|a)(b*)c", "abc", match);
  /// 
/// After this command, match[0] will be "abc", /// match[1] will be "ab", and match[2] will be an /// empty string. /// Rule 4 specifies that the "(ab|a)" component gets first shot at the input /// string and Rule 2 specifies that the "ab" sub-expression /// is checked before the "a" sub-expression. /// Thus the "b" has already been claimed before the "(b*)" /// component is checked and therefore "(b*)" must match an empty string. ///
/// /// REGULAR EXPRESSION SUBSTITUTION ///

/// Regular expression substitution matches a string against a regular /// expression, transforming the string by replacing the matched region(s) /// with new substring(s). ///

/// What gets substituted into the result is controlled by a /// subspec. The subspec is a formatting string that specifies /// what portions of the matched region should be substituted into the /// result. ///

/// In the above, strings like "\2" represents the two characters /// backslash and "2", not the Unicode character 0002. ///
/// Here is an example of how to use Regexp ///
  /// 
  /// public static void
  /// main(String[] args)
  /// throws Exception
  /// {
  /// Regexp re;
  /// String[] matches;
  /// String s;
  /// 
  /// /*
  /// * A regular expression to match the first line of a HTTP request.
  /// *
  /// * 1. ^               - starting at the beginning of the line
  /// * 2. ([A-Z]+)        - match and remember some upper case characters
  /// * 3. [ \t]+          - skip blank space
  /// * 4. ([^ \t]*)       - match and remember up to the next blank space
  /// * 5. [ \t]+          - skip more blank space
  /// * 6. (HTTP/1\\.[01]) - match and remember HTTP/1.0 or HTTP/1.1
  /// * 7. $		      - end of string - no chars left.
  /// */
  /// 
  /// s = "GET http://a.b.com:1234/index.html HTTP/1.1";
  /// 
  /// Regexp re = new Regexp("^([A-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\\.[01])$");
  /// String[] matches = new String[4];
  /// if (re.match(s, matches)) {
  /// System.out.println("METHOD  " + matches[1]);
  /// System.out.println("URL     " + matches[2]);
  /// System.out.println("VERSION " + matches[3]);
  /// }
  /// 
  /// /*
  /// * A regular expression to extract some simple comma-separated data,
  /// * reorder some of the columns, and discard column 2.
  /// */
  /// 
  /// s = "abc,def,ghi,klm,nop,pqr";
  /// 
  /// Regexp re = new Regexp("^([^,]+),([^,]+),([^,]+),(.*)");
  /// System.out.println(re.sub(s, "\\3,\\1,\\4"));
  /// }
  /// 
/// ///
/// Colin Stevens (colin.stevens@sun.com) /// /// 1.7, 99/10/14 /// /// /// public class Regexp { //[STAThread] //public static void Main(string[] args) //{ // if ((args.Length == 2) && (args[0].Equals("compile"))) // { // System.Diagnostics.Debug.WriteLine(new Regexp(args[1])); // } // else if ((args.Length == 3) && (args[0].Equals("match"))) // { // Regexp r = new Regexp(args[1]); // string[] substrs = new string[r.subspecs()]; // bool match = r.match(args[2], substrs); // System.Diagnostics.Debug.WriteLine("match:\t" + match); // for (int i = 0; i < substrs.Length; i++) // { // System.Diagnostics.Debug.WriteLine((i + 1) + ":\t" + substrs[i]); // } // } // else if ((args.Length == 4) && (args[0].Equals("sub"))) // { // Regexp r = new Regexp(args[1]); // System.Diagnostics.Debug.WriteLine(r.subAll(args[2], args[3])); // } // else // { // System.Diagnostics.Debug.WriteLine("usage:"); // System.Diagnostics.Debug.WriteLine("\tRegexp match "); // System.Diagnostics.Debug.WriteLine("\tRegexp sub "); // System.Diagnostics.Debug.WriteLine("\tRegexp compile "); // } //} /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or * "railroad normal form" in parsing technology). Each node is an opcode * plus a "next" pointer, possibly plus an operand. "Next" pointers of * all nodes except BRANCH implement concatenation; a "next" pointer with * a BRANCH on both ends of it is connecting two alternatives. (Here we * have one of the subtle syntax dependencies: an individual BRANCH (as * opposed to a collection of them) is never concatenated with anything * because of operator precedence.) The operand of some types of node is * a literal string; for others, it is a node leading into a sub-FSM. In * particular, the operand of a BRANCH node is the first node of the branch. * (NB this is *not* a tree structure: the tail of the branch connects * to the thing following the set of BRANCHes.) The opcodes are: */ internal const int NSUBEXP = 100; /* definition number opnd? meaning */ internal const char END = (char)( 0 ); /* no End of program. */ internal const char BOL = (char)( 1 ); /* no Match "" at beginning of line. */ internal const char EOL = (char)( 2 ); /* no Match "" at end of line. */ internal const char ANY = (char)( 3 ); /* no Match any one character. */ internal const char ANYOF = (char)( 4 ); /* str Match any character in this string. */ internal const char ANYBUT = (char)( 5 ); /* str Match any character not in this string. */ internal const char BRANCH = (char)( 6 ); /* node Match this alternative, or the next... */ internal const char BACK = (char)( 7 ); /* no Match "", "next" ptr points backward. */ internal const char EXACTLY = (char)( 8 ); /* str Match this string. */ internal const char NOTHING = (char)( 9 ); /* no Match empty string. */ internal const char STAR = (char)( 10 ); /* node Match this (simple) thing 0 or more times. */ internal const char PLUS = (char)( 11 ); /* node Match this (simple) thing 1 or more times. */ internal const char OPEN = (char)( 20 ); /* no Mark this point in input as start of #n. */ /* OPEN+1 is number 1, etc. */ internal static readonly char CLOSE = (char)( OPEN + NSUBEXP ); /* no Analogous to OPEN. */ internal static readonly string[] opnames = new string[] { "END", "BOL", "EOL", "ANY", "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING", "STAR", "PLUS" }; /* * A node is one char of opcode followed by one char of "next" pointer. * The value is a positive offset from the opcode of the node containing * it. An operand, if any, simply follows the node. (Note that much of * the code generation knows about this implicit relationship.) * * Opcode notes: * * BRANCH The set of branches constituting a single choice are hooked * together with their "next" pointers, since precedence prevents * anything being concatenated to any individual branch. The * "next" pointer of the last BRANCH in a choice points to the * thing following the whole choice. This is also where the * final "next" pointer of each individual branch points; each * branch starts with the operand node of a BRANCH node. * * ANYOF, ANYBUT, EXACTLY * The format of a string operand is one char of length * followed by the characters making up the string. * * BACK Normal "next" pointers all implicitly point forward; BACK * exists to make loop structures possible. * * STAR, PLUS * '?', and complex '*' and '+' are implemented as circular * BRANCH structures using BACK. Simple cases (one character * per match) are implemented with STAR and PLUS for speed * and to minimize recursive plunges. * * OPENn, CLOSEn * are numbered at compile time. */ /// The bytecodes making up the regexp program. internal char[] program; /// Whether the regexp matching should be case insensitive. internal bool ignoreCase; /// The number of parenthesized subexpressions in the regexp pattern, /// plus 1 for the match of the whole pattern itself. /// internal int npar; /// true if the pattern must match the beginning of the /// string, so we don't have to waste time matching against all possible /// starting locations in the string. /// internal bool anchored; internal int startChar; internal string must; /// Compiles a new Regexp object from the given regular expression /// pattern. ///

/// It takes a certain amount of time to parse and validate a regular /// expression pattern before it can be used to perform matches /// or substitutions. If the caller caches the new Regexp object, that /// parsing time will be saved because the same Regexp can be used with /// respect to many different strings. /// ///

/// pat /// The string holding the regular expression pattern. /// /// @throws IllegalArgumentException if the pattern is malformed. /// The detail message for the exception will be set to a /// string indicating how the pattern was malformed. /// public Regexp( string pat ) { compile( pat ); } /// Compiles a new Regexp object from the given regular expression /// pattern. /// /// /// pat /// The string holding the regular expression pattern. /// /// /// ignoreCase /// If true then this regular expression will /// do case-insensitive matching. If false, then /// the matches are case-sensitive. Regular expressions /// generated by Regexp(String) are case-sensitive. /// /// @throws IllegalArgumentException if the pattern is malformed. /// The detail message for the exception will be set to a /// string indicating how the pattern was malformed. /// public Regexp( string pat, bool ignoreCase ) { this.ignoreCase = ignoreCase; if ( ignoreCase ) { pat = pat.ToLower(); } compile( pat ); } /// Returns the number of parenthesized subexpressions in this regular /// expression, plus one more for this expression itself. /// /// /// The number. /// public int subspecs() { return npar; } /// Matches the given string against this regular expression. /// /// /// str /// The string to match. /// /// /// The substring of str that matched the entire /// regular expression, or null if the string did not /// match this regular expression. /// public string match( string str ) { Match m = exec( str, 0, 0 ); if ( m == null ) { return null; } return str.Substring( m.indices[0], ( m.indices[1] ) - ( m.indices[0] ) ); } /// Matches the given string against this regular expression, and computes /// the set of substrings that matched the parenthesized subexpressions. ///

/// substrs[0] is set to the range of str /// that matched the entire regular expression. ///

/// substrs[1] is set to the range of str /// that matched the first (leftmost) parenthesized subexpression. /// substrs[n] is set to the range that matched the /// nth subexpression, and so on. ///

/// If subexpression n did not match, then /// substrs[n] is set to null. Not to /// be confused with "", which is a valid value for a /// subexpression that matched 0 characters. ///

/// The length that the caller should use when allocating the /// substr array is the return value of /// Regexp.subspecs. The array /// can be shorter (in which case not all the information will /// be returned), or longer (in which case the remainder of the /// elements are initialized to null), or /// null (to ignore the subexpressions). /// ///

/// str /// The string to match. /// /// /// substrs /// An array of strings allocated by the caller, and filled in /// with information about the portions of str that /// matched the regular expression. May be null. /// /// /// true if str that matched this /// regular expression, false otherwise. /// If false is returned, then the contents of /// substrs are unchanged. /// /// /// /// public bool match( string str, string[] substrs ) { Match m = exec( str, 0, 0 ); if ( m == null ) { return false; } if ( substrs != null ) { int max = System.Math.Min( substrs.Length, npar ); int i; int j = 0; for ( i = 0; i < max; i++ ) { int start = m.indices[j++]; int end = m.indices[j++]; if ( start < 0 ) { substrs[i] = null; } else { substrs[i] = str.Substring( start, ( end ) - ( start ) ); } } for ( ; i < substrs.Length; i++ ) { substrs[i] = null; } } return true; } /// Matches the given string against this regular expression, and computes /// the set of substrings that matched the parenthesized subexpressions. ///

/// For the indices specified below, the range extends from the character /// at the starting index up to, but not including, the character at the /// ending index. ///

/// indices[0] and indices[1] are set to /// starting and ending indices of the range of str /// that matched the entire regular expression. ///

/// indices[2] and indices[3] are set to the /// starting and ending indices of the range of str that /// matched the first (leftmost) parenthesized subexpression. /// indices[n * 2] and indices[n * 2 + 1] /// are set to the range that matched the nth /// subexpression, and so on. ///

/// If subexpression n did not match, then /// indices[n * 2] and indices[n * 2 + 1] /// are both set to -1. ///

/// The length that the caller should use when allocating the /// indices array is twice the return value of /// Regexp.subspecs. The array /// can be shorter (in which case not all the information will /// be returned), or longer (in which case the remainder of the /// elements are initialized to -1), or /// null (to ignore the subexpressions). /// ///

/// str /// The string to match. /// /// /// indices /// An array of integers allocated by the caller, and filled in /// with information about the portions of str that /// matched all the parts of the regular expression. /// May be null. /// /// /// true if the string matched the regular expression, /// false otherwise. If false is /// returned, then the contents of indices are /// unchanged. /// /// /// /// public bool match( string str, int[] indices ) { Match m = exec( str, 0, 0 ); if ( m == null ) { return false; } if ( indices != null ) { int max = System.Math.Min( indices.Length, npar * 2 ); Array.Copy( (System.Array)m.indices, 0, (System.Array)indices, 0, max ); for ( int i = max; i < indices.Length; i++ ) { indices[i] = -1; } } return true; } /// Matches a string against a regular expression and replaces the first /// match with the string generated from the substitution parameter. /// /// /// str /// The string to match against this regular expression. /// /// /// subspec /// The substitution parameter, described in /// REGULAR EXPRESSION SUBSTITUTION. /// /// /// The string formed by replacing the first match in /// str with the string generated from /// subspec. If no matches were found, then /// the return value is null. /// public string sub( string str, string subspec ) { Regsub rs = new Regsub( this, str ); if ( rs.nextMatch() ) { StringBuilder sb = new StringBuilder( rs.skipped() ); applySubspec( rs, subspec, sb ); sb.Append( rs.rest() ); return sb.ToString(); } else { return null; } } /// Matches a string against a regular expression and replaces all /// matches with the string generated from the substitution parameter. /// After each substutition is done, the portions of the string already /// examined, including the newly substituted region, are not checked /// again for new matches -- only the rest of the string is examined. /// /// /// str /// The string to match against this regular expression. /// /// /// subspec /// The substitution parameter, described in /// REGULAR EXPRESSION SUBSTITUTION. /// /// /// The string formed by replacing all the matches in /// str with the strings generated from /// subspec. If no matches were found, then /// the return value is a copy of str. /// public string subAll( string str, string subspec ) { return sub( str, new SubspecFilter( subspec, true ) ); } /// Utility method to give access to the standard substitution algorithm /// used by sub and subAll. Appends to the /// string buffer the string generated by applying the substitution /// parameter to the matched region. /// /// /// rs /// Information about the matched region. /// /// /// subspec /// The substitution parameter. /// /// /// sb /// StringBuffer to which the generated string is appended. /// public static void applySubspec( Regsub rs, string subspec, StringBuilder sb ) { try { int len = subspec.Length; for ( int i = 0; i < len; i++ ) { char ch = subspec[i]; switch ( ch ) { case '&': { sb.Append( rs.matched() ); break; } case '\\': { i++; ch = subspec[i]; if ( ( ch >= '0' ) && ( ch <= '9' ) ) { string match = rs.submatch( ch - '0' ); if ( (System.Object)match != null ) { sb.Append( match ); } break; } // fall through. } goto default; default: { sb.Append( ch ); } break; } } } catch ( System.IndexOutOfRangeException e ) { /* * Ignore malformed substitution pattern. * Return string matched so far. */ } } public string sub( string str, Filter rf ) { Regsub rs = new Regsub( this, str ); if ( rs.nextMatch() == false ) { return str; } StringBuilder sb = new StringBuilder(); do { sb.Append( rs.skipped() ); if ( rf.filter( rs, sb ) == false ) { break; } } while ( rs.nextMatch() ); sb.Append( rs.rest() ); return sb.ToString(); } /// This interface is used by the Regexp class to generate /// the replacement string for each pattern match found in the source /// string. /// /// /// Colin Stevens (colin.stevens@sun.com) /// /// 1.7, 99/10/14 /// public interface Filter { /// Given the current state of the match, generate the replacement /// string. This method will be called for each match found in /// the source string, unless this filter decides not to handle any /// more matches. ///

/// The implementation can use whatever rules it chooses /// to generate the replacement string. For example, here is an /// example of a filter that replaces the first 5 /// occurrences of "%XX" in a string with the ASCII character /// represented by the hex digits "XX": ///

      /// String str = ...;
      /// 
      /// Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");
      /// 
      /// Regexp.Filter rf = new Regexp.Filter() {
      /// int count = 5;
      /// public boolean filter(Regsub rs, StringBuffer sb) {
      /// String match = rs.matched();
      /// int hi = Character.digit(match.charAt(1), 16);
      /// int lo = Character.digit(match.charAt(2), 16);
      /// sb.append((char) ((hi << 4) | lo));
      /// return (--count > 0);
      /// }
      /// }
      /// 
      /// String result = re.sub(str, rf);
      /// 
/// ///
/// rs /// Regsub containing the state of the current /// match. /// /// /// sb /// The string buffer that this filter should append the /// generated string to. This string buffer actually /// contains the results the calling Regexp has /// generated up to this point. /// /// /// false if no further matches should be /// considered in this string, true to allow /// Regexp to continue looking for further /// matches. /// bool filter( Regsub rs, StringBuilder sb ); } private class SubspecFilter : Filter { internal string subspec; internal bool all; public SubspecFilter( string subspec, bool all ) { this.subspec = subspec; this.all = all; } public bool filter( Regsub rs, StringBuilder sb ) { sunlabs.brazil.util.regexp.Regexp.applySubspec( rs, subspec, sb ); return all; } } /// Returns a string representation of this compiled regular /// expression. The format of the string representation is a /// symbolic dump of the bytecodes. /// /// /// A string representation of this regular expression. /// public override string ToString() { StringBuilder sb = new StringBuilder(); sb.Append( "# subs: " + npar + "\n" ); sb.Append( "anchor: " + anchored + "\n" ); sb.Append( "start: " + (char)startChar + "\n" ); sb.Append( "must: " + must + "\n" ); for ( int i = 0; i < program.Length; ) { sb.Append( i + ":\t" ); int op = program[i]; if ( op >= CLOSE ) { sb.Append( "CLOSE" + ( op - CLOSE ) ); } else if ( op >= OPEN ) { sb.Append( "OPEN" + ( op - OPEN ) ); } else { sb.Append( opnames[op] ); } int line; int offset = (int)program[i + 1]; if ( offset == 0 ) { sb.Append( '\t' ); } else if ( op == BACK ) { sb.Append( "\t-" + offset + "," + ( i - offset ) ); } else { sb.Append( "\t+" + offset + "," + ( i + offset ) ); } if ( ( op == ANYOF ) || ( op == ANYBUT ) || ( op == EXACTLY ) ) { sb.Append( "\t'" ); sb.Append( program, i + 3, program[i + 2] ); sb.Append( "'" ); i += 3 + program[i + 2]; } else { i += 2; } sb.Append( '\n' ); } return sb.ToString(); } private void compile( string exp ) { Compiler rcstate = new Compiler(); rcstate.parse = exp.ToCharArray(); rcstate.off = 0; rcstate.npar = 1; rcstate.code = new StringBuilder(); rcstate.reg( false ); program = rcstate.code.ToString().ToCharArray(); npar = rcstate.npar; startChar = -1; /* optimize */ if ( program[rcstate.regnext( 0 )] == END ) { if ( program[2] == BOL ) { anchored = true; } else if ( program[2] == EXACTLY ) { startChar = (int)program[5]; } } /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ /* if ((rcstate.flagp & Compiler.SPSTART) != 0) { int index = -1; int longest = 0; for (scan = 0; scan < program.length; ) { switch (program[scan]) { case EXACTLY: int length = program[scan + 2]; if (length > longest) { index = scan; longest = length; } // fall through; case ANYOF: case ANYBUT: scan += 3 + program[scan + 2]; break; default: scan += 2; break; } } if (longest > 0) { must = new String(program, index + 3, longest); } }*/ } internal Match exec( string str, int start, int off ) { if ( ignoreCase ) { str = str.ToLower(); } Match match = new Match(); match.program = program; /* Mark beginning of line for ^ . */ match.str = str; match.bol = start; match.length = str.Length; match.indices = new int[npar * 2]; if ( anchored ) { /* Simplest case: anchored match need be tried only once. */ if ( match.regtry( off ) ) { return match; } } else if ( startChar >= 0 ) { /* We know what char it must start with. */ while ( off < match.length ) { off = str.IndexOf( (System.Char)startChar, off ); if ( off < 0 ) { break; } if ( match.regtry( off ) ) { return match; } off++; } } else { /* Messy cases: unanchored match. */ do { if ( match.regtry( off ) ) { return match; } } while ( off++ < match.length ); } return null; } internal class Compiler { internal char[] parse; internal int off; internal int npar; internal StringBuilder code; internal int flagp; internal const string META = "^$.[()|?+*\\"; internal const string MULT = "*+?"; internal const int WORST = 0; /* Worst case. */ internal const int HASWIDTH = 1; /* Known never to match null string. */ internal const int SIMPLE = 2; /* Simple enough to be STAR/PLUS operand. */ internal const int SPSTART = 4; /* Starts with * or +. */ /* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ internal int reg( bool paren ) { int netFlags = HASWIDTH; int parno = 0; int ret = -1; if ( paren ) { parno = npar++; if ( npar >= sunlabs.brazil.util.regexp.Regexp.NSUBEXP ) { throw new System.ArgumentException( "too many ()" ); } ret = regnode( (char)( sunlabs.brazil.util.regexp.Regexp.OPEN + parno ) ); } /* Pick up the branches, linking them together. */ int br = regbranch(); if ( ret >= 0 ) { regtail( ret, br ); } else { ret = br; } if ( ( flagp & HASWIDTH ) == 0 ) { netFlags &= ~HASWIDTH; } netFlags |= ( flagp & SPSTART ); while ( ( off < parse.Length ) && ( parse[off] == '|' ) ) { off++; br = regbranch(); regtail( ret, br ); if ( ( flagp & HASWIDTH ) == 0 ) { netFlags &= ~HASWIDTH; } netFlags |= ( flagp & SPSTART ); } /* Make a closing node, and hook it on the end. */ int ender = regnode( ( paren ) ? (char)( sunlabs.brazil.util.regexp.Regexp.CLOSE + parno ) : sunlabs.brazil.util.regexp.Regexp.END ); regtail( ret, ender ); /* Hook the tails of the branches to the closing node. */ for ( br = ret; br >= 0; br = regnext( br ) ) { regoptail( br, ender ); } /* Check for proper termination. */ if ( paren && ( ( off >= parse.Length ) || ( parse[off++] != ')' ) ) ) { throw new System.ArgumentException( "missing )" ); } else if ( ( paren == false ) && ( off < parse.Length ) ) { throw new System.ArgumentException( "unexpected )" ); } flagp = netFlags; return ret; } /* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ internal int regbranch() { int netFlags = WORST; /* Tentatively. */ int ret = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ); int chain = -1; while ( ( off < parse.Length ) && ( parse[off] != '|' ) && ( parse[off] != ')' ) ) { int latest = regpiece(); netFlags |= flagp & HASWIDTH; if ( chain < 0 ) { /* First piece. */ netFlags |= ( flagp & SPSTART ); } else { regtail( chain, latest ); } chain = latest; } if ( chain < 0 ) { /* Loop ran zero times. */ regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ); } flagp = netFlags; return ret; } /* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ internal int regpiece() { int netFlags; int ret = regatom(); if ( ( off >= parse.Length ) || ( isMult( parse[off] ) == false ) ) { return ret; } char op = parse[off]; if ( ( ( flagp & HASWIDTH ) == 0 ) && ( op != '?' ) ) { throw new System.ArgumentException( "*+ operand could be empty" ); } netFlags = ( op != '+' ) ? ( WORST | SPSTART ) : ( WORST | HASWIDTH ); if ( ( op == '*' ) && ( ( flagp & SIMPLE ) != 0 ) ) { reginsert( sunlabs.brazil.util.regexp.Regexp.STAR, ret ); } else if ( op == '*' ) { /* Emit x* as (x&|), where & means "self". */ reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */ regoptail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BACK ) ); /* and loop */ regoptail( ret, ret ); /* back */ regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */ regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */ } else if ( ( op == '+' ) && ( ( flagp & SIMPLE ) != 0 ) ) { reginsert( sunlabs.brazil.util.regexp.Regexp.PLUS, ret ); } else if ( op == '+' ) { /* Emit x+ as x(&|), where & means "self". */ int next = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ); /* Either */ regtail( ret, next ); regtail( regnode( sunlabs.brazil.util.regexp.Regexp.BACK ), ret ); /* loop back */ regtail( next, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */ regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */ } else if ( op == '?' ) { /* Emit x? as (x|) */ reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */ regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */ int next = regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ); /* null. */ regtail( ret, next ); regoptail( ret, next ); } off++; if ( ( off < parse.Length ) && isMult( parse[off] ) ) { throw new System.ArgumentException( "nested *?+" ); } flagp = netFlags; return ret; } /* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ internal int regatom() { int netFlags = WORST; /* Tentatively. */ int ret; switch ( parse[off++] ) { case '^': ret = regnode( sunlabs.brazil.util.regexp.Regexp.BOL ); break; case '$': ret = regnode( sunlabs.brazil.util.regexp.Regexp.EOL ); break; case '.': ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANY ); netFlags |= ( HASWIDTH | SIMPLE ); break; case '[': { try { if ( parse[off] == '^' ) { ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYBUT ); off++; } else { ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYOF ); } int pos = reglen(); regc( '\x0000' ); if ( ( parse[off] == ']' ) || ( parse[off] == '-' ) ) { regc( parse[off++] ); } while ( parse[off] != ']' ) { if ( parse[off] == '-' ) { off++; if ( parse[off] == ']' ) { regc( '-' ); } else { int start = parse[off - 2]; int end = parse[off++]; if ( start > end ) { throw new System.ArgumentException( "invalid [] range" ); } for ( int i = start + 1; i <= end; i++ ) { regc( (char)i ); } } } else { regc( parse[off++] ); } } regset( pos, (char)( reglen() - pos - 1 ) ); off++; netFlags |= HASWIDTH | SIMPLE; } catch ( System.IndexOutOfRangeException e ) { throw new System.ArgumentException( "missing ]" ); } break; } case '(': ret = reg( true ); netFlags |= ( flagp & ( HASWIDTH | SPSTART ) ); break; case '|': case ')': throw new System.ArgumentException( "internal urp" ); case '?': case '+': case '*': throw new System.ArgumentException( "?+* follows nothing" ); case '\\': if ( off >= parse.Length ) { throw new System.ArgumentException( "trailing \\" ); } ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY ); regc( (char)1 ); regc( parse[off++] ); netFlags |= HASWIDTH | SIMPLE; break; default: { off--; int end; for ( end = off; end < parse.Length; end++ ) { if ( META.IndexOf( (System.Char)parse[end] ) >= 0 ) { break; } } if ( ( end > off + 1 ) && ( end < parse.Length ) && isMult( parse[end] ) ) { end--; /* Back off clear of ?+* operand. */ } netFlags |= HASWIDTH; if ( end == off + 1 ) { netFlags |= SIMPLE; } ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY ); regc( (char)( end - off ) ); for ( ; off < end; off++ ) { regc( parse[off] ); } } break; } flagp = netFlags; return ret; } /* - regnode - emit a node */ internal int regnode( char op ) { int ret = code.Length; code.Append( op ); code.Append( '\x0000' ); return ret; } /* - regc - emit (if appropriate) a byte of code */ internal void regc( char b ) { code.Append( b ); } internal int reglen() { return code.Length; } internal void regset( int pos, char ch ) { code[pos] = ch; } /* - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. */ internal void reginsert( char op, int pos ) { char[] tmp = new char[] { op, '\x0000' }; code.Insert( pos, tmp ); } /* - regtail - set the next-pointer at the end of a node chain */ internal void regtail( int pos, int val ) { /* Find last node. */ int scan = pos; while ( true ) { int tmp = regnext( scan ); if ( tmp < 0 ) { break; } scan = tmp; } int offset = ( code[scan] == sunlabs.brazil.util.regexp.Regexp.BACK ) ? scan - val : val - scan; code[scan + 1] = (char)offset; } /* - regoptail - regtail on operand of first argument; nop if operandless */ internal void regoptail( int pos, int val ) { if ( ( pos < 0 ) || ( code[pos] != sunlabs.brazil.util.regexp.Regexp.BRANCH ) ) { return; } regtail( pos + 2, val ); } /* - regnext - dig the "next" pointer out of a node */ internal int regnext( int pos ) { int offset = code[pos + 1]; if ( offset == 0 ) { return -1; } if ( code[pos] == sunlabs.brazil.util.regexp.Regexp.BACK ) { return pos - offset; } else { return pos + offset; } } internal static bool isMult( char ch ) { return ( ch == '*' ) || ( ch == '+' ) || ( ch == '?' ); } } internal class Match { internal char[] program; internal string str; internal int bol; internal int input; internal int length; internal int[] indices; internal bool regtry( int off ) { this.input = off; for ( int i = 0; i < indices.Length; i++ ) { indices[i] = -1; } if ( regmatch( 0 ) ) { indices[0] = off; indices[1] = input; return true; } else { return false; } } /* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ internal bool regmatch( int scan ) { while ( true ) { int next = regnext( scan ); int op = program[scan]; switch ( op ) { case sunlabs.brazil.util.regexp.Regexp.BOL: if ( input != bol ) { return false; } break; case sunlabs.brazil.util.regexp.Regexp.EOL: if ( input != length ) { return false; } break; case sunlabs.brazil.util.regexp.Regexp.ANY: if ( input >= length ) { return false; } input++; break; case sunlabs.brazil.util.regexp.Regexp.EXACTLY: { if ( compare( scan ) == false ) { return false; } break; } case sunlabs.brazil.util.regexp.Regexp.ANYOF: if ( input >= length ) { return false; } if ( present( scan ) == false ) { return false; } input++; break; case sunlabs.brazil.util.regexp.Regexp.ANYBUT: if ( input >= length ) { return false; } if ( present( scan ) ) { return false; } input++; break; case sunlabs.brazil.util.regexp.Regexp.NOTHING: case sunlabs.brazil.util.regexp.Regexp.BACK: break; case sunlabs.brazil.util.regexp.Regexp.BRANCH: { if ( program[next] != sunlabs.brazil.util.regexp.Regexp.BRANCH ) { next = scan + 2; } else { do { int save = input; if ( regmatch( scan + 2 ) ) { return true; } input = save; scan = regnext( scan ); } while ( ( scan >= 0 ) && ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); return false; } break; } case sunlabs.brazil.util.regexp.Regexp.STAR: case sunlabs.brazil.util.regexp.Regexp.PLUS: { /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ int ch = -1; if ( program[next] == sunlabs.brazil.util.regexp.Regexp.EXACTLY ) { ch = program[next + 3]; } int min = ( op == sunlabs.brazil.util.regexp.Regexp.STAR ) ? 0 : 1; int save = input; int no = regrepeat( scan + 2 ); while ( no >= min ) { /* If it could work, try it. */ if ( ( ch < 0 ) || ( ( input < length ) && ( str[input] == ch ) ) ) { if ( regmatch( next ) ) { return true; } } /* Couldn't or didn't -- back up. */ no--; input = save + no; } return false; } case sunlabs.brazil.util.regexp.Regexp.END: return true; default: if ( op >= sunlabs.brazil.util.regexp.Regexp.CLOSE ) { int no = op - sunlabs.brazil.util.regexp.Regexp.CLOSE; int save = input; if ( regmatch( next ) ) { /* * Don't set endp if some later * invocation of the same parentheses * already has. */ if ( indices[no * 2 + 1] <= 0 ) { indices[no * 2 + 1] = save; } return true; } } else if ( op >= sunlabs.brazil.util.regexp.Regexp.OPEN ) { int no = op - sunlabs.brazil.util.regexp.Regexp.OPEN; int save = input; if ( regmatch( next ) ) { /* * Don't set startp if some later invocation of the * same parentheses already has. */ if ( indices[no * 2] <= 0 ) { indices[no * 2] = save; } return true; } } return false; } scan = next; } } internal bool compare( int scan ) { int count = program[scan + 2]; if ( input + count > length ) { return false; } int start = scan + 3; int end = start + count; for ( int i = start; i < end; i++ ) { if ( str[input++] != program[i] ) { return false; } } return true; } internal bool present( int scan ) { char ch = str[input]; int count = program[scan + 2]; int start = scan + 3; int end = start + count; for ( int i = start; i < end; i++ ) { if ( program[i] == ch ) { return true; } } return false; } /* - regrepeat - repeatedly match something simple, report how many */ internal int regrepeat( int scan ) { int op = program[scan]; int count = 0; switch ( op ) { case sunlabs.brazil.util.regexp.Regexp.ANY: count = length - input; input = length; break; case sunlabs.brazil.util.regexp.Regexp.EXACTLY: { // 'g*' matches all the following 'g' characters. char ch = program[scan + 3]; while ( ( input < length ) && ( str[input] == ch ) ) { input++; count++; } break; } case sunlabs.brazil.util.regexp.Regexp.ANYOF: while ( ( input < length ) && present( scan ) ) { input++; count++; } break; case sunlabs.brazil.util.regexp.Regexp.ANYBUT: while ( ( input < length ) && !present( scan ) ) { input++; count++; } break; } return count; } /* - regnext - dig the "next" pointer out of a node */ internal int regnext( int scan ) { int offset = program[scan + 1]; if ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BACK ) { return scan - offset; } else { return scan + offset; } } } } }