/** * {@code findAllUTF8Submatch} is the <a href='#all'>All</a> version of {@link #findUTF8Submatch}; * it returns a list of up to {@code n} successive matches of the expression, as defined by the <a * href='#all'>All</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public List <byte[][]> findAllUTF8Submatch(byte[] b, int n) { List <byte[][]> result = new List <byte[][]>(); allMatches( MachineInput.fromUTF8(b), n, (int[] match) => { byte[][] slice = new byte[match.Length / 2][]; for (int j = 0; j < slice.Length; ++j) { if (match[2 * j] >= 0) { slice[j] = Utils.subarray(b, match[2 * j], match[2 * j + 1]); } } result.Add(slice); }); if (!result.Any()) { return(null); } return(result); }
/** * Matches the regular expression against input starting at position start and ending at position * end, with the given anchoring. Records the submatch boundaries in group, which is [start, end) * pairs of byte offsets. The number of boundaries needed is inferred from the size of the group * array. It is most efficient not to ask for submatch boundaries. * * @param input the input byte array * @param start the beginning position in the input * @param end the end position in the input * @param anchor the anchoring flag (UNANCHORED, ANCHOR_START, ANCHOR_BOTH) * @param group the array to fill with submatch positions * @param ngroup the number of array pairs to fill in * @return true if a match was found */ public bool match(string input, int start, int end, int anchor, int[] group, int ngroup) { if (start >= end) // strings in Java and C# indexed from zero. But, java doesn't crash if over, c# crashes. { return(false); } // TODO(afrozm): We suspect that the correct code should look something // like the following: // doExecute(MachineInput.fromUTF16(input), start, anchor, 2*ngroup); // // In Russ' own words: // That is, I believe doExecute needs to know the bounds of the whole input // as well as the bounds of the subpiece that is being searched. int[] groupMatch = doExecute(MachineInput.fromUTF16(input, 0, end), start, anchor, 2 * ngroup); if (groupMatch == null) { return(false); } if (group != null) { System.Array.Copy(groupMatch, 0, group, 0, groupMatch.Length); } return(true); }
/** * {@code findAllSubmatch} is the <a href='#all'>All</a> version of {@link #findSubmatch}; it * returns a list of up to {@code n} successive matches of the expression, as defined by the <a * href='#all'>All</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public List <String[]> findAllSubmatch(String s, int n) { List <String[]> result = new List <String[]>(); allMatches( MachineInput.fromUTF16(s), n, (int[] match) => { String[] slice = new String[match.Length / 2]; for (int j = 0; j < slice.Length; ++j) { if (match[2 * j] >= 0) { slice[j] = s.Substring(match[2 * j], match[2 * j + 1] - match[2 * j]); } } result.Add(slice); }); if (!result.Any()) { return(null); } return(result); }
// doExecute() finds the leftmost match in the input and returns // the position of its subexpressions. // Derived from exec.go. public int[] doExecute(MachineInput @in, int pos, int anchor, int ncap) { Machine m = get(); m.init(ncap); int[] cap = m.match(@in, pos, anchor) ? m.submatches() : null; put(m); return(cap); }
/** * Returns a two-element array of integers defining the location of the leftmost match in * {@code s} of this regular expression. The match itself is at * {@code s.substring(loc[0], loc[1])}. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public int[] findIndex(String s) { int[] a = doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 2); if (a == null) { return(null); } return(a); }
/** * Returns a string holding the text of the leftmost match in {@code s} of this regular * expression. * * <p> * If there is no match, the return value is an empty string, but it will also be empty if the * regular expression successfully matches an empty string. Use {@link #findIndex} or * {@link #findSubmatch} if it is necessary to distinguish these cases. */ // This is visible for testing. public String find(String s) { int[] a = doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 2); if (a == null) { return(""); } return(s.Substring(a[0], a[1] - a[0])); }
/** * Returns a two-element array of integers defining the location of the leftmost match in * {@code b} of this regular expression. The match itself is at {@code b[loc[0]...loc[1]]}. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public int[] findUTF8Index(byte[] b) { int[] a = doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, 2); if (a == null) { return(null); } return(Utils.subarray(a, 0, 2)); }
// Find matches in input. private void allMatches(MachineInput input, int n, DeliverFunc deliver) { int end = input.endPos(); if (n < 0) { n = end + 1; } for (int pos = 0, i = 0, prevMatchEnd = -1; i < n && pos <= end;) { int[] matches = doExecute(input, pos, UNANCHORED, prog.numCap); if (matches == null || matches.Length == 0) { break; } bool accept = true; if (matches[1] == pos) { // We've found an empty match. if (matches[0] == prevMatchEnd) { // We don't allow an empty match right // after a previous match, so ignore it. accept = false; } int r = input.step(pos); if (r < 0) { // EOF pos = end + 1; } else { pos += r & 0x7; } } else { pos = matches[1]; } prevMatchEnd = matches[1]; if (accept) { deliver(pad(matches)); i++; } } }
/** * {@code findAllUTF8SubmatchIndex} is the <a href='#all'>All</a> version of * {@link #findUTF8SubmatchIndex}; it returns a list of up to {@code n} successive matches of the * expression, as defined by the <a href='#all'>All</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public List <int[]> findAllUTF8SubmatchIndex(byte[] b, int n) { List <int[]> result = new List <int[]>(); allMatches( MachineInput.fromUTF8(b), n, (int[] match) => { result.Add(match); }); if (!result.Any()) { return(null); } return(result); }
/** * {@code findAll} is the <a href='#all'>All</a> version of {@link #find}; it returns a list of up * to {@code n} successive matches of the expression, as defined by the <a href='#all'>All</a> * description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public List <String> findAll(String s, int n) { List <String> result = new List <String>(); allMatches( MachineInput.fromUTF16(s), n, (int[] match) => { result.Add(s.Substring(match[0], match[1] - match[0])); }); if (!result.Any()) { return(null); } return(result); }
/** * {@code findAllUTF8()} is the <a href='#all'>All</a> version of {@link #findUTF8}; it returns a * list of up to {@code n} successive matches of the expression, as defined by the <a * href='#all'>All</a> description above. * * <p> * A return value of null indicates no match. * * TODO(adonovan): think about defining a byte slice view class, like a read-only Go slice backed * by |b|. */ // This is visible for testing. public List <byte[]> findAllUTF8(byte[] b, int n) { List <byte[]> result = new List <byte[]>(); allMatches( MachineInput.fromUTF8(b), n, (int[] match) => { result.Add(Utils.subarray(b, match[0], match[1])); }); if (!result.Any()) { return(null); } return(result); }
/** * {@code findAllSubmatchIndex} is the <a href='#all'>All</a> version of * {@link #findSubmatchIndex}; it returns a list of up to {@code n} successive matches of the * expression, as defined by the <a href='#all'>All</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public List <int[]> findAllSubmatchIndex(String s, int n) { List <int[]> result = new List <int[]>(); allMatches( MachineInput.fromUTF16(s), n, (int[] match) => { result.Add(match); }); if (!result.Any()) { return(null); } return(result); }
/** * Returns an array of arrays the text of the leftmost match of the regular expression in * {@code b} and the matches, if any, of its subexpressions, as defined by the <a * href='#submatch'>Submatch</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public byte[][] findUTF8Submatch(byte[] b) { int[] a = doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, prog.numCap); if (a == null) { return(null); } byte[][] ret = new byte[1 + numSubexp][]; for (int i = 0; i < ret.Length; i++) { if (2 * i < a.Length && a[2 * i] >= 0) { ret[i] = Utils.subarray(b, a[2 * i], a[2 * i + 1]); } } return(ret); }
/** * Returns an array of strings holding the text of the leftmost match of the regular expression in * {@code s} and the matches, if any, of its subexpressions, as defined by the <a * href='#submatch'>Submatch</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public string[] findSubmatch(String s) { int[] a = doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, prog.numCap); if (a == null) { return(null); } string[] ret = new string[1 + numSubexp]; for (int i = 0; i < ret.Length; i++) { if (2 * i < a.Length && a[2 * i] >= 0) { ret[i] = s.Substring(a[2 * i], a[2 * i + 1] - a[2 * i]); } } return(ret); }
/** * Returns true iff this regexp matches the UTF-8 byte array {@code b}. */ // This is visible for testing. bool matchUTF8(byte[] b) { return(doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, 0) != null); }
/** * Returns an array holding the index pairs identifying the leftmost match of this regular * expression in {@code s} and the matches, if any, of its subexpressions, as defined by the <a * href='#submatch'>Submatch</a> description above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public int[] findSubmatchIndex(String s) { return(pad(doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, prog.numCap))); }
/** * Returns true iff this regexp matches the string {@code s}. */ public bool match(string s) { return(doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 0) != null); }
/** * Returns a copy of {@code src} in which at most {@code maxReplaces} matches for this regexp have * been replaced by the return value of of function {@code repl} (whose first argument is the * matched string). No support is provided for expressions (e.g. {@code \1} or {@code $1}) in the * replacement string. */ // This is visible for testing. public string replaceAllFunc(string src, ReplaceFunc repl, int maxReplaces) { int lastMatchEnd = 0; // end position of the most recent match int searchPos = 0; // position where we next look for a match StringBuilder buf = new StringBuilder(); MachineInput input = MachineInput.fromUTF16(src); int numReplaces = 0; while (searchPos <= src.Length) { int[] a = doExecute(input, searchPos, UNANCHORED, 2); if (a == null || a.Length == 0) { break; // no more matches } // Copy the unmatched characters before this match. buf.Append(src.Substring(lastMatchEnd, a[0])); // Now insert a copy of the replacement string, but not for a // match of the empty string immediately after another match. // (Otherwise, we get double replacement for patterns that // match both empty and nonempty strings.) // FIXME(adonovan), FIXME(afrozm) - JDK seems to be doing exactly this // put a replacement for a pattern that also matches empty and non-empty // strings. The fix would not just be a[1] >= lastMatchEnd, there are a // few corner cases in that as well, and there are tests which will fail // when that case is touched (happens only at the end of the input string // though). if (a[1] > lastMatchEnd || a[0] == 0) { buf.Append(repl(src.Substring(a[0], a[1]))); // Increment the replace count. ++numReplaces; } lastMatchEnd = a[1]; // Advance past this match; always advance at least one character. int width = input.step(searchPos) & 0x7; if (searchPos + width > a[1]) { searchPos += width; } else if (searchPos + 1 > a[1]) { // This clause is only needed at the end of the input // string. In that case, DecodeRuneInString returns width=0. searchPos++; } else { searchPos = a[1]; } if (numReplaces >= maxReplaces) { // Should never be greater though. break; } } // Copy the unmatched characters after the last match. buf.Append(src.Substring(lastMatchEnd)); return(buf.ToString()); }
// match() runs the machine over the input |in| starting at |pos| with the // RE2 Anchor |anchor|. // It reports whether a match was found. // If so, matchcap holds the submatch information. public bool match(MachineInput @in, int pos, int anchor) { int startCond = re2.cond; if (startCond == Utils.EMPTY_ALL) { // impossible return(false); } if ((anchor == RE2.ANCHOR_START || anchor == RE2.ANCHOR_BOTH) && pos != 0) { return(false); } matched = false; for (int jj = 0; jj < prog.numCap; ++jj) { matchcap[jj] = -1; } Queue runq = q0, nextq = q1; int r = @in.step(pos); int rune = r >> 3; int width = r & 7; int rune1 = -1; int width1 = 0; if (r != MachineInput.EOF) { r = @in.step(pos + width); rune1 = r >> 3; width1 = r & 7; } int flag; // bitmask of EMPTY_* flags if (pos == 0) { flag = Utils.emptyOpContext(-1, rune); } else { flag = @in.context(pos); } for (;;) { if (runq.isEmpty()) { if ((startCond & Utils.EMPTY_BEGIN_TEXT) != 0 && pos != 0) { // Anchored match, past beginning of text. break; } if (matched) { // Have match; finished exploring alternatives. break; } if (re2.prefix.Length != 0 && rune1 != re2.prefixRune && @in.canCheckPrefix()) { // Match requires literal prefix; fast search for it. int advance = @in.index(re2, pos); if (advance < 0) { break; } pos += advance; r = @in.step(pos); rune = r >> 3; width = r & 7; r = @in.step(pos + width); rune1 = r >> 3; width1 = r & 7; } } if (!matched && (pos == 0 || anchor == RE2.UNANCHORED)) { // If we are anchoring at begin then only add threads that begin // at |pos| = 0. if (ncap > 0) { matchcap[0] = pos; } add(runq, prog.start, pos, matchcap, flag, null); } flag = Utils.emptyOpContext(rune, rune1); step(runq, nextq, pos, pos + width, rune, flag, anchor, pos == @in.endPos()); if (width == 0) { // EOF break; } if (ncap == 0 && matched) { // Found a match and not paying attention // to where it is, so any match will do. break; } pos += width; rune = rune1; width = width1; if (rune != -1) { r = @in.step(pos + width); rune1 = r >> 3; width1 = r & 7; } Queue tmpq = runq; runq = nextq; nextq = tmpq; } free(nextq); return(matched); }
/** * Returns an array holding the index pairs identifying the leftmost match of this regular * expression in {@code b} and the matches, if any, of its subexpressions, as defined by the the * <a href='#submatch'>Submatch</a> and <a href='#index'>Index</a> descriptions above. * * <p> * A return value of null indicates no match. */ // This is visible for testing. public int[] findUTF8SubmatchIndex(byte[] b) { return(pad(doExecute(MachineInput.fromUTF8(b), 0, UNANCHORED, prog.numCap))); }