internal RegexFC(bool nullable) { _cc = new RegexCharClass(); _nullable = nullable; }
internal string OpcodeDescription(int offset) { var sb = new StringBuilder(); int opcode = _codes[offset]; sb.AppendFormat("{0:D6} ", offset); sb.Append(OpcodeBacktracks(opcode & Mask) ? '*' : ' '); sb.Append(OperatorDescription(opcode)); sb.Append('('); opcode &= Mask; switch (opcode) { case One: case Notone: case Onerep: case Notonerep: case Oneloop: case Notoneloop: case Onelazy: case Notonelazy: sb.Append("Ch = "); sb.Append(RegexCharClass.CharDescription((char)_codes[offset + 1])); break; case Set: case Setrep: case Setloop: case Setlazy: sb.Append("Set = "); sb.Append(RegexCharClass.SetDescription(_strings[_codes[offset + 1]])); break; case Multi: sb.Append("String = "); sb.Append(_strings[_codes[offset + 1]]); break; case Ref: case Testref: sb.Append("Index = "); sb.Append(_codes[offset + 1]); break; case Capturemark: sb.Append("Index = "); sb.Append(_codes[offset + 1]); if (_codes[offset + 2] != -1) { sb.Append(", Unindex = "); sb.Append(_codes[offset + 2]); } break; case Nullcount: case Setcount: sb.Append("Value = "); sb.Append(_codes[offset + 1]); break; case Goto: case Lazybranch: case Branchmark: case Lazybranchmark: case Branchcount: case Lazybranchcount: sb.Append("Addr = "); sb.Append(_codes[offset + 1]); break; } switch (opcode) { case Onerep: case Notonerep: case Oneloop: case Notoneloop: case Onelazy: case Notonelazy: case Setrep: case Setloop: case Setlazy: sb.Append(", Rep = "); if (_codes[offset + 2] == int.MaxValue) { sb.Append("inf"); } else { sb.Append(_codes[offset + 2]); } break; case Branchcount: case Lazybranchcount: sb.Append(", Limit = "); if (_codes[offset + 2] == int.MaxValue) { sb.Append("inf"); } else { sb.Append(_codes[offset + 2]); } break; } sb.Append(')'); return(sb.ToString()); }
protected static bool CharInClass(char ch, string charClass) { return(RegexCharClass.CharInClass(ch, charClass)); }
internal S CreateConditionFromSet(bool ignoreCase, string set) { //char at position 0 is 1 iff the set is negated //bool negate = ((int)set[0] == 1); bool negate = RegexCharClass.IsNegated(set); //following are conditions over characters in the set //these will become disjuncts of a single disjunction //or conjuncts of a conjunction in case negate is true //negation is pushed in when the conditions are created List <S> conditions = new List <S>(); #region ranges var ranges = ComputeRanges(set); foreach (var range in ranges) { S cond = solver.MkRangeConstraint(range.First, range.Second, ignoreCase); conditions.Add(negate ? solver.MkNot(cond) : cond); } #endregion #region categories int setLength = set[SETLENGTH]; int catLength = set[CATEGORYLENGTH]; //int myEndPosition = SETSTART + setLength + catLength; int catStart = setLength + SETSTART; int j = catStart; while (j < catStart + catLength) { //singleton categories are stored as unicode characters whose code is //1 + the unicode category code as a short //thus - 1 is applied to exctarct the actual code of the category //the category itself may be negated e.g. \D instead of \d short catCode = (short)set[j++]; if (catCode != 0) { //note that double negation cancels out the negation of the category S cond = MapCategoryCodeToCondition(Math.Abs(catCode) - 1); conditions.Add(catCode < 0 ^ negate ? solver.MkNot(cond) : cond); } else { //special case for a whole group G of categories surrounded by 0's //essentially 0 C1 C2 ... Cn 0 ==> G = (C1 | C2 | ... | Cn) catCode = (short)set[j++]; if (catCode == 0) { continue; //empty set of categories } //collect individual category codes into this set var catCodes = new HashSet <int>(); //if the first catCode is negated, the group as a whole is negated bool negGroup = (catCode < 0); while (catCode != 0) { catCodes.Add(Math.Abs(catCode) - 1); catCode = (short)set[j++]; } // C1 | C2 | ... | Cn S catCondDisj = MapCategoryCodeSetToCondition(catCodes); S catGroupCond = (negate ^ negGroup ? solver.MkNot(catCondDisj) : catCondDisj); conditions.Add(catGroupCond); } } #endregion #region Subtractor S subtractorCond = default(S); if (set.Length > j) { //the set has a subtractor-set at the end //all characters in the subtractor-set are excluded from the set //note that the subtractor sets may be nested, e.g. in r=[a-z-[b-g-[cd]]] //the subtractor set [b-g-[cd]] has itself a subtractor set [cd] //thus r is the set of characters between a..z except b,e,f,g var subtractor = set.Substring(j); subtractorCond = CreateConditionFromSet(ignoreCase, subtractor); } #endregion S moveCond; //if there are no ranges and no groups then there are no conditions //this situation arises for SingleLine regegex option and . //and means that all characters are accepted if (conditions.Count == 0) { moveCond = (negate ? solver.False : solver.True); } else { moveCond = (negate ? solver.MkAnd(conditions) : solver.MkOr(conditions)); } //Subtelty of regex sematics: //note that the subtractor is not within the scope of the negation (if there is a negation) //thus the negated subtractor is conjuncted with moveCond after the negation has been //performed above if (!object.Equals(subtractorCond, default(S))) { moveCond = solver.MkAnd(moveCond, solver.MkNot(subtractorCond)); } return(moveCond); }
/* * Scans chars following a '\' (not counting the '\'), and returns * a RegexNode for the type of atom scanned. */ private RegexNode ScanBackslash() { char ch; RegexCharClass cc; if (CharsRight() == 0) { throw MakeException(Strings.IllegalEndEscape); } switch (ch = RightChar()) { case 'b': case 'B': case 'A': case 'G': case 'Z': case 'z': MoveRight(); return(new RegexNode(TypeFromCode(ch), _options)); case 'w': MoveRight(); if (UseOptionE()) { return(new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass)); } return(new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass)); case 'W': MoveRight(); if (UseOptionE()) { return(new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass)); } return(new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass)); case 's': MoveRight(); if (UseOptionE()) { return(new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass)); } return(new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass)); case 'S': MoveRight(); if (UseOptionE()) { return(new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass)); } return(new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass)); case 'd': MoveRight(); if (UseOptionE()) { return(new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass)); } return(new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass)); case 'D': MoveRight(); if (UseOptionE()) { return(new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass)); } return(new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass)); case 'p': case 'P': MoveRight(); cc = new RegexCharClass(); cc.AddCategoryFromName(ParseProperty(), ch != 'p', UseOptionI(), _pattern); if (UseOptionI()) { cc.AddLowercase(_culture); } return(new RegexNode(RegexNode.Set, _options, cc.ToStringClass())); default: return(ScanBasicBackslash()); } }
protected static bool CharInSet(char ch, string set, string category) { string charClass = RegexCharClass.ConvertOldStringsToClass(set, category); return(RegexCharClass.CharInClass(ch, charClass)); }
/// <summary> /// Basic optimization. Single-letter alternations can be replaced /// by faster set specifications, and nested alternations with no /// intervening operators can be flattened: /// /// a|b|c|def|g|h -> [a-c]|def|[gh] /// apple|(?:orange|pear)|grape -> apple|orange|pear|grape /// </summary> internal RegexNode ReduceAlternation() { // Combine adjacent sets/chars bool wasLastSet; bool lastNodeCannotMerge; RegexOptions optionsLast; RegexOptions optionsAt; int i; int j; RegexNode at; RegexNode prev; if (_children == null) { return(new RegexNode(Nothing, _options)); } wasLastSet = false; lastNodeCannotMerge = false; optionsLast = 0; for (i = 0, j = 0; i < _children.Count; i++, j++) { at = _children[i]; if (j < i) { _children[j] = at; } for (; ;) { if (at._type == Alternate) { for (int k = 0; k < at._children.Count; k++) { at._children[k]._next = this; } _children.InsertRange(i + 1, at._children); j--; } else if (at._type == Set || at._type == One) { // Cannot merge sets if L or I options differ, or if either are negated. optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase); if (at._type == Set) { if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at._str)) { wasLastSet = true; lastNodeCannotMerge = !RegexCharClass.IsMergeable(at._str); optionsLast = optionsAt; break; } } else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) { wasLastSet = true; lastNodeCannotMerge = false; optionsLast = optionsAt; break; } // The last node was a Set or a One, we're a Set or One and our options are the same. // Merge the two nodes. j--; prev = _children[j]; RegexCharClass prevCharClass; if (prev._type == One) { prevCharClass = new RegexCharClass(); prevCharClass.AddChar(prev._ch); } else { prevCharClass = RegexCharClass.Parse(prev._str); } if (at._type == One) { prevCharClass.AddChar(at._ch); } else { RegexCharClass atCharClass = RegexCharClass.Parse(at._str); prevCharClass.AddCharClass(atCharClass); } prev._type = Set; prev._str = prevCharClass.ToStringClass(); } else if (at._type == Nothing) { j--; } else { wasLastSet = false; lastNodeCannotMerge = false; } break; } } if (j < i) { _children.RemoveRange(j, i - j); } return(StripEnation(Nothing)); }
protected override void Go() { Goto(0); for (; ;) { #if DEBUG if (runmatch.Debug) { DumpState(); } #endif CheckTimeout(); switch (Operator()) { case RegexCode.Stop: return; case RegexCode.Nothing: break; case RegexCode.Goto: Goto(Operand(0)); continue; case RegexCode.Testref: if (!IsMatched(Operand(0))) { break; } Advance(1); continue; case RegexCode.Lazybranch: TrackPush(Textpos()); Advance(1); continue; case RegexCode.Lazybranch | RegexCode.Back: TrackPop(); Textto(TrackPeek()); Goto(Operand(0)); continue; case RegexCode.Setmark: StackPush(Textpos()); TrackPush(); Advance(); continue; case RegexCode.Nullmark: StackPush(-1); TrackPush(); Advance(); continue; case RegexCode.Setmark | RegexCode.Back: case RegexCode.Nullmark | RegexCode.Back: StackPop(); break; case RegexCode.Getmark: StackPop(); TrackPush(StackPeek()); Textto(StackPeek()); Advance(); continue; case RegexCode.Getmark | RegexCode.Back: TrackPop(); StackPush(TrackPeek()); break; case RegexCode.Capturemark: if (Operand(1) != -1 && !IsMatched(Operand(1))) { break; } StackPop(); if (Operand(1) != -1) { TransferCapture(Operand(0), Operand(1), StackPeek(), Textpos()); } else { Capture(Operand(0), StackPeek(), Textpos()); } TrackPush(StackPeek()); Advance(2); continue; case RegexCode.Capturemark | RegexCode.Back: TrackPop(); StackPush(TrackPeek()); Uncapture(); if (Operand(0) != -1 && Operand(1) != -1) { Uncapture(); } break; case RegexCode.Branchmark: { int matched; StackPop(); matched = Textpos() - StackPeek(); if (matched != 0) { // Nonempty match -> loop now TrackPush(StackPeek(), Textpos()); // Save old mark, textpos StackPush(Textpos()); // Make new mark Goto(Operand(0)); // Loop } else { // Empty match -> straight now TrackPush2(StackPeek()); // Save old mark Advance(1); // Straight } continue; } case RegexCode.Branchmark | RegexCode.Back: TrackPop(2); StackPop(); Textto(TrackPeek(1)); // Recall position TrackPush2(TrackPeek()); // Save old mark Advance(1); // Straight continue; case RegexCode.Branchmark | RegexCode.Back2: TrackPop(); StackPush(TrackPeek()); // Recall old mark break; // Backtrack case RegexCode.Lazybranchmark: { // We hit this the first time through a lazy loop and after each // successful match of the inner expression. It simply continues // on and doesn't loop. StackPop(); int oldMarkPos = StackPeek(); if (Textpos() != oldMarkPos) { // Nonempty match -> try to loop again by going to 'back' state if (oldMarkPos != -1) { TrackPush(oldMarkPos, Textpos()); // Save old mark, textpos } else { TrackPush(Textpos(), Textpos()); } } else { // The inner expression found an empty match, so we'll go directly to 'back2' if we // backtrack. In this case, we need to push something on the stack, since back2 pops. // However, in the case of ()+? or similar, this empty match may be legitimate, so push the text // position associated with that empty match. StackPush(oldMarkPos); TrackPush2(StackPeek()); // Save old mark } Advance(1); continue; } case RegexCode.Lazybranchmark | RegexCode.Back: { // After the first time, Lazybranchmark | RegexCode.Back occurs // with each iteration of the loop, and therefore with every attempted // match of the inner expression. We'll try to match the inner expression, // then go back to Lazybranchmark if successful. If the inner expression // fails, we go to Lazybranchmark | RegexCode.Back2 int pos; TrackPop(2); pos = TrackPeek(1); TrackPush2(TrackPeek()); // Save old mark StackPush(pos); // Make new mark Textto(pos); // Recall position Goto(Operand(0)); // Loop continue; } case RegexCode.Lazybranchmark | RegexCode.Back2: // The lazy loop has failed. We'll do a true backtrack and // start over before the lazy loop. StackPop(); TrackPop(); StackPush(TrackPeek()); // Recall old mark break; case RegexCode.Setcount: StackPush(Textpos(), Operand(0)); TrackPush(); Advance(1); continue; case RegexCode.Nullcount: StackPush(-1, Operand(0)); TrackPush(); Advance(1); continue; case RegexCode.Setcount | RegexCode.Back: StackPop(2); break; case RegexCode.Nullcount | RegexCode.Back: StackPop(2); break; case RegexCode.Branchcount: // StackPush: // 0: Mark // 1: Count { StackPop(2); int mark = StackPeek(); int count = StackPeek(1); int matched = Textpos() - mark; if (count >= Operand(1) || (matched == 0 && count >= 0)) { // Max loops or empty match -> straight now TrackPush2(mark, count); // Save old mark, count Advance(2); // Straight } else { // Nonempty match -> count+loop now TrackPush(mark); // remember mark StackPush(Textpos(), count + 1); // Make new mark, incr count Goto(Operand(0)); // Loop } continue; } case RegexCode.Branchcount | RegexCode.Back: // TrackPush: // 0: Previous mark // StackPush: // 0: Mark (= current pos, discarded) // 1: Count TrackPop(); StackPop(2); if (StackPeek(1) > 0) { // Positive -> can go straight Textto(StackPeek()); // Zap to mark TrackPush2(TrackPeek(), StackPeek(1) - 1); // Save old mark, old count Advance(2); // Straight continue; } StackPush(TrackPeek(), StackPeek(1) - 1); // recall old mark, old count break; case RegexCode.Branchcount | RegexCode.Back2: // TrackPush: // 0: Previous mark // 1: Previous count TrackPop(2); StackPush(TrackPeek(), TrackPeek(1)); // Recall old mark, old count break; // Backtrack case RegexCode.Lazybranchcount: // StackPush: // 0: Mark // 1: Count { StackPop(2); int mark = StackPeek(); int count = StackPeek(1); if (count < 0) { // Negative count -> loop now TrackPush2(mark); // Save old mark StackPush(Textpos(), count + 1); // Make new mark, incr count Goto(Operand(0)); // Loop } else { // Nonneg count -> straight now TrackPush(mark, count, Textpos()); // Save mark, count, position Advance(2); // Straight } continue; } case RegexCode.Lazybranchcount | RegexCode.Back: // TrackPush: // 0: Mark // 1: Count // 2: Textpos { TrackPop(3); int mark = TrackPeek(); int textpos = TrackPeek(2); if (TrackPeek(1) < Operand(1) && textpos != mark) { // Under limit and not empty match -> loop Textto(textpos); // Recall position StackPush(textpos, TrackPeek(1) + 1); // Make new mark, incr count TrackPush2(mark); // Save old mark Goto(Operand(0)); // Loop continue; } else { // Max loops or empty match -> backtrack StackPush(TrackPeek(), TrackPeek(1)); // Recall old mark, count break; // backtrack } } case RegexCode.Lazybranchcount | RegexCode.Back2: // TrackPush: // 0: Previous mark // StackPush: // 0: Mark (== current pos, discarded) // 1: Count TrackPop(); StackPop(2); StackPush(TrackPeek(), StackPeek(1) - 1); // Recall old mark, count break; // Backtrack case RegexCode.Setjump: StackPush(Trackpos(), Crawlpos()); TrackPush(); Advance(); continue; case RegexCode.Setjump | RegexCode.Back: StackPop(2); break; case RegexCode.Backjump: // StackPush: // 0: Saved trackpos // 1: Crawlpos StackPop(2); Trackto(StackPeek()); while (Crawlpos() != StackPeek(1)) { Uncapture(); } break; case RegexCode.Forejump: // StackPush: // 0: Saved trackpos // 1: Crawlpos StackPop(2); Trackto(StackPeek()); TrackPush(StackPeek(1)); Advance(); continue; case RegexCode.Forejump | RegexCode.Back: // TrackPush: // 0: Crawlpos TrackPop(); while (Crawlpos() != TrackPeek()) { Uncapture(); } break; case RegexCode.Bol: if (Leftchars() > 0 && CharAt(Textpos() - 1) != '\n') { break; } Advance(); continue; case RegexCode.Eol: if (Rightchars() > 0 && CharAt(Textpos()) != '\n') { break; } Advance(); continue; case RegexCode.Boundary: if (!IsBoundary(Textpos(), runtextbeg, runtextend)) { break; } Advance(); continue; case RegexCode.Nonboundary: if (IsBoundary(Textpos(), runtextbeg, runtextend)) { break; } Advance(); continue; case RegexCode.ECMABoundary: if (!IsECMABoundary(Textpos(), runtextbeg, runtextend)) { break; } Advance(); continue; case RegexCode.NonECMABoundary: if (IsECMABoundary(Textpos(), runtextbeg, runtextend)) { break; } Advance(); continue; case RegexCode.Beginning: if (Leftchars() > 0) { break; } Advance(); continue; case RegexCode.Start: if (Textpos() != Textstart()) { break; } Advance(); continue; case RegexCode.EndZ: if (Rightchars() > 1 || Rightchars() == 1 && CharAt(Textpos()) != '\n') { break; } Advance(); continue; case RegexCode.End: if (Rightchars() > 0) { break; } Advance(); continue; case RegexCode.One: if (Forwardchars() < 1 || Forwardcharnext() != (char)Operand(0)) { break; } Advance(1); continue; case RegexCode.Notone: if (Forwardchars() < 1 || Forwardcharnext() == (char)Operand(0)) { break; } Advance(1); continue; case RegexCode.Set: if (Forwardchars() < 1 || !RegexCharClass.CharInClass(Forwardcharnext(), _code._strings[Operand(0)])) { break; } Advance(1); continue; case RegexCode.Multi: { if (!Stringmatch(_code._strings[Operand(0)])) { break; } Advance(1); continue; } case RegexCode.Ref: { int capnum = Operand(0); if (IsMatched(capnum)) { if (!Refmatch(MatchIndex(capnum), MatchLength(capnum))) { break; } } else { if ((runregex.roptions & RegexOptions.ECMAScript) == 0) { break; } } Advance(1); continue; } case RegexCode.Onerep: { int c = Operand(1); if (Forwardchars() < c) { break; } char ch = (char)Operand(0); while (c-- > 0) { if (Forwardcharnext() != ch) { goto BreakBackward; } } Advance(2); continue; } case RegexCode.Notonerep: { int c = Operand(1); if (Forwardchars() < c) { break; } char ch = (char)Operand(0); while (c-- > 0) { if (Forwardcharnext() == ch) { goto BreakBackward; } } Advance(2); continue; } case RegexCode.Setrep: { int c = Operand(1); if (Forwardchars() < c) { break; } string set = _code._strings[Operand(0)]; while (c-- > 0) { if (!RegexCharClass.CharInClass(Forwardcharnext(), set)) { goto BreakBackward; } } Advance(2); continue; } case RegexCode.Oneloop: { int c = Operand(1); if (c > Forwardchars()) { c = Forwardchars(); } char ch = (char)Operand(0); int i; for (i = c; i > 0; i--) { if (Forwardcharnext() != ch) { Backwardnext(); break; } } if (c > i) { TrackPush(c - i - 1, Textpos() - Bump()); } Advance(2); continue; } case RegexCode.Notoneloop: { int c = Operand(1); if (c > Forwardchars()) { c = Forwardchars(); } char ch = (char)Operand(0); int i; for (i = c; i > 0; i--) { if (Forwardcharnext() == ch) { Backwardnext(); break; } } if (c > i) { TrackPush(c - i - 1, Textpos() - Bump()); } Advance(2); continue; } case RegexCode.Setloop: { int c = Operand(1); if (c > Forwardchars()) { c = Forwardchars(); } string set = _code._strings[Operand(0)]; int i; for (i = c; i > 0; i--) { if (!RegexCharClass.CharInClass(Forwardcharnext(), set)) { Backwardnext(); break; } } if (c > i) { TrackPush(c - i - 1, Textpos() - Bump()); } Advance(2); continue; } case RegexCode.Oneloop | RegexCode.Back: case RegexCode.Notoneloop | RegexCode.Back: { TrackPop(2); int i = TrackPeek(); int pos = TrackPeek(1); Textto(pos); if (i > 0) { TrackPush(i - 1, pos - Bump()); } Advance(2); continue; } case RegexCode.Setloop | RegexCode.Back: { TrackPop(2); int i = TrackPeek(); int pos = TrackPeek(1); Textto(pos); if (i > 0) { TrackPush(i - 1, pos - Bump()); } Advance(2); continue; } case RegexCode.Onelazy: case RegexCode.Notonelazy: { int c = Operand(1); if (c > Forwardchars()) { c = Forwardchars(); } if (c > 0) { TrackPush(c - 1, Textpos()); } Advance(2); continue; } case RegexCode.Setlazy: { int c = Operand(1); if (c > Forwardchars()) { c = Forwardchars(); } if (c > 0) { TrackPush(c - 1, Textpos()); } Advance(2); continue; } case RegexCode.Onelazy | RegexCode.Back: { TrackPop(2); int pos = TrackPeek(1); Textto(pos); if (Forwardcharnext() != (char)Operand(0)) { break; } int i = TrackPeek(); if (i > 0) { TrackPush(i - 1, pos + Bump()); } Advance(2); continue; } case RegexCode.Notonelazy | RegexCode.Back: { TrackPop(2); int pos = TrackPeek(1); Textto(pos); if (Forwardcharnext() == (char)Operand(0)) { break; } int i = TrackPeek(); if (i > 0) { TrackPush(i - 1, pos + Bump()); } Advance(2); continue; } case RegexCode.Setlazy | RegexCode.Back: { TrackPop(2); int pos = TrackPeek(1); Textto(pos); if (!RegexCharClass.CharInClass(Forwardcharnext(), _code._strings[Operand(0)])) { break; } int i = TrackPeek(); if (i > 0) { TrackPush(i - 1, pos + Bump()); } Advance(2); continue; } case RegexCode.ResetMatchStart: TrackPush(MatchStart()); // Enable backtracking, saving the current match start SetMatchStart(Textpos()); // Set the match start to the current position in text Advance(); continue; case RegexCode.ResetMatchStart | RegexCode.Back: TrackPop(); SetMatchStart(TrackPeek()); // Restore the previously saved value as the match start break; // Continue backtracking default: throw new NotImplementedException(SR.UnimplementedState); } BreakBackward: ; // "break Backward" comes here: Backtrack(); } }
internal RegexFC(string charClass, bool nullable, bool caseInsensitive) { _cc = RegexCharClass.Parse(charClass); _nullable = nullable; _caseInsensitive = caseInsensitive; }
private void GenerateFindFirstChar() { var boyerMooreCulture = BoyerMoorePrefix != null ? Writer.DeclareField($@"private static readonly CultureInfo BoyerMooreCulture = CultureInfo.GetCultureInfo(""{BoyerMoorePrefix._culture.ToString()}"");") : null; if (!(Anchors.Beginning || Anchors.Start || Anchors.EndZ || Anchors.End) && BoyerMoorePrefix != null) { GenerateBoyerMoorePrefixScan(boyerMooreCulture); } using (Writer.Method("protected override bool FindFirstChar()")) { #if DEBUG_OUTPUT Writer.Write($@"Debug.WriteLine("""")"); Writer.Write($@"Debug.WriteLine($""Search range: from {{{runtextbeg}.ToString(CultureInfo.InvariantCulture)}} to {{{runtextend}.ToString(CultureInfo.InvariantCulture)}}"")"); Writer.Write($@"Debug.WriteLine($""Firstchar search starting at {{{runtextpos}.ToString(CultureInfo.InvariantCulture)}} stopping at {{{(IsRightToLeft ? runtextbeg : runtextend)}.ToString(CultureInfo.InvariantCulture)}}"")"); #endif if (Anchors.Beginning || Anchors.Start || Anchors.EndZ || Anchors.End) { GenerateAnchorChecks(boyerMooreCulture); } else if (BoyerMoorePrefix != null) { GenerateBoyerMoorePrefixScanCheck(); } else if (FirstCharacterPrefix == null) { Writer.Write($"return true;"); } else { var culture = DeclareCulture(); var set = FirstCharacterPrefix.GetValueOrDefault().Prefix; if (RegexCharClass.IsSingleton(set)) { var ch = RegexCharClass.SingletonChar(set); var i = Local.Parse("i"); using (Writer.For($"int {i} = {Forwardchars()}; {i} > 0; {i}--")) { using (Writer.If($"'{ch}' == {Forwardcharnext(culture)}")) { Backwardnext(); Writer.Write($"return true;"); } } } else { var i = Local.Parse("i"); using (Writer.For($"int {i} = {Forwardchars()}; i > 0; i--")) { using (Writer.If($@"{CharInClass(Forwardcharnext(culture), set)}")) { Backwardnext(); Writer.Write($"return true;"); } } } Writer.Write($"return false;"); } } }
internal RegexFC(char ch, bool not, bool nullable, bool caseInsensitive) { _cc = new RegexCharClass(); if (not) { if (ch > 0) _cc.AddRange('\0', (char)(ch - 1)); if (ch < 0xFFFF) _cc.AddRange((char)(ch + 1), '\uFFFF'); } else { _cc.AddRange(ch, ch); } _caseInsensitive = caseInsensitive; _nullable = nullable; }
/* * Scans \-style backreferences and character escapes */ private RegexNode ScanBasicBackslash() { if (CharsRight() == 0) { throw MakeException(Strings.IllegalEndEscape); } char ch; var angled = false; var close = '\0'; var backpos = Textpos(); ch = RightChar(); // allow \k<foo> instead of \<foo>, which is now deprecated if (ch == 'k') { if (CharsRight() >= 2) { MoveRight(); ch = MoveRightGetChar(); if (ch == '<' || ch == '\'') { angled = true; close = (ch == '\'') ? '\'' : '>'; } } if (!angled || CharsRight() <= 0) { throw MakeException(Strings.MalformedNameRef); } ch = RightChar(); } // Note angle without \g else if ((ch == '<' || ch == '\'') && CharsRight() > 1) { angled = true; close = (ch == '\'') ? '\'' : '>'; MoveRight(); ch = RightChar(); } // Try to parse backreference: \<1> or \<cap> if (angled && ch >= '0' && ch <= '9') { _ = ScanDecimal(); if (CharsRight() > 0 && MoveRightGetChar() == close) { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } } // Try to parse backreference or octal: \1 else if (!angled && ch >= '1' && ch <= '9') { if (UseOptionE()) { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } else { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } } else if (angled && RegexCharClass.IsWordChar(ch)) { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } // Not backreference: must be char code Textto(backpos); ch = ScanCharEscape(); if (UseOptionI()) { ch = _culture.TextInfo.ToLower(ch); } return(new RegexNode(RegexNode.One, _options, ch)); }
/// <summary> /// Basic optimization. Single-letter alternations can be replaced /// by faster set specifications, and nested alternations with no /// intervening operators can be flattened: /// /// a|b|c|def|g|h -> [a-c]|def|[gh] /// apple|(?:orange|pear)|grape -> apple|orange|pear|grape /// </summary> internal RegexNode ReduceAlternation() { // Combine adjacent sets/chars bool wasLastSet; bool lastNodeCannotMerge; RegexOptions optionsLast; RegexOptions optionsAt; int i; int j; RegexNode at; RegexNode prev; if (_children == null) return new RegexNode(Nothing, _options); wasLastSet = false; lastNodeCannotMerge = false; optionsLast = 0; for (i = 0, j = 0; i < _children.Count; i++, j++) { at = _children[i]; if (j < i) _children[j] = at; for (; ;) { if (at._type == Alternate) { for (int k = 0; k < at._children.Count; k++) at._children[k]._next = this; _children.InsertRange(i + 1, at._children); j--; } else if (at._type == Set || at._type == One) { // Cannot merge sets if L or I options differ, or if either are negated. optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase); if (at._type == Set) { if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at._str)) { wasLastSet = true; lastNodeCannotMerge = !RegexCharClass.IsMergeable(at._str); optionsLast = optionsAt; break; } } else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) { wasLastSet = true; lastNodeCannotMerge = false; optionsLast = optionsAt; break; } // The last node was a Set or a One, we're a Set or One and our options are the same. // Merge the two nodes. j--; prev = _children[j]; RegexCharClass prevCharClass; if (prev._type == One) { prevCharClass = new RegexCharClass(); prevCharClass.AddChar(prev._ch); } else { prevCharClass = RegexCharClass.Parse(prev._str); } if (at._type == One) { prevCharClass.AddChar(at._ch); } else { RegexCharClass atCharClass = RegexCharClass.Parse(at._str); prevCharClass.AddCharClass(atCharClass); } prev._type = Set; prev._str = prevCharClass.ToStringClass(); } else if (at._type == Nothing) { j--; } else { wasLastSet = false; lastNodeCannotMerge = false; } break; } } if (j < i) _children.RemoveRange(j, i - j); return StripEnation(Nothing); }
internal string Description() { var ArgSb = new StringBuilder(); ArgSb.Append(TypeStr[_type]); if ((_options & RegexOptions.ExplicitCapture) != 0) { ArgSb.Append("-C"); } if ((_options & RegexOptions.IgnoreCase) != 0) { ArgSb.Append("-I"); } if ((_options & RegexOptions.RightToLeft) != 0) { ArgSb.Append("-L"); } if ((_options & RegexOptions.Multiline) != 0) { ArgSb.Append("-M"); } if ((_options & RegexOptions.Singleline) != 0) { ArgSb.Append("-S"); } if ((_options & RegexOptions.IgnorePatternWhitespace) != 0) { ArgSb.Append("-X"); } if ((_options & RegexOptions.ECMAScript) != 0) { ArgSb.Append("-E"); } switch (_type) { case Oneloop: case Notoneloop: case Onelazy: case Notonelazy: case One: case Notone: ArgSb.Append("(Ch = " + RegexCharClass.CharDescription(_ch) + ")"); break; case Capture: ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ", unindex = " + _n.ToString(CultureInfo.InvariantCulture) + ")"); break; case Ref: case Testref: ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ")"); break; case Multi: ArgSb.Append("(String = " + _str + ")"); break; case Set: case Setloop: case Setlazy: ArgSb.Append("(Set = " + RegexCharClass.SetDescription(_str) + ")"); break; } switch (_type) { case Oneloop: case Notoneloop: case Onelazy: case Notonelazy: case Setloop: case Setlazy: case Loop: case Lazyloop: ArgSb.Append("(Min = " + _m.ToString(CultureInfo.InvariantCulture) + ", Max = " + (_n == int.MaxValue ? "inf" : Convert.ToString(_n, CultureInfo.InvariantCulture)) + ")"); break; } return(ArgSb.ToString()); }
protected override bool FindFirstChar() { int i; string set; if (0 != (_code._anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) { if (!_code._rightToLeft) { if ((0 != (_code._anchors & RegexFCD.Beginning) && runtextpos > runtextbeg) || (0 != (_code._anchors & RegexFCD.Start) && runtextpos > runtextstart)) { runtextpos = runtextend; return(false); } if (0 != (_code._anchors & RegexFCD.EndZ) && runtextpos < runtextend - 1) { runtextpos = runtextend - 1; } else if (0 != (_code._anchors & RegexFCD.End) && runtextpos < runtextend) { runtextpos = runtextend; } } else { if ((0 != (_code._anchors & RegexFCD.End) && runtextpos < runtextend) || (0 != (_code._anchors & RegexFCD.EndZ) && (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'))) || (0 != (_code._anchors & RegexFCD.Start) && runtextpos < runtextstart)) { runtextpos = runtextbeg; return(false); } if (0 != (_code._anchors & RegexFCD.Beginning) && runtextpos > runtextbeg) { runtextpos = runtextbeg; } } if (_code._bmPrefix != null) { return(_code._bmPrefix.IsMatch(runtext, runtextpos, runtextbeg, runtextend)); } return(true); // found a valid start or end anchor } else if (_code._bmPrefix != null) { runtextpos = _code._bmPrefix.Scan(runtext, runtextpos, runtextbeg, runtextend); if (runtextpos == -1) { runtextpos = (_code._rightToLeft ? runtextbeg : runtextend); return(false); } return(true); } else if (_code._fcPrefix == null) { return(true); } _rightToLeft = _code._rightToLeft; _caseInsensitive = _code._fcPrefix.CaseInsensitive; set = _code._fcPrefix.Prefix; if (RegexCharClass.IsSingleton(set)) { char ch = RegexCharClass.SingletonChar(set); for (i = Forwardchars(); i > 0; i--) { if (ch == Forwardcharnext()) { Backwardnext(); return(true); } } } else { for (i = Forwardchars(); i > 0; i--) { if (RegexCharClass.CharInClass(Forwardcharnext(), set)) { Backwardnext(); return(true); } } } return(false); }
protected bool IsECMABoundary(int index, int startpos, int endpos) { return((index > startpos && RegexCharClass.IsECMAWordChar(runtext[index - 1])) != (index < endpos && RegexCharClass.IsECMAWordChar(runtext[index]))); }
/* * Scans chars following a '\' (not counting the '\'), and returns * a RegexNode for the type of atom scanned. */ internal RegexNode ScanBackslash() { char ch; RegexCharClass cc; if (CharsRight() == 0) throw MakeException(SR.IllegalEndEscape); switch (ch = RightChar()) { case 'b': case 'B': case 'A': case 'G': case 'Z': case 'z': MoveRight(); return new RegexNode(TypeFromCode(ch), _options); case 'w': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass); case 'W': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass); case 's': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass); case 'S': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass); case 'd': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass); case 'D': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass); case 'p': case 'P': MoveRight(); cc = new RegexCharClass(); cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), UseOptionI(), _pattern); if (UseOptionI()) cc.AddLowercase(_culture); return new RegexNode(RegexNode.Set, _options, cc.ToStringClass()); default: return ScanBasicBackslash(); } }
/* * Scans chars following a '(' (not counting the '('), and returns * a RegexNode for the type of group scanned, or null if the group * simply changed options (?cimsx-cimsx) or was a comment (#...). */ private RegexNode ScanGroupOpen() { var ch = '\0'; int NodeType; var close = '>'; // just return a RegexNode if we have: // 1. "(" followed by nothing // 2. "(x" where x != ? // 3. "(?)" if (CharsRight() == 0 || RightChar() != '?' || (RightChar() == '?' && (CharsRight() > 1 && RightChar(1) == ')'))) { if (UseOptionN() || _ignoreNextParen) { _ignoreNextParen = false; return(new RegexNode(RegexNode.Group, _options)); } else { return(new RegexNode(RegexNode.Capture, _options, _autocap++, -1)); } } MoveRight(); for (; ;) { if (CharsRight() == 0) { break; } switch (_ = MoveRightGetChar()) { case ':': NodeType = RegexNode.Group; break; case '=': _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Require; break; case '!': _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Prevent; break; case '>': NodeType = RegexNode.Greedy; break; case '\'': close = '\''; goto case '<'; // fallthrough case '<': if (CharsRight() == 0) { goto BreakRecognize; } switch (ch = MoveRightGetChar()) { case '=': if (close == '\'') { goto BreakRecognize; } _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Require; break; case '!': if (close == '\'') { goto BreakRecognize; } _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Prevent; break; default: MoveLeft(); const int capnum = -1; const int uncapnum = -1; var proceed = false; // grab part before - if (ch >= '0' && ch <= '9') { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } else if (RegexCharClass.IsWordChar(ch)) { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } else if (ch == '-') { proceed = true; } else { // bad group name - starts with something other than a word character and isn't a number throw MakeException(Strings.InvalidGroupName); } // grab part after - if any if ((capnum != -1 || proceed) && CharsRight() > 0 && RightChar() == '-') { MoveRight(); ch = RightChar(); if (ch >= '0' && ch <= '9') { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } else if (RegexCharClass.IsWordChar(ch)) { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } else { // bad group name - starts with something other than a word character and isn't a number throw MakeException(Strings.InvalidGroupName); } } // actually make the node if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && MoveRightGetChar() == close) { return(new RegexNode(RegexNode.Capture, _options, capnum, uncapnum)); } goto BreakRecognize; } break; case '(': // alternation construct (?(...) | ) var parenPos = Textpos(); if (CharsRight() > 0) { ch = RightChar(); // check if the alternation condition is a backref if (ch >= '0' && ch <= '9') { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } else if (RegexCharClass.IsWordChar(ch)) { throw MakeException(Strings.BackRefCaptureGroupNotSupported); } } // not a backref NodeType = RegexNode.Testgroup; Textto(parenPos - 1); // jump to the start of the parentheses _ignoreNextParen = true; // but make sure we don't try to capture the insides var charsRight = CharsRight(); if (charsRight >= 3 && RightChar(1) == '?') { var rightchar2 = RightChar(2); // disallow comments in the condition if (rightchar2 == '#') { throw MakeException(Strings.AlternationCantHaveComment); } // disallow named capture group (?<..>..) in the condition if (rightchar2 == '\'') { throw MakeException(Strings.AlternationCantCapture); } else if (charsRight >= 4 && (rightchar2 == '<' && RightChar(3) != '!' && RightChar(3) != '=')) { throw MakeException(Strings.AlternationCantCapture); } } break; default: MoveLeft(); NodeType = RegexNode.Group; // Disallow options in the children of a testgroup node if (_group._type != RegexNode.Testgroup) { ScanOptions(); } if (CharsRight() == 0) { goto BreakRecognize; } if ((ch = MoveRightGetChar()) == ')') { return(null); } if (ch != ':') { goto BreakRecognize; } break; } return(new RegexNode(NodeType, _options)); } BreakRecognize: // break Recognize comes here throw MakeException(Strings.UnrecognizedGrouping); }