Regular Expression extension to Automaton.

Regular expressions are built from the following abstract syntax:

regexp ::= unionexp
|
unionexp ::= interexp | unionexp (union)
| interexp
interexp ::= concatexp & interexp (intersection) [OPTIONAL]
| concatexp
concatexp ::= repeatexp concatexp (concatenation)
| repeatexp
repeatexp ::= repeatexp ? (zero or one occurrence)
| repeatexp * (zero or more occurrences)
| repeatexp + (one or more occurrences)
| repeatexp {n} (n occurrences)
| repeatexp {n,} (n or more occurrences)
| repeatexp {n,m} (n to m occurrences, including both)
| complexp
complexp ::= ~ complexp (complement) [OPTIONAL]
| charclassexp
charclassexp ::= [ charclasses ] (character class)
| [^ charclasses ] (negated character class)
| simpleexp
charclasses ::= charclass charclasses
| charclass
charclass ::= charexp - charexp (character range, including end-points)
| charexp
simpleexp ::= charexp
| . (any single character)
| # (the empty language) [OPTIONAL]
| @ (any string) [OPTIONAL]
| " <Unicode string without double-quotes>  " (a string)
| ( ) (the empty string)
| ( unionexp ) (precedence override)
| < <identifier> > (named automaton) [OPTIONAL]
| <n-m> (numerical interval) [OPTIONAL]
charexp ::= <Unicode character> (a single non-reserved character)
| \ <Unicode character>  (a single character)

The productions marked [OPTIONAL] are only allowed if specified by the syntax flags passed to the RegExp constructor. The reserved characters used in the (enabled) syntax must be escaped with backslash (\) or double-quotes ("..."). (In contrast to other regexp syntaxes, this is required also in character classes.) Be aware that dash (-) has a special meaning in charclass expressions. An identifier is a string not containing right angle bracket (>) or dash (-). Numerical intervals are specified by non-negative decimal integers and include both end points, and if n and m have the same number of digits, then the conforming strings must have that length (i.e. prefixed by 0's). @lucene.experimental

Example #1
0
        internal static RegExp MakeRepeat(RegExp exp, int min)
        {
            RegExp r = new RegExp();

            r.kind = Kind.REGEXP_REPEAT_MIN;
            r.Exp1 = exp;
            r.Min  = min;
            return(r);
        }
Example #2
0
 internal static RegExp MakeUnion(RegExp exp1, RegExp exp2)
 {
     return(new RegExp
     {
         kind = Kind.REGEXP_UNION,
         exp1 = exp1,
         exp2 = exp2
     });
 }
Example #3
0
        internal static RegExp MakeIntersection(RegExp exp1, RegExp exp2)
        {
            RegExp r = new RegExp();

            r.kind = Kind.REGEXP_INTERSECTION;
            r.Exp1 = exp1;
            r.Exp2 = exp2;
            return(r);
        }
Example #4
0
        internal static RegExp MakeUnion(RegExp exp1, RegExp exp2)
        {
            RegExp r = new RegExp();

            r.kind = Kind.REGEXP_UNION;
            r.Exp1 = exp1;
            r.Exp2 = exp2;
            return(r);
        }
Example #5
0
 internal static RegExp MakeRepeat(RegExp exp, int min)
 {
     return(new RegExp
     {
         kind = Kind.REGEXP_REPEAT_MIN,
         exp1 = exp,
         min = min
     });
 }
Example #6
0
 internal static RegExp MakeIntersection(RegExp exp1, RegExp exp2)
 {
     return(new RegExp
     {
         kind = Kind.REGEXP_INTERSECTION,
         exp1 = exp1,
         exp2 = exp2
     });
 }
Example #7
0
 internal static RegExp MakeRepeat(RegExp exp, int min, int max)
 {
     return(new RegExp
     {
         kind = Kind.REGEXP_REPEAT_MINMAX,
         exp1 = exp,
         min = min,
         max = max
     });
 }
Example #8
0
        internal static RegExp MakeInterval(int min, int max, int digits)
        {
            RegExp r = new RegExp();

            r.kind   = Kind.REGEXP_INTERVAL;
            r.Min    = min;
            r.Max    = max;
            r.Digits = digits;
            return(r);
        }
Example #9
0
        internal RegExp ParseInterExp()
        {
            RegExp e = ParseConcatExp();

            if (Check(RegExpSyntax.INTERSECTION) && Match('&'))
            {
                e = MakeIntersection(e, ParseInterExp());
            }
            return(e);
        }
Example #10
0
        internal RegExp ParseConcatExp()
        {
            RegExp e = ParseRepeatExp();

            if (More() && !Peek(")|") && (!Check(RegExpSyntax.INTERSECTION) || !Peek("&")))
            {
                e = MakeConcatenation(e, ParseConcatExp());
            }
            return(e);
        }
Example #11
0
        internal RegExp ParseCharClasses()
        {
            RegExp e = ParseCharClass();

            while (More() && !Peek("]"))
            {
                e = MakeUnion(e, ParseCharClass());
            }
            return(e);
        }
Example #12
0
        internal RegExp ParseUnionExp()
        {
            RegExp e = ParseInterExp();

            if (Match('|'))
            {
                e = MakeUnion(e, ParseUnionExp());
            }
            return(e);
        }
Example #13
0
        internal static RegExp MakeRepeat(RegExp exp, int min, int max)
        {
            RegExp r = new RegExp();

            r.kind = Kind.REGEXP_REPEAT_MINMAX;
            r.exp1 = exp;
            r.min  = min;
            r.max  = max;
            return(r);
        }
Example #14
0
 private void FindLeaves(RegExp exp, Kind kind, IList <Automaton> list, IDictionary <string, Automaton> automata, IAutomatonProvider automaton_provider)
 {
     if (exp.kind == kind)
     {
         FindLeaves(exp.exp1, kind, list, automata, automaton_provider);
         FindLeaves(exp.exp2, kind, list, automata, automaton_provider);
     }
     else
     {
         list.Add(exp.ToAutomaton(automata, automaton_provider));
     }
 }
Example #15
0
        public void TestSpecialCase2()
        {
            RegExp                re        = new RegExp(".+\u0775");
            string                input     = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775";
            Automaton             automaton = re.ToAutomaton();
            CharacterRunAutomaton cra       = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton      bra       = new ByteRunAutomaton(automaton);

            Assert.IsTrue(cra.Run(input));

            sbyte[] bytes = input.GetBytes(Encoding.UTF8);
            Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); // this one fails!
        }
Example #16
0
        internal static RegExp MakeCharRange(int from, int to)
        {
            if (from > to)
            {
                throw new System.ArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
            }
            RegExp r = new RegExp();

            r.kind = Kind.REGEXP_CHAR_RANGE;
            r.From = from;
            r.To   = to;
            return(r);
        }
Example #17
0
        public void TestSpecialCase3()
        {
            RegExp                re        = new RegExp("(\\鯺)*(.)*\\Ӕ");
            string                input     = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4";
            Automaton             automaton = re.ToAutomaton();
            CharacterRunAutomaton cra       = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton      bra       = new ByteRunAutomaton(automaton);

            Assert.IsTrue(cra.Run(input));

            sbyte[] bytes = input.GetBytes(Encoding.UTF8);
            Assert.IsTrue(bra.Run(bytes, 0, bytes.Length));
        }
Example #18
0
        public void TestSpecialCase()
        {
            RegExp                re        = new RegExp(".?");
            Automaton             automaton = re.ToAutomaton();
            CharacterRunAutomaton cra       = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton      bra       = new ByteRunAutomaton(automaton);

            // make sure character dfa accepts empty string
            Assert.IsTrue(cra.IsAccept(cra.InitialState));
            Assert.IsTrue(cra.Run(""));
            Assert.IsTrue(cra.Run(new char[0], 0, 0));

            // make sure byte dfa accepts empty string
            Assert.IsTrue(bra.IsAccept(bra.InitialState));
            Assert.IsTrue(bra.Run(new byte[0], 0, 0));
        }
Example #19
0
        private static RegExp MakeString(RegExp exp1, RegExp exp2)
        {
            StringBuilder b = new StringBuilder();

            if (exp1.kind == Kind.REGEXP_STRING)
            {
                b.Append(exp1.s);
            }
            else
            {
                b.AppendCodePoint(exp1.c);
            }
            if (exp2.kind == Kind.REGEXP_STRING)
            {
                b.Append(exp2.s);
            }
            else
            {
                b.AppendCodePoint(exp2.c);
            }
            return(MakeString(b.ToString()));
        }
Example #20
0
        public virtual void TestGetRandomAcceptedString()
        {
            int ITER1 = AtLeast(100);
            int ITER2 = AtLeast(100);

            for (int i = 0; i < ITER1; i++)
            {
                RegExp    re = new RegExp(AutomatonTestUtil.RandomRegexp(Random()), RegExp.NONE);
                Automaton a  = re.ToAutomaton();
                Assert.IsFalse(BasicOperations.IsEmpty(a));

                AutomatonTestUtil.RandomAcceptedStrings rx = new AutomatonTestUtil.RandomAcceptedStrings(a);
                for (int j = 0; j < ITER2; j++)
                {
                    int[] acc = null;
                    try
                    {
                        acc = rx.GetRandomAcceptedString(Random());
                        string s = UnicodeUtil.NewString(acc, 0, acc.Length);
                        Assert.IsTrue(BasicOperations.Run(a, s));
                    }
                    catch (Exception t)
                    {
                        Console.WriteLine("regexp: " + re);
                        if (acc != null)
                        {
                            Console.WriteLine("fail acc re=" + re + " count=" + acc.Length);
                            for (int k = 0; k < acc.Length; k++)
                            {
                                Console.WriteLine("  " + acc[k].ToString("x"));
                            }
                        }
                        throw t;
                    }
                }
            }
        }
        public virtual void TestGetRandomAcceptedString()
        {
            int ITER1 = AtLeast(100);
            int ITER2 = AtLeast(100);

            for (int i = 0; i < ITER1; i++)
            {
                RegExp    re = new RegExp(AutomatonTestUtil.RandomRegexp(Random), RegExpSyntax.NONE);
                Automaton a  = re.ToAutomaton();
                Assert.IsFalse(BasicOperations.IsEmpty(a));

                RandomAcceptedStrings rx = new RandomAcceptedStrings(a);
                for (int j = 0; j < ITER2; j++)
                {
                    int[] acc = null;
                    try
                    {
                        acc = rx.GetRandomAcceptedString(Random);
                        string s = UnicodeUtil.NewString(acc, 0, acc.Length);
                        Assert.IsTrue(BasicOperations.Run(a, s));
                    }
                    catch (Exception /*t*/)
                    {
                        Console.WriteLine("regexp: " + re);
                        if (acc != null)
                        {
                            Console.WriteLine("fail acc re=" + re + " count=" + acc.Length);
                            for (int k = 0; k < acc.Length; k++)
                            {
                                Console.WriteLine("  " + acc[k].ToString("x"));
                            }
                        }
                        throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details)
                    }
                }
            }
        }
Example #22
0
 internal DumbRegexpQuery(Term term, RegExpSyntax flags)
     : base(term.Field)
 {
     RegExp re = new RegExp(term.Text, flags);
     automaton = re.ToAutomaton();
 }
Example #23
0
        internal RegExp ParseRepeatExp()
        {
            RegExp e = ParseComplExp();

            while (Peek("?*+{"))
            {
                if (Match('?'))
                {
                    e = MakeOptional(e);
                }
                else if (Match('*'))
                {
                    e = MakeRepeat(e);
                }
                else if (Match('+'))
                {
                    e = MakeRepeat(e, 1);
                }
                else if (Match('{'))
                {
                    int start = Pos;
                    while (Peek("0123456789"))
                    {
                        Next();
                    }
                    if (start == Pos)
                    {
                        throw new System.ArgumentException("integer expected at position " + Pos);
                    }
                    int n = Convert.ToInt32(b.Substring(start, Pos - start));
                    int m = -1;
                    if (Match(','))
                    {
                        start = Pos;
                        while (Peek("0123456789"))
                        {
                            Next();
                        }
                        if (start != Pos)
                        {
                            m = Convert.ToInt32(b.Substring(start, Pos - start));
                        }
                    }
                    else
                    {
                        m = n;
                    }
                    if (!Match('}'))
                    {
                        throw new System.ArgumentException("expected '}' at position " + Pos);
                    }
                    if (m == -1)
                    {
                        e = MakeRepeat(e, n);
                    }
                    else
                    {
                        e = MakeRepeat(e, n, m);
                    }
                }
            }
            return(e);
        }
Example #24
0
        internal RegExp ParseRepeatExp()
        {
            RegExp e = ParseComplExp();

            while (Peek("?*+{"))
            {
                if (Match('?'))
                {
                    e = MakeOptional(e);
                }
                else if (Match('*'))
                {
                    e = MakeRepeat(e);
                }
                else if (Match('+'))
                {
                    e = MakeRepeat(e, 1);
                }
                else if (Match('{'))
                {
                    int start = pos;
                    while (Peek("0123456789"))
                    {
                        Next();
                    }
                    if (start == pos)
                    {
                        throw new ArgumentException("integer expected at position " + pos);
                    }
                    // LUCENENET: Optimized so we don't allocate a substring during the parse
                    int n = Integer.Parse(b, start, pos - start, radix: 10);
                    int m = -1;
                    if (Match(','))
                    {
                        start = pos;
                        while (Peek("0123456789"))
                        {
                            Next();
                        }
                        if (start != pos)
                        {
                            // LUCENENET: Optimized so we don't allocate a substring during the parse
                            m = Integer.Parse(b, start, pos - start, radix: 10);
                        }
                    }
                    else
                    {
                        m = n;
                    }
                    if (!Match('}'))
                    {
                        throw new ArgumentException("expected '}' at position " + pos);
                    }
                    if (m == -1)
                    {
                        e = MakeRepeat(e, n);
                    }
                    else
                    {
                        e = MakeRepeat(e, n, m);
                    }
                }
            }
            return(e);
        }
Example #25
0
 internal RegExp ParseSimpleExp()
 {
     if (Match('.'))
     {
         return(MakeAnyChar());
     }
     else if (Check(EMPTY) && Match('#'))
     {
         return(MakeEmpty());
     }
     else if (Check(ANYSTRING) && Match('@'))
     {
         return(MakeAnyString());
     }
     else if (Match('"'))
     {
         int start = Pos;
         while (More() && !Peek("\""))
         {
             Next();
         }
         if (!Match('"'))
         {
             throw new System.ArgumentException("expected '\"' at position " + Pos);
         }
         return(MakeString(b.Substring(start, Pos - 1 - start)));
     }
     else if (Match('('))
     {
         if (Match(')'))
         {
             return(MakeString(""));
         }
         RegExp e = ParseUnionExp();
         if (!Match(')'))
         {
             throw new System.ArgumentException("expected ')' at position " + Pos);
         }
         return(e);
     }
     else if ((Check(AUTOMATON) || Check(INTERVAL)) && Match('<'))
     {
         int start = Pos;
         while (More() && !Peek(">"))
         {
             Next();
         }
         if (!Match('>'))
         {
             throw new System.ArgumentException("expected '>' at position " + Pos);
         }
         string s = b.Substring(start, Pos - 1 - start);
         int    i = s.IndexOf('-');
         if (i == -1)
         {
             if (!Check(AUTOMATON))
             {
                 throw new System.ArgumentException("interval syntax error at position " + (Pos - 1));
             }
             return(MakeAutomaton(s));
         }
         else
         {
             if (!Check(INTERVAL))
             {
                 throw new System.ArgumentException("illegal identifier at position " + (Pos - 1));
             }
             try
             {
                 if (i == 0 || i == s.Length - 1 || i != s.LastIndexOf('-'))
                 {
                     throw new System.FormatException();
                 }
                 string smin = s.Substring(0, i);
                 string smax = s.Substring(i + 1, s.Length - (i + 1));
                 int    imin = Convert.ToInt32(smin);
                 int    imax = Convert.ToInt32(smax);
                 int    digits;
                 if (smin.Length == smax.Length)
                 {
                     digits = smin.Length;
                 }
                 else
                 {
                     digits = 0;
                 }
                 if (imin > imax)
                 {
                     int t = imin;
                     imin = imax;
                     imax = t;
                 }
                 return(MakeInterval(imin, imax, digits));
             }
             catch (System.FormatException e)
             {
                 throw new System.ArgumentException("interval syntax error at position " + (Pos - 1));
             }
         }
     }
     else
     {
         return(MakeChar(ParseCharExp()));
     }
 }
Example #26
0
        internal RegExp ParseSimpleExp()
        {
            if (Match('.'))
            {
                return(MakeAnyChar());
            }
            else if (Check(RegExpSyntax.EMPTY) && Match('#'))
            {
                return(MakeEmpty());
            }
            else if (Check(RegExpSyntax.ANYSTRING) && Match('@'))
            {
                return(MakeAnyString());
            }
            else if (Match('"'))
            {
                int start = pos;
                while (More() && !Peek("\""))
                {
                    Next();
                }
                if (!Match('"'))
                {
                    throw new ArgumentException("expected '\"' at position " + pos);
                }
                return(MakeString(b.Substring(start, pos - 1 - start)));
            }
            else if (Match('('))
            {
                if (Match(')'))
                {
                    return(MakeString(""));
                }
                RegExp e = ParseUnionExp();
                if (!Match(')'))
                {
                    throw new ArgumentException("expected ')' at position " + pos);
                }
                return(e);
            }
            else if ((Check(RegExpSyntax.AUTOMATON) || Check(RegExpSyntax.INTERVAL)) && Match('<'))
            {
                int start = pos;
                while (More() && !Peek(">"))
                {
                    Next();
                }
                if (!Match('>'))
                {
                    throw new ArgumentException("expected '>' at position " + pos);
                }
                string s = b.Substring(start, pos - 1 - start);
                int    i = s.IndexOf('-');
                if (i == -1)
                {
                    if (!Check(RegExpSyntax.AUTOMATON))
                    {
                        throw new ArgumentException("interval syntax error at position " + (pos - 1));
                    }
                    return(MakeAutomaton(s));
                }
                else
                {
                    if (!Check(RegExpSyntax.INTERVAL))
                    {
                        throw new ArgumentException("illegal identifier at position " + (pos - 1));
                    }

                    // LUCENENET: Refactored so we don't throw exceptions in the normal flow
                    if (i == 0 || i == s.Length - 1 || i != s.LastIndexOf('-'))
                    {
                        throw new ArgumentException("interval syntax error at position " + (pos - 1));
                    }
                    string smin = s.Substring(0, i);
                    string smax = s.Substring(i + 1, s.Length - (i + 1));

                    if (!int.TryParse(smin, NumberStyles.Integer, CultureInfo.InvariantCulture, out int imin) ||
                        !int.TryParse(smax, NumberStyles.Integer, CultureInfo.InvariantCulture, out int imax))
                    {
                        throw new ArgumentException("interval syntax error at position " + (pos - 1));
                    }

                    int digits;
                    if (smin.Length == smax.Length)
                    {
                        digits = smin.Length;
                    }
                    else
                    {
                        digits = 0;
                    }
                    if (imin > imax)
                    {
                        int t = imin;
                        imin = imax;
                        imax = t;
                    }
                    return(MakeInterval(imin, imax, digits));
                }
            }
            else
            {
                return(MakeChar(ParseCharExp()));
            }
        }