Ejemplo n.º 1
0
        public void TestSpecialCase2()
        {
            RegExp                re        = new RegExp(".+\u0775");
            string                input     = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775";
            Automaton             automaton = re.ToAutomaton();
            CharacterRunAutomaton cra       = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton      bra       = new ByteRunAutomaton(automaton);

            Assert.IsTrue(cra.Run(input));

            sbyte[] bytes = input.GetBytes(Encoding.UTF8);
            Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); // this one fails!
        }
Ejemplo n.º 2
0
        public void TestSpecialCase3()
        {
            RegExp                re        = new RegExp("(\\鯺)*(.)*\\Ӕ");
            string                input     = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4";
            Automaton             automaton = re.ToAutomaton();
            CharacterRunAutomaton cra       = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton      bra       = new ByteRunAutomaton(automaton);

            Assert.IsTrue(cra.Run(input));

            sbyte[] bytes = input.GetBytes(Encoding.UTF8);
            Assert.IsTrue(bra.Run(bytes, 0, bytes.Length));
        }
Ejemplo n.º 3
0
        public void TestSpecialCase()
        {
            RegExp                re        = new RegExp(".?");
            Automaton             automaton = re.ToAutomaton();
            CharacterRunAutomaton cra       = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton      bra       = new ByteRunAutomaton(automaton);

            // make sure character dfa accepts empty string
            Assert.IsTrue(cra.IsAccept(cra.InitialState));
            Assert.IsTrue(cra.Run(""));
            Assert.IsTrue(cra.Run(new char[0], 0, 0));

            // make sure byte dfa accepts empty string
            Assert.IsTrue(bra.IsAccept(bra.InitialState));
            Assert.IsTrue(bra.Run(new byte[0], 0, 0));
        }
Ejemplo n.º 4
0
 public void AssertLexicon(List<Automaton> a, List<string> terms)
 {
     var automata = CollectionsHelper.Shuffle(a);
     var lex = BasicOperations.Union(automata);
     lex.Determinize();
     Assert.IsTrue(SpecialOperations.IsFinite(lex));
     foreach (string s in terms)
     {
         Assert.IsTrue(BasicOperations.Run(lex, s));
     }
     var lexByte = new ByteRunAutomaton(lex);
     foreach (string s in terms)
     {
         sbyte[] bytes = s.GetBytes(Encoding.UTF8);
         Assert.IsTrue(lexByte.Run(bytes, 0, bytes.Length));
     }
 }
Ejemplo n.º 5
0
 public void AssertLexicon(List<Automaton> a, List<string> terms)
 {
     var automata = CollectionsHelper.Shuffle(a);
     var lex = BasicOperations.Union(automata);
     lex.Determinize();
     Assert.IsTrue(SpecialOperations.IsFinite(lex));
     foreach (string s in terms)
     {
         Assert.IsTrue(BasicOperations.Run(lex, s));
     }
     var lexByte = new ByteRunAutomaton(lex);
     foreach (string s in terms)
     {
         var bytes = s.GetBytes(Encoding.UTF8);
         Assert.IsTrue(lexByte.Run(bytes, 0, bytes.Length));
     }
 }
Ejemplo n.º 6
0
        public void AssertLexicon()
        {
            Collections.Shuffle(automata, Random());
            var lex = BasicOperations.Union(automata);

            lex.Determinize();
            Assert.IsTrue(SpecialOperations.IsFinite(lex));
            foreach (string s in terms)
            {
                assertTrue(BasicOperations.Run(lex, s));
            }
            var lexByte = new ByteRunAutomaton(lex);

            foreach (string s in terms)
            {
                var bytes = s.GetBytes(Encoding.UTF8);
                assertTrue(lexByte.Run(bytes, 0, bytes.Length));
            }
        }
Ejemplo n.º 7
0
        private static void AssertAutomaton(Automaton automaton)
        {
            var cra = new CharacterRunAutomaton(automaton);
            var bra = new ByteRunAutomaton(automaton);
            var ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);

            int num = AtLeast(1000);

            for (int i = 0; i < num; i++)
            {
                string s;
                if (Random().NextBoolean())
                {
                    // likely not accepted
                    s = TestUtil.RandomUnicodeString(Random());
                }
                else
                {
                    // will be accepted
                    int[] codepoints = ras.GetRandomAcceptedString(Random());
                    try
                    {
                        s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(codepoints.Length + " codepoints:");
                        for (int j = 0; j < codepoints.Length; j++)
                        {
                            Console.WriteLine("  " + codepoints[j].ToString("x"));
                        }
                        throw e;
                    }
                }
                var bytes = s.GetBytes(Encoding.UTF8);
                Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length));
            }
        }
Ejemplo n.º 8
0
        private static void AssertAutomaton(Automaton automaton)
        {
            var cra = new CharacterRunAutomaton(automaton);
            var bra = new ByteRunAutomaton(automaton);
            var ras = new RandomAcceptedStrings(automaton);

            int num = AtLeast(1000);

            for (int i = 0; i < num; i++)
            {
                string s;
                if (Random.NextBoolean())
                {
                    // likely not accepted
                    s = TestUtil.RandomUnicodeString(Random);
                }
                else
                {
                    // will be accepted
                    int[] codepoints = ras.GetRandomAcceptedString(Random);
                    try
                    {
                        s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length);
                    }
                    catch (Exception /*e*/)
                    {
                        Console.WriteLine(codepoints.Length + " codepoints:");
                        for (int j = 0; j < codepoints.Length; j++)
                        {
                            Console.WriteLine("  " + codepoints[j].ToString("x"));
                        }
                        throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details)
                    }
                }
                var bytes = s.GetBytes(Encoding.UTF8);
                Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length));
            }
        }
Ejemplo n.º 9
0
        private void TestOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters)
        {
            // Verify correct ints are accepted
            int  nonSurrogateCount;
            bool ovSurStart;

            if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || startCode > UnicodeUtil.UNI_SUR_LOW_END)
            {
                // no overlap w/ surrogates
                nonSurrogateCount = endCode - startCode + 1;
                ovSurStart        = false;
            }
            else if (IsSurrogate(startCode))
            {
                // start of range overlaps surrogates
                nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1);
                ovSurStart        = false;
            }
            else if (IsSurrogate(endCode))
            {
                // end of range overlaps surrogates
                ovSurStart        = true;
                nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1);
            }
            else
            {
                // range completely subsumes surrogates
                ovSurStart        = true;
                nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1);
            }

            Debug.Assert(nonSurrogateCount > 0);

            for (int iter = 0; iter < iters; iter++)
            {
                // pick random code point in-range

                int code = startCode + r.Next(nonSurrogateCount);
                if (IsSurrogate(code))
                {
                    if (ovSurStart)
                    {
                        code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START);
                    }
                    else
                    {
                        code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode);
                    }
                }

                Debug.Assert(code >= startCode && code <= endCode, "code=" + code + " start=" + startCode + " end=" + endCode);
                Debug.Assert(!IsSurrogate(code));

                Assert.IsTrue(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " failed to match code=" + code);
            }

            // Verify invalid ints are not accepted
            int invalidRange = MAX_UNICODE - (endCode - startCode + 1);

            if (invalidRange > 0)
            {
                for (int iter = 0; iter < iters; iter++)
                {
                    int x = TestUtil.NextInt(r, 0, invalidRange - 1);
                    int code;
                    if (x >= startCode)
                    {
                        code = endCode + 1 + x - startCode;
                    }
                    else
                    {
                        code = x;
                    }
                    if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END))
                    {
                        iter--;
                        continue;
                    }
                    Assert.IsFalse(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code);
                }
            }
        }
Ejemplo n.º 10
0
 private bool Matches(ByteRunAutomaton a, int code)
 {
     char[] chars = Character.ToChars(code);
     UnicodeUtil.UTF16toUTF8(chars, 0, chars.Length, b);
     return(a.Run(b.Bytes, 0, b.Length));
 }
Ejemplo n.º 11
0
        public void TestSpecialCase()
        {
            RegExp re = new RegExp(".?");
            Automaton automaton = re.ToAutomaton();
            CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
            // make sure character dfa accepts empty string
            Assert.IsTrue(cra.IsAccept(cra.InitialState));
            Assert.IsTrue(cra.Run(""));
            Assert.IsTrue(cra.Run(new char[0], 0, 0));

            // make sure byte dfa accepts empty string
            Assert.IsTrue(bra.IsAccept(bra.InitialState));
            Assert.IsTrue(bra.Run(new byte[0], 0, 0));
        }
Ejemplo n.º 12
0
        private void TestOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters)
        {
            // Verify correct ints are accepted
            int nonSurrogateCount;
            bool ovSurStart;
            if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || startCode > UnicodeUtil.UNI_SUR_LOW_END)
            {
                // no overlap w/ surrogates
                nonSurrogateCount = endCode - startCode + 1;
                ovSurStart = false;
            }
            else if (IsSurrogate(startCode))
            {
                // start of range overlaps surrogates
                nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1);
                ovSurStart = false;
            }
            else if (IsSurrogate(endCode))
            {
                // end of range overlaps surrogates
                ovSurStart = true;
                nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1);
            }
            else
            {
                // range completely subsumes surrogates
                ovSurStart = true;
                nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1);
            }

            Debug.Assert(nonSurrogateCount > 0);

            for (int iter = 0; iter < iters; iter++)
            {
                // pick random code point in-range

                int code = startCode + r.Next(nonSurrogateCount);
                if (IsSurrogate(code))
                {
                    if (ovSurStart)
                    {
                        code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START);
                    }
                    else
                    {
                        code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode);
                    }
                }

                Debug.Assert(code >= startCode && code <= endCode, "code=" + code + " start=" + startCode + " end=" + endCode);
                Debug.Assert(!IsSurrogate(code));

                Assert.IsTrue(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " failed to match code=" + code);
            }

            // Verify invalid ints are not accepted
            int invalidRange = MAX_UNICODE - (endCode - startCode + 1);
            if (invalidRange > 0)
            {
                for (int iter = 0; iter < iters; iter++)
                {
                    int x = TestUtil.NextInt(r, 0, invalidRange - 1);
                    int code;
                    if (x >= startCode)
                    {
                        code = endCode + 1 + x - startCode;
                    }
                    else
                    {
                        code = x;
                    }
                    if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END))
                    {
                        iter--;
                        continue;
                    }
                    Assert.IsFalse(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code);
                }
            }
        }
Ejemplo n.º 13
0
 private bool Matches(ByteRunAutomaton a, int code)
 {
     char[] chars = Character.ToChars(code);
     UnicodeUtil.UTF16toUTF8(chars, 0, chars.Length, b);
     return a.Run(b.Bytes, 0, b.Length);
 }
Ejemplo n.º 14
0
        private static void AssertAutomaton(Automaton automaton)
        {
            var cra = new CharacterRunAutomaton(automaton);
            var bra = new ByteRunAutomaton(automaton);
            var ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);

            int num = AtLeast(1000);
            for (int i = 0; i < num; i++)
            {
                string s;
                if (Random().NextBoolean())
                {
                    // likely not accepted
                    s = TestUtil.RandomUnicodeString(Random());
                }
                else
                {
                    // will be accepted
                    int[] codepoints = ras.GetRandomAcceptedString(Random());
                    try
                    {
                        s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(codepoints.Length + " codepoints:");
                        for (int j = 0; j < codepoints.Length; j++)
                        {
                            Console.WriteLine("  " + codepoints[j].ToString("x"));
                        }
                        throw e;
                    }
                }
                var bytes = s.GetBytes(Encoding.UTF8);
                Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length));
            }
        }
Ejemplo n.º 15
0
        public void TestSpecialCase3()
        {
            RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ");
            string input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4";
            Automaton automaton = re.ToAutomaton();
            CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton bra = new ByteRunAutomaton(automaton);

            Assert.IsTrue(cra.Run(input));

            var bytes = input.GetBytes(Encoding.UTF8);
            Assert.IsTrue(bra.Run(bytes, 0, bytes.Length));
        }
Ejemplo n.º 16
0
        public void TestSpecialCase2()
        {
            RegExp re = new RegExp(".+\u0775");
            string input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775";
            Automaton automaton = re.ToAutomaton();
            CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
            ByteRunAutomaton bra = new ByteRunAutomaton(automaton);

            Assert.IsTrue(cra.Run(input));

            var bytes = input.GetBytes(Encoding.UTF8);
            Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); // this one fails!
        }
Ejemplo n.º 17
0
        public CompiledAutomaton(Automaton automaton, bool?finite, bool simplify)
        {
            if (simplify)
            {
                // Test whether the automaton is a "simple" form and
                // if so, don't create a runAutomaton.  Note that on a
                // large automaton these tests could be costly:
                if (BasicOperations.IsEmpty(automaton))
                {
                    // matches nothing
                    Type              = AUTOMATON_TYPE.NONE;
                    Term              = null;
                    CommonSuffixRef   = null;
                    RunAutomaton      = null;
                    sortedTransitions = null;
                    this.Finite       = null;
                    return;
                }
                else if (BasicOperations.IsTotal(automaton))
                {
                    // matches all possible strings
                    Type              = AUTOMATON_TYPE.ALL;
                    Term              = null;
                    CommonSuffixRef   = null;
                    RunAutomaton      = null;
                    sortedTransitions = null;
                    this.Finite       = null;
                    return;
                }
                else
                {
                    string commonPrefix;
                    string singleton;
                    if (automaton.Singleton == null)
                    {
                        commonPrefix = SpecialOperations.GetCommonPrefix(automaton);
                        if (commonPrefix.Length > 0 && BasicOperations.SameLanguage(automaton, BasicAutomata.MakeString(commonPrefix)))
                        {
                            singleton = commonPrefix;
                        }
                        else
                        {
                            singleton = null;
                        }
                    }
                    else
                    {
                        commonPrefix = null;
                        singleton    = automaton.Singleton;
                    }

                    if (singleton != null)
                    {
                        // matches a fixed string in singleton or expanded
                        // representation
                        Type              = AUTOMATON_TYPE.SINGLE;
                        Term              = new BytesRef(singleton);
                        CommonSuffixRef   = null;
                        RunAutomaton      = null;
                        sortedTransitions = null;
                        this.Finite       = null;
                        return;
                    }
                    else if (BasicOperations.SameLanguage(automaton, BasicOperations.Concatenate(BasicAutomata.MakeString(commonPrefix), BasicAutomata.MakeAnyString())))
                    {
                        // matches a constant prefix
                        Type              = AUTOMATON_TYPE.PREFIX;
                        Term              = new BytesRef(commonPrefix);
                        CommonSuffixRef   = null;
                        RunAutomaton      = null;
                        sortedTransitions = null;
                        this.Finite       = null;
                        return;
                    }
                }
            }

            Type = AUTOMATON_TYPE.NORMAL;
            Term = null;
            if (finite == null)
            {
                this.Finite = SpecialOperations.IsFinite(automaton);
            }
            else
            {
                this.Finite = finite;
            }
            Automaton utf8 = (new UTF32ToUTF8()).Convert(automaton);

            if (this.Finite == true)
            {
                CommonSuffixRef = null;
            }
            else
            {
                CommonSuffixRef = SpecialOperations.GetCommonSuffixBytesRef(utf8);
            }
            RunAutomaton      = new ByteRunAutomaton(utf8, true);
            sortedTransitions = utf8.GetSortedTransitions();
        }
Ejemplo n.º 18
0
        public CompiledAutomaton(Automaton automaton, bool? finite, bool simplify)
        {
            if (simplify)
            {
                // Test whether the automaton is a "simple" form and
                // if so, don't create a runAutomaton.  Note that on a
                // large automaton these tests could be costly:
                if (BasicOperations.IsEmpty(automaton))
                {
                    // matches nothing
                    Type = AUTOMATON_TYPE.NONE;
                    Term = null;
                    CommonSuffixRef = null;
                    RunAutomaton = null;
                    SortedTransitions = null;
                    this.Finite = null;
                    return;
                }
                else if (BasicOperations.IsTotal(automaton))
                {
                    // matches all possible strings
                    Type = AUTOMATON_TYPE.ALL;
                    Term = null;
                    CommonSuffixRef = null;
                    RunAutomaton = null;
                    SortedTransitions = null;
                    this.Finite = null;
                    return;
                }
                else
                {
                    string commonPrefix;
                    string singleton;
                    if (automaton.Singleton == null)
                    {
                        commonPrefix = SpecialOperations.GetCommonPrefix(automaton);
                        if (commonPrefix.Length > 0 && BasicOperations.SameLanguage(automaton, BasicAutomata.MakeString(commonPrefix)))
                        {
                            singleton = commonPrefix;
                        }
                        else
                        {
                            singleton = null;
                        }
                    }
                    else
                    {
                        commonPrefix = null;
                        singleton = automaton.Singleton;
                    }

                    if (singleton != null)
                    {
                        // matches a fixed string in singleton or expanded
                        // representation
                        Type = AUTOMATON_TYPE.SINGLE;
                        Term = new BytesRef(singleton);
                        CommonSuffixRef = null;
                        RunAutomaton = null;
                        SortedTransitions = null;
                        this.Finite = null;
                        return;
                    }
                    else if (BasicOperations.SameLanguage(automaton, BasicOperations.Concatenate(BasicAutomata.MakeString(commonPrefix), BasicAutomata.MakeAnyString())))
                    {
                        // matches a constant prefix
                        Type = AUTOMATON_TYPE.PREFIX;
                        Term = new BytesRef(commonPrefix);
                        CommonSuffixRef = null;
                        RunAutomaton = null;
                        SortedTransitions = null;
                        this.Finite = null;
                        return;
                    }
                }
            }

            Type = AUTOMATON_TYPE.NORMAL;
            Term = null;
            if (finite == null)
            {
                this.Finite = SpecialOperations.IsFinite(automaton);
            }
            else
            {
                this.Finite = finite;
            }
            Automaton utf8 = (new UTF32ToUTF8()).Convert(automaton);
            if (this.Finite == true)
            {
                CommonSuffixRef = null;
            }
            else
            {
                CommonSuffixRef = SpecialOperations.GetCommonSuffixBytesRef(utf8);
            }
            RunAutomaton = new ByteRunAutomaton(utf8, true);
            SortedTransitions = utf8.SortedTransitions;
        }