public void TestSpecialCase2() { RegExp re = new RegExp(".+\u0775"); string input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775"; Automaton automaton = re.ToAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); Assert.IsTrue(cra.Run(input)); sbyte[] bytes = input.GetBytes(Encoding.UTF8); Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); // this one fails! }
public void TestSpecialCase3() { RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ"); string input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4"; Automaton automaton = re.ToAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); Assert.IsTrue(cra.Run(input)); sbyte[] bytes = input.GetBytes(Encoding.UTF8); Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); }
public void TestSpecialCase() { RegExp re = new RegExp(".?"); Automaton automaton = re.ToAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); // make sure character dfa accepts empty string Assert.IsTrue(cra.IsAccept(cra.InitialState)); Assert.IsTrue(cra.Run("")); Assert.IsTrue(cra.Run(new char[0], 0, 0)); // make sure byte dfa accepts empty string Assert.IsTrue(bra.IsAccept(bra.InitialState)); Assert.IsTrue(bra.Run(new byte[0], 0, 0)); }
public void AssertLexicon(List<Automaton> a, List<string> terms) { var automata = CollectionsHelper.Shuffle(a); var lex = BasicOperations.Union(automata); lex.Determinize(); Assert.IsTrue(SpecialOperations.IsFinite(lex)); foreach (string s in terms) { Assert.IsTrue(BasicOperations.Run(lex, s)); } var lexByte = new ByteRunAutomaton(lex); foreach (string s in terms) { sbyte[] bytes = s.GetBytes(Encoding.UTF8); Assert.IsTrue(lexByte.Run(bytes, 0, bytes.Length)); } }
public void AssertLexicon(List<Automaton> a, List<string> terms) { var automata = CollectionsHelper.Shuffle(a); var lex = BasicOperations.Union(automata); lex.Determinize(); Assert.IsTrue(SpecialOperations.IsFinite(lex)); foreach (string s in terms) { Assert.IsTrue(BasicOperations.Run(lex, s)); } var lexByte = new ByteRunAutomaton(lex); foreach (string s in terms) { var bytes = s.GetBytes(Encoding.UTF8); Assert.IsTrue(lexByte.Run(bytes, 0, bytes.Length)); } }
public void AssertLexicon() { Collections.Shuffle(automata, Random()); var lex = BasicOperations.Union(automata); lex.Determinize(); Assert.IsTrue(SpecialOperations.IsFinite(lex)); foreach (string s in terms) { assertTrue(BasicOperations.Run(lex, s)); } var lexByte = new ByteRunAutomaton(lex); foreach (string s in terms) { var bytes = s.GetBytes(Encoding.UTF8); assertTrue(lexByte.Run(bytes, 0, bytes.Length)); } }
private static void AssertAutomaton(Automaton automaton) { var cra = new CharacterRunAutomaton(automaton); var bra = new ByteRunAutomaton(automaton); var ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = AtLeast(1000); for (int i = 0; i < num; i++) { string s; if (Random().NextBoolean()) { // likely not accepted s = TestUtil.RandomUnicodeString(Random()); } else { // will be accepted int[] codepoints = ras.GetRandomAcceptedString(Random()); try { s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length); } catch (Exception e) { Console.WriteLine(codepoints.Length + " codepoints:"); for (int j = 0; j < codepoints.Length; j++) { Console.WriteLine(" " + codepoints[j].ToString("x")); } throw e; } } var bytes = s.GetBytes(Encoding.UTF8); Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length)); } }
private static void AssertAutomaton(Automaton automaton) { var cra = new CharacterRunAutomaton(automaton); var bra = new ByteRunAutomaton(automaton); var ras = new RandomAcceptedStrings(automaton); int num = AtLeast(1000); for (int i = 0; i < num; i++) { string s; if (Random.NextBoolean()) { // likely not accepted s = TestUtil.RandomUnicodeString(Random); } else { // will be accepted int[] codepoints = ras.GetRandomAcceptedString(Random); try { s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length); } catch (Exception /*e*/) { Console.WriteLine(codepoints.Length + " codepoints:"); for (int j = 0; j < codepoints.Length; j++) { Console.WriteLine(" " + codepoints[j].ToString("x")); } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } var bytes = s.GetBytes(Encoding.UTF8); Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length)); } }
private void TestOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) { // Verify correct ints are accepted int nonSurrogateCount; bool ovSurStart; if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || startCode > UnicodeUtil.UNI_SUR_LOW_END) { // no overlap w/ surrogates nonSurrogateCount = endCode - startCode + 1; ovSurStart = false; } else if (IsSurrogate(startCode)) { // start of range overlaps surrogates nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1); ovSurStart = false; } else if (IsSurrogate(endCode)) { // end of range overlaps surrogates ovSurStart = true; nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1); } else { // range completely subsumes surrogates ovSurStart = true; nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1); } Debug.Assert(nonSurrogateCount > 0); for (int iter = 0; iter < iters; iter++) { // pick random code point in-range int code = startCode + r.Next(nonSurrogateCount); if (IsSurrogate(code)) { if (ovSurStart) { code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START); } else { code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode); } } Debug.Assert(code >= startCode && code <= endCode, "code=" + code + " start=" + startCode + " end=" + endCode); Debug.Assert(!IsSurrogate(code)); Assert.IsTrue(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " failed to match code=" + code); } // Verify invalid ints are not accepted int invalidRange = MAX_UNICODE - (endCode - startCode + 1); if (invalidRange > 0) { for (int iter = 0; iter < iters; iter++) { int x = TestUtil.NextInt(r, 0, invalidRange - 1); int code; if (x >= startCode) { code = endCode + 1 + x - startCode; } else { code = x; } if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { iter--; continue; } Assert.IsFalse(Matches(a, code), "DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code); } } }
private bool Matches(ByteRunAutomaton a, int code) { char[] chars = Character.ToChars(code); UnicodeUtil.UTF16toUTF8(chars, 0, chars.Length, b); return(a.Run(b.Bytes, 0, b.Length)); }
private bool Matches(ByteRunAutomaton a, int code) { char[] chars = Character.ToChars(code); UnicodeUtil.UTF16toUTF8(chars, 0, chars.Length, b); return a.Run(b.Bytes, 0, b.Length); }
public void TestSpecialCase3() { RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ"); string input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4"; Automaton automaton = re.ToAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); Assert.IsTrue(cra.Run(input)); var bytes = input.GetBytes(Encoding.UTF8); Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); }
public void TestSpecialCase2() { RegExp re = new RegExp(".+\u0775"); string input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775"; Automaton automaton = re.ToAutomaton(); CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); Assert.IsTrue(cra.Run(input)); var bytes = input.GetBytes(Encoding.UTF8); Assert.IsTrue(bra.Run(bytes, 0, bytes.Length)); // this one fails! }
public CompiledAutomaton(Automaton automaton, bool?finite, bool simplify) { if (simplify) { // Test whether the automaton is a "simple" form and // if so, don't create a runAutomaton. Note that on a // large automaton these tests could be costly: if (BasicOperations.IsEmpty(automaton)) { // matches nothing Type = AUTOMATON_TYPE.NONE; Term = null; CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } else if (BasicOperations.IsTotal(automaton)) { // matches all possible strings Type = AUTOMATON_TYPE.ALL; Term = null; CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } else { string commonPrefix; string singleton; if (automaton.Singleton == null) { commonPrefix = SpecialOperations.GetCommonPrefix(automaton); if (commonPrefix.Length > 0 && BasicOperations.SameLanguage(automaton, BasicAutomata.MakeString(commonPrefix))) { singleton = commonPrefix; } else { singleton = null; } } else { commonPrefix = null; singleton = automaton.Singleton; } if (singleton != null) { // matches a fixed string in singleton or expanded // representation Type = AUTOMATON_TYPE.SINGLE; Term = new BytesRef(singleton); CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } else if (BasicOperations.SameLanguage(automaton, BasicOperations.Concatenate(BasicAutomata.MakeString(commonPrefix), BasicAutomata.MakeAnyString()))) { // matches a constant prefix Type = AUTOMATON_TYPE.PREFIX; Term = new BytesRef(commonPrefix); CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } } } Type = AUTOMATON_TYPE.NORMAL; Term = null; if (finite == null) { this.Finite = SpecialOperations.IsFinite(automaton); } else { this.Finite = finite; } Automaton utf8 = (new UTF32ToUTF8()).Convert(automaton); if (this.Finite == true) { CommonSuffixRef = null; } else { CommonSuffixRef = SpecialOperations.GetCommonSuffixBytesRef(utf8); } RunAutomaton = new ByteRunAutomaton(utf8, true); sortedTransitions = utf8.GetSortedTransitions(); }
public CompiledAutomaton(Automaton automaton, bool? finite, bool simplify) { if (simplify) { // Test whether the automaton is a "simple" form and // if so, don't create a runAutomaton. Note that on a // large automaton these tests could be costly: if (BasicOperations.IsEmpty(automaton)) { // matches nothing Type = AUTOMATON_TYPE.NONE; Term = null; CommonSuffixRef = null; RunAutomaton = null; SortedTransitions = null; this.Finite = null; return; } else if (BasicOperations.IsTotal(automaton)) { // matches all possible strings Type = AUTOMATON_TYPE.ALL; Term = null; CommonSuffixRef = null; RunAutomaton = null; SortedTransitions = null; this.Finite = null; return; } else { string commonPrefix; string singleton; if (automaton.Singleton == null) { commonPrefix = SpecialOperations.GetCommonPrefix(automaton); if (commonPrefix.Length > 0 && BasicOperations.SameLanguage(automaton, BasicAutomata.MakeString(commonPrefix))) { singleton = commonPrefix; } else { singleton = null; } } else { commonPrefix = null; singleton = automaton.Singleton; } if (singleton != null) { // matches a fixed string in singleton or expanded // representation Type = AUTOMATON_TYPE.SINGLE; Term = new BytesRef(singleton); CommonSuffixRef = null; RunAutomaton = null; SortedTransitions = null; this.Finite = null; return; } else if (BasicOperations.SameLanguage(automaton, BasicOperations.Concatenate(BasicAutomata.MakeString(commonPrefix), BasicAutomata.MakeAnyString()))) { // matches a constant prefix Type = AUTOMATON_TYPE.PREFIX; Term = new BytesRef(commonPrefix); CommonSuffixRef = null; RunAutomaton = null; SortedTransitions = null; this.Finite = null; return; } } } Type = AUTOMATON_TYPE.NORMAL; Term = null; if (finite == null) { this.Finite = SpecialOperations.IsFinite(automaton); } else { this.Finite = finite; } Automaton utf8 = (new UTF32ToUTF8()).Convert(automaton); if (this.Finite == true) { CommonSuffixRef = null; } else { CommonSuffixRef = SpecialOperations.GetCommonSuffixBytesRef(utf8); } RunAutomaton = new ByteRunAutomaton(utf8, true); SortedTransitions = utf8.SortedTransitions; }