/// <summary> /// Builds a DFA for some string, and checks all Lev automata /// up to some maximum distance. /// </summary> private void AssertLev(string s, int maxDistance) { LevenshteinAutomata builder = new LevenshteinAutomata(s, false); LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true); Automaton[] automata = new Automaton[maxDistance + 1]; Automaton[] tautomata = new Automaton[maxDistance + 1]; for (int n = 0; n < automata.Length; n++) { automata[n] = builder.ToAutomaton(n); tautomata[n] = tbuilder.ToAutomaton(n); Assert.IsNotNull(automata[n]); Assert.IsNotNull(tautomata[n]); Assert.IsTrue(automata[n].Deterministic); Assert.IsTrue(tautomata[n].Deterministic); Assert.IsTrue(SpecialOperations.IsFinite(automata[n])); Assert.IsTrue(SpecialOperations.IsFinite(tautomata[n])); AutomatonTestUtil.AssertNoDetachedStates(automata[n]); AutomatonTestUtil.AssertNoDetachedStates(tautomata[n]); // check that the dfa for n-1 accepts a subset of the dfa for n if (n > 0) { Assert.IsTrue(automata[n - 1].SubsetOf(automata[n])); Assert.IsTrue(automata[n - 1].SubsetOf(tautomata[n])); Assert.IsTrue(tautomata[n - 1].SubsetOf(automata[n])); Assert.IsTrue(tautomata[n - 1].SubsetOf(tautomata[n])); Assert.AreNotSame(automata[n - 1], automata[n]); } // check that Lev(N) is a subset of LevT(N) Assert.IsTrue(automata[n].SubsetOf(tautomata[n])); // special checks for specific n switch (n) { case 0: // easy, matches the string itself Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), automata[0])); Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), tautomata[0])); break; case 1: // generate a lev1 naively, and check the accepted lang is the same. Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1(s), automata[1])); Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1T(s), tautomata[1])); break; default: AssertBruteForce(s, automata[n], n); AssertBruteForceT(s, tautomata[n], n); break; } } }
internal Automaton ToLevenshteinAutomata(Automaton automaton) { var @ref = SpecialOperations.GetFiniteStrings(automaton, -1); Automaton[] subs = new Automaton[@ref.Count]; int upto = 0; foreach (IntsRef path in @ref) { if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength) { subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length); upto++; } else { Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix); int[] ints = new int[path.Length - nonFuzzyPrefix]; Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.ToAutomaton(maxEdits); Automaton combined = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton)); combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already subs[upto] = combined; upto++; } } if (subs.Length == 0) { // automaton is empty, there is no accepted paths through it return BasicAutomata.MakeEmpty(); // matches nothing } else if (subs.Length == 1) { // no synonyms or anything: just a single path through the tokenstream return subs[0]; } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = BasicOperations.Union(Arrays.AsList(subs)); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) BasicOperations.Determinize(a); return a; } }