Esempio n. 1
0
        /// <summary>
        /// Builds a DFA for some string, and checks all Lev automata
        /// up to some maximum distance.
        /// </summary>
        private void AssertLev(string s, int maxDistance)
        {
            LevenshteinAutomata builder  = new LevenshteinAutomata(s, false);
            LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true);

            Automaton[] automata  = new Automaton[maxDistance + 1];
            Automaton[] tautomata = new Automaton[maxDistance + 1];
            for (int n = 0; n < automata.Length; n++)
            {
                automata[n]  = builder.ToAutomaton(n);
                tautomata[n] = tbuilder.ToAutomaton(n);
                Assert.IsNotNull(automata[n]);
                Assert.IsNotNull(tautomata[n]);
                Assert.IsTrue(automata[n].Deterministic);
                Assert.IsTrue(tautomata[n].Deterministic);
                Assert.IsTrue(SpecialOperations.IsFinite(automata[n]));
                Assert.IsTrue(SpecialOperations.IsFinite(tautomata[n]));
                AutomatonTestUtil.AssertNoDetachedStates(automata[n]);
                AutomatonTestUtil.AssertNoDetachedStates(tautomata[n]);
                // check that the dfa for n-1 accepts a subset of the dfa for n
                if (n > 0)
                {
                    Assert.IsTrue(automata[n - 1].SubsetOf(automata[n]));
                    Assert.IsTrue(automata[n - 1].SubsetOf(tautomata[n]));
                    Assert.IsTrue(tautomata[n - 1].SubsetOf(automata[n]));
                    Assert.IsTrue(tautomata[n - 1].SubsetOf(tautomata[n]));
                    Assert.AreNotSame(automata[n - 1], automata[n]);
                }
                // check that Lev(N) is a subset of LevT(N)
                Assert.IsTrue(automata[n].SubsetOf(tautomata[n]));
                // special checks for specific n
                switch (n)
                {
                case 0:
                    // easy, matches the string itself
                    Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), automata[0]));
                    Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), tautomata[0]));
                    break;

                case 1:
                    // generate a lev1 naively, and check the accepted lang is the same.
                    Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1(s), automata[1]));
                    Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1T(s), tautomata[1]));
                    break;

                default:
                    AssertBruteForce(s, automata[n], n);
                    AssertBruteForceT(s, tautomata[n], n);
                    break;
                }
            }
        }
        /// <summary>
        /// Builds a DFA for some string, and checks all Lev automata
        /// up to some maximum distance.
        /// </summary>
        private void AssertLev(string s, int maxDistance)
        {
            LevenshteinAutomata builder = new LevenshteinAutomata(s, false);
            LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true);
            Automaton[] automata = new Automaton[maxDistance + 1];
            Automaton[] tautomata = new Automaton[maxDistance + 1];
            for (int n = 0; n < automata.Length; n++)
            {
                automata[n] = builder.ToAutomaton(n);
                tautomata[n] = tbuilder.ToAutomaton(n);
                Assert.IsNotNull(automata[n]);
                Assert.IsNotNull(tautomata[n]);
                Assert.IsTrue(automata[n].Deterministic);
                Assert.IsTrue(tautomata[n].Deterministic);
                Assert.IsTrue(SpecialOperations.IsFinite(automata[n]));
                Assert.IsTrue(SpecialOperations.IsFinite(tautomata[n]));
                AutomatonTestUtil.AssertNoDetachedStates(automata[n]);
                AutomatonTestUtil.AssertNoDetachedStates(tautomata[n]);
                // check that the dfa for n-1 accepts a subset of the dfa for n
                if (n > 0)
                {
                    Assert.IsTrue(automata[n - 1].SubsetOf(automata[n]));
                    Assert.IsTrue(automata[n - 1].SubsetOf(tautomata[n]));
                    Assert.IsTrue(tautomata[n - 1].SubsetOf(automata[n]));
                    Assert.IsTrue(tautomata[n - 1].SubsetOf(tautomata[n]));
                    Assert.AreNotSame(automata[n - 1], automata[n]);
                }
                // check that Lev(N) is a subset of LevT(N)
                Assert.IsTrue(automata[n].SubsetOf(tautomata[n]));
                // special checks for specific n
                switch (n)
                {
                    case 0:
                        // easy, matches the string itself
                        Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), automata[0]));
                        Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), tautomata[0]));
                        break;

                    case 1:
                        // generate a lev1 naively, and check the accepted lang is the same.
                        Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1(s), automata[1]));
                        Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1T(s), tautomata[1]));
                        break;

                    default:
                        AssertBruteForce(s, automata[n], n);
                        AssertBruteForceT(s, tautomata[n], n);
                        break;
                }
            }
        }
Esempio n. 3
0
        internal Automaton ToLevenshteinAutomata(Automaton automaton)
        {
            var @ref = SpecialOperations.GetFiniteStrings(automaton, -1);
            Automaton[] subs = new Automaton[@ref.Count];
            int upto = 0;
            foreach (IntsRef path in @ref)
            {
                if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength)
                {
                    subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length);
                    upto++;
                }
                else
                {
                    Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix);
                    int[] ints = new int[path.Length - nonFuzzyPrefix];
                    Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length);
                    // TODO: maybe add alphaMin to LevenshteinAutomata,
                    // and pass 1 instead of 0?  We probably don't want
                    // to allow the trailing dedup bytes to be
                    // edited... but then 0 byte is "in general" allowed
                    // on input (but not in UTF8).
                    LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions);
                    Automaton levAutomaton = lev.ToAutomaton(maxEdits);
                    Automaton combined = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton));
                    combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already
                    subs[upto] = combined;
                    upto++;
                }
            }

            if (subs.Length == 0)
            {
                // automaton is empty, there is no accepted paths through it
                return BasicAutomata.MakeEmpty(); // matches nothing
            }
            else if (subs.Length == 1)
            {
                // no synonyms or anything: just a single path through the tokenstream
                return subs[0];
            }
            else
            {
                // multiple paths: this is really scary! is it slow?
                // maybe we should not do this and throw UOE?
                Automaton a = BasicOperations.Union(Arrays.AsList(subs));
                // TODO: we could call toLevenshteinAutomata() before det?
                // this only happens if you have multiple paths anyway (e.g. synonyms)
                BasicOperations.Determinize(a);

                return a;
            }
        }