Beispiel #1
0
        // FST is pruned
        private void VerifyPruned(int inputMode, FST <T> fst, int prune1, int prune2)
        {
            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: now verify pruned " + Pairs.Count + " terms; outputs=" + Outputs);
                foreach (InputOutput <T> pair in Pairs)
                {
                    Console.WriteLine("  " + InputToString(inputMode, pair.Input) + ": " + Outputs.OutputToString(pair.Output));
                }
            }

            // To validate the FST, we brute-force compute all prefixes
            // in the terms, matched to their "common" outputs, prune that
            // set according to the prune thresholds, then assert the FST
            // matches that same set.

            // NOTE: Crazy RAM intensive!!

            //System.out.println("TEST: tally prefixes");

            // build all prefixes
            IDictionary <IntsRef, CountMinOutput <T> > prefixes = new Dictionary <IntsRef, CountMinOutput <T> >();
            IntsRef scratch = new IntsRef(10);

            foreach (InputOutput <T> pair in Pairs)
            {
                scratch.CopyInts(pair.Input);
                for (int idx = 0; idx <= pair.Input.Length; idx++)
                {
                    scratch.Length = idx;
                    CountMinOutput <T> cmo = prefixes[scratch];
                    if (cmo == null)
                    {
                        cmo        = new CountMinOutput <T>();
                        cmo.Count  = 1;
                        cmo.Output = pair.Output;
                        prefixes[IntsRef.DeepCopyOf(scratch)] = cmo;
                    }
                    else
                    {
                        cmo.Count++;
                        T output1 = cmo.Output;
                        if (output1.Equals(Outputs.NoOutput))
                        {
                            output1 = Outputs.NoOutput;
                        }
                        T output2 = pair.Output;
                        if (output2.Equals(Outputs.NoOutput))
                        {
                            output2 = Outputs.NoOutput;
                        }
                        cmo.Output = Outputs.Common(output1, output2);
                    }
                    if (idx == pair.Input.Length)
                    {
                        cmo.IsFinal     = true;
                        cmo.FinalOutput = cmo.Output;
                    }
                }
            }

            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: now prune");
            }

            // prune 'em
            IEnumerator <KeyValuePair <IntsRef, CountMinOutput <T> > > it = prefixes.GetEnumerator();

            while (it.MoveNext())
            {
                KeyValuePair <IntsRef, CountMinOutput <T> > ent = it.Current;
                IntsRef            prefix = ent.Key;
                CountMinOutput <T> cmo    = ent.Value;
                if (LuceneTestCase.VERBOSE)
                {
                    Console.WriteLine("  term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + Outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal);
                }
                bool keep;
                if (prune1 > 0)
                {
                    keep = cmo.Count >= prune1;
                }
                else
                {
                    Debug.Assert(prune2 > 0);
                    if (prune2 > 1 && cmo.Count >= prune2)
                    {
                        keep = true;
                    }
                    else if (prefix.Length > 0)
                    {
                        // consult our parent
                        scratch.Length = prefix.Length - 1;
                        Array.Copy(prefix.Ints, prefix.Offset, scratch.Ints, 0, scratch.Length);
                        CountMinOutput <T> cmo2 = prefixes[scratch];
                        //System.out.println("    parent count = " + (cmo2 == null ? -1 : cmo2.count));
                        keep = cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1)));
                    }
                    else if (cmo.Count >= prune2)
                    {
                        keep = true;
                    }
                    else
                    {
                        keep = false;
                    }
                }

                if (!keep)
                {
                    it.Reset();
                    //System.out.println("    remove");
                }
                else
                {
                    // clear isLeaf for all ancestors
                    //System.out.println("    keep");
                    scratch.CopyInts(prefix);
                    scratch.Length--;
                    while (scratch.Length >= 0)
                    {
                        CountMinOutput <T> cmo2 = prefixes[scratch];
                        if (cmo2 != null)
                        {
                            //System.out.println("    clear isLeaf " + inputToString(inputMode, scratch));
                            cmo2.IsLeaf = false;
                        }
                        scratch.Length--;
                    }
                }
            }

            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: after prune");
                foreach (KeyValuePair <IntsRef, CountMinOutput <T> > ent in prefixes)
                {
                    Console.WriteLine("  " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal);
                    if (ent.Value.IsFinal)
                    {
                        Console.WriteLine("    finalOutput=" + Outputs.OutputToString(ent.Value.FinalOutput));
                    }
                }
            }

            if (prefixes.Count <= 1)
            {
                Assert.IsNull(fst);
                return;
            }

            Assert.IsNotNull(fst);

            // make sure FST only enums valid prefixes
            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: check pruned enum");
            }
            IntsRefFSTEnum <T> fstEnum = new IntsRefFSTEnum <T>(fst);

            IntsRefFSTEnum <T> .InputOutput <T> current;
            while ((current = fstEnum.Next()) != null)
            {
                if (LuceneTestCase.VERBOSE)
                {
                    Console.WriteLine("  fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + Outputs.OutputToString(current.Output));
                }
                CountMinOutput <T> cmo = prefixes[current.Input];
                Assert.IsNotNull(cmo);
                Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal);
                //if (cmo.isFinal && !cmo.isLeaf) {
                if (cmo.IsFinal)
                {
                    Assert.AreEqual(cmo.FinalOutput, current.Output);
                }
                else
                {
                    Assert.AreEqual(cmo.Output, current.Output);
                }
            }

            // make sure all non-pruned prefixes are present in the FST
            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: verify all prefixes");
            }
            int[] stopNode = new int[1];
            foreach (KeyValuePair <IntsRef, CountMinOutput <T> > ent in prefixes)
            {
                if (ent.Key.Length > 0)
                {
                    CountMinOutput <T> cmo = ent.Value;
                    T output = Run(fst, ent.Key, stopNode);
                    if (LuceneTestCase.VERBOSE)
                    {
                        Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + Outputs.OutputToString(cmo.Output));
                    }
                    // if (cmo.isFinal && !cmo.isLeaf) {
                    if (cmo.IsFinal)
                    {
                        Assert.AreEqual(cmo.FinalOutput, output);
                    }
                    else
                    {
                        Assert.AreEqual(cmo.Output, output);
                    }
                    Assert.AreEqual(ent.Key.Length, stopNode[0]);
                }
            }
        }
Beispiel #2
0
        /// <summary>
        /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>,
        /// accumulating the <see cref="FST"/> end node and output for each path.
        /// </summary>
        public static List <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst)
        {
            Debug.Assert(a.Deterministic);
            IList <Path <T> > queue    = new List <Path <T> >();
            List <Path <T> >  endNodes = new List <Path <T> >();

            queue.Add(new Path <T>(a.InitialState, fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new IntsRef()));

            FST.Arc <T>     scratchArc = new FST.Arc <T>();
            FST.BytesReader fstReader  = fst.BytesReader;

            while (queue.Count != 0)
            {
                Path <T> path = queue.ElementAt(queue.Count - 1);
                queue.Remove(path);
                if (path.state.Accept)
                {
                    endNodes.Add(path);
                    // we can stop here if we accept this path,
                    // we accept all further paths too
                    continue;
                }

                IntsRef currentInput = path.input;
                foreach (Transition t in path.state.Transitions)
                {
                    int min = t.Min;
                    int max = t.Max;
                    if (min == max)
                    {
                        FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.fstNode, scratchArc, fstReader);
                        if (nextArc != null)
                        {
                            IntsRef newInput = new IntsRef(currentInput.Length + 1);
                            newInput.CopyInts(currentInput);
                            newInput.Ints[currentInput.Length] = t.Min;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.output, nextArc.Output), newInput));
                        }
                    }
                    else
                    {
                        // TODO: if this transition's TO state is accepting, and
                        // it accepts the entire range possible in the FST (ie. 0 to 255),
                        // we can simply use the prefix as the accepted state instead of
                        // looking up all the ranges and terminate early
                        // here.  This just shifts the work from one queue
                        // (this one) to another (the completion search
                        // done in AnalyzingSuggester).

                        FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.fstNode, scratchArc, fstReader);
                        while (nextArc != null && nextArc.Label <= max)
                        {
                            Debug.Assert(nextArc.Label <= max);
                            Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min);
                            IntsRef newInput = new IntsRef(currentInput.Length + 1);
                            newInput.CopyInts(currentInput);
                            newInput.Ints[currentInput.Length] = nextArc.Label;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.output, nextArc.Output), newInput));
                            int label = nextArc.Label; // used in assert
                            nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader);
                            Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString()));
                        }
                    }
                }
            }
            return(endNodes);
        }