// FST is pruned private void VerifyPruned(int inputMode, FST <T> fst, int prune1, int prune2) { if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now verify pruned " + Pairs.Count + " terms; outputs=" + Outputs); foreach (InputOutput <T> pair in Pairs) { Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + Outputs.OutputToString(pair.Output)); } } // To validate the FST, we brute-force compute all prefixes // in the terms, matched to their "common" outputs, prune that // set according to the prune thresholds, then assert the FST // matches that same set. // NOTE: Crazy RAM intensive!! //System.out.println("TEST: tally prefixes"); // build all prefixes IDictionary <IntsRef, CountMinOutput <T> > prefixes = new Dictionary <IntsRef, CountMinOutput <T> >(); IntsRef scratch = new IntsRef(10); foreach (InputOutput <T> pair in Pairs) { scratch.CopyInts(pair.Input); for (int idx = 0; idx <= pair.Input.Length; idx++) { scratch.Length = idx; CountMinOutput <T> cmo = prefixes[scratch]; if (cmo == null) { cmo = new CountMinOutput <T>(); cmo.Count = 1; cmo.Output = pair.Output; prefixes[IntsRef.DeepCopyOf(scratch)] = cmo; } else { cmo.Count++; T output1 = cmo.Output; if (output1.Equals(Outputs.NoOutput)) { output1 = Outputs.NoOutput; } T output2 = pair.Output; if (output2.Equals(Outputs.NoOutput)) { output2 = Outputs.NoOutput; } cmo.Output = Outputs.Common(output1, output2); } if (idx == pair.Input.Length) { cmo.IsFinal = true; cmo.FinalOutput = cmo.Output; } } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now prune"); } // prune 'em IEnumerator <KeyValuePair <IntsRef, CountMinOutput <T> > > it = prefixes.GetEnumerator(); while (it.MoveNext()) { KeyValuePair <IntsRef, CountMinOutput <T> > ent = it.Current; IntsRef prefix = ent.Key; CountMinOutput <T> cmo = ent.Value; if (LuceneTestCase.VERBOSE) { Console.WriteLine(" term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + Outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal); } bool keep; if (prune1 > 0) { keep = cmo.Count >= prune1; } else { Debug.Assert(prune2 > 0); if (prune2 > 1 && cmo.Count >= prune2) { keep = true; } else if (prefix.Length > 0) { // consult our parent scratch.Length = prefix.Length - 1; Array.Copy(prefix.Ints, prefix.Offset, scratch.Ints, 0, scratch.Length); CountMinOutput <T> cmo2 = prefixes[scratch]; //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); keep = cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1))); } else if (cmo.Count >= prune2) { keep = true; } else { keep = false; } } if (!keep) { it.Reset(); //System.out.println(" remove"); } else { // clear isLeaf for all ancestors //System.out.println(" keep"); scratch.CopyInts(prefix); scratch.Length--; while (scratch.Length >= 0) { CountMinOutput <T> cmo2 = prefixes[scratch]; if (cmo2 != null) { //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); cmo2.IsLeaf = false; } scratch.Length--; } } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: after prune"); foreach (KeyValuePair <IntsRef, CountMinOutput <T> > ent in prefixes) { Console.WriteLine(" " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal); if (ent.Value.IsFinal) { Console.WriteLine(" finalOutput=" + Outputs.OutputToString(ent.Value.FinalOutput)); } } } if (prefixes.Count <= 1) { Assert.IsNull(fst); return; } Assert.IsNotNull(fst); // make sure FST only enums valid prefixes if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check pruned enum"); } IntsRefFSTEnum <T> fstEnum = new IntsRefFSTEnum <T>(fst); IntsRefFSTEnum <T> .InputOutput <T> current; while ((current = fstEnum.Next()) != null) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + Outputs.OutputToString(current.Output)); } CountMinOutput <T> cmo = prefixes[current.Input]; Assert.IsNotNull(cmo); Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal); //if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, current.Output); } else { Assert.AreEqual(cmo.Output, current.Output); } } // make sure all non-pruned prefixes are present in the FST if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify all prefixes"); } int[] stopNode = new int[1]; foreach (KeyValuePair <IntsRef, CountMinOutput <T> > ent in prefixes) { if (ent.Key.Length > 0) { CountMinOutput <T> cmo = ent.Value; T output = Run(fst, ent.Key, stopNode); if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + Outputs.OutputToString(cmo.Output)); } // if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, output); } else { Assert.AreEqual(cmo.Output, output); } Assert.AreEqual(ent.Key.Length, stopNode[0]); } } }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static List <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { Debug.Assert(a.Deterministic); IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.InitialState, fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new IntsRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.BytesReader; while (queue.Count != 0) { Path <T> path = queue.ElementAt(queue.Count - 1); queue.Remove(path); if (path.state.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } IntsRef currentInput = path.input; foreach (Transition t in path.state.Transitions) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.fstNode, scratchArc, fstReader); if (nextArc != null) { IntsRef newInput = new IntsRef(currentInput.Length + 1); newInput.CopyInts(currentInput); newInput.Ints[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.fstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { Debug.Assert(nextArc.Label <= max); Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min); IntsRef newInput = new IntsRef(currentInput.Length + 1); newInput.CopyInts(currentInput); newInput.Ints[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString())); } } } } return(endNodes); }