// for debugging /* * private String toString(BytesRef b) { * try { * return b.utf8ToString() + " " + b; * } catch (Throwable t) { * return b.toString(); * } * } */ /// <summary> /// It's OK to add the same input twice in a row with /// different outputs, as long as outputs impls the merge /// method. Note that input is fully consumed after this /// method is returned (so caller is free to reuse), but /// output is not. So if your outputs are changeable (eg /// <see cref="ByteSequenceOutputs"/> or /// <see cref="Int32SequenceOutputs"/>) then you cannot reuse across /// calls. /// </summary> public virtual void Add(Int32sRef input, T output) { /* * if (DEBUG) { * BytesRef b = new BytesRef(input.length); * for(int x=0;x<input.length;x++) { * b.bytes[x] = (byte) input.ints[x]; * } * b.length = input.length; * if (output == NO_OUTPUT) { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b); * } else { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output)); * } * } */ // De-dup NO_OUTPUT since it must be a singleton: if (output.Equals(NO_OUTPUT)) { output = NO_OUTPUT; } if (Debugging.AssertsEnabled) { Debugging.Assert(lastInput.Length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput={0} vs input={1}", lastInput, input); Debugging.Assert(ValidOutput(output)); } //System.out.println("\nadd: " + input); if (input.Length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].InputCount++; frontier[0].IsFinal = true; fst.EmptyOutput = output; return; } // compare shared prefix length int pos1 = 0; int pos2 = input.Offset; int pos1Stop = Math.Min(lastInput.Length, input.Length); while (true) { frontier[pos1].InputCount++; //System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]); if (pos1 >= pos1Stop || lastInput.Int32s[pos1] != input.Int32s[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (frontier.Length < input.Length + 1) { UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(frontier, 0, next, 0, frontier.Length); for (int idx = frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix DoFreezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.Length; idx++) { frontier[idx - 1].AddArc(input.Int32s[input.Offset + idx - 1], frontier[idx]); frontier[idx].InputCount++; } UnCompiledNode <T> lastNode = frontier[input.Length]; if (lastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1) { lastNode.IsFinal = true; lastNode.Output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parentNode = frontier[idx - 1]; T lastOutput = parentNode.GetLastOutput(input.Int32s[input.Offset + idx - 1]); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(lastOutput)); } T commonOutputPrefix; T wordSuffix; if (!lastOutput.Equals(NO_OUTPUT)) { commonOutputPrefix = fst.Outputs.Common(output, lastOutput); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(commonOutputPrefix)); } wordSuffix = fst.Outputs.Subtract(lastOutput, commonOutputPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(wordSuffix)); } parentNode.SetLastOutput(input.Int32s[input.Offset + idx - 1], commonOutputPrefix); node.PrependOutput(wordSuffix); } else { commonOutputPrefix = /*wordSuffix =*/ NO_OUTPUT; // LUCENENET: Removed unnecessary assignment } output = fst.Outputs.Subtract(output, commonOutputPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(output)); } } if (lastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.Output = fst.Outputs.Merge(lastNode.Output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1 - 1].SetLastOutput(input.Int32s[input.Offset + prefixLenPlus1 - 1], output); } // save last input lastInput.CopyInt32s(input); //System.out.println(" count[0]=" + frontier[0].inputCount); }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { if (Debugging.AssertsEnabled) { Debugging.Assert(a.IsDeterministic); } IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue[queue.Count - 1]; queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label <= max); } if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label >= min, () => nextArc.Label + " " + min); } Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc == null || label < nextArc.Label, () => "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString())); } } } } } return(endNodes); }
// FST is pruned private void VerifyPruned(int inputMode, FST <T> fst, int prune1, int prune2) { if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now verify pruned " + pairs.Count + " terms; outputs=" + outputs); foreach (InputOutput <T> pair in pairs) { Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + outputs.OutputToString(pair.Output)); } } // To validate the FST, we brute-force compute all prefixes // in the terms, matched to their "common" outputs, prune that // set according to the prune thresholds, then assert the FST // matches that same set. // NOTE: Crazy RAM intensive!! //System.out.println("TEST: tally prefixes"); // build all prefixes IDictionary <Int32sRef, CountMinOutput <T> > prefixes = new HashMap <Int32sRef, CountMinOutput <T> >(); Int32sRef scratch = new Int32sRef(10); foreach (InputOutput <T> pair in pairs) { scratch.CopyInt32s(pair.Input); for (int idx = 0; idx <= pair.Input.Length; idx++) { scratch.Length = idx; CountMinOutput <T> cmo = prefixes.ContainsKey(scratch) ? prefixes[scratch] : null; if (cmo == null) { cmo = new CountMinOutput <T>(); cmo.Count = 1; cmo.Output = pair.Output; prefixes[Int32sRef.DeepCopyOf(scratch)] = cmo; } else { cmo.Count++; T output1 = cmo.Output; if (output1.Equals(outputs.NoOutput)) { output1 = outputs.NoOutput; } T output2 = pair.Output; if (output2.Equals(outputs.NoOutput)) { output2 = outputs.NoOutput; } cmo.Output = outputs.Common(output1, output2); } if (idx == pair.Input.Length) { cmo.IsFinal = true; cmo.FinalOutput = cmo.Output; } } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now prune"); } // prune 'em // LUCENENET NOTE: Altered this a bit to go in reverse rather than use an enumerator since // in .NET you cannot delete records while enumerating forward through a dictionary. for (int i = prefixes.Count - 1; i >= 0; i--) { KeyValuePair <Int32sRef, CountMinOutput <T> > ent = prefixes.ElementAt(i); Int32sRef prefix = ent.Key; CountMinOutput <T> cmo = ent.Value; if (LuceneTestCase.VERBOSE) { Console.WriteLine(" term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal); } bool keep; if (prune1 > 0) { keep = cmo.Count >= prune1; } else { Debug.Assert(prune2 > 0); if (prune2 > 1 && cmo.Count >= prune2) { keep = true; } else if (prefix.Length > 0) { // consult our parent scratch.Length = prefix.Length - 1; Array.Copy(prefix.Int32s, prefix.Offset, scratch.Int32s, 0, scratch.Length); CountMinOutput <T> cmo2 = prefixes.ContainsKey(scratch) ? prefixes[scratch] : null; //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); keep = cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1))); } else if (cmo.Count >= prune2) { keep = true; } else { keep = false; } } if (!keep) { prefixes.Remove(prefix); //System.out.println(" remove"); } else { // clear isLeaf for all ancestors //System.out.println(" keep"); scratch.CopyInt32s(prefix); scratch.Length--; while (scratch.Length >= 0) { CountMinOutput <T> cmo2 = prefixes.ContainsKey(scratch) ? prefixes[scratch] : null; if (cmo2 != null) { //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); cmo2.IsLeaf = false; } scratch.Length--; } } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: after prune"); foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { Console.WriteLine(" " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal); if (ent.Value.IsFinal) { Console.WriteLine(" finalOutput=" + outputs.OutputToString(ent.Value.FinalOutput)); } } } if (prefixes.Count <= 1) { Assert.IsNull(fst); return; } Assert.IsNotNull(fst); // make sure FST only enums valid prefixes if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check pruned enum"); } Int32sRefFSTEnum <T> fstEnum = new Int32sRefFSTEnum <T>(fst); Int32sRefFSTEnum.InputOutput <T> current; while ((current = fstEnum.Next()) != null) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + outputs.OutputToString(current.Output)); } CountMinOutput <T> cmo = prefixes.ContainsKey(current.Input) ? prefixes[current.Input] : null; Assert.IsNotNull(cmo); Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal); //if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, current.Output); } else { Assert.AreEqual(cmo.Output, current.Output); } } // make sure all non-pruned prefixes are present in the FST if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify all prefixes"); } int[] stopNode = new int[1]; foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { if (ent.Key.Length > 0) { CountMinOutput <T> cmo = ent.Value; T output = Run(fst, ent.Key, stopNode); if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + outputs.OutputToString(cmo.Output)); } // if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, output); } else { Assert.AreEqual(cmo.Output, output); } Assert.AreEqual(ent.Key.Length, stopNode[0]); } } }
// FST is pruned private void VerifyPruned(int inputMode, FST <T> fst, int prune1, int prune2) { if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: now verify pruned " + pairs.Count + " terms; outputs=" + outputs); foreach (InputOutput <T> pair in pairs) { Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + outputs.OutputToString(pair.Output)); } } // To validate the FST, we brute-force compute all prefixes // in the terms, matched to their "common" outputs, prune that // set according to the prune thresholds, then assert the FST // matches that same set. // NOTE: Crazy RAM intensive!! //System.out.println("TEST: tally prefixes"); // build all prefixes // LUCENENET: We use ConcurrentDictionary<TKey, TValue> because Dictionary<TKey, TValue> doesn't support // deletion while iterating, but ConcurrentDictionary does. IDictionary <Int32sRef, CountMinOutput <T> > prefixes = new ConcurrentDictionary <Int32sRef, CountMinOutput <T> >(); Int32sRef scratch = new Int32sRef(10); foreach (InputOutput <T> pair in pairs) { scratch.CopyInt32s(pair.Input); for (int idx = 0; idx <= pair.Input.Length; idx++) { scratch.Length = idx; if (!prefixes.TryGetValue(scratch, out CountMinOutput <T> cmo) || cmo == null) { cmo = new CountMinOutput <T>(); cmo.Count = 1; cmo.Output = pair.Output; prefixes[Int32sRef.DeepCopyOf(scratch)] = cmo; } else { cmo.Count++; T output1 = cmo.Output; if (output1.Equals(outputs.NoOutput)) { output1 = outputs.NoOutput; } T output2 = pair.Output; if (output2.Equals(outputs.NoOutput)) { output2 = outputs.NoOutput; } cmo.Output = outputs.Common(output1, output2); } if (idx == pair.Input.Length) { cmo.IsFinal = true; cmo.FinalOutput = cmo.Output; } } } if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: now prune"); } // prune 'em using (var it = prefixes.GetEnumerator()) { while (it.MoveNext()) { var ent = it.Current; Int32sRef prefix = ent.Key; CountMinOutput <T> cmo = ent.Value; if (LuceneTestCase.Verbose) { Console.WriteLine(" term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal); } bool keep; if (prune1 > 0) { keep = cmo.Count >= prune1; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(prune2 > 0); } if (prune2 > 1 && cmo.Count >= prune2) { keep = true; } else if (prefix.Length > 0) { // consult our parent scratch.Length = prefix.Length - 1; Array.Copy(prefix.Int32s, prefix.Offset, scratch.Int32s, 0, scratch.Length); keep = prefixes.TryGetValue(scratch, out CountMinOutput <T> cmo2) && cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1))); //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); } else if (cmo.Count >= prune2) { keep = true; } else { keep = false; } } if (!keep) { //it.remove(); prefixes.Remove(ent); //System.out.println(" remove"); } else { // clear isLeaf for all ancestors //System.out.println(" keep"); scratch.CopyInt32s(prefix); scratch.Length--; while (scratch.Length >= 0) { if (prefixes.TryGetValue(scratch, out CountMinOutput <T> cmo2) && cmo2 != null) { //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); cmo2.IsLeaf = false; } scratch.Length--; } } } } if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: after prune"); foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { Console.WriteLine(" " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal); if (ent.Value.IsFinal) { Console.WriteLine(" finalOutput=" + outputs.OutputToString(ent.Value.FinalOutput)); } } } if (prefixes.Count <= 1) { Assert.IsNull(fst); return; } Assert.IsNotNull(fst); // make sure FST only enums valid prefixes if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: check pruned enum"); } Int32sRefFSTEnum <T> fstEnum = new Int32sRefFSTEnum <T>(fst); Int32sRefFSTEnum.InputOutput <T> current; while ((current = fstEnum.Next()) != null) { if (LuceneTestCase.Verbose) { Console.WriteLine(" fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + outputs.OutputToString(current.Output)); } prefixes.TryGetValue(current.Input, out CountMinOutput <T> cmo); Assert.IsNotNull(cmo); Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal); //if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, current.Output); } else { Assert.AreEqual(cmo.Output, current.Output); } } // make sure all non-pruned prefixes are present in the FST if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: verify all prefixes"); } int[] stopNode = new int[1]; foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { if (ent.Key.Length > 0) { CountMinOutput <T> cmo = ent.Value; T output = Run(fst, ent.Key, stopNode); if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + outputs.OutputToString(cmo.Output)); } // if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, output); } else { Assert.AreEqual(cmo.Output, output); } Assert.AreEqual(ent.Key.Length, stopNode[0]); } } }