/// <summary> /// Looks up the output for this input, or <c>null</c> if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, Int32sRef input) { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST.Arc <T>()); var fstReader = fst.GetBytesReader(); // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) == null) { return(default);
/// <summary> /// Looks up the output for this input, or <c>null</c> if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, Int32sRef input) where T : class // LUCENENET specific - added class constraint, since we compare reference equality { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST.Arc <T>()); var fstReader = fst.GetBytesReader(); // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) == null) { return(default);
// runs the term, returning the output, or null if term // isn't accepted. if prefixLength is non-null it must be // length 1 int array; prefixLength[0] is set to the length // of the term prefix that matches private T Run(FST <T> fst, Int32sRef term, int[] prefixLength) { if (Debugging.AssertsEnabled) { Debugging.Assert(prefixLength == null || prefixLength.Length == 1); } FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.GetBytesReader(); for (int i = 0; i <= term.Length; i++) { int label; if (i == term.Length) { label = FST.END_LABEL; } else { label = term.Int32s[term.Offset + i]; } // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); if (fst.FindTargetArc(label, arc, arc, fstReader) == null) { // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return(output); } else { return(default(T)); } } output = fst.Outputs.Add(output, arc.Output); } if (prefixLength != null) { prefixLength[0] = term.Length; } return(output); }
private T RandomAcceptedWord(FST <T> fst, Int32sRef @in) { FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); IList <FST.Arc <T> > arcs = new List <FST.Arc <T> >(); @in.Length = 0; @in.Offset = 0; T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.GetBytesReader(); while (true) { // read all arcs: fst.ReadFirstTargetArc(arc, arc, fstReader); arcs.Add((new FST.Arc <T>()).CopyFrom(arc)); while (!arc.IsLast) { fst.ReadNextArc(arc, fstReader); arcs.Add((new FST.Arc <T>()).CopyFrom(arc)); } // pick one arc = arcs[random.Next(arcs.Count)]; arcs.Clear(); // accumulate output output = fst.Outputs.Add(output, arc.Output); // append label if (arc.Label == FST.END_LABEL) { break; } if (@in.Int32s.Length == @in.Length) { @in.Grow(1 + @in.Length); } @in.Int32s[@in.Length++] = arc.Label; } return(output); }
/// <summary> /// Default constructor that takes a <see cref="TextReader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { //LUCENENET support to reset the reader. _input = GetBufferedReader(@in); _input.Mark(BufferedCharFilter.DEFAULT_CHAR_BUFFER_SIZE); buffer.Reset(_input); //buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.GetBytesReader(); } else { fstReader = null; } }
public override SortedSetDocValues GetSortedSet(FieldInfo field) { FSTEntry entry = fsts[field.Number]; if (entry.NumOrds == 0) { return(DocValues.EMPTY_SORTED_SET); // empty FST! } FST <long?> instance; UninterruptableMonitor.Enter(this); try { if (!fstInstances.TryGetValue(field.Number, out instance) || instance == null) { data.Seek(entry.Offset); instance = new FST <long?>(data, PositiveInt32Outputs.Singleton); ramBytesUsed.AddAndGet(instance.GetSizeInBytes()); fstInstances[field.Number] = instance; } } finally { UninterruptableMonitor.Exit(this); } BinaryDocValues docToOrds = GetBinary(field); FST <long?> fst = instance; // per-thread resources var @in = fst.GetBytesReader(); var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new Int32sRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); var @ref = new BytesRef(); var input = new ByteArrayDataInput(); return(new SortedSetDocValuesAnonymousClass(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input)); }
private static void Walk <T>(FST <T> fst) // LUCENENET NOTE: Not referenced anywhere { var queue = new List <FST.Arc <T> >(); // Java version was BitSet(), but in .NET we don't have a zero contructor BitSet. // Couldn't find the default size in BitSet, so went with zero here. var seen = new BitSet(); var reader = fst.GetBytesReader(); var startArc = fst.GetFirstArc(new FST.Arc <T>()); queue.Add(startArc); while (queue.Count > 0) { //FST.Arc<T> arc = queue.Remove(0); var arc = queue[0]; queue.RemoveAt(0); long node = arc.Target; //System.out.println(arc); if (FST <T> .TargetHasArcs(arc) && !seen.Get((int)node)) { seen.Set((int)node); fst.ReadFirstRealTargetArc(node, arc, reader); while (true) { queue.Add((new FST.Arc <T>()).CopyFrom(arc)); if (arc.IsLast) { break; } else { fst.ReadNextRealArc(arc, reader); } } } } }
// Use the builder to create: private NormalizeCharMap(FST <CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: var scratchArc = new FST.Arc <CharsRef>(); FST.BytesReader fstReader = map.GetBytesReader(); map.GetFirstArc(scratchArc); if (FST <CharsRef> .TargetHasArcs(scratchArc)) { map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader); while (true) { if (Debugging.AssertsEnabled) { Debugging.Assert(scratchArc.Label != FST.END_LABEL); } cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc <CharsRef>()).CopyFrom(scratchArc); if (scratchArc.IsLast) { break; } map.ReadNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception("Should never happen", ioe); } } }
internal IntersectTermsEnum(FSTTermsReader.TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance) { this.outerInstance = outerInstance; //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = outerInstance.dict; this.fstReader = fst.GetBytesReader(); this.fstOutputs = outerInstance.dict.Outputs; this.fsa = compiled.RunAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.Length; i++) { this.stack[i] = new Frame(this); } Frame frame; frame = LoadVirtualFrame(NewFrame()); this.level++; frame = LoadFirstFrame(NewFrame()); PushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = IsAccept(TopFrame()); } else { DoSeekCeil(startTerm); pending = !startTerm.Equals(term) && IsValid(TopFrame()) && IsAccept(TopFrame()); } }
/// <summary> /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description /// for visualization. Example of use: /// /// <code> /// using (TextWriter sw = new StreamWriter("out.dot")) /// { /// Util.ToDot(fst, sw, true, true); /// } /// </code> /// /// and then, from command line: /// /// <code> /// dot -Tpng -o out.png out.dot /// </code> /// /// <para/> /// Note: larger FSTs (a few thousand nodes) won't even /// render, don't bother. If the FST is > 2.1 GB in size /// then this method will throw strange exceptions. /// <para/> /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>. /// </summary> /// <param name="sameRank"> /// If <c>true</c>, the resulting <c>dot</c> file will try /// to order states in layers of breadth-first traversal. This may /// mess up arcs, but makes the output FST's structure a bit clearer. /// </param> /// <param name="labelStates"> /// If <c>true</c> states will have labels equal to their offsets in their /// binary format. Expands the graph considerably. /// </param> public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates) { const string expandedNodeColor = "blue"; // this is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>()); // A queue of transitions to consider for the next level. IList <FST.Arc <T> > thisLevelQueue = new List <FST.Arc <T> >(); // A queue of transitions to consider when processing the next level. IList <FST.Arc <T> > nextLevelQueue = new List <FST.Arc <T> >(); nextLevelQueue.Add(startArc); //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). IList <int?> sameLevelStates = new List <int?>(); // A bitset of already seen states (target offset). BitArray seen = new BitArray(32); seen.SafeSet((int)startArc.Target, true); // Shape for states. const string stateShape = "circle"; const string finalStateShape = "doublecircle"; // Emit DOT prologue. @out.Write("digraph FST {\n"); @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } EmitDotState(@out, "initial", "point", "white", ""); T NO_OUTPUT = fst.Outputs.NoOutput; var r = fst.GetBytesReader(); // final FST.Arc<T> scratchArc = new FST.Arc<>(); { string stateColor; if (fst.IsExpandedTarget(startArc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } bool isFinal; T finalOutput; if (startArc.IsFinal) { isFinal = true; finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput; } else { isFinal = false; finalOutput = default(T); } EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); } @out.Write(" initial -> " + startArc.Target + "\n"); int level = 0; while (nextLevelQueue.Count > 0) { // we could double buffer here, but it doesn't matter probably. //System.out.println("next level=" + level); thisLevelQueue.AddRange(nextLevelQueue); nextLevelQueue.Clear(); level++; @out.Write("\n // Transitions and states at level: " + level + "\n"); while (thisLevelQueue.Count > 0) { FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); //System.out.println(" pop: " + arc); if (FST <T> .TargetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); long node = arc.Target; fst.ReadFirstRealTargetArc(arc.Target, arc, r); //System.out.println(" firstTarget: " + arc); while (true) { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target)) { /* * boolean isFinal = false; * T finalOutput = null; * fst.readFirstTargetArc(arc, scratchArc); * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { * // target is final * isFinal = true; * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; * System.out.println("dot hit final label=" + (char) scratchArc.label); * } */ string stateColor; if (fst.IsExpandedTarget(arc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } string finalOutput; if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); } else { finalOutput = ""; } EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.SafeSet((int)arc.Target, true); nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc)); sameLevelStates.Add((int)arc.Target); } string outs; if (!arc.Output.Equals(NO_OUTPUT)) { outs = "/" + fst.Outputs.OutputToString(arc.Output); } else { outs = ""; } if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state // (-1) but also with a next final output; in // this case we pull that output up onto this // arc outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; } string arcColor; if (arc.Flag(FST.BIT_TARGET_NEXT)) { arcColor = "red"; } else { arcColor = "black"; } Debug.Assert(arc.Label != FST.END_LABEL); @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.IsLast) { //System.out.println(" break"); break; } fst.ReadNextRealArc(arc, r); } } } // Emit state ranking information. if (sameRank && sameLevelStates.Count > 1) { @out.Write(" {rank=same; "); foreach (int state in sameLevelStates) { @out.Write(state + "; "); } @out.Write(" }\n"); } sameLevelStates.Clear(); } // Emit terminating state (always there anyway). @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); @out.Write(" {rank=sink; -1 }\n"); @out.Write("}\n"); @out.Flush(); }
public virtual TopResults <T> Search() { IList <Result <T> > results = new List <Result <T> >(); //System.out.println("search topN=" + topN); var fstReader = fst.GetBytesReader(); T NO_OUTPUT = fst.Outputs.NoOutput; // TODO: we could enable FST to sorting arcs by weight // as it freezes... can easily do this on first pass // (w/o requiring rewrite) // TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!? // (nibbles) int rejectCount = 0; // For each top N path: while (results.Count < topN) { //System.out.println("\nfind next path: queue.size=" + queue.size()); FSTPath <T> path; if (queue == null) { // Ran out of paths //System.out.println(" break queue=null"); break; } // Remove top path since we are now going to // pursue it: // LUCENENET NOTE: SortedSet doesn't have atomic operations, // so we need to add some thread safety just in case. // Perhaps it might make sense to wrap SortedSet into a type // that provides thread safety. lock (syncLock) { path = queue.Min; if (path != null) { queue.Remove(path); } } if (path == null) { // There were less than topN paths available: //System.out.println(" break no more paths"); break; } if (path.Arc.Label == FST.END_LABEL) { //System.out.println(" empty string! cost=" + path.cost); // Empty string! path.Input.Length--; results.Add(new Result <T>(path.Input, path.Cost)); continue; } if (results.Count == topN - 1 && maxQueueDepth == topN) { // Last path -- don't bother w/ queue anymore: queue = null; } //System.out.println(" path: " + path); // We take path and find its "0 output completion", // ie, just keep traversing the first arc with // NO_OUTPUT that we can find, since this must lead // to the minimum path that completes from // path.arc. // For each input letter: while (true) { //System.out.println("\n cycle path: " + path); fst.ReadFirstTargetArc(path.Arc, path.Arc, fstReader); // For each arc leaving this node: bool foundZero = false; while (true) { //System.out.println(" arc=" + (char) path.arc.label + " cost=" + path.arc.output); // tricky: instead of comparing output == 0, we must // express it via the comparer compare(output, 0) == 0 if (comparer.Compare(NO_OUTPUT, path.Arc.Output) == 0) { if (queue == null) { foundZero = true; break; } else if (!foundZero) { scratchArc.CopyFrom(path.Arc); foundZero = true; } else { AddIfCompetitive(path); } } else if (queue != null) { AddIfCompetitive(path); } if (path.Arc.IsLast) { break; } fst.ReadNextArc(path.Arc, fstReader); } Debug.Assert(foundZero); if (queue != null) { // TODO: maybe we can save this copyFrom if we // are more clever above... eg on finding the // first NO_OUTPUT arc we'd switch to using // scratchArc path.Arc.CopyFrom(scratchArc); } if (path.Arc.Label == FST.END_LABEL) { // Add final output: //Debug.WriteLine(" done!: " + path); T finalOutput = fst.Outputs.Add(path.Cost, path.Arc.Output); if (AcceptResult(path.Input, finalOutput)) { //Debug.WriteLine(" add result: " + path); results.Add(new Result <T>(path.Input, finalOutput)); } else { rejectCount++; } break; } else { path.Input.Grow(1 + path.Input.Length); path.Input.Int32s[path.Input.Length] = path.Arc.Label; path.Input.Length++; path.Cost = fst.Outputs.Add(path.Cost, path.Arc.Output); } } } return(new TopResults <T>(rejectCount + topN <= maxQueueDepth, results)); }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } //System.out.println("lookup key=" + key + " num=" + num); for (var i = 0; i < key.Length; i++) { if (key[i] == 0x1E) { throw new ArgumentException( "lookup key cannot contain HOLE character U+001E; this character is reserved"); } if (key[i] == 0x1F) { throw new ArgumentException( "lookup key cannot contain unit separator character U+001F; this character is reserved"); } } var utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = ToLookupAutomaton(key); var spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); FST.BytesReader bytesReader = fst.GetBytesReader(); var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>(); IList <LookupResult> results = new List <LookupResult>(); IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths = FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst); if (exactFirst) { int count = 0; foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; } } // Searcher just to find the single exact only // match, if present: Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher; searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparer); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: foreach (var path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false, path.Input); } } var completions = searcher.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" // path, and then do linear scan to see if one of // these exactly matches the input. It should be // possible (though hairy) to do something similar // to getByOutput, since the surface form is encoded // into the FST output, so we more efficiently hone // in on the exact surface-form match. Still, I // suspect very little time is spent in this linear // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: foreach (var completion in completions) { BytesRef output2 = completion.Output.Output2; if (SameSurfaceForm(utf8Key, output2)) { results.Add(GetLookupResult(completion.Output.Output1, output2, spare)); break; } } if (results.Count == num) { // That was quick: return(results); } } Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2; searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count, num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results); prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst); foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input); } var completions2 = searcher2.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions2.IsComplete); } foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2) { LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required //System.out.println(" result=" + result); results.Add(result); if (results.Count == num) { // In the exactFirst=true case the search may // produce one extra path break; } } return(results); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } }
internal FSTTermsEnum(FST <long?> fst) { this.fst = fst; @in = new BytesRefFSTEnum <long?>(fst); bytesReader = fst.GetBytesReader(); }
// TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / // SEEK_END)? saves the eq check above? /// <summary> /// Seeks to smallest term that's >= target. </summary> protected virtual void DoSeekCeil() { //System.out.println(" advance len=" + target.length + " curlen=" + current.length); // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own // automaton //System.out.println("FE.seekCeil upto=" + upto); // Save time by starting at the end of the shared prefix // b/w our current term & the target: RewindPrefix(); //System.out.println(" after rewind upto=" + upto); FST.Arc <T> arc = GetArc(m_upto); int targetLabel = TargetLabel; //System.out.println(" init targetLabel=" + targetLabel); // Now scan forward, matching the new suffix of the target while (true) { //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel); if (arc.BytesPerArc != 0 && arc.Label != -1) { // Arcs are fixed array -- use binary search to find // the target. FST.BytesReader @in = m_fst.GetBytesReader(); int low = arc.ArcIdx; int high = arc.NumArcs - 1; int mid = 0; //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); bool found = false; while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid + 1); int midLabel = m_fst.ReadLabel(@in); int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { found = true; break; } } // NOTE: this code is dup'd w/ the code below (in // the outer else clause): if (found) { // Match arc.ArcIdx = mid - 1; m_fst.ReadNextRealArc(arc, @in); if (Debugging.AssertsEnabled) { Debugging.Assert(arc.ArcIdx == mid); Debugging.Assert(arc.Label == targetLabel, "arc.label={0} vs targetLabel={1} mid={2}", arc.Label, targetLabel, mid); } m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output); if (targetLabel == FST.END_LABEL) { return; } CurrentLabel = arc.Label; Incr(); arc = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader); targetLabel = TargetLabel; continue; } else if (low == arc.NumArcs) { // Dead end arc.ArcIdx = arc.NumArcs - 2; m_fst.ReadNextRealArc(arc, @in); if (Debugging.AssertsEnabled) { Debugging.Assert(arc.IsLast); } // Dead end (target is after the last arc); // rollback to last fork then push m_upto--; while (true) { if (m_upto == 0) { return; } FST.Arc <T> prevArc = GetArc(m_upto); //System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast()); if (!prevArc.IsLast) { m_fst.ReadNextArc(prevArc, m_fstReader); PushFirst(); return; } m_upto--; } } else { arc.ArcIdx = (low > high ? low : high) - 1; m_fst.ReadNextRealArc(arc, @in); if (Debugging.AssertsEnabled) { Debugging.Assert(arc.Label > targetLabel); } PushFirst(); return; } } else { // Arcs are not array'd -- must do linear scan: if (arc.Label == targetLabel) { // recurse m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output); if (targetLabel == FST.END_LABEL) { return; } CurrentLabel = arc.Label; Incr(); arc = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader); targetLabel = TargetLabel; } else if (arc.Label > targetLabel) { PushFirst(); return; } else if (arc.IsLast) { // Dead end (target is after the last arc); // rollback to last fork then push m_upto--; while (true) { if (m_upto == 0) { return; } FST.Arc <T> prevArc = GetArc(m_upto); //System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast()); if (!prevArc.IsLast) { m_fst.ReadNextArc(prevArc, m_fstReader); PushFirst(); return; } m_upto--; } } else { // keep scanning //System.out.println(" next scan"); m_fst.ReadNextArc(arc, m_fstReader); } } } }
internal FSTTermsEnum(FST <Int64> fst) { this.fst = fst; input = new BytesRefFSTEnum <Int64>(fst); bytesReader = fst.GetBytesReader(); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { if (Debugging.AssertsEnabled) { Debugging.Assert(a.IsDeterministic); } IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue[queue.Count - 1]; queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label <= max); } if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label >= min, () => nextArc.Label + " " + min); } Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc == null || label < nextArc.Label, () => "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString())); } } } } } return(endNodes); }