protected internal override IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > GetFullPrefixPaths( IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths, Automaton lookupAutomaton, FST <PairOutputs <long?, BytesRef> .Pair> fst) { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = ConvertAutomaton(ToLevenshteinAutomata(lookupAutomaton)); /* * Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8); * w.write(levA.toDot()); * w.close(); * System.out.println("Wrote LevA to out.dot"); */ return(FSTUtil.IntersectPrefixPaths(levA, fst)); }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } //System.out.println("lookup key=" + key + " num=" + num); for (var i = 0; i < key.Length; i++) { if (key[i] == 0x1E) { throw new ArgumentException( "lookup key cannot contain HOLE character U+001E; this character is reserved"); } if (key[i] == 0x1F) { throw new ArgumentException( "lookup key cannot contain unit separator character U+001F; this character is reserved"); } } var utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = ToLookupAutomaton(key); var spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); FST.BytesReader bytesReader = fst.GetBytesReader(); var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>(); IList <LookupResult> results = new List <LookupResult>(); IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths = FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst); if (exactFirst) { int count = 0; foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; } } // Searcher just to find the single exact only // match, if present: Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher; searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparer); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: foreach (var path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false, path.Input); } } var completions = searcher.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" // path, and then do linear scan to see if one of // these exactly matches the input. It should be // possible (though hairy) to do something similar // to getByOutput, since the surface form is encoded // into the FST output, so we more efficiently hone // in on the exact surface-form match. Still, I // suspect very little time is spent in this linear // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: foreach (var completion in completions) { BytesRef output2 = completion.Output.Output2; if (SameSurfaceForm(utf8Key, output2)) { results.Add(GetLookupResult(completion.Output.Output1, output2, spare)); break; } } if (results.Count == num) { // That was quick: return(results); } } Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2; searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count, num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results); prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst); foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input); } var completions2 = searcher2.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions2.IsComplete); } foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2) { LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required //System.out.println(" result=" + result); results.Add(result); if (results.Count == num) { // In the exactFirst=true case the search may // produce one extra path break; } } return(results); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } }