public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } //System.out.println("lookup key=" + key + " num=" + num); for (var i = 0; i < key.Length; i++) { if (key[i] == 0x1E) { throw new ArgumentException( "lookup key cannot contain HOLE character U+001E; this character is reserved"); } if (key[i] == 0x1F) { throw new ArgumentException( "lookup key cannot contain unit separator character U+001F; this character is reserved"); } } var utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = ToLookupAutomaton(key); var spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); FST.BytesReader bytesReader = fst.GetBytesReader(); var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>(); IList <LookupResult> results = new List <LookupResult>(); IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths = FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst); if (exactFirst) { int count = 0; foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; } } // Searcher just to find the single exact only // match, if present: Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher; searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparer); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: foreach (var path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false, path.Input); } } var completions = searcher.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" // path, and then do linear scan to see if one of // these exactly matches the input. It should be // possible (though hairy) to do something similar // to getByOutput, since the surface form is encoded // into the FST output, so we more efficiently hone // in on the exact surface-form match. Still, I // suspect very little time is spent in this linear // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: foreach (var completion in completions) { BytesRef output2 = completion.Output.Output2; if (SameSurfaceForm(utf8Key, output2)) { results.Add(GetLookupResult(completion.Output.Output1, output2, spare)); break; } } if (results.Count == num) { // That was quick: return(results); } } Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2; searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count, num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results); prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst); foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input); } var completions2 = searcher2.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions2.IsComplete); } foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2) { LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required //System.out.println(" result=" + result); results.Add(result); if (results.Count == num) { // In the exactFirst=true case the search may // produce one extra path break; } } return(results); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }