/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; if (version.onOrAfter(Version.LUCENE_44)) { posIncAtt = addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = addAttribute(typeof(PositionLengthAttribute)); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testRandom() throws Exception public virtual void testRandom() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int alphabetSize = org.apache.lucene.util.TestUtil.nextInt(random(), 2, 7); int alphabetSize = TestUtil.Next(random(), 2, 7); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int docLen = atLeast(3000); int docLen = atLeast(3000); //final int docLen = 50; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String document = getRandomString('a', alphabetSize, docLen); string document = getRandomString('a', alphabetSize, docLen); if (VERBOSE) { Console.WriteLine("TEST: doc=" + document); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int numSyn = atLeast(5); int numSyn = atLeast(5); //final int numSyn = 2; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.Map<String,OneSyn> synMap = new java.util.HashMap<>(); IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.List<OneSyn> syns = new java.util.ArrayList<>(); IList<OneSyn> syns = new List<OneSyn>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean dedup = random().nextBoolean(); bool dedup = random().nextBoolean(); if (VERBOSE) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0;synIDX < numSyn;synIDX++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String synIn = getRandomString('a', alphabetSize, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim(); string synIn = getRandomString('a', alphabetSize, TestUtil.Next(random(), 1, 5)).Trim(); OneSyn s = synMap[synIn]; if (s == null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new List<>(); synMap[synIn] = s; s.keepOrig = random().nextBoolean(); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String synOut = getRandomString('0', 10, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim(); string synOut = getRandomString('0', 10, TestUtil.Next(random(), 1, 5)).Trim(); [email protected](synOut); add(synIn, synOut, s.keepOrig); if (VERBOSE) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); if (dedup) { pruneDups(syns); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String expected = slowSynMatcher(document, syns, 5); string expected = slowSynMatcher(document, syns, 5); if (VERBOSE) { Console.WriteLine("TEST: expected=" + expected); } verify(document, expected); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testOutputHangsOffEnd() throws Exception public virtual void testOutputHangsOffEnd() { b = new SynonymMap.Builder(true); const bool keepOrig = false; // b hangs off the end (no input token under it): add("a", "a b", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); // Make sure endOffset inherits from previous input token: verify("a", "a b:1"); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBasic2() throws Exception public virtual void testBasic2() { b = new SynonymMap.Builder(true); const bool keepOrig = false; add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); add("bbb", "bbbb1 bbbb2", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); if (keepOrig) { verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); } else { verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBasic() throws Exception public virtual void testBasic() { b = new SynonymMap.Builder(true); add("a", "foo", true); add("a b", "bar fee", true); add("b c", "dog collar", true); add("c d", "dog harness holder extras", true); add("m c e", "dog barks loudly", false); add("i j k", "feep", true); add("e f", "foo bar", false); add("e f", "baz bee", false); add("z", "boo", false); add("y", "bee", true); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); verify("a b c", "a/bar b/fee c"); // syn output extends beyond input tokens verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); verify("a b a", "a/bar b/fee a/foo"); // outputs that add to one another: verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); // two outputs for same input verify("e f", "foo/baz bar/bee"); // verify multi-word / single-output offsets: verify("g i j k g", "g i/feep:7_3 j k g"); // mixed keepOrig true/false: verify("a m c e x", "a/foo dog barks loudly x"); verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.CaptureCount > 0); // no captureStates when no syns matched verify("p q r s t", "p q r s t"); assertEquals(0, tokensOut.CaptureCount); // no captureStates when only single-input syns, w/ no // lookahead needed, matched verify("p q z y t", "p q boo y/bee t"); assertEquals(0, tokensOut.CaptureCount); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.TokenStream("", key.ToString()); try { TermToBytesRefAttribute termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>(); OffsetAttribute offsetAtt = ts.AddAttribute <OffsetAttribute>(); PositionLengthAttribute posLenAtt = ts.AddAttribute <PositionLengthAttribute>(); PositionIncrementAttribute posIncAtt = ts.AddAttribute <PositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset()); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset()); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.BytesReader; // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; IList <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; prefixOutput = LookupPrefix(fst, bytesReader, token, arc); //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } nextCompletionBreak : backoff *= ALPHA; } results.Sort(new ComparatorAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.CloseWhileHandlingException(ts); } }