You must specify the required Version compatibility when creating a NGramTokenFilter. As of Lucene 4.4, this token filters:
You can make this filter use the old behavior by providing a version < Version#LUCENE_44 in the constructor but this is not recommended as it will lead to broken TokenStreams that will cause highlighting bugs.
If you were using this TokenFilter to perform partial highlighting, this won't work anymore since this filter doesn't update offsets. You should modify your analysis chain to use NGramTokenizer, and potentially override NGramTokenizer#isTokenChar(int) to perform pre-tokenization.
public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance) { this.outerInstance = outerInstance; }
public virtual void TestBigrams() { NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2); AssertTokenStreamContents(filter, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0 }); }
public virtual void TestNgramsNoIncrement() { NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3); AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, null, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, null, null, false); }
public virtual void TestLucene43() { #pragma warning disable 612, 618 NGramTokenFilter filter = new NGramTokenFilter(LuceneVersion.LUCENE_43, input, 2, 3); #pragma warning restore 612, 618 AssertTokenStreamContents(filter, new string[] { "ab", "bc", "cd", "de", "abc", "bcd", "cde" }, new int[] { 0, 1, 2, 3, 0, 1, 2 }, new int[] { 2, 3, 4, 5, 3, 4, 5 }, null, new int[] { 1, 1, 1, 1, 1, 1, 1 }, null, null, false); }
public virtual void TestSupplementaryCharacters() { string s = TestUtil.RandomUnicodeString(Random(), 10); int codePointCount = Character.CodePointCount(s, 0, s.Length); int minGram = TestUtil.NextInt(Random(), 1, 3); int maxGram = TestUtil.NextInt(Random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>(); IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>(); tk.Reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end) { assertTrue(tk.IncrementToken()); assertEquals(0, offsetAtt.StartOffset()); assertEquals(s.Length, offsetAtt.EndOffset()); int startIndex = Character.OffsetByCodePoints(s, 0, start); int endIndex = Character.OffsetByCodePoints(s, 0, end); assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString()); } } assertFalse(tk.IncrementToken()); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); }
public virtual void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1); AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 }); tokenizer.Reader = new StringReader("abcde"); AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 }); }
public virtual void TestSmallTokenInStream() { input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3); AssertTokenStreamContents(filter, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 }, new int[] { 1, 2 }); }
public virtual void TestOversizedNgrams() { NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7); AssertTokenStreamContents(filter, new string[0], new int[0], new int[0]); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94); //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83); //stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); }