protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) { ShingleFilter shingles = new ShingleFilter(components.TokenStream, 2, outerInstance.grams); shingles.SetTokenSeparator(char.ToString((char)outerInstance.separator)); return(new TokenStreamComponents(components.Tokenizer, shingles)); }
public virtual void TestGraphs() { TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10); AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23); }
public void Test6GramFilterNoPositions() { ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6); AssertTokenStreamContents(new PositionFilter(filter), SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS); }
public virtual void Test6GramFilterNoPositions() { ShingleFilter filter = new ShingleFilter(new TestTokenStream(this, TEST_TOKEN), 6); AssertTokenStreamContents #pragma warning disable 612, 618 (new PositionFilter(filter), #pragma warning restore 612, 618 SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94); //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83); //stream = new SopTokenFilter(stream); return(new TokenStreamComponents(tokenizer, stream)); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Need to account for the | breaks in relatedcontent var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader))); //return new ShingleFilter(tokenizedInput, 4); var output = new ShingleFilter(tokenizedInput, 4); //output.SetOutputUnigrams(false); return(output); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream ts = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30).ReusableTokenStream(fieldName, reader); ts = new LowerCaseFilter(ts); int shinglesSize = 3; ts = new ShingleFilter(ts, shinglesSize); // Uncomment below to use StopFilter that will skip few predefined set of words. // ts = new StopFilter(true, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return(ts); }
public virtual void TestUnicodeShinglesAndNgrams() { Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94); //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83); //stream = new SopTokenFilter(stream); return(new TokenStreamComponents(tokenizer, stream)); }); CheckRandomData(Random, analyzer, 2000); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var tokenizer = new StandardTokenizer(version, reader); var shingler = new ShingleFilter(tokenizer, minGramSize, maxGramSize); if (!this.ShowUnigrams) { shingler.SetOutputUnigrams(false); } else { shingler.SetOutputUnigrams(true); } var filter = new StopFilter(version, new LowerCaseFilter(version, shingler), StopAnalyzer.ENGLISH_STOP_WORDS_SET); return(new TokenStreamComponents(tokenizer, filter)); }
public void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 }, new[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new[] { 1, 0, 1, 0, 1, 0, 1 } ); wsTokenizer.Reset(new StringReader("please divide this sentence")); AssertTokenStreamContents(filter, new[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 }, new[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new[] { 1, 0, 1, 0, 1, 0, 1 } ); }
protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, bool outputUnigrams) { var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); filter.SetOutputUnigrams(outputUnigrams); var termAtt = filter.AddAttribute <ITermAttribute>(); var offsetAtt = filter.AddAttribute <IOffsetAttribute>(); var posIncrAtt = filter.AddAttribute <IPositionIncrementAttribute>(); var typeAtt = filter.AddAttribute <ITypeAttribute>(); int i = 0; while (filter.IncrementToken()) { Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected"); String termText = termAtt.Term; String goldText = tokensToCompare[i].Term; Assert.AreEqual(goldText, termText, "Wrong termText"); Assert.AreEqual(tokensToCompare[i].StartOffset, offsetAtt.StartOffset, "Wrong startOffset for token \"" + termText + "\""); Assert.AreEqual(tokensToCompare[i].EndOffset, offsetAtt.EndOffset, "Wrong endOffset for token \"" + termText + "\""); Assert.AreEqual(positionIncrements[i], posIncrAtt.PositionIncrement, "Wrong positionIncrement for token \"" + termText + "\""); Assert.AreEqual(types[i], typeAtt.Type, "Wrong type for token \"" + termText + "\""); i++; } Assert.AreEqual(tokensToCompare.Length, i, "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.Length + ")"); }
protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components) { ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize); filter.MinShingleSize = minShingleSize; filter.MaxShingleSize = maxShingleSize; filter.TokenSeparator = tokenSeparator; filter.OutputUnigrams = outputUnigrams; filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles; filter.FillerToken = fillerToken; return new TokenStreamComponents(components.Tokenizer, filter); }