protected internal virtual void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams) { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.SetOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller() { // Analyzing "purple wizard of the", where of and the are removed as a // stopwords, leaving two trailing holes: Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) }; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken("--"); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken(""); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken(null); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken(null); filter.SetTokenSeparator(null); AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); }
public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource) { this.outerInstance = outerInstance; this.attSource = attSource; this.termAtt = attSource.GetAttribute <ICharTermAttribute>(); this.offsetAtt = attSource.GetAttribute <IOffsetAttribute>(); }
public virtual void TestTrailingHole2() { // Analyzing "purple wizard of", where of is removed as a // stopword leaving a trailing hole: Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) }; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16); }
public virtual void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); wsTokenizer.SetReader(new StringReader("please divide this sentence")); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { //Need to account for the | breaks in relatedcontent var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader))); //return new ShingleFilter(tokenizedInput, 4); var output = new ShingleFilter(tokenizedInput, 4); //output.SetOutputUnigrams(false); return output; }
public override TokenStream Create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.OutputUnigrams = outputUnigrams; r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles; r.TokenSeparator = tokenSeparator; r.FillerToken = fillerToken; return(r); }
public override TokenStream Create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.SetOutputUnigrams(outputUnigrams); r.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.SetTokenSeparator(tokenSeparator); r.SetFillerToken(fillerToken); return(r); }
protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) { ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize); filter.SetMinShingleSize(minShingleSize); filter.SetMaxShingleSize(maxShingleSize); filter.SetTokenSeparator(tokenSeparator); filter.SetOutputUnigrams(outputUnigrams); filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.SetFillerToken(fillerToken); return(new TokenStreamComponents(components.Tokenizer, filter)); }
protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components) { ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize); filter.MinShingleSize = minShingleSize; filter.MaxShingleSize = maxShingleSize; filter.TokenSeparator = tokenSeparator; filter.OutputUnigrams = outputUnigrams; filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles; filter.FillerToken = fillerToken; return(new TokenStreamComponents(components.Tokenizer, filter)); }
protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types) { string[] text = new string[tokensToCompare.Length]; int[] startOffsets = new int[tokensToCompare.Length]; int[] endOffsets = new int[tokensToCompare.Length]; for (int i = 0; i < tokensToCompare.Length; i++) { text[i] = new string(tokensToCompare[i].Buffer, 0, tokensToCompare[i].Length); startOffsets[i] = tokensToCompare[i].StartOffset; endOffsets[i] = tokensToCompare[i].EndOffset; } AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream wrapped; try { wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader); } catch (IOException) { wrapped = defaultAnalyzer.TokenStream(fieldName, reader); } ShingleFilter filter = new ShingleFilter(wrapped); filter.SetMaxShingleSize(maxShingleSize); filter.SetOutputUnigrams(outputUnigrams); return(filter); }
public virtual void TestGraphs() { TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10); AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23); }
protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types) { string[] text = new string[tokensToCompare.Length]; int[] startOffsets = new int[tokensToCompare.Length]; int[] endOffsets = new int[tokensToCompare.Length]; for (int i = 0; i < tokensToCompare.Length; i++) { text[i] = new string(tokensToCompare[i].Buffer(), 0, tokensToCompare[i].Length); startOffsets[i] = tokensToCompare[i].StartOffset(); endOffsets[i] = tokensToCompare[i].EndOffset(); } AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements); }
public virtual void TestTrailingHole2() { // Analyzing "purple wizard of", where of is removed as a // stopword leaving a trailing hole: Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) }; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16); }
protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) { ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize); filter.MinShingleSize = minShingleSize; filter.MaxShingleSize = maxShingleSize; filter.TokenSeparator = tokenSeparator; filter.OutputUnigrams = outputUnigrams; filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles; filter.FillerToken = fillerToken; return new TokenStreamComponents(components.Tokenizer, filter); }
public CircularSequence(ShingleFilter shingleFilter) { this.outerInstance = shingleFilter; minValue = shingleFilter.outputUnigrams ? 1 : shingleFilter.minShingleSize; Reset(); }
public CircularSequence(ShingleFilter outerInstance) { this.outerInstance = outerInstance; minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize; Reset(); }
public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller() { // Analyzing "purple wizard of the", where of and the are removed as a // stopwords, leaving two trailing holes: Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) }; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.FillerToken = "--"; AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.FillerToken = ""; AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.FillerToken = null; AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.FillerToken = null; filter.TokenSeparator = null; AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); }
protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams) { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.TokenSeparator = tokenSeparator; filter.OutputUnigrams = outputUnigrams; shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
public virtual void Test6GramFilterNoPositions() { ShingleFilter filter = new ShingleFilter(new TestTokenStream(this, TEST_TOKEN), 6); AssertTokenStreamContents #pragma warning disable 612, 618 (new PositionFilter(filter), #pragma warning restore 612, 618 SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS); }
public CircularSequence(ShingleFilter outerInstance) { this.outerInstance = outerInstance; minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize; reset(); }
public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource) { this.outerInstance = outerInstance; this.attSource = attSource; this.termAtt = attSource.GetAttribute<ICharTermAttribute>(); this.offsetAtt = attSource.GetAttribute<IOffsetAttribute>(); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream wrapped; try { wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader); } catch (IOException) { wrapped = defaultAnalyzer.TokenStream(fieldName, reader); } ShingleFilter filter = new ShingleFilter(wrapped); filter.SetMaxShingleSize(maxShingleSize); filter.SetOutputUnigrams(outputUnigrams); return filter; }
public virtual void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); wsTokenizer.Reader = new StringReader("please divide this sentence"); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, new[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new[] {1, 0, 1, 0, 1, 0, 1} ); wsTokenizer.Reset(new StringReader("please divide this sentence")); AssertTokenStreamContents(filter, new[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, new[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new[] {1, 0, 1, 0, 1, 0, 1} ); }
protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, bool outputUnigrams) { var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); filter.SetOutputUnigrams(outputUnigrams); var termAtt = filter.AddAttribute<ITermAttribute>(); var offsetAtt = filter.AddAttribute<IOffsetAttribute>(); var posIncrAtt = filter.AddAttribute<IPositionIncrementAttribute>(); var typeAtt = filter.AddAttribute<ITypeAttribute>(); int i = 0; while (filter.IncrementToken()) { Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected"); String termText = termAtt.Term; String goldText = tokensToCompare[i].Term; Assert.AreEqual(goldText, termText, "Wrong termText"); Assert.AreEqual(tokensToCompare[i].StartOffset, offsetAtt.StartOffset, "Wrong startOffset for token \"" + termText + "\""); Assert.AreEqual(tokensToCompare[i].EndOffset, offsetAtt.EndOffset, "Wrong endOffset for token \"" + termText + "\""); Assert.AreEqual(positionIncrements[i], posIncrAtt.PositionIncrement, "Wrong positionIncrement for token \"" + termText + "\""); Assert.AreEqual(types[i], typeAtt.Type, "Wrong type for token \"" + termText + "\""); i++; } Assert.AreEqual(tokensToCompare.Length, i, "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.Length + ")"); }
public virtual void TestTwoTrailingHoles() { // Analyzing "purple wizard of the", where of and the are removed as a // stopwords, leaving two trailing holes: Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) }; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 20 }, new int[] { 1, 0, 1, 0 }, 20); }
public void Test6GramFilterNoPositions() { ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6); AssertTokenStreamContents(new PositionFilter(filter), SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94); //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83); //stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); }