public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller() { // Analyzing "purple wizard of the", where of and the are removed as a // stopwords, leaving two trailing holes: Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) }; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken("--"); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken(""); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken(null); AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.SetFillerToken(null); filter.SetTokenSeparator(null); AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); }
protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams) { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.SetTokenSeparator(tokenSeparator); filter.SetOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
public override TokenStream Create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.SetOutputUnigrams(outputUnigrams); r.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.SetTokenSeparator(tokenSeparator); r.SetFillerToken(fillerToken); return(r); }
protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) { ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize); filter.SetMinShingleSize(minShingleSize); filter.SetMaxShingleSize(maxShingleSize); filter.SetTokenSeparator(tokenSeparator); filter.SetOutputUnigrams(outputUnigrams); filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.SetFillerToken(fillerToken); return(new TokenStreamComponents(components.Tokenizer, filter)); }