A ShingleFilter constructs shingles (token n-grams) from a token stream. In other words, it creates combinations of tokens as a single token.

For example, the sentence "please divide this sentence into shingles" might be tokenized into shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles".

This filter handles position increments > 1 by inserting filler tokens (tokens with termtext "_"). It does not handle a position increment of 0.

Inheritance: Lucene.Net.Analysis.TokenFilter
Beispiel #1
0
        protected internal virtual void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
        {
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);

            filter.SetOutputUnigrams(outputUnigrams);
            shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
        }
Beispiel #2
0
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);

            filter.SetFillerToken("--");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken("");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);
            filter.SetTokenSeparator(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
Beispiel #3
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource     = attSource;
     this.termAtt       = attSource.GetAttribute <ICharTermAttribute>();
     this.offsetAtt     = attSource.GetAttribute <IOffsetAttribute>();
 }
Beispiel #4
0
        public virtual void TestTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16);
        }
Beispiel #5
0
        public virtual void TestReset()
        {
            Tokenizer   wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
            TokenStream filter      = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
            wsTokenizer.SetReader(new StringReader("please divide this sentence"));
            AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Need to account for the | breaks in relatedcontent
            var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader)));
            //return new ShingleFilter(tokenizedInput, 4);

            var output = new ShingleFilter(tokenizedInput, 4);
            //output.SetOutputUnigrams(false);
            return output;
        }
Beispiel #7
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.OutputUnigrams             = outputUnigrams;
            r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            r.TokenSeparator             = tokenSeparator;
            r.FillerToken = fillerToken;
            return(r);
        }
Beispiel #8
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.SetOutputUnigrams(outputUnigrams);
            r.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            r.SetTokenSeparator(tokenSeparator);
            r.SetFillerToken(fillerToken);
            return(r);
        }
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.SetMinShingleSize(minShingleSize);
            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetTokenSeparator(tokenSeparator);
            filter.SetOutputUnigrams(outputUnigrams);
            filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            filter.SetFillerToken(fillerToken);
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
Beispiel #10
0
        protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.MinShingleSize             = minShingleSize;
            filter.MaxShingleSize             = maxShingleSize;
            filter.TokenSeparator             = tokenSeparator;
            filter.OutputUnigrams             = outputUnigrams;
            filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            filter.FillerToken = fillerToken;
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
Beispiel #11
0
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text         = new string[tokensToCompare.Length];
            int[]    startOffsets = new int[tokensToCompare.Length];
            int[]    endOffsets   = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
                text[i]         = new string(tokensToCompare[i].Buffer, 0, tokensToCompare[i].Length);
                startOffsets[i] = tokensToCompare[i].StartOffset;
                endOffsets[i]   = tokensToCompare[i].EndOffset;
            }

            AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream wrapped;

            try
            {
                wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
            }
            catch (IOException)
            {
                wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
            }
            ShingleFilter filter = new ShingleFilter(wrapped);

            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetOutputUnigrams(outputUnigrams);
            return(filter);
        }
 public virtual void TestGraphs()
 {
     TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
     tk = new ShingleFilter(tk);
     tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
     AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
 }
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text = new string[tokensToCompare.Length];
            int[] startOffsets = new int[tokensToCompare.Length];
            int[] endOffsets = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
                text[i] = new string(tokensToCompare[i].Buffer(), 0, tokensToCompare[i].Length);
                startOffsets[i] = tokensToCompare[i].StartOffset();
                endOffsets[i] = tokensToCompare[i].EndOffset();
            }

            AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
        public virtual void TestTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16);
        }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);
     filter.MinShingleSize = minShingleSize;
     filter.MaxShingleSize = maxShingleSize;
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
     filter.FillerToken = fillerToken;
     return new TokenStreamComponents(components.Tokenizer, filter);
 }
Beispiel #17
0
 public CircularSequence(ShingleFilter shingleFilter)
 {
     this.outerInstance = shingleFilter;
     minValue           = shingleFilter.outputUnigrams ? 1 : shingleFilter.minShingleSize;
     Reset();
 }
Beispiel #18
0
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue           = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     Reset();
 }
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "--";

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "";

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;
            filter.TokenSeparator = null;

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
 protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
 {
     ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
 }
        public virtual void Test6GramFilterNoPositions()
        {

            ShingleFilter filter = new ShingleFilter(new TestTokenStream(this, TEST_TOKEN), 6);
            AssertTokenStreamContents
#pragma warning disable 612, 618
                (new PositionFilter(filter),
#pragma warning restore 612, 618
                SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS);
        }
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     reset();
 }
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource = attSource;
     this.termAtt = attSource.GetAttribute<ICharTermAttribute>();
     this.offsetAtt = attSource.GetAttribute<IOffsetAttribute>();
 }
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream wrapped;
     try
     {
         wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
     }
     catch (IOException)
     {
         wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
     }
     ShingleFilter filter = new ShingleFilter(wrapped);
     filter.SetMaxShingleSize(maxShingleSize);
     filter.SetOutputUnigrams(outputUnigrams);
     return filter;
 }
 public virtual void TestReset()
 {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
     wsTokenizer.Reader = new StringReader("please divide this sentence");
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
 }
        public void TestReset()
        {
            Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
            TokenStream filter = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this", "this",
                                              "this sentence",
                                              "sentence"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
                                      new[]
                                          {
                                              TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                                              "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                                              TypeAttribute.DEFAULT_TYPE
                                          },
                                      new[] {1, 0, 1, 0, 1, 0, 1}
                );

            wsTokenizer.Reset(new StringReader("please divide this sentence"));

            AssertTokenStreamContents(filter,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this", "this",
                                              "this sentence",
                                              "sentence"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
                                      new[]
                                          {
                                              TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                                              "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                                              TypeAttribute.DEFAULT_TYPE
                                          },
                                      new[] {1, 0, 1, 0, 1, 0, 1}
                );
        }
        protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                         int[] positionIncrements, String[] types, bool outputUnigrams)
        {
            var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
            filter.SetOutputUnigrams(outputUnigrams);

            var termAtt = filter.AddAttribute<ITermAttribute>();
            var offsetAtt = filter.AddAttribute<IOffsetAttribute>();
            var posIncrAtt = filter.AddAttribute<IPositionIncrementAttribute>();
            var typeAtt = filter.AddAttribute<ITypeAttribute>();

            int i = 0;
            while (filter.IncrementToken())
            {
                Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected");

                String termText = termAtt.Term;
                String goldText = tokensToCompare[i].Term;

                Assert.AreEqual(goldText, termText, "Wrong termText");
                Assert.AreEqual(tokensToCompare[i].StartOffset, offsetAtt.StartOffset,
                                "Wrong startOffset for token \"" + termText + "\"");
                Assert.AreEqual(tokensToCompare[i].EndOffset, offsetAtt.EndOffset,
                                "Wrong endOffset for token \"" + termText + "\"");
                Assert.AreEqual(positionIncrements[i], posIncrAtt.PositionIncrement,
                                "Wrong positionIncrement for token \"" + termText + "\"");
                Assert.AreEqual(types[i], typeAtt.Type, "Wrong type for token \"" + termText + "\"");

                i++;
            }

            Assert.AreEqual(tokensToCompare.Length, i,
                            "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" +
                            tokensToCompare.Length + ")");
        }
        public virtual void TestTwoTrailingHoles()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 20 }, new int[] { 1, 0, 1, 0 }, 20);
        }
 public void Test6GramFilterNoPositions()
 {
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
     AssertTokenStreamContents(new PositionFilter(filter),
                               SIX_GRAM_NO_POSITIONS_TOKENS,
                               SIX_GRAM_NO_POSITIONS_INCREMENTS);
 }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
     //TokenStream stream = new SopTokenFilter(tokenizer);
     TokenStream stream = new ShingleFilter(tokenizer, 5);
     //stream = new SopTokenFilter(stream);
     stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
     //stream = new SopTokenFilter(stream);
     return new TokenStreamComponents(tokenizer, stream);
 }