A ShingleFilter constructs shingles (token n-grams) from a token stream. In other words, it creates combinations of tokens as a single token.

For example, the sentence "please divide this sentence into shingles" might be tokenized into shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles".

This filter handles position increments > 1 by inserting filler tokens (tokens with termtext "_"). It does not handle a position increment of 0.

Inheritance: Lucene.Net.Analysis.TokenFilter
コード例 #1
0
        protected internal virtual void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
        {
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);

            filter.SetOutputUnigrams(outputUnigrams);
            shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
        }
コード例 #2
0
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);

            filter.SetFillerToken("--");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken("");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);
            filter.SetTokenSeparator(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
コード例 #3
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource     = attSource;
     this.termAtt       = attSource.GetAttribute <ICharTermAttribute>();
     this.offsetAtt     = attSource.GetAttribute <IOffsetAttribute>();
 }
コード例 #4
0
        public virtual void TestTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16);
        }
コード例 #5
0
        public virtual void TestReset()
        {
            Tokenizer   wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
            TokenStream filter      = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
            wsTokenizer.SetReader(new StringReader("please divide this sentence"));
            AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
        }
コード例 #6
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Need to account for the | breaks in relatedcontent
            var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader)));
            //return new ShingleFilter(tokenizedInput, 4);

            var output = new ShingleFilter(tokenizedInput, 4);
            //output.SetOutputUnigrams(false);
            return output;
        }
コード例 #7
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.OutputUnigrams             = outputUnigrams;
            r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            r.TokenSeparator             = tokenSeparator;
            r.FillerToken = fillerToken;
            return(r);
        }
コード例 #8
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.SetOutputUnigrams(outputUnigrams);
            r.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            r.SetTokenSeparator(tokenSeparator);
            r.SetFillerToken(fillerToken);
            return(r);
        }
コード例 #9
0
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.SetMinShingleSize(minShingleSize);
            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetTokenSeparator(tokenSeparator);
            filter.SetOutputUnigrams(outputUnigrams);
            filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            filter.SetFillerToken(fillerToken);
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
コード例 #10
0
        protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.MinShingleSize             = minShingleSize;
            filter.MaxShingleSize             = maxShingleSize;
            filter.TokenSeparator             = tokenSeparator;
            filter.OutputUnigrams             = outputUnigrams;
            filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            filter.FillerToken = fillerToken;
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
コード例 #11
0
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text         = new string[tokensToCompare.Length];
            int[]    startOffsets = new int[tokensToCompare.Length];
            int[]    endOffsets   = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
                text[i]         = new string(tokensToCompare[i].Buffer, 0, tokensToCompare[i].Length);
                startOffsets[i] = tokensToCompare[i].StartOffset;
                endOffsets[i]   = tokensToCompare[i].EndOffset;
            }

            AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
コード例 #12
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream wrapped;

            try
            {
                wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
            }
            catch (IOException)
            {
                wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
            }
            ShingleFilter filter = new ShingleFilter(wrapped);

            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetOutputUnigrams(outputUnigrams);
            return(filter);
        }
コード例 #13
0
 public virtual void TestGraphs()
 {
     TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
     tk = new ShingleFilter(tk);
     tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
     AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
 }
コード例 #14
0
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text = new string[tokensToCompare.Length];
            int[] startOffsets = new int[tokensToCompare.Length];
            int[] endOffsets = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
                text[i] = new string(tokensToCompare[i].Buffer(), 0, tokensToCompare[i].Length);
                startOffsets[i] = tokensToCompare[i].StartOffset();
                endOffsets[i] = tokensToCompare[i].EndOffset();
            }

            AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
コード例 #15
0
        public virtual void TestTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16);
        }
コード例 #16
0
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);
     filter.MinShingleSize = minShingleSize;
     filter.MaxShingleSize = maxShingleSize;
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
     filter.FillerToken = fillerToken;
     return new TokenStreamComponents(components.Tokenizer, filter);
 }
コード例 #17
0
 public CircularSequence(ShingleFilter shingleFilter)
 {
     this.outerInstance = shingleFilter;
     minValue           = shingleFilter.outputUnigrams ? 1 : shingleFilter.minShingleSize;
     Reset();
 }
コード例 #18
0
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue           = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     Reset();
 }
コード例 #19
0
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "--";

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "";

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;
            filter.TokenSeparator = null;

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
コード例 #20
0
 protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
 {
     ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
 }
コード例 #21
0
        public virtual void Test6GramFilterNoPositions()
        {

            ShingleFilter filter = new ShingleFilter(new TestTokenStream(this, TEST_TOKEN), 6);
            AssertTokenStreamContents
#pragma warning disable 612, 618
                (new PositionFilter(filter),
#pragma warning restore 612, 618
                SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS);
        }
コード例 #22
0
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     reset();
 }
コード例 #23
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource = attSource;
     this.termAtt = attSource.GetAttribute<ICharTermAttribute>();
     this.offsetAtt = attSource.GetAttribute<IOffsetAttribute>();
 }
コード例 #24
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream wrapped;
     try
     {
         wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
     }
     catch (IOException)
     {
         wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
     }
     ShingleFilter filter = new ShingleFilter(wrapped);
     filter.SetMaxShingleSize(maxShingleSize);
     filter.SetOutputUnigrams(outputUnigrams);
     return filter;
 }
コード例 #25
0
 public virtual void TestReset()
 {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
     wsTokenizer.Reader = new StringReader("please divide this sentence");
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
 }
コード例 #26
0
        public void TestReset()
        {
            Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
            TokenStream filter = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this", "this",
                                              "this sentence",
                                              "sentence"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
                                      new[]
                                          {
                                              TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                                              "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                                              TypeAttribute.DEFAULT_TYPE
                                          },
                                      new[] {1, 0, 1, 0, 1, 0, 1}
                );

            wsTokenizer.Reset(new StringReader("please divide this sentence"));

            AssertTokenStreamContents(filter,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this", "this",
                                              "this sentence",
                                              "sentence"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
                                      new[]
                                          {
                                              TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                                              "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                                              TypeAttribute.DEFAULT_TYPE
                                          },
                                      new[] {1, 0, 1, 0, 1, 0, 1}
                );
        }
コード例 #27
0
        protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                         int[] positionIncrements, String[] types, bool outputUnigrams)
        {
            var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
            filter.SetOutputUnigrams(outputUnigrams);

            var termAtt = filter.AddAttribute<ITermAttribute>();
            var offsetAtt = filter.AddAttribute<IOffsetAttribute>();
            var posIncrAtt = filter.AddAttribute<IPositionIncrementAttribute>();
            var typeAtt = filter.AddAttribute<ITypeAttribute>();

            int i = 0;
            while (filter.IncrementToken())
            {
                Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected");

                String termText = termAtt.Term;
                String goldText = tokensToCompare[i].Term;

                Assert.AreEqual(goldText, termText, "Wrong termText");
                Assert.AreEqual(tokensToCompare[i].StartOffset, offsetAtt.StartOffset,
                                "Wrong startOffset for token \"" + termText + "\"");
                Assert.AreEqual(tokensToCompare[i].EndOffset, offsetAtt.EndOffset,
                                "Wrong endOffset for token \"" + termText + "\"");
                Assert.AreEqual(positionIncrements[i], posIncrAtt.PositionIncrement,
                                "Wrong positionIncrement for token \"" + termText + "\"");
                Assert.AreEqual(types[i], typeAtt.Type, "Wrong type for token \"" + termText + "\"");

                i++;
            }

            Assert.AreEqual(tokensToCompare.Length, i,
                            "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" +
                            tokensToCompare.Length + ")");
        }
コード例 #28
0
        public virtual void TestTwoTrailingHoles()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 20 }, new int[] { 1, 0, 1, 0 }, 20);
        }
コード例 #29
0
 public void Test6GramFilterNoPositions()
 {
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
     AssertTokenStreamContents(new PositionFilter(filter),
                               SIX_GRAM_NO_POSITIONS_TOKENS,
                               SIX_GRAM_NO_POSITIONS_INCREMENTS);
 }
コード例 #30
0
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
     //TokenStream stream = new SopTokenFilter(tokenizer);
     TokenStream stream = new ShingleFilter(tokenizer, 5);
     //stream = new SopTokenFilter(stream);
     stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
     //stream = new SopTokenFilter(stream);
     return new TokenStreamComponents(tokenizer, stream);
 }