A ShingleFilter constructs shingles (token n-grams) from a token stream. In other words, it creates combinations of tokens as a single token.

For example, the sentence "please divide this sentence into shingles" might be tokenized into shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles".

This filter handles position increments > 1 by inserting filler tokens (tokens with termtext "_"). It does not handle a position increment of 0.

Наследование: Lucene.Net.Analysis.TokenFilter
Пример #1
0
        protected internal virtual void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
        {
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);

            filter.SetOutputUnigrams(outputUnigrams);
            shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
        }
Пример #2
0
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);

            filter.SetFillerToken("--");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken("");

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.SetFillerToken(null);
            filter.SetTokenSeparator(null);

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
Пример #3
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource     = attSource;
     this.termAtt       = attSource.GetAttribute <ICharTermAttribute>();
     this.offsetAtt     = attSource.GetAttribute <IOffsetAttribute>();
 }
Пример #4
0
        public virtual void TestTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[]       inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter      = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16);
        }
Пример #5
0
        public virtual void TestReset()
        {
            Tokenizer   wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
            TokenStream filter      = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
            wsTokenizer.SetReader(new StringReader("please divide this sentence"));
            AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Need to account for the | breaks in relatedcontent
            var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader)));
            //return new ShingleFilter(tokenizedInput, 4);

            var output = new ShingleFilter(tokenizedInput, 4);
            //output.SetOutputUnigrams(false);
            return output;
        }
Пример #7
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.OutputUnigrams             = outputUnigrams;
            r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            r.TokenSeparator             = tokenSeparator;
            r.FillerToken = fillerToken;
            return(r);
        }
Пример #8
0
        public override TokenStream Create(TokenStream input)
        {
            ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);

            r.SetOutputUnigrams(outputUnigrams);
            r.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            r.SetTokenSeparator(tokenSeparator);
            r.SetFillerToken(fillerToken);
            return(r);
        }
Пример #9
0
        protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.SetMinShingleSize(minShingleSize);
            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetTokenSeparator(tokenSeparator);
            filter.SetOutputUnigrams(outputUnigrams);
            filter.SetOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
            filter.SetFillerToken(fillerToken);
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
Пример #10
0
        protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components)
        {
            ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);

            filter.MinShingleSize             = minShingleSize;
            filter.MaxShingleSize             = maxShingleSize;
            filter.TokenSeparator             = tokenSeparator;
            filter.OutputUnigrams             = outputUnigrams;
            filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
            filter.FillerToken = fillerToken;
            return(new TokenStreamComponents(components.Tokenizer, filter));
        }
Пример #11
0
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text         = new string[tokensToCompare.Length];
            int[]    startOffsets = new int[tokensToCompare.Length];
            int[]    endOffsets   = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
                text[i]         = new string(tokensToCompare[i].Buffer, 0, tokensToCompare[i].Length);
                startOffsets[i] = tokensToCompare[i].StartOffset;
                endOffsets[i]   = tokensToCompare[i].EndOffset;
            }

            AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
Пример #12
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream wrapped;

            try
            {
                wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
            }
            catch (IOException)
            {
                wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
            }
            ShingleFilter filter = new ShingleFilter(wrapped);

            filter.SetMaxShingleSize(maxShingleSize);
            filter.SetOutputUnigrams(outputUnigrams);
            return(filter);
        }
 public virtual void TestGraphs()
 {
     TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
     tk = new ShingleFilter(tk);
     tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
     AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
 }
Пример #14
0
        protected internal virtual void shingleFilterTestCommon(ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, string[] types)
        {
            string[] text = new string[tokensToCompare.Length];
            int[] startOffsets = new int[tokensToCompare.Length];
            int[] endOffsets = new int[tokensToCompare.Length];

            for (int i = 0; i < tokensToCompare.Length; i++)
            {
                text[i] = new string(tokensToCompare[i].Buffer(), 0, tokensToCompare[i].Length);
                startOffsets[i] = tokensToCompare[i].StartOffset();
                endOffsets[i] = tokensToCompare[i].EndOffset();
            }

            AssertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
        }
Пример #15
0
        public virtual void TestTrailingHole2()
        {
            // Analyzing "purple wizard of", where of is removed as a
            // stopword leaving a trailing hole:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 16 }, new int[] { 1, 0, 1, 0 }, 16);
        }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);
     filter.MinShingleSize = minShingleSize;
     filter.MaxShingleSize = maxShingleSize;
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
     filter.FillerToken = fillerToken;
     return new TokenStreamComponents(components.Tokenizer, filter);
 }
Пример #17
0
 public CircularSequence(ShingleFilter shingleFilter)
 {
     this.outerInstance = shingleFilter;
     minValue           = shingleFilter.outputUnigrams ? 1 : shingleFilter.minShingleSize;
     Reset();
 }
Пример #18
0
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue           = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     Reset();
 }
Пример #19
0
        public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "--";

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);

            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = "";

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  " }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);


            filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
            filter.FillerToken = null;
            filter.TokenSeparator = null;

            AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
        }
Пример #20
0
 protected internal virtual void shingleFilterTest(string tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, string[] types, bool outputUnigrams)
 {
     ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
 }
        public virtual void Test6GramFilterNoPositions()
        {

            ShingleFilter filter = new ShingleFilter(new TestTokenStream(this, TEST_TOKEN), 6);
            AssertTokenStreamContents
#pragma warning disable 612, 618
                (new PositionFilter(filter),
#pragma warning restore 612, 618
                SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS);
        }
Пример #22
0
 public CircularSequence(ShingleFilter outerInstance)
 {
     this.outerInstance = outerInstance;
     minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
     reset();
 }
Пример #23
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource = attSource;
     this.termAtt = attSource.GetAttribute<ICharTermAttribute>();
     this.offsetAtt = attSource.GetAttribute<IOffsetAttribute>();
 }
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream wrapped;
     try
     {
         wrapped = defaultAnalyzer.ReusableTokenStream(fieldName, reader);
     }
     catch (IOException)
     {
         wrapped = defaultAnalyzer.TokenStream(fieldName, reader);
     }
     ShingleFilter filter = new ShingleFilter(wrapped);
     filter.SetMaxShingleSize(maxShingleSize);
     filter.SetOutputUnigrams(outputUnigrams);
     return filter;
 }
Пример #25
0
 public virtual void TestReset()
 {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
     wsTokenizer.Reader = new StringReader("please divide this sentence");
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
 }
Пример #26
0
        public void TestReset()
        {
            Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
            TokenStream filter = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this", "this",
                                              "this sentence",
                                              "sentence"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
                                      new[]
                                          {
                                              TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                                              "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                                              TypeAttribute.DEFAULT_TYPE
                                          },
                                      new[] {1, 0, 1, 0, 1, 0, 1}
                );

            wsTokenizer.Reset(new StringReader("please divide this sentence"));

            AssertTokenStreamContents(filter,
                                      new[]
                                          {
                                              "please", "please divide", "divide", "divide this", "this",
                                              "this sentence",
                                              "sentence"
                                          },
                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
                                      new[]
                                          {
                                              TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                                              "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                                              TypeAttribute.DEFAULT_TYPE
                                          },
                                      new[] {1, 0, 1, 0, 1, 0, 1}
                );
        }
Пример #27
0
        protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                         int[] positionIncrements, String[] types, bool outputUnigrams)
        {
            var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
            filter.SetOutputUnigrams(outputUnigrams);

            var termAtt = filter.AddAttribute<ITermAttribute>();
            var offsetAtt = filter.AddAttribute<IOffsetAttribute>();
            var posIncrAtt = filter.AddAttribute<IPositionIncrementAttribute>();
            var typeAtt = filter.AddAttribute<ITypeAttribute>();

            int i = 0;
            while (filter.IncrementToken())
            {
                Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected");

                String termText = termAtt.Term;
                String goldText = tokensToCompare[i].Term;

                Assert.AreEqual(goldText, termText, "Wrong termText");
                Assert.AreEqual(tokensToCompare[i].StartOffset, offsetAtt.StartOffset,
                                "Wrong startOffset for token \"" + termText + "\"");
                Assert.AreEqual(tokensToCompare[i].EndOffset, offsetAtt.EndOffset,
                                "Wrong endOffset for token \"" + termText + "\"");
                Assert.AreEqual(positionIncrements[i], posIncrAtt.PositionIncrement,
                                "Wrong positionIncrement for token \"" + termText + "\"");
                Assert.AreEqual(types[i], typeAtt.Type, "Wrong type for token \"" + termText + "\"");

                i++;
            }

            Assert.AreEqual(tokensToCompare.Length, i,
                            "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" +
                            tokensToCompare.Length + ")");
        }
Пример #28
0
        public virtual void TestTwoTrailingHoles()
        {
            // Analyzing "purple wizard of the", where of and the are removed as a
            // stopwords, leaving two trailing holes:
            Token[] inputTokens = new Token[] { CreateToken("purple", 0, 6), CreateToken("wizard", 7, 13) };
            ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);

            AssertTokenStreamContents(filter, new string[] { "purple", "purple wizard", "wizard", "wizard _" }, new int[] { 0, 0, 7, 7 }, new int[] { 6, 13, 13, 20 }, new int[] { 1, 0, 1, 0 }, 20);
        }
Пример #29
0
 public void Test6GramFilterNoPositions()
 {
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
     AssertTokenStreamContents(new PositionFilter(filter),
                               SIX_GRAM_NO_POSITIONS_TOKENS,
                               SIX_GRAM_NO_POSITIONS_INCREMENTS);
 }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
     //TokenStream stream = new SopTokenFilter(tokenizer);
     TokenStream stream = new ShingleFilter(tokenizer, 5);
     //stream = new SopTokenFilter(stream);
     stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
     //stream = new SopTokenFilter(stream);
     return new TokenStreamComponents(tokenizer, stream);
 }