protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
            {
                ShingleFilter shingles = new ShingleFilter(components.TokenStream, 2, outerInstance.grams);

                shingles.SetTokenSeparator(char.ToString((char)outerInstance.separator));
                return(new TokenStreamComponents(components.Tokenizer, shingles));
            }
        public virtual void TestGraphs()
        {
            TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));

            tk = new ShingleFilter(tk);
            tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
            AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
        }
Beispiel #3
0
        public void Test6GramFilterNoPositions()
        {
            ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);

            AssertTokenStreamContents(new PositionFilter(filter),
                                      SIX_GRAM_NO_POSITIONS_TOKENS,
                                      SIX_GRAM_NO_POSITIONS_INCREMENTS);
        }
Beispiel #4
0
        public virtual void Test6GramFilterNoPositions()
        {
            ShingleFilter filter = new ShingleFilter(new TestTokenStream(this, TEST_TOKEN), 6);

            AssertTokenStreamContents
#pragma warning disable 612, 618
                (new PositionFilter(filter),
#pragma warning restore 612, 618
                SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS);
        }
Beispiel #5
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
                //TokenStream stream = new SopTokenFilter(tokenizer);
                TokenStream stream = new ShingleFilter(tokenizer, 5);

                //stream = new SopTokenFilter(stream);
                stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
                //stream = new SopTokenFilter(stream);
                return(new TokenStreamComponents(tokenizer, stream));
            }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            //Need to account for the | breaks in relatedcontent
            var tokenizedInput = new LowerCaseFilter(new StandardFilter(new StandardTokenizer(_version, reader)));
            //return new ShingleFilter(tokenizedInput, 4);

            var output = new ShingleFilter(tokenizedInput, 4);

            //output.SetOutputUnigrams(false);
            return(output);
        }
Beispiel #7
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream ts = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30).ReusableTokenStream(fieldName, reader);

            ts = new LowerCaseFilter(ts);
            int shinglesSize = 3;

            ts = new ShingleFilter(ts, shinglesSize);

            // Uncomment below to use StopFilter that will skip few predefined set of words.
            // ts = new StopFilter(true, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            return(ts);
        }
Beispiel #8
0
        public virtual void TestUnicodeShinglesAndNgrams()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
                //TokenStream stream = new SopTokenFilter(tokenizer);
                TokenStream stream = new ShingleFilter(tokenizer, 5);
                //stream = new SopTokenFilter(stream);
                stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
                //stream = new SopTokenFilter(stream);
                return(new TokenStreamComponents(tokenizer, stream));
            });

            CheckRandomData(Random, analyzer, 2000);
        }
Beispiel #9
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var tokenizer = new StandardTokenizer(version, reader);
            var shingler  = new ShingleFilter(tokenizer, minGramSize, maxGramSize);

            if (!this.ShowUnigrams)
            {
                shingler.SetOutputUnigrams(false);
            }
            else
            {
                shingler.SetOutputUnigrams(true);
            }
            var filter = new StopFilter(version, new LowerCaseFilter(version, shingler),
                                        StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            return(new TokenStreamComponents(tokenizer, filter));
        }
Beispiel #10
0
        public void TestReset()
        {
            Tokenizer   wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
            TokenStream filter      = new ShingleFilter(wsTokenizer, 2);

            AssertTokenStreamContents(filter,
                                      new[]
            {
                "please", "please divide", "divide", "divide this", "this",
                "this sentence",
                "sentence"
            },
                                      new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 },
                                      new[]
            {
                TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                TypeAttribute.DEFAULT_TYPE
            },
                                      new[] { 1, 0, 1, 0, 1, 0, 1 }
                                      );

            wsTokenizer.Reset(new StringReader("please divide this sentence"));

            AssertTokenStreamContents(filter,
                                      new[]
            {
                "please", "please divide", "divide", "divide this", "this",
                "this sentence",
                "sentence"
            },
                                      new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 },
                                      new[]
            {
                TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE,
                "shingle", TypeAttribute.DEFAULT_TYPE, "shingle",
                TypeAttribute.DEFAULT_TYPE
            },
                                      new[] { 1, 0, 1, 0, 1, 0, 1 }
                                      );
        }
Beispiel #11
0
        protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                         int[] positionIncrements, String[] types, bool outputUnigrams)
        {
            var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);

            filter.SetOutputUnigrams(outputUnigrams);

            var termAtt    = filter.AddAttribute <ITermAttribute>();
            var offsetAtt  = filter.AddAttribute <IOffsetAttribute>();
            var posIncrAtt = filter.AddAttribute <IPositionIncrementAttribute>();
            var typeAtt    = filter.AddAttribute <ITypeAttribute>();

            int i = 0;

            while (filter.IncrementToken())
            {
                Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected");

                String termText = termAtt.Term;
                String goldText = tokensToCompare[i].Term;

                Assert.AreEqual(goldText, termText, "Wrong termText");
                Assert.AreEqual(tokensToCompare[i].StartOffset, offsetAtt.StartOffset,
                                "Wrong startOffset for token \"" + termText + "\"");
                Assert.AreEqual(tokensToCompare[i].EndOffset, offsetAtt.EndOffset,
                                "Wrong endOffset for token \"" + termText + "\"");
                Assert.AreEqual(positionIncrements[i], posIncrAtt.PositionIncrement,
                                "Wrong positionIncrement for token \"" + termText + "\"");
                Assert.AreEqual(types[i], typeAtt.Type, "Wrong type for token \"" + termText + "\"");

                i++;
            }

            Assert.AreEqual(tokensToCompare.Length, i,
                            "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" +
                            tokensToCompare.Length + ")");
        }
 protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components)
 {
     ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);
     filter.MinShingleSize = minShingleSize;
     filter.MaxShingleSize = maxShingleSize;
     filter.TokenSeparator = tokenSeparator;
     filter.OutputUnigrams = outputUnigrams;
     filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
     filter.FillerToken = fillerToken;
     return new TokenStreamComponents(components.Tokenizer, filter);
 }