public virtual void TestReusableTokenStream()
        {
            Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), 2);

            AssertAnalyzesTo(a, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
            AssertAnalyzesTo(a, "divide me up again", new string[] { "divide", "divide me", "me", "me up", "up", "up again", "again" }, new int[] { 0, 0, 7, 7, 10, 10, 13 }, new int[] { 6, 9, 9, 12, 12, 18, 18 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
        }
        public virtual void TestAltTokenSeparator()
        {
            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);

            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please<SEP>divide", "divide", "divide<SEP>into", "into", "into<SEP>shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });

            analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please<SEP>divide", "divide<SEP>into", "into<SEP>shingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 });
        }
        public virtual void TestNonDefaultMinAndSameMaxShingleSize()
        {
            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), 3, 3);

            AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please", "please divide this", "divide", "divide this sentence", "this", "this sentence into", "sentence", "sentence into shingles", "into", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
            AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please divide this", "divide this sentence", "this sentence into", "sentence into shingles" }, new int[] { 0, 7, 14, 19 }, new int[] { 18, 27, 32, 41 }, new int[] { 1, 1, 1, 1 });
        }
        public virtual void TestAltFillerToken()
        {
            Analyzer @delegate = new AnalyzerAnonymousInnerClassHelper(this);

            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");

            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
        }
Exemplo n.º 5
0
        public virtual void TestAltFillerToken()
        {
            Analyzer @delegate = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into");
                Tokenizer tokenizer  = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filter   = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
                return(new TokenStreamComponents(tokenizer, filter));
            });

            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");

            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
        }
        public virtual void TestAltTokenSeparator()
        {
            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please<SEP>divide", "divide", "divide<SEP>into", "into", "into<SEP>shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });

            analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please<SEP>divide", "divide<SEP>into", "into<SEP>shingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 });
        }
        public virtual void TestAltFillerToken()
        {
            Analyzer @delegate = new AnalyzerAnonymousInnerClassHelper(this);

            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
            AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
        }
 public virtual void TestReusableTokenStream()
 {
     Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), 2);
     AssertAnalyzesTo(a, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
     AssertAnalyzesTo(a, "divide me up again", new string[] { "divide", "divide me", "me", "me up", "up", "up again", "again" }, new int[] { 0, 0, 7, 7, 10, 10, 13 }, new int[] { 6, 9, 9, 12, 12, 18, 18 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
 }
 public virtual void TestOutputUnigramsIfNoShinglesSingleToken()
 {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, true, ShingleFilter.DEFAULT_FILLER_TOKEN);
     AssertAnalyzesTo(analyzer, "please", new string[] { "please" }, new int[] { 0 }, new int[] { 6 }, new int[] { 1 });
 }
        public virtual void TestNonDefaultMinShingleSize()
        {
            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), 3, 4);
            AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please", "please divide this", "please divide this sentence", "divide", "divide this sentence", "divide this sentence into", "this", "this sentence into", "this sentence into shingles", "sentence", "sentence into shingles", "into", "shingles" }, new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });

            analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
            AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please divide this", "please divide this sentence", "divide this sentence", "divide this sentence into", "this sentence into", "this sentence into shingles", "sentence into shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 18, 27, 27, 32, 32, 41, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
        }
Exemplo n.º 11
0
 public void TestWrappedAnalyzerDoesNotReuse()
 {
     Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
     AssertAnalyzesToReuse(a, "please divide into shingles.",
                           new[]
                               {
                                   "please", "please divide", "divide", "divide into", "into", "into shingles",
                                   "shingles"
                               },
                           new[] { 0, 0, 7, 7, 14, 14, 19 },
                           new[] { 6, 13, 13, 18, 18, 27, 27 },
                           new[] { 1, 0, 1, 0, 1, 0, 1 });
     AssertAnalyzesToReuse(a, "please divide into shingles.",
                           new[]
                               {
                                   "please", "please divide", "divide", "divide into", "into", "into shingles.",
                                   "shingles."
                               },
                           new[] { 0, 0, 7, 7, 14, 14, 19 },
                           new[] { 6, 13, 13, 18, 18, 28, 28 },
                           new[] { 1, 0, 1, 0, 1, 0, 1 });
     AssertAnalyzesToReuse(a, "please divide into shingles.",
                           new[]
                               {
                                   "please", "please divide", "divide", "divide into", "into", "into shingles",
                                   "shingles"
                               },
                           new[] { 0, 0, 7, 7, 14, 14, 19 },
                           new[] { 6, 13, 13, 18, 18, 27, 27 },
                           new[] { 1, 0, 1, 0, 1, 0, 1 });
 }
Exemplo n.º 12
0
 public void TestReusableTokenStream()
 {
     Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
     AssertAnalyzesToReuse(a, "please divide into shingles",
                           new[]
                               {
                                   "please", "please divide", "divide", "divide into", "into", "into shingles",
                                   "shingles"
                               },
                           new[] {0, 0, 7, 7, 14, 14, 19},
                           new[] {6, 13, 13, 18, 18, 27, 27},
                           new[] {1, 0, 1, 0, 1, 0, 1});
     AssertAnalyzesToReuse(a, "divide me up again",
                           new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"},
                           new[] {0, 0, 7, 7, 10, 10, 13},
                           new[] {6, 9, 9, 12, 12, 18, 18},
                           new[] {1, 0, 1, 0, 1, 0, 1});
 }
Exemplo n.º 13
0
        public void TestShingleAnalyzerWrapperBooleanQuery()
        {
            Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
            Searcher = SetUpSearcher(analyzer);

            var q = new BooleanQuery();

            var ts = analyzer.TokenStream("content", new StringReader("test sentence"));

            var termAtt = ts.AddAttribute<ITermAttribute>();

            while (ts.IncrementToken())
            {
                var termText = termAtt.Term;
                q.Add(new TermQuery(new Term("content", termText)),
                      Occur.SHOULD);
            }

            var hits = Searcher.Search(q, null, 1000).ScoreDocs;
            var ranks = new[] {1, 2, 0};
            CompareRanks(hits, ranks);
        }
Exemplo n.º 14
0
        public void TestShingleAnalyzerWrapperPhraseQuery()
        {
            Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
            Searcher = SetUpSearcher(analyzer);

            var q = new PhraseQuery();

            var ts = analyzer.TokenStream("content", new StringReader("this sentence"));
            var j = -1;

            var posIncrAtt = ts.AddAttribute<IPositionIncrementAttribute>();
            var termAtt = ts.AddAttribute<ITermAttribute>();

            while (ts.IncrementToken())
            {
                j += posIncrAtt.PositionIncrement;
                var termText = termAtt.Term;
                q.Add(new Term("content", termText), j);
            }

            var hits = Searcher.Search(q, null, 1000).ScoreDocs;
            var ranks = new[] {0};
            CompareRanks(hits, ranks);
        }
        public virtual void TestOutputUnigramsIfNoShinglesSingleToken()
        {
            ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, true, ShingleFilter.DEFAULT_FILLER_TOKEN);

            AssertAnalyzesTo(analyzer, "please", new string[] { "please" }, new int[] { 0 }, new int[] { 6 }, new int[] { 1 });
        }