public virtual void TestReusableTokenStream() { Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), 2); AssertAnalyzesTo(a, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); AssertAnalyzesTo(a, "divide me up again", new string[] { "divide", "divide me", "me", "me up", "up", "up again", "again" }, new int[] { 0, 0, 7, 7, 10, 10, 13 }, new int[] { 6, 9, 9, 12, 12, 18, 18 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public virtual void TestAltTokenSeparator() { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please<SEP>divide", "divide", "divide<SEP>into", "into", "into<SEP>shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please<SEP>divide", "divide<SEP>into", "into<SEP>shingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 }); }
public virtual void TestNonDefaultMinAndSameMaxShingleSize() { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), 3, 3); AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please", "please divide this", "divide", "divide this sentence", "this", "this sentence into", "sentence", "sentence into shingles", "into", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please divide this", "divide this sentence", "this sentence into", "sentence into shingles" }, new int[] { 0, 7, 14, 19 }, new int[] { 18, 27, 32, 41 }, new int[] { 1, 1, 1, 1 }); }
public virtual void TestAltFillerToken() { Analyzer @delegate = new AnalyzerAnonymousInnerClassHelper(this); ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--"); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ""); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); }
public virtual void TestAltFillerToken() { Analyzer @delegate = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet); return(new TokenStreamComponents(tokenizer, filter)); }); ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--"); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(@delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ""); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 }); }
public virtual void TestAltTokenSeparator() { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", true, false, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please", "please<SEP>divide", "divide", "divide<SEP>into", "into", "into<SEP>shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "<SEP>", false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please divide into shingles", new string[] { "please<SEP>divide", "divide<SEP>into", "into<SEP>shingles" }, new int[] { 0, 7, 14 }, new int[] { 13, 18, 27 }, new int[] { 1, 1, 1 }); }
public virtual void TestReusableTokenStream() { Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), 2); AssertAnalyzesTo(a, "please divide into shingles", new string[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); AssertAnalyzesTo(a, "divide me up again", new string[] { "divide", "divide me", "me", "me up", "up", "up again", "again" }, new int[] { 0, 0, 7, 7, 10, 10, 13 }, new int[] { 6, 9, 9, 12, 12, 18, 18 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public virtual void TestOutputUnigramsIfNoShinglesSingleToken() { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, true, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please", new string[] { "please" }, new int[] { 0 }, new int[] { 6 }, new int[] { 1 }); }
public virtual void TestNonDefaultMinShingleSize() { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), 3, 4); AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please", "please divide this", "please divide this sentence", "divide", "divide this sentence", "divide this sentence into", "this", "this sentence into", "this sentence into shingles", "sentence", "sentence into shingles", "into", "shingles" }, new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please divide this sentence into shingles", new string[] { "please divide this", "please divide this sentence", "divide this sentence", "divide this sentence into", "this sentence into", "this sentence into shingles", "sentence into shingles" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 18, 27, 27, 32, 32, 41, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public void TestWrappedAnalyzerDoesNotReuse() { Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer()); AssertAnalyzesToReuse(a, "please divide into shingles.", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 }, new[] { 1, 0, 1, 0, 1, 0, 1 }); AssertAnalyzesToReuse(a, "please divide into shingles.", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles.", "shingles." }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 28, 28 }, new[] { 1, 0, 1, 0, 1, 0, 1 }); AssertAnalyzesToReuse(a, "please divide into shingles.", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new[] { 0, 0, 7, 7, 14, 14, 19 }, new[] { 6, 13, 13, 18, 18, 27, 27 }, new[] { 1, 0, 1, 0, 1, 0, 1 }); }
public void TestReusableTokenStream() { Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); AssertAnalyzesToReuse(a, "please divide into shingles", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, new[] {1, 0, 1, 0, 1, 0, 1}); AssertAnalyzesToReuse(a, "divide me up again", new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"}, new[] {0, 0, 7, 7, 10, 10, 13}, new[] {6, 9, 9, 12, 12, 18, 18}, new[] {1, 0, 1, 0, 1, 0, 1}); }
public void TestShingleAnalyzerWrapperBooleanQuery() { Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); Searcher = SetUpSearcher(analyzer); var q = new BooleanQuery(); var ts = analyzer.TokenStream("content", new StringReader("test sentence")); var termAtt = ts.AddAttribute<ITermAttribute>(); while (ts.IncrementToken()) { var termText = termAtt.Term; q.Add(new TermQuery(new Term("content", termText)), Occur.SHOULD); } var hits = Searcher.Search(q, null, 1000).ScoreDocs; var ranks = new[] {1, 2, 0}; CompareRanks(hits, ranks); }
public void TestShingleAnalyzerWrapperPhraseQuery() { Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); Searcher = SetUpSearcher(analyzer); var q = new PhraseQuery(); var ts = analyzer.TokenStream("content", new StringReader("this sentence")); var j = -1; var posIncrAtt = ts.AddAttribute<IPositionIncrementAttribute>(); var termAtt = ts.AddAttribute<ITermAttribute>(); while (ts.IncrementToken()) { j += posIncrAtt.PositionIncrement; var termText = termAtt.Term; q.Add(new Term("content", termText), j); } var hits = Searcher.Search(q, null, 1000).ScoreDocs; var ranks = new[] {0}; CompareRanks(hits, ranks); }
public virtual void TestOutputUnigramsIfNoShinglesSingleToken() { ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false), ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, "", false, true, ShingleFilter.DEFAULT_FILLER_TOKEN); AssertAnalyzesTo(analyzer, "please", new string[] { "please" }, new int[] { 0 }, new int[] { 6 }, new int[] { 1 }); }