public void TestWildcards() { Directory dir = NewDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.Add(body); body.SetStringValue("This is a test."); iw.AddDocument(doc); body.SetStringValue("Test a one sentence document."); iw.AddDocument(doc); IndexReader ir = iw.Reader; iw.Dispose(); IndexSearcher searcher = NewSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer); // PostingsHighlighter highlighter = new PostingsHighlighter() { // @Override // protected Analyzer getIndexAnalyzer(String field) // { // return analyzer; // } //}; Query query = new WildcardQuery(new Term("body", "te*")); TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); String[] snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); // wrong field BooleanQuery bq = new BooleanQuery(); bq.Add(new MatchAllDocsQuery(), Occur.SHOULD); bq.Add(new WildcardQuery(new Term("bogus", "te*")), Occur.SHOULD); topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", bq, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a test.", snippets[0]); assertEquals("Test a one sentence document.", snippets[1]); ir.Dispose(); dir.Dispose(); }
public void TestWhichMTQMatched() { Directory dir = NewDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.Add(body); body.SetStringValue("Test a one sentence document."); iw.AddDocument(doc); IndexReader ir = iw.GetReader(); iw.Dispose(); IndexSearcher searcher = NewSearcher(ir); ICUPostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer); BooleanQuery query = new BooleanQuery(); query.Add(new WildcardQuery(new Term("body", "te*")), Occur.SHOULD); query.Add(new WildcardQuery(new Term("body", "one")), Occur.SHOULD); query.Add(new WildcardQuery(new Term("body", "se*")), Occur.SHOULD); TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(1, topDocs.TotalHits); String[] snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(1, snippets.Length); // Default formatter just bolds each hit: assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", snippets[0]); // Now use our own formatter, that also stuffs the // matching term's text into the result: highlighter = new PostingsHighlighterAnalyzerAndFormatterHelper(analyzer, new PassageFormatterHelper()); assertEquals(1, topDocs.TotalHits); snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(1, snippets.Length); // Default formatter bolds each hit: assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]); ir.Dispose(); dir.Dispose(); }
public void TestSpanNot() { Directory dir = NewDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.Add(body); body.SetStringValue("This is a test."); iw.AddDocument(doc); body.SetStringValue("Test a one sentence document."); iw.AddDocument(doc); IndexReader ir = iw.GetReader(); iw.Dispose(); IndexSearcher searcher = NewSearcher(ir); ICUPostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer); SpanQuery include = new SpanMultiTermQueryWrapper <WildcardQuery>(new WildcardQuery(new Term("body", "te*"))); SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus")); Query query = new SpanNotQuery(include, exclude); TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); String[] snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); ir.Dispose(); dir.Dispose(); }
public void TestRanges() { Directory dir = NewDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.Add(body); body.SetStringValue("This is a test."); iw.AddDocument(doc); body.SetStringValue("Test a one sentence document."); iw.AddDocument(doc); IndexReader ir = iw.GetReader(); iw.Dispose(); IndexSearcher searcher = NewSearcher(ir); ICUPostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer); Query query = TermRangeQuery.NewStringRange("body", "ta", "tf", true, true); TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); String[] snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); // null start query = TermRangeQuery.NewStringRange("body", null, "tf", true, true); topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This <b>is</b> <b>a</b> <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> <b>a</b> <b>one</b> <b>sentence</b> <b>document</b>.", snippets[1]); // null end query = TermRangeQuery.NewStringRange("body", "ta", null, true, true); topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("<b>This</b> is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); // exact start inclusive query = TermRangeQuery.NewStringRange("body", "test", "tf", true, true); topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); // exact end inclusive query = TermRangeQuery.NewStringRange("body", "ta", "test", true, true); topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a <b>test</b>.", snippets[0]); assertEquals("<b>Test</b> a one sentence document.", snippets[1]); // exact start exclusive BooleanQuery bq = new BooleanQuery(); bq.Add(new MatchAllDocsQuery(), Occur.SHOULD); bq.Add(TermRangeQuery.NewStringRange("body", "test", "tf", false, true), Occur.SHOULD); topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", bq, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a test.", snippets[0]); assertEquals("Test a one sentence document.", snippets[1]); // exact end exclusive bq = new BooleanQuery(); bq.Add(new MatchAllDocsQuery(), Occur.SHOULD); bq.Add(TermRangeQuery.NewStringRange("body", "ta", "test", true, false), Occur.SHOULD); topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", bq, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a test.", snippets[0]); assertEquals("Test a one sentence document.", snippets[1]); // wrong field bq = new BooleanQuery(); bq.Add(new MatchAllDocsQuery(), Occur.SHOULD); bq.Add(TermRangeQuery.NewStringRange("bogus", "ta", "tf", true, true), Occur.SHOULD); topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER); assertEquals(2, topDocs.TotalHits); snippets = highlighter.Highlight("body", bq, searcher, topDocs); assertEquals(2, snippets.Length); assertEquals("This is a test.", snippets[0]); assertEquals("Test a one sentence document.", snippets[1]); ir.Dispose(); dir.Dispose(); }
public void TestWhichMTQMatched() { Directory dir = NewDirectory(); // use simpleanalyzer for more natural tokenization (else "test." is a token) Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); FieldType offsetsType = new FieldType(TextField.TYPE_STORED); offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field body = new Field("body", "", offsetsType); Document doc = new Document(); doc.Add(body); body.SetStringValue("Test a one sentence document."); iw.AddDocument(doc); IndexReader ir = iw.Reader; iw.Dispose(); IndexSearcher searcher = NewSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer); //PostingsHighlighter highlighter = new PostingsHighlighter() { // @Override // protected Analyzer getIndexAnalyzer(String field) //{ // return analyzer; //} // }; BooleanQuery query = new BooleanQuery(); query.Add(new WildcardQuery(new Term("body", "te*")), Occur.SHOULD); query.Add(new WildcardQuery(new Term("body", "one")), Occur.SHOULD); query.Add(new WildcardQuery(new Term("body", "se*")), Occur.SHOULD); TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER); assertEquals(1, topDocs.TotalHits); String[] snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(1, snippets.Length); // Default formatter just bolds each hit: assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", snippets[0]); // Now use our own formatter, that also stuffs the // matching term's text into the result: highlighter = new PostingsHighlighterAnalyzerAndFormatterHelper(analyzer, new PassageFormatterHelper()); //highlighter = new PostingsHighlighter() //{ // @Override // protected Analyzer getIndexAnalyzer(String field) //{ // return analyzer; //} //@Override // protected PassageFormatter getFormatter(String field) //{ // return new PassageFormatter() { // @Override // public Object format(Passage passages[], String content) //{ // // Copied from DefaultPassageFormatter, but // // tweaked to include the matched term: // StringBuilder sb = new StringBuilder(); // int pos = 0; // for (Passage passage : passages) // { // // don't add ellipsis if its the first one, or if its connected. // if (passage.startOffset > pos && pos > 0) // { // sb.append("... "); // } // pos = passage.startOffset; // for (int i = 0; i < passage.numMatches; i++) // { // int start = passage.matchStarts[i]; // int end = passage.matchEnds[i]; // // its possible to have overlapping terms // if (start > pos) // { // sb.append(content, pos, start); // } // if (end > pos) // { // sb.append("<b>"); // sb.append(content, Math.max(pos, start), end); // sb.append('('); // sb.append(passage.getMatchTerms()[i].utf8ToString()); // sb.append(')'); // sb.append("</b>"); // pos = end; // } // } // // its possible a "term" from the analyzer could span a sentence boundary. // sb.append(content, pos, Math.max(pos, passage.endOffset)); // pos = passage.endOffset; // } // return sb.toString(); //} // }; // } // }; assertEquals(1, topDocs.TotalHits); snippets = highlighter.Highlight("body", query, searcher, topDocs); assertEquals(1, snippets.Length); // Default formatter bolds each hit: assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]); ir.Dispose(); dir.Dispose(); }