//END //this method creates document from an ObjectToIndex public void BuildIndex(FileToIndex file) { using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30)) { using (IndexWriter idxw = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { //check if document exists, if true deletes existing var searchQuery = new TermQuery(new Term("Id", file.Id.ToString())); idxw.DeleteDocuments(searchQuery); //creation Document doc = new Document(); doc.Add(new Field("Id", file.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//аналайзер разбивает строки на слова doc.Add(new Field("Title", file.Title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Description", file.Description, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Authors", file.Authors, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Text", file.Text, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Hashtags", file.Hashtags, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Discipline", file.Discipline, Field.Store.YES, Field.Index.ANALYZED)); //write the document to the index idxw.AddDocument(doc); //optimize and close the writer idxw.Commit(); idxw.Optimize(); } } }
//partially taken from http://www.codeproject.com/Articles/320219/Lucene-Net-ultra-fast-search-for-MVC-or-WebForms //START private IEnumerable <FileToIndex> _search(string keywords, out int count, string field = "") { if (string.IsNullOrEmpty(keywords.Replace("*", "").Replace("?", ""))) { count = 0; return(new List <FileToIndex>()); } using (var searcher = new IndexSearcher(_directory)) using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30)) { if (!string.IsNullOrEmpty(field)) { var parser = new QueryParser(Version.LUCENE_30, field, analyzer); var queryForField = parseQuery(keywords, parser); var docs = searcher.Search(queryForField, 100); count = docs.TotalHits; var samples = _convertDocs(docs.ScoreDocs, searcher); searcher.Dispose(); return(samples); } else { var parser = new MultiFieldQueryParser (Version.LUCENE_30, new[] { "Title", "Authors", "Description", "Text", "Discipline" }, analyzer); var queryForField = parseQuery(keywords, parser); var docs = searcher.Search(queryForField, null, 100, Sort.RELEVANCE); count = docs.TotalHits; var samples = _convertDocs(docs.ScoreDocs, searcher); searcher.Dispose(); return(samples); } } }
public IEnumerable <SampleHit> Search(string query_str) { List <SampleHit> result_hits = new List <SampleHit>(); using (Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(index_folder)) { Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); //Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); using (IndexSearcher searcher = new IndexSearcher(luceneIndexDirectory)) { QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_CURRENT, IndexModel.LineText, analyzer); Query query = parser.Parse(query_str); TopDocs hits = searcher.Search(query, max_search_hits); // code highlighting var formatter = new Lucene.Net.Search.Highlight.SimpleHTMLFormatter("<span style=\"background:yellow;\">", "</span>"); var fragmenter = new Lucene.Net.Search.Highlight.SimpleFragmenter(200); Lucene.Net.Search.Highlight.QueryScorer scorer = new Lucene.Net.Search.Highlight.QueryScorer(query); Lucene.Net.Search.Highlight.Highlighter highlighter = new Lucene.Net.Search.Highlight.Highlighter(formatter, scorer); highlighter.TextFragmenter = fragmenter; foreach (ScoreDoc hit in hits.ScoreDocs) { Document doc = searcher.Doc(hit.Doc); float score = hit.Score; Field line_number = doc.GetField(IndexModel.LineNumber); Field line_text = doc.GetField(IndexModel.LineText); Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(line_text.StringValue)); string highlightedText = highlighter.GetBestFragments(stream, doc.Get(IndexModel.LineText), 1, "..."); result_hits.Add(new SampleHit { line_number = line_number.StringValue, sample_text = line_text.StringValue, html_highlighting = highlightedText }); } } } return(result_hits); }
//partially taken from http://www.codeproject.com/Articles/320219/Lucene-Net-ultra-fast-search-for-MVC-or-WebForms //START private IEnumerable<FileToIndex> _search(string keywords, out int count, string field = "") { if (string.IsNullOrEmpty(keywords.Replace("*", "").Replace("?", ""))) { count = 0; return new List<FileToIndex>(); } using (var searcher = new IndexSearcher(_directory)) using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30)) { if (!string.IsNullOrEmpty(field)) { var parser = new QueryParser(Version.LUCENE_30, field, analyzer); var queryForField = parseQuery(keywords, parser); var docs = searcher.Search(queryForField, 100); count = docs.TotalHits; var samples = _convertDocs(docs.ScoreDocs, searcher); searcher.Dispose(); return samples; } else { var parser = new MultiFieldQueryParser (Version.LUCENE_30, new[] { "Title", "Authors", "Description", "Text", "Discipline" }, analyzer); var queryForField = parseQuery(keywords, parser); var docs = searcher.Search(queryForField, null, 100, Sort.RELEVANCE); count = docs.TotalHits; var samples = _convertDocs(docs.ScoreDocs, searcher); searcher.Dispose(); return samples; } } }
public void BuildIndex(string index_folder, string corpus_file_path, CorpusFormat.CorpusFormatDecriptor corpus_format, ICancelIndexation cancellation, IShowIndexationProgress progress) { if (string.IsNullOrEmpty(index_folder)) { throw new ArgumentException("index_folder"); } if (string.IsNullOrEmpty(corpus_file_path)) { throw new ArgumentException("corpus_file_path"); } // todo: добавить учет параметров corpus_format, вероятно через фабрику. if (!System.IO.File.Exists(corpus_file_path)) { throw new ApplicationException($"File {corpus_file_path} does not exists"); } // Очистим папку с индексной информацией от предыдущего индексирования. if (System.IO.Directory.Exists(index_folder)) { System.IO.Directory.Delete(index_folder, true); } // Для оценки прогресса индексирования большого файла нам нужно заранее получить число строк в нем, // чем мы сейчас и займемся в лоб. // TODO: для оптимизации можно читать байты блоками и искать \n int total_lines = 0; using (System.IO.StreamReader rdr0 = new System.IO.StreamReader(corpus_file_path)) { while (!rdr0.EndOfStream) { rdr0.ReadLine(); total_lines += 1; } } using (Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(index_folder)) { Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); //Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); using (Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(luceneIndexDirectory, analyzer, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED)) { CorpusFileReader rdr = new CorpusFileReader(); int line_counter = 0; foreach (var sampleDataFileRow in rdr.ReadAllLines(corpus_file_path)) { line_counter++; if (cancellation.GetCancellationPending()) { cancellation.Cancelled = true; break; } if ((line_counter % 100000) == 0) { int percentage = (int)Math.Round((100.0 * line_counter) / total_lines); progress.ShowProgress(percentage); } Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Lucene.Net.Documents.Field(IndexModel.LineNumber, sampleDataFileRow.LineNumber.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO)); doc.Add(new Lucene.Net.Documents.Field(IndexModel.LineText, sampleDataFileRow.LineText, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED)); writer.AddDocument(doc); } writer.Optimize(); writer.Flush(true, true, true); } } }