コード例 #1
0
        //END
        //this method creates document from an ObjectToIndex
        public void BuildIndex(FileToIndex file)
        {
            using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30))
            {
                using (IndexWriter idxw = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
                {
                    //check if document exists, if true deletes existing

                    var searchQuery = new TermQuery(new Term("Id", file.Id.ToString()));
                    idxw.DeleteDocuments(searchQuery);
                    //creation
                    Document doc = new Document();
                    doc.Add(new Field("Id", file.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//аналайзер разбивает строки на слова
                    doc.Add(new Field("Title", file.Title, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Description", file.Description, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Authors", file.Authors, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Text", file.Text, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Hashtags", file.Hashtags, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Discipline", file.Discipline, Field.Store.YES, Field.Index.ANALYZED));
                    //write the document to the index
                    idxw.AddDocument(doc);
                    //optimize and close the writer
                    idxw.Commit();

                    idxw.Optimize();

                }
            }
        }
コード例 #2
0
 //partially taken from http://www.codeproject.com/Articles/320219/Lucene-Net-ultra-fast-search-for-MVC-or-WebForms
 //START
 private IEnumerable <FileToIndex> _search(string keywords, out int count, string field = "")
 {
     if (string.IsNullOrEmpty(keywords.Replace("*", "").Replace("?", "")))
     {
         count = 0;
         return(new List <FileToIndex>());
     }
     using (var searcher = new IndexSearcher(_directory))
         using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30))
         {
             if (!string.IsNullOrEmpty(field))
             {
                 var parser        = new QueryParser(Version.LUCENE_30, field, analyzer);
                 var queryForField = parseQuery(keywords, parser);
                 var docs          = searcher.Search(queryForField, 100);
                 count = docs.TotalHits;
                 var samples = _convertDocs(docs.ScoreDocs, searcher);
                 searcher.Dispose();
                 return(samples);
             }
             else
             {
                 var parser = new MultiFieldQueryParser
                                  (Version.LUCENE_30, new[] { "Title", "Authors", "Description", "Text", "Discipline" }, analyzer);
                 var queryForField = parseQuery(keywords, parser);
                 var docs          = searcher.Search(queryForField, null, 100, Sort.RELEVANCE);
                 count = docs.TotalHits;
                 var samples = _convertDocs(docs.ScoreDocs, searcher);
                 searcher.Dispose();
                 return(samples);
             }
         }
 }
コード例 #3
0
        //END

        //this method creates document from an ObjectToIndex
        public void BuildIndex(FileToIndex file)
        {
            using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30))
            {
                using (IndexWriter idxw = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
                {
                    //check if document exists, if true deletes existing

                    var searchQuery = new TermQuery(new Term("Id", file.Id.ToString()));
                    idxw.DeleteDocuments(searchQuery);
                    //creation
                    Document doc = new Document();
                    doc.Add(new Field("Id", file.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//аналайзер разбивает строки на слова
                    doc.Add(new Field("Title", file.Title, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Description", file.Description, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Authors", file.Authors, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Text", file.Text, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Hashtags", file.Hashtags, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field("Discipline", file.Discipline, Field.Store.YES, Field.Index.ANALYZED));
                    //write the document to the index
                    idxw.AddDocument(doc);
                    //optimize and close the writer
                    idxw.Commit();

                    idxw.Optimize();
                }
            }
        }
コード例 #4
0
        public IEnumerable <SampleHit> Search(string query_str)
        {
            List <SampleHit> result_hits = new List <SampleHit>();

            using (Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(index_folder))
            {
                Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
                //Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);

                using (IndexSearcher searcher = new IndexSearcher(luceneIndexDirectory))
                {
                    QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_CURRENT, IndexModel.LineText, analyzer);
                    Query       query  = parser.Parse(query_str);

                    TopDocs hits = searcher.Search(query, max_search_hits);

                    // code highlighting
                    var formatter  = new Lucene.Net.Search.Highlight.SimpleHTMLFormatter("<span style=\"background:yellow;\">", "</span>");
                    var fragmenter = new Lucene.Net.Search.Highlight.SimpleFragmenter(200);
                    Lucene.Net.Search.Highlight.QueryScorer scorer      = new Lucene.Net.Search.Highlight.QueryScorer(query);
                    Lucene.Net.Search.Highlight.Highlighter highlighter = new Lucene.Net.Search.Highlight.Highlighter(formatter, scorer);
                    highlighter.TextFragmenter = fragmenter;

                    foreach (ScoreDoc hit in hits.ScoreDocs)
                    {
                        Document doc   = searcher.Doc(hit.Doc);
                        float    score = hit.Score;

                        Field line_number = doc.GetField(IndexModel.LineNumber);
                        Field line_text   = doc.GetField(IndexModel.LineText);

                        Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(line_text.StringValue));
                        string highlightedText = highlighter.GetBestFragments(stream, doc.Get(IndexModel.LineText), 1, "...");

                        result_hits.Add(new SampleHit {
                            line_number = line_number.StringValue, sample_text = line_text.StringValue, html_highlighting = highlightedText
                        });
                    }
                }
            }


            return(result_hits);
        }
コード例 #5
0
 //partially taken from http://www.codeproject.com/Articles/320219/Lucene-Net-ultra-fast-search-for-MVC-or-WebForms
 //START
 private IEnumerable<FileToIndex> _search(string keywords, out int count, string field = "")
 {
     if (string.IsNullOrEmpty(keywords.Replace("*", "").Replace("?", "")))
     {
         count = 0;
         return new List<FileToIndex>();
     }
     using (var searcher = new IndexSearcher(_directory))
     using (var analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Version.LUCENE_30))
     {
         if (!string.IsNullOrEmpty(field))
         {
             var parser = new QueryParser(Version.LUCENE_30, field, analyzer);
             var queryForField = parseQuery(keywords, parser);
             var docs = searcher.Search(queryForField, 100);
             count = docs.TotalHits;
             var samples = _convertDocs(docs.ScoreDocs, searcher);
             searcher.Dispose();
             return samples;
         }
         else
         {
             var parser = new MultiFieldQueryParser
                 (Version.LUCENE_30, new[] { "Title", "Authors", "Description", "Text", "Discipline" }, analyzer);
             var queryForField = parseQuery(keywords, parser);
             var docs = searcher.Search(queryForField, null, 100, Sort.RELEVANCE);
             count = docs.TotalHits;
             var samples = _convertDocs(docs.ScoreDocs, searcher);
             searcher.Dispose();
             return samples;
         }
     }
 }
コード例 #6
0
ファイル: CorpusIndexer.cs プロジェクト: Koziev/CorpusSearch
        public void BuildIndex(string index_folder,
                               string corpus_file_path,
                               CorpusFormat.CorpusFormatDecriptor corpus_format,
                               ICancelIndexation cancellation, IShowIndexationProgress progress)
        {
            if (string.IsNullOrEmpty(index_folder))
            {
                throw new ArgumentException("index_folder");
            }

            if (string.IsNullOrEmpty(corpus_file_path))
            {
                throw new ArgumentException("corpus_file_path");
            }

            // todo: добавить учет параметров corpus_format, вероятно через фабрику.


            if (!System.IO.File.Exists(corpus_file_path))
            {
                throw new ApplicationException($"File {corpus_file_path} does not exists");
            }

            // Очистим папку с индексной информацией от предыдущего индексирования.
            if (System.IO.Directory.Exists(index_folder))
            {
                System.IO.Directory.Delete(index_folder, true);
            }

            // Для оценки прогресса индексирования большого файла нам нужно заранее получить число строк в нем,
            // чем мы сейчас и займемся в лоб.
            // TODO: для оптимизации можно читать байты блоками и искать \n
            int total_lines = 0;

            using (System.IO.StreamReader rdr0 = new System.IO.StreamReader(corpus_file_path))
            {
                while (!rdr0.EndOfStream)
                {
                    rdr0.ReadLine();
                    total_lines += 1;
                }
            }


            using (Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(index_folder))
            {
                Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
                //Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);

                using (Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(luceneIndexDirectory, analyzer, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED))
                {
                    CorpusFileReader rdr = new CorpusFileReader();

                    int line_counter = 0;
                    foreach (var sampleDataFileRow in rdr.ReadAllLines(corpus_file_path))
                    {
                        line_counter++;

                        if (cancellation.GetCancellationPending())
                        {
                            cancellation.Cancelled = true;
                            break;
                        }

                        if ((line_counter % 100000) == 0)
                        {
                            int percentage = (int)Math.Round((100.0 * line_counter) / total_lines);
                            progress.ShowProgress(percentage);
                        }

                        Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();

                        doc.Add(new Lucene.Net.Documents.Field(IndexModel.LineNumber,
                                                               sampleDataFileRow.LineNumber.ToString(),
                                                               Lucene.Net.Documents.Field.Store.YES,
                                                               Lucene.Net.Documents.Field.Index.NO));

                        doc.Add(new Lucene.Net.Documents.Field(IndexModel.LineText,
                                                               sampleDataFileRow.LineText,
                                                               Lucene.Net.Documents.Field.Store.YES,
                                                               Lucene.Net.Documents.Field.Index.ANALYZED));

                        writer.AddDocument(doc);
                    }

                    writer.Optimize();
                    writer.Flush(true, true, true);
                }
            }
        }