private TopDocs PhraseQuery(string keyword, string field, int slop) { string[] words = keyword.Trim().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); TopDocs docs = null; int n = 10;//最多返回多少个结果 SetOutput(string.Format("正在检索关键字:{0}", keyword)); try { PhraseQuery query = new PhraseQuery(); query.SetSlop(slop); foreach (string word in words) { Term t = new Term(field, word); query.Add(t); } Stopwatch watch = new Stopwatch(); watch.Start(); docs = searcher.Search(query, (Filter)null, n); watch.Stop(); StringBuffer sb = "PhraseQuery搜索完成,共用时:" + watch.Elapsed.Hours + "时 " + watch.Elapsed.Minutes + "分 " + watch.Elapsed.Seconds + "秒 " + watch.Elapsed.Milliseconds + "毫秒"; SetOutput(sb); } catch (Exception ex) { SetOutput(ex.Message); docs = null; } return(docs); }
/// <summary> /// Adds a standard type clause to this instance /// </summary> /// <param name="term">Term to add to this query.</param> /// <param name="occurrence">Defines how the term is added to this query.</param> /// <param name="slop">The amount of allowed slop in a phrase query.</param> /// <remarks> /// Slop is the amount of movement each word is allowed in a non-exact phrase query. /// For instance if you search for "Adobe Systems Incorporated" and the slop is set to 0 then /// only results with that term is allowed. If you set the slop to 2 then two movements can be /// made, max, for each word. In the same example with slop set to 2 results would be returned /// for "Adobe Systems Incorporated", "Adobe Incorporated Systems", "Systems Adobe Incorporated", /// and "Systems Incorporated Adobe". /// </remarks> public void AddBooleanClause(SearchTerm term, ClauseOccurrence occurrence, int slop) { if (term == null) { throw new ArgumentNullException("term", "term cannot be null"); } IncrementTotalClauses(1); if (term.IsPhrase) { PhraseQuery phraseQuery = new PhraseQuery(); phraseQuery.Add(term.GetLuceneTerm()); phraseQuery.SetSlop(slop); phraseQuery.SetBoost(term.Boost); this.luceneQuery.Add(phraseQuery, TypeConverter.ConvertToLuceneClauseOccurrence(occurrence)); phraseQuery = null; } else { TermQuery termQuery = new TermQuery(term.GetLuceneTerm()); termQuery.SetBoost(term.Boost); this.luceneQuery.Add(termQuery, TypeConverter.ConvertToLuceneClauseOccurrence(occurrence)); termQuery = null; } }
/* * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries. * * ex1) src="a b", dest="c d" => no overlap * ex2) src="a b", dest="a b c" => no overlap * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"} * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"} * ex5) src="a b c", dest="b c" => no overlap * ex6) src="a b c", dest="b" => no overlap * ex7) src="a a a a", dest="a a a" => overlap; * expandQueries={"a a a a a","a a a a a a"} * ex8) src="a b c d", dest="b c" => no overlap */ private void CheckOverlap(Dictionary<Query,Query> expandQueries, Term[] src, Term[] dest, int slop, float boost) { // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() // converts PhraseQuery to TermQuery) for (int i = 1; i < src.Length; i++) { bool overlap = true; for (int j = i; j < src.Length; j++) { if ((j - i) < dest.Length && !src[j].Text().Equals(dest[j - i].Text())) { overlap = false; break; } } if (overlap && src.Length - i < dest.Length) { PhraseQuery pq = new PhraseQuery(); foreach (Term srcTerm in src) pq.Add(srcTerm); for (int k = src.Length - i; k < dest.Length; k++) { pq.Add(new Term(src[0].Field(), dest[k].Text())); } pq.SetSlop(slop); pq.SetBoost(boost); if (!expandQueries.ContainsKey(pq)) expandQueries.Add(pq,pq); } } }
private List <BookSearchModel> SearchBookContent(string searchWords) { List <BookSearchModel> bookSearchModelList = new List <BookSearchModel>(); //1.对搜索条件进行分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(searchWords)); Lucene.Net.Analysis.Token token = null; string indexPath = @"D:\lucenedir"; //string kw = "面向对象";//对用户输入的搜索条件进行拆分。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” //{ // query.Add(new Term("body", word)); //} //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); while ((token = tokenStream.Next()) != null) { query.Add(new Term("body", token.TermText())); } // query.Add(new Term("body", kw));//body中含有kw的文章 query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 BookSearchModel searchModel = new BookSearchModel(); searchModel.Id = int.Parse(doc.Get("ID")); searchModel.Title = doc.Get("title"); searchModel.ContenDescription = SearchWordHighlight.CreateHightLight(searchWords, doc.Get("body")); //this.listBox1.Items.Add(doc.Get("number") + "\n");// 取出放进字段的值 //this.listBox1.Items.Add(doc.Get("body") + "\n"); //this.listBox1.Items.Add("-----------------------\n"); bookSearchModelList.Add(searchModel); } //将搜索的此插入词库之中 SearchDetails entity = new SearchDetails() { Id = Guid.NewGuid(), KeyWords = searchWords, SearchDateTime = DateTime.Now }; SearchDetailsService.AddEntity(entity); return(bookSearchModelList); }
/// <summary> /// 根据已经有的索引检索数据 /// </summary> /// <param name="FieldTitle">检索的字段,注意,这个一定要求是索引中已经存在的</param> /// <param name="keyword">关键字、词、句</param> /// <param name="directoryinfo">索引所在的位置</param> /// <param name="costTime">检索花费的时间</param> /// <param name="CountNum">检索到的条数</param> /// <param name="CountNum">检索是否成功</param> /// <returns></returns> public static IEnumerable <Document> Query(string FieldTitle, string keyword, DirectoryInfo directoryinfo, out TimeSpan costTime, out int CountNum, out bool isSuccess) { isSuccess = false; costTime = TimeSpan.Zero; CountNum = 0; string indexPath = string.Empty; if (directoryinfo.Exists) { indexPath = directoryinfo.FullName; } else { ShowMessageBox(new Page(), "索引路径不正确"); return(null); } FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); if (reader.GetFieldNames(IndexReader.FieldOption.ALL).Contains(FieldTitle))//判断索引中是否包含此字段 { IndexSearcher searcher = new IndexSearcher(reader); //BooleanQuery query = new BooleanQuery(); PhraseQuery query = new PhraseQuery(); string[] KeyWords = WordSegmentation(keyword); foreach (string word in KeyWords)//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { //query.Add(new TermQuery(new Term(FieldTitle, word)), BooleanClause.Occur.SHOULD);//每个词只要有就查出来,用的“与或搜索” query.Add(new Term(FieldTitle, word)); } query.SetSlop(100); TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); Stopwatch stopwatch = Stopwatch.StartNew(); stopwatch.Start(); searcher.Search(query, null, collector); stopwatch.Stop(); costTime = stopwatch.Elapsed; CountNum = collector.GetTotalHits(); ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; List <Document> list = new List <Document>(); for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc; //取到文档的编号(主键,这个是Lucene .net分配的)//检索结果中只有文档的id,如果要取Document,则需要Doc再去取 //降低内容占用 Document doc = searcher.Doc(docId); //根据id找Document list.Add(doc); //string url = highLight(KeyWords[0], doc.Get("url")); //string body = highLight(KeyWords[0], doc.Get("body")); //Response.Write(@"<a href='" + url + "'>" + url + "</a> <br/>" + body + "<br /><br />"); //Response.Write("<hr/><br />"); } isSuccess = true; return(list.ToArray()); } else { return(null); } }
/// <summary> /// 根据多个关键字添加PhraseQuery /// </summary> /// <param name="fieldName">待搜索的字段名称</param> /// <param name="phrases">待搜索的短语列表</param> /// <param name="boostLevel">权重级别</param> /// <param name="asFilter">是否作为过滤条件</param> /// <returns>LuceneSearchBuilder</returns> public LuceneSearchBuilder WithPhrases(string fieldName, IEnumerable <string> phrases, BoostLevel?boostLevel = null, bool asFilter = false) { BooleanQuery query = new BooleanQuery(); foreach (string phrase in phrases) { string filteredPhrase = ClauseScrubber.LuceneKeywordsScrub(phrase); if (string.IsNullOrEmpty(filteredPhrase)) { continue; } if (filteredPhrase.Length == 1) { Term term = new Term(fieldName, filteredPhrase); Query q = new PrefixQuery(term); if (boostLevel.HasValue) { SetBoost(q, boostLevel.Value); } query.Add(q, BooleanClause.Occur.SHOULD); continue; } string[] nameSegments = ClauseScrubber.SegmentForPhraseQuery(filteredPhrase); PhraseQuery phraseQuery = new PhraseQuery(); foreach (var nameSegment in nameSegments) { phraseQuery.Add(new Term(fieldName, nameSegment)); } phraseQuery.SetSlop(PhraseQuerySlop); if (boostLevel.HasValue) { SetBoost(phraseQuery, boostLevel.Value); } query.Add(phraseQuery, BooleanClause.Occur.SHOULD); } if (asFilter) { filters.Add(new BooleanClause(query, BooleanClause.Occur.MUST)); } else { clauses.Add(new BooleanClause(query, BooleanClause.Occur.MUST)); } return(this); }
private List <ViewModelContent> SearchWants() { string indexPath = @"G:\lucenedir"; List <string> list = Common.WebCommon.PanGuSplitWord(Request["txtSearch"]);//对用户输入的搜索条件进行拆分。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery queryTitle = new PhraseQuery(); foreach (string word in list)//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { queryTitle.Add(new Term("Title", word)); T_SearchLogsService.AddEntity(new T_SearchLogs { Id = Guid.NewGuid(), Word = word, SearchDate = DateTime.Now });//将用户搜索的关键词插入到明细表中 } //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); // query.Add(new Term("body", kw));//body中含有kw的文章 queryTitle.SetSlop(100); //多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 PhraseQuery queryContent = new PhraseQuery(); foreach (string word in list)//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { queryContent.Add(new Term("Content", word)); } queryContent.SetSlop(100); BooleanQuery query = new BooleanQuery();//总查询条件 query.Add(queryTitle, BooleanClause.Occur.SHOULD); query.Add(queryContent, BooleanClause.Occur.SHOULD); TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 List <ViewModelContent> viewModelList = new List <ViewModelContent>(); for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. ViewModelContent viewModel = new ViewModelContent(); int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 viewModel.Id = Convert.ToInt32(doc.Get("Id")); // 取出放进字段的值 viewModel.Content = Common.WebCommon.CreateHightLight(Request["txtSearch"], doc.Get("Content")); viewModel.Title = Common.WebCommon.CreateHightLight(Request["txtSearch"], doc.Get("Title")); viewModelList.Add(viewModel); } return(viewModelList); }
/// <summary> /// 进行搜索 /// </summary> /// <returns></returns> public ActionResult Search() { string kw = Request["kw"]; // 获取用户输入的搜索内容 string indexPath = Server.MapPath("~/lucenedir"); // 从哪里搜索 // 对用户输入的内容进行分割 List <string> kws = new List <string>(); // 定义一个集合用来存储分割后的分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(kw.ToString())); Lucene.Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { kws.Add(token.TermText()); } FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 // 注意:这个类只可以进行单个列条件搜索,如果想要实现多个条件搜索要使用另外一个类 PhraseQuery query = new PhraseQuery(); foreach (var word in kws) { query.Add(new Term("content", word)); // 向content这个列进行搜索 } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. // 创建一个list集合用来存储搜索到的结果 List <BookVieModel> bookList = new List <BookVieModel>(); for (int i = 0; i < docs.Length; i++) { //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 BookVieModel model = new BookVieModel(); model.Id = Convert.ToInt32(doc.Get("Id")); // 注意:这些字段要和在添加搜索词库的时候保持一致 model.Title = CreateHightLight(kw, doc.Get("title")); // 注意:这些字段要和在添加搜索词库的时候保持一致 // 对搜索到结果中的搜索词进行高亮显示 model.Content = CreateHightLight(kw, doc.Get("content")); // 注意:这些字段要和在添加搜索词库的时候保持一致 bookList.Add(model); } ViewBag.books = bookList; ViewBag.kw = kw; return(View("Index")); }
public List <T> SearchFromIndexData <T>(string searchKey, int pageIndex, int pageSize, out int totalCount, string orderByFiled, bool isDesc, string needSearchField) { List <T> list = new List <T>(); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //把用户输入的关键字进行分词 foreach (string word in SplitWords(searchKey)) { query.Add(new Term(needSearchField, word)); } //query.Add(new Term("content", "C#"));//多个查询条件时 为且的关系 query.SetSlop(100); //指定关键词相隔最大距离 //TopScoreDocCollector盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); Sort sort = new Sort(new SortField(orderByFiled, SortField.DOC, isDesc)); var docResult = searcher.Search(query, sort);//根据query查询条件进行查询,查询结果放入collector容器 //TopDocs 指定0到GetTotalHits() 即所有查询结果中的文档 如果TopDocs(20,10)则意味着获取第20-30之间文档内容 达到分页的效果 // ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; totalCount = docResult.Length(); int startIndex = Math.Max((pageIndex - 1) * pageSize, 0); int endIndex = 0; if (totalCount < pageSize) { endIndex = startIndex + totalCount; } else { endIndex = startIndex + pageSize; } //展示数据实体对象集合 for (int i = startIndex; i < endIndex; i++) { int docId = docResult.Id(i); //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //根据文档id来获得文档对象Document T t = Activator.CreateInstance <T>(); //创建T对象 Type type = typeof(T); var fieldList = type.GetProperties(); for (int j = 0; j < fieldList.Length; j++) { var f = type.GetProperty(fieldList[j].Name); f.SetValue(t, Utilities.ConvertToT(f.PropertyType.Name, doc.Get(fieldList[j].Name))); } list.Add(t); } return(list); }
private void button3_Click(object sender, EventArgs e) { string indexPath = @"C:\Users\杨ShineLon\Desktop\lucenedir"; // 从哪里搜索 string kw = textBox1.Text; //"面向对象";//对用户输入的搜索条件进行拆分。 // 对用户输入的内容进行分割 List <string> kws = new List <string>(); // 定义一个集合用来存储分割后的分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(kw.ToString())); Net.Analysis.Token token = null; while ((token = tokenStream.Next()) != null) { kws.Add(token.TermText()); //Console.WriteLine(token.TermText()); } FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” //{ // query.Add(new Term("body", word)); //} //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); //query.Add(new Term("body", kw));//body中含有kw的文章 foreach (var word in kws) { query.Add(new Term("body", word)); } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 this.listBox1.Items.Clear(); for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 this.listBox1.Items.Add(doc.Get("number") + "\n"); // 取出放进字段的值 this.listBox1.Items.Add(doc.Get("body") + "\n"); this.listBox1.Items.Add("-----------------------\n"); } }
protected Query Pq(float boost, int slop, String field, params String[] texts) { PhraseQuery query = new PhraseQuery(); foreach (String text in texts) { query.Add(new Term(field, text)); } query.SetBoost(boost); query.SetSlop(slop); return(query); }
/// <summary> /// 创建索引库 /// </summary> //private void CreateSearchIndex() //{ // string indexPath = @"C:\lucenedir";//注意和磁盘上文件夹的大小写一致,否则会报错。将创建的分词内容放在该目录下。 // FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());//指定索引文件(打开索引目录) FS指的是就是FileSystem // bool isUpdate = IndexReader.IndexExists(directory);//IndexReader:对索引进行读取的类。该语句的作用:判断索引库文件夹是否存在以及索引特征文件是否存在。 // if (isUpdate) // { // //同时只能有一段代码对索引库进行写操作。当使用IndexWriter打开directory时会自动对索引库文件上锁。 // //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁(提示一下:如果我现在正在写着已经加锁了,但是还没有写完,这时候又来一个请求,那么不就解锁了吗?这个问题后面会解决) // if (IndexWriter.IsLocked(directory)) // { // IndexWriter.Unlock(directory); // } // } // IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);//向索引库中写索引。这时在这里加锁。 // List<SearchContent> articelList = ArticelService.LoadEntities(a => a.DelFlag == 0).Select(a => new SearchContent() { Id = a.ID, Content = a.ArticleContent, Title = a.Title }).ToList(); // List<SearchContent> photoList = PhotoInfoService.LoadEntities(a => a.DelFlag == 0).Select(a => new SearchContent() { Id = a.ID, Content = a.PictureContent, Title = a.Title }).ToList(); // List<SearchContent> videoList = VideoFileInfoService.LoadEntities(a => a.DelFlag == 0).Select(a => new SearchContent() { Id = a.ID, Content = a.VideoContent, Title = a.Title }).ToList(); // articelList.AddRange(photoList); // articelList.AddRange(videoList); // foreach (var model in articelList) // { // writer.DeleteDocuments(new Term("Id", model.Id.ToString()));//删除 // Document document = new Document();//表示一篇文档。 // //Field.Store.YES:表示是否存储原值。只有当Field.Store.YES在后面才能用doc.Get("number")取出值来.Field.Index. NOT_ANALYZED:不进行分词保存 // document.Add(new Field("Id", model.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // //Field.Index. ANALYZED:进行分词保存:也就是要进行全文的字段要设置分词 保存(因为要进行模糊查询) // //Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS:不仅保存分词还保存分词的距离。 // document.Add(new Field("Title", model.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); // document.Add(new Field("Content", model.Content, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); // writer.AddDocument(document); // } // writer.Close();//会自动解锁。 // directory.Close();//不要忘了Close,否则索引结果搜不到 //} /// <summary> /// 搜索的实现 /// </summary> /// <returns></returns> private List <SearchContent> SearchIndexContent() { string indexPath = @"C:\lucenedir"; string k = Request["txtSearch"]; string[] kw = Common.WebCommon.GetPanGuWord(k).ToArray(); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); foreach (string word in kw)//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { query.Add(new Term("Content", word)); } //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); // query.Add(new Term("body", kw));//body中含有kw的文章 query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 List <SearchContent> list = new List <SearchContent>(); for (int i = 0; i < docs.Length; i++) { SearchContent searchContent = new SearchContent(); // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 searchContent.Id = Convert.ToInt32(doc.Get("Id")); // 取出放进字段的值 searchContent.Flag = Convert.ToInt32(doc.Get("Flag")); searchContent.AddDate = Convert.ToDateTime(doc.Get("AddDate")); searchContent.Title = doc.Get("Title"); searchContent.Content = Common.WebCommon.CreateHightLight(k, doc.Get("Content")); list.Add(searchContent); } //搜索一个词向明细表中插入一条记录 SearchDetails searchDetail = new SearchDetails(); searchDetail.Id = Guid.NewGuid(); searchDetail.KeyWords = k; searchDetail.SearchDateTime = DateTime.Now; SearchDetailsService.AddEntity(searchDetail); return(list); }
/// <summary> /// 获取搜索结果 /// </summary> /// <param name="msg">搜索内容</param> /// <returns>搜索结果</returns> private List <ViewModelContent> GetSearchResult(string msg) { List <ViewModelContent> viewModelList = new List <ViewModelContent>(); string indexPath = @"D:\stu\DotNet\练习\BlangenOA\lucenedir"; //对用户输入的搜索条件进行拆分。 List <string> keywordList = Common.WebCommon.PanguSplitWords(msg); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //添加搜索条件 PhraseQuery query = new PhraseQuery(); foreach (string keyword in keywordList) { //Abstract中含有keyword的文章 query.Add(new Term("Abstract", keyword)); } //多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) query.SetSlop(100); //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); //根据query查询条件进行查询,查询结果放入collector容器 searcher.Search(query, null, collector); //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容.//可以用来实现分页功能 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 viewModelList.Add(new ViewModelContent() { // 取出放进字段的值 ID = Convert.ToInt32(doc.Get("ID")), Title = doc.Get("Title"), //高亮显示 Abstract = Common.WebCommon.CreateHightLight(msg, doc.Get("Abstract")) }); } //用户将搜索内容存储到明细表中 SearchDetailsBll.AddEntity(new SearchDetails() { Id = Guid.NewGuid(), KeyWords = msg, SearchDateTime = DateTime.Now }); return(viewModelList); }
public static List <SearchResult> SeartchContact(string keyword, int startIndex, int pageSize, out int totalCount) { FSDirectory directory = FSDirectory.Open(new DirectoryInfo(ContactIndexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); IEnumerable <string> keyList = SplitHelper.SplitWords(keyword); PhraseQuery queryContact = new PhraseQuery(); foreach (var key in keyList) { queryContact.Add(new Term("contactInfo", key)); } queryContact.SetSlop(100); BooleanQuery query = new BooleanQuery(); query.Add(queryContact, BooleanClause.Occur.SHOULD); // SHOULD => 表示或者 // TopScoreDocCollector:盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); // 使用query这个查询条件进行搜索,搜索结果放入collector searcher.Search(query, null, collector); // 首先获取总条数 totalCount = collector.GetTotalHits(); // 从查询结果中取出第m条到第n条的数据 ScoreDoc[] docs = collector.TopDocs(startIndex, pageSize).scoreDocs; // 遍历查询结果 List <SearchResult> resultList = new List <SearchResult>(); for (int i = 0; i < docs.Length; i++) { // 拿到文档的id int docId = docs[i].doc; // 所以查询结果中只有id,具体内容需要二次查询 // 根据id查询内容:放进去的是Document,查出来的还是Document Lucene.Net.Documents.Document doc = searcher.Doc(docId); SearchResult result = new SearchResult(); result.UserId = doc.Get("id"); result.Name = doc.Get("name"); result.Email = doc.Get("email"); result.PhoneNumber = doc.Get("phone"); result.Position = doc.Get("position"); resultList.Add(result); } return(resultList); }
public List <ViewModelContent> ShowSearchContent() { string indexPath = @"E:\GitProgram\HLX.OA.WenApp\lucenedir"; List <string> kw = HLX.OA.Common.WebCommon.PanGuSplitWord(Request["txtSearch"]);//对用户输入的搜索条件进行拆分。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); for (int i = 0; i < kw.Count; i++) { query.Add(new Term("Content", kw[i]));//body中含有kw的文章 } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 // this.listBox1.Items.Clear(); List <ViewModelContent> vieModelList = new List <ViewModelContent>(); for (int i = 0; i < docs.Length; i++) { ViewModelContent viewModel = new ViewModelContent(); // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 viewModel.Id = Convert.ToInt32(doc.Get("Id")); viewModel.Title = doc.Get("Title"); viewModel.Content = Common.WebCommon.CreateHightLight(Request["txtSearch"], doc.Get("Content"));//高亮显示 搜索内容 vieModelList.Add(viewModel); } Model.SearchDetials searchDetail = new SearchDetials(); searchDetail.Id = Guid.NewGuid(); searchDetail.KeyWords = Request["txtSearch"]; searchDetail.SearchDateTime = DateTime.Now; SearchDetialsService.AddEntity(searchDetail); return(vieModelList); }
private void button1_Click_1(object sender, EventArgs e) { string indexPath = "c:/index"; string kw = textBox1.Text; FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); PhraseQuery query = new PhraseQuery(); //todo:把用户输入的关键词进行拆词 //char[] str = textBox1.Text.ToCharArray(); //for (int i = 0; i < str.Length; i++) //{ // query.Add(new Term("name", str[i].ToString())); //} List <String> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("", new StringReader(textBox1.Text)); Token token = null; while ((token = tokenStream.Next()) != null) { list.Add(token.TermText()); } for (int i = 0; i < list.Count; i++) { query.Add(new Term("name", list[i].ToString())); } query.SetSlop(100); TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); searcher.Search(query, null, collector); ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc;//取到文档的编号(主键,这个是Lucene .net分配的) //检索结果中只有文档的id,如果要取Document,则需要Doc再去取 //降低内容占用 Document doc = searcher.Doc(docId);//根据id找Document string code = doc.Get("code"); string name = doc.Get("name"); MessageBox.Show("code:" + code + "name:" + name); } }
public static void PhraseQueryTest(Analyzer analyzer, string field, string keyword, int slop) { Console.WriteLine("====PhraseQuery===="); string[] arr = keyword.Trim().Split(new char[] { ' ', ',', ',', '、' }, StringSplitOptions.RemoveEmptyEntries); PhraseQuery query = new PhraseQuery(); foreach (string item in arr) { query.Add(new Term(field, item)); } query.SetSlop(slop); ShowQueryExpression(analyzer, query, keyword); SearchToShow(query); Console.WriteLine(); }
public static List <int> SeartchUser(string keyword) { FSDirectory directory = FSDirectory.Open(new DirectoryInfo(UserInfoIndexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); IEnumerable <string> keyList = SplitHelper.SplitWords(keyword); PhraseQuery queryUserInfo = new PhraseQuery(); foreach (var key in keyList) { queryUserInfo.Add(new Term("userInfo", key)); } queryUserInfo.SetSlop(100); BooleanQuery query = new BooleanQuery(); query.Add(queryUserInfo, BooleanClause.Occur.SHOULD); // TopScoreDocCollector:盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); // 使用query这个查询条件进行搜索,搜索结果放入collector searcher.Search(query, null, collector); // 首先获取总条数 int totalCount = collector.GetTotalHits(); //这里取所有的数据 以方便后续的查找。 ScoreDoc[] docs = collector.TopDocs(0, totalCount).scoreDocs; // 遍历查询结果 List <int> resultList = new List <int>(); for (int i = 0; i < docs.Length; i++) { // 拿到文档的id,因为Document可能非常占内存(DataSet和DataReader的区别) int docId = docs[i].doc; // 所以查询结果中只有id,具体内容需要二次查询 // 根据id查询内容:放进去的是Document,查出来的还是Document Lucene.Net.Documents.Document doc = searcher.Doc(docId); int uid = Convert.ToInt32(doc.Get("id")); resultList.Add(uid); } return(resultList); }
/// <summary> /// Match a multi-word phrase exactly. (This is like how QueryParser handles quoted phrases) /// </summary> /// <param name="field"></param> /// <param name="phrase"></param> /// <param name="slop"></param> /// <returns></returns> public QueryBuilder MatchPhrase(string field, string phrase, int slop = 0) { if (string.IsNullOrWhiteSpace(phrase)) { return(this); } var query = new PhraseQuery(); foreach (var token in _analyzer.TokenListFromString(phrase)) { query.Add(new Term(field, token)); } query.SetSlop(slop); return(AddSubQuery(query)); }
/// <summary> /// 从索引库中检索关键字 /// </summary> private void SearchFromIndexData() { string indexPath = Context.Server.MapPath("~/IndexData"); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //把用户输入的关键字进行分词 foreach (string word in Common.SplitContent.SplitWords(Request.QueryString["SearchKey"])) { query.Add(new Term("content", word)); } //query.Add(new Term("content", "C#"));//多个查询条件时 为且的关系 query.SetSlop(100); //指定关键词相隔最大距离 //TopScoreDocCollector盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector);//根据query查询条件进行查询,查询结果放入collector容器 //TopDocs 指定0到GetTotalHits() 即所有查询结果中的文档 如果TopDocs(20,10)则意味着获取第20-30之间文档内容 达到分页的效果 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //展示数据实体对象集合 List <PZYM.Shop.Model.Books> bookResult = new List <PZYM.Shop.Model.Books>(); for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //根据文档id来获得文档对象Document PZYM.Shop.Model.Books book = new PZYM.Shop.Model.Books(); book.Title = doc.Get("title"); //book.ContentDescription = doc.Get("content");//未使用高亮 //搜索关键字高亮显示 使用盘古提供高亮插件 book.ContentDescription = Common.SplitContent.HightLight(Request.QueryString["SearchKey"], doc.Get("content")); book.Id = Convert.ToInt32(doc.Get("id")); bookResult.Add(book); } Repeater1.DataSource = bookResult; Repeater1.DataBind(); }
/// <summary> /// 获取搜索结果 /// </summary> protected void btnGetSearchResult_Click(object sender, EventArgs e) { string keyword = txtKeyWords.Text; string indexPath = Context.Server.MapPath("~/Index"); // 索引文档保存位置 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); // 查询条件 PhraseQuery query = new PhraseQuery(); // 等同于 where contains("msg",kw) query.Add(new Term("msg", keyword)); // 两个词的距离大于100(经验值)就不放入搜索结果,因为距离太远相关度就不高了 query.SetSlop(100); // TopScoreDocCollector:盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); // 使用query这个查询条件进行搜索,搜索结果放入collector searcher.Search(query, null, collector); // 从查询结果中取出第m条到第n条的数据 // collector.GetTotalHits()表示总的结果条数 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; // 遍历查询结果 IList <SearchResult> resultList = new List <SearchResult>(); for (int i = 0; i < docs.Length; i++) { // 拿到文档的id,因为Document可能非常占内存(DataSet和DataReader的区别) int docId = docs[i].doc; // 所以查询结果中只有id,具体内容需要二次查询 // 根据id查询内容:放进去的是Document,查出来的还是Document Document doc = searcher.Doc(docId); SearchResult result = new SearchResult(); result.Id = Convert.ToInt32(doc.Get("id")); result.Msg = HighlightHelper.HighLight(keyword, doc.Get("msg")); resultList.Add(result); } // 绑定到Repeater rptSearchResult.DataSource = resultList; rptSearchResult.DataBind(); }
public void SearchFromIndexData() { string indexPath = System.Web.HttpContext.Current.Server.MapPath("~/IndexData"); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //把用户输入的关键字进行分词 foreach (string word in WitKeyDu.Site.Web.SplitContent.SplitWords(Request.QueryString["SearchKey"])) { query.Add(new Term("ForumContent", word)); } //query.Add(new Term("content", "C#"));//多个查询条件时 为且的关系 query.SetSlop(100); //指定关键词相隔最大距离 //TopScoreDocCollector盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector);//根据query查询条件进行查询,查询结果放入collector容器 //TopDocs 指定0到GetTotalHits() 即所有查询结果中的文档 如果TopDocs(20,10)则意味着获取第20-30之间文档内容 达到分页的效果 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //展示数据实体对象集合 List <Forum> ForumResult = new List <Forum>(); for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //根据文档id来获得文档对象Document Forum forum = new Forum(); forum.ForumName = doc.Get("ForumName"); //book.Title = doc.Get("title"); ////book.ContentDescription = doc.Get("content");//未使用高亮 ////搜索关键字高亮显示 使用盘古提供高亮插件 forum.ForumContent = WitKeyDu.Site.Web.SplitContent.HightLight(Request.QueryString["SearchKey"], doc.Get("ForumContent")); forum.ForumTypeID = Convert.ToInt32(doc.Get("ID")); ForumResult.Add(forum); } }
public static List <ViewModelContent> ShowSearchContent(HttpRequestBase Request, string msg) { string indexPath = ConfigurationManager.AppSettings["lucenedirPath"]; List <string> list = Common.WebCommon.PanGuSplitWord(msg);//对用户输入的搜索条件进行拆分。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); foreach (string word in list)//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { query.Add(new Term("Title", word)); } //query.Add(new Term("body","语言"));--可以添加查询条件,两者是add关系.顺序没有关系. // query.Add(new Term("body", "大学生")); // query.Add(new Term("body", kw));//body中含有kw的文章 query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 List <Models.ViewModelContent> viewModelList = new List <Models.ViewModelContent>(); for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. Models.ViewModelContent viewModel = new Models.ViewModelContent(); int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 viewModel.Id = Convert.ToInt32(doc.Get("Id")); // 取出放进字段的值 viewModel.Title = doc.Get("Title"); // viewModel.Content = Common.WebCommon.CreateHightLight(Request["txtSearch"], doc.Get("Content"));//将搜索的关键字高亮显示。 viewModelList.Add(viewModel); } //删除汇总表中所有的数据 //再将搜索的词插入到明细表中 return(viewModelList); }
private void SearchFromIndexData(string searchkey) { string indexPath = Context.Server.MapPath("~/IndexData"); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); //把用户输入的关键字进行分词 foreach (string word in SplitContent.SplitWords(searchkey)) { query.Add(new Term("TITLE", word)); } //query.Add(new Term("content", "C#"));//多个查询条件时 为且的关系 query.SetSlop(100); //指定关键词相隔最大距离 //TopScoreDocCollector盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 //TopDocs 指定0到GetTotalHits() 即所有查询结果中的文档 如果TopDocs(20,10)则意味着获取第20-30之间文档内容 达到分页的效果 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //collector.GetTotalHits() //展示数据实体对象集合 for (int i = 0; i < docs.Length; i++) { int docID = docs[i].doc; //得到查询结果文档的ID(Lucene内部分配的ID) Document doc = searcher.Doc(docID); //根据文档ID来获得文档对象Document SUC_NEWS mod = new SUC_NEWS(); mod.TITLE = SplitContent.HightLight(searchkey, doc.Get("TITLE")); mod.TITLE = string.IsNullOrEmpty(mod.TITLE) ? doc.Get("TITLE") : mod.TITLE; //book.ContentDESCRPTION = doc.Get("content");//未使用高亮 //搜索关键字高亮显示 使用盘古提供高亮插件 mod.CONTENT = SplitContent.HightLight(searchkey, doc.Get("CONTENT")); mod.CONTENT = string.IsNullOrEmpty(mod.CONTENT) ? doc.Get("CONTENT") : mod.CONTENT; mod.CONTENT = mod.CONTENT.Replace("<b>", ""); mod.ID = Convert.ToInt32(doc.Get("ID")); mod.pandaWebUrl = doc.Get("URL"); modResult.Add(mod); } }
//搜索包含关键词 public static List <JobSerach> SearchContent(string kw, int index, int skipCount) { //string indexPath = lucenePath;//最好将该项放在配置文件中。 kw = kw.ToLower(); FSDirectory directory = FSDirectory.Open(new DirectoryInfo(lucenePath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); //搜索条件 PhraseQuery query = new PhraseQuery(); foreach (string word in SplitWord(kw))//将用户输入的搜索内容进行了盘古分词、 { query.Add(new Term("Title", word)); //query.Add(new Term("Content", word)); //query.Add(new Term("MaiDian", word)); } query.SetSlop(100);//多个查询条件的词之间的最大距离.在文章中相隔太远 也就无意义.(例如 “大学生”这个查询条件和"简历"这个查询条件之间如果间隔的词太多也就没有意义了。) //TopScoreDocCollector是盛放查询结果的容器 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(index - 1, skipCount).scoreDocs; //得到所有查询结果中的文档,GetTotalHits():表示总条数 TopDocs(300, 20);//表示得到300(从300开始),到320(结束)的文档内容. //可以用来实现分页功能 List <JobSerach> list = new List <JobSerach>(); for (int i = 0; i < docs.Length; i++) { // //搜索ScoreDoc[]只能获得文档的id,这样不会把查询结果的Document一次性加载到内存中。降低了内存压力,需要获得文档的详细内容的时候通过searcher.Doc来根据文档id来获得文档的详细内容对象Document. int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //找到文档id对应的文档详细信息 JobSerach result = new JobSerach(); result.Title = Highlight(kw, doc.Get("Title")); result.Id = Convert.ToInt32(doc.Get("Id")); result.ImageAddress = doc.Get("ImageAddress"); result.MaiDian = doc.Get("MaiDian"); result.Price = double.Parse(doc.Get("Price")); result.Content = doc.Get("Content"); list.Add(result); } return(list); }
/// <summary> /// 批量添加PhraseQuery /// </summary> /// <param name="phrase">待搜索的短语</param> /// <param name="fieldNameAndBoosts">字段名称及权重集合</param> /// <param name="occur">搜索条件间的关系</param> /// <param name="asFilter">是否作为过滤器条件</param> /// <returns></returns> public LuceneSearchBuilder WithPhrases(Dictionary <string, BoostLevel> fieldNameAndBoosts, string phrase, BooleanClause.Occur occur, bool asFilter = false) { string filteredPhrase = ClauseScrubber.LuceneKeywordsScrub(phrase); if (string.IsNullOrEmpty(filteredPhrase)) { return(this); } string[] nameSegments = ClauseScrubber.SegmentForPhraseQuery(filteredPhrase); if (nameSegments.Length == 1) { return(WithFields(fieldNameAndBoosts, nameSegments[0], false, occur, asFilter)); } else { BooleanQuery query = new BooleanQuery(); foreach (var fieldNameAndBoost in fieldNameAndBoosts) { PhraseQuery phraseQuery = new PhraseQuery(); foreach (var nameSegment in nameSegments) { phraseQuery.Add(new Term(fieldNameAndBoost.Key, nameSegment)); } phraseQuery.SetSlop(PhraseQuerySlop); SetBoost(phraseQuery, fieldNameAndBoost.Value); query.Add(phraseQuery, occur); } if (asFilter) { filters.Add(new BooleanClause(query, BooleanClause.Occur.MUST)); } else { clauses.Add(new BooleanClause(query, BooleanClause.Occur.MUST)); } return(this); } }
/// <summary> /// 全文搜索 /// </summary> /// <param name="keyword"></param> /// <param name="startRowIndex"></param> /// <param name="pageSize"></param> /// <param name="totalCount"></param> /// <returns></returns> public static List <SearchResult> DoSearch(string keyword, int startRowIndex, int pageSize, out int totalCount) { FSDirectory directory = FSDirectory.Open(new DirectoryInfo(_indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); //IndexSearcher是进行搜索的类 var searcher = new IndexSearcher(reader); var query = new PhraseQuery(); foreach (string word in GetKeyWords(keyword)) { query.Add(new Term("body", word)); } query.SetSlop(100); //相聚100以内才算是查询到 TopScoreDocCollector collector = TopScoreDocCollector.create(1024, true); //最大1024条记录 searcher.Search(query, null, collector); totalCount = collector.GetTotalHits(); //返回总条数 ScoreDoc[] docs = collector.TopDocs(startRowIndex, pageSize).scoreDocs; //分页,下标应该从0开始吧,0是第一条记录 var list = new List <SearchResult>(); for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc; //取文档的编号,这个是主键,lucene.net分配 //检索结果中只有文档的id,如果要取Document,则需要Doc再去取 //降低内容占用 Document doc = searcher.Doc(docId); string number = doc.Get("number"); string title = doc.Get("title"); string fullPath = doc.Get("fullPath"); string body = doc.Get("body"); var searchResult = new SearchResult { Number = number, Title = title, FullPath = fullPath, BodyPreview = Preview(body, keyword) }; list.Add(searchResult); } return(list); }
/// <summary> /// 添加PhraseQuery /// </summary> /// <param name="fieldName">待搜索的字段名称</param> /// <param name="phrase">待搜索的短语</param> /// <param name="boostLevel">权重级别</param> /// <param name="asFilter">是否作为过滤条件</param> /// <returns>LuceneSearchBuilder</returns> public LuceneSearchBuilder WithPhrase(string fieldName, string phrase, BoostLevel?boostLevel = null, bool asFilter = false) { string filteredPhrase = ClauseScrubber.LuceneKeywordsScrub(phrase); if (string.IsNullOrEmpty(filteredPhrase)) { return(this); } if (filteredPhrase.Length == 1) { return(WithField(fieldName, filteredPhrase, false, boostLevel, asFilter)); } string[] nameSegments = ClauseScrubber.SegmentForPhraseQuery(filteredPhrase); PhraseQuery phraseQuery = new PhraseQuery(); foreach (var nameSegment in nameSegments) { phraseQuery.Add(new Term(fieldName, nameSegment)); } phraseQuery.SetSlop(PhraseQuerySlop); if (boostLevel.HasValue) { SetBoost(phraseQuery, boostLevel.Value); } if (asFilter) { filters.Add(new BooleanClause(phraseQuery, BooleanClause.Occur.MUST)); } else { clauses.Add(new BooleanClause(phraseQuery, BooleanClause.Occur.MUST)); } return(this); }
/// <summary> /// 从索引库中检索关键字 /// </summary> public List <T> SearchFromIndexData <T>(string kw, Action <T, Document> AddResult) where T : new() { FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); PhraseQuery query = new PhraseQuery();//搜索条件 //把用户输入的关键字进行分词 foreach (string word in SplitContent.SplitWords(kw)) { query.Add(new Term("content", word)); //多个查询条件时 为且的关系 } query.SetSlop(100); //指定关键词相隔最大距离 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true); //TopScoreDocCollector盛放查询结果的容器 searcher.Search(query, null, collector); //根据query查询条件进行查询,查询结果放入collector容器 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; //TopDocs 指定0到GetTotalHits() 即所有查询结果中的文档 如果TopDocs(20,10)则意味着获取第20-30之间文档内容 达到分页的效果 //展示数据实体对象集合 List <T> bookResult = new List <T>(); for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc; //得到查询结果文档的id(Lucene内部分配的id) Document doc = searcher.Doc(docId); //根据文档id来获得文档对象Document T entity = new T(); AddResult(entity, doc); bookResult.Add(entity); //Bid book = new Bid(); //book.Title = doc.Get("title"); //book.BidContent = SplitContent.HightLight(kw, doc.Get("content")); //搜索关键字高亮显示 使用盘古提供高亮插件 //book.ID = Convert.ToInt32(doc.Get("id")); //bookResult.Add(book); } return(bookResult); }
private List <SearchResult> DoSearch(int startRowIndex, int pageSize, out int totalCount) { string indexPath = "E:/Index"; FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); //IndexSearcher是进行搜索的类 IndexSearcher searcher = new IndexSearcher(reader); PhraseQuery query = new PhraseQuery(); foreach (string word in CommonHelper.SplitWord(kw)) { query.Add(new Term("body", word)); } query.SetSlop(100); //相聚100以内才算是查询到 TopScoreDocCollector collector = TopScoreDocCollector.create(1024, true); //最大1024条记录 searcher.Search(query, null, collector); totalCount = collector.GetTotalHits(); //返回总条数 ScoreDoc[] docs = collector.TopDocs(startRowIndex, pageSize).scoreDocs; //分页,下标应该从0开始吧,0是第一条记录 List <SearchResult> list = new List <SearchResult>(); for (int i = 0; i < docs.Length; i++) { int docID = docs[i].doc;//取文档的编号,这个是主键,lucene.net分配 //检索结果中只有文档的id,如果要取Document,则需要Doc再去取 //降低内容占用 Document doc = searcher.Doc(docID); string number = doc.Get("number"); string title = doc.Get("title"); string body = doc.Get("body"); SearchResult searchResult = new SearchResult() { Number = number, Score = title, Uri = Preview(body, kw) }; list.Add(searchResult); } return(list); }
/// <summary> /// Adds a standard type clause to this instance /// </summary> /// <param name="term">Term to add to this query.</param> /// <param name="occurrence">Defines how the term is added to this query.</param> /// <param name="slop">The amount of allowed slop in a phrase query.</param> /// <remarks> /// Slop is the amount of movement each word is allowed in a non-exact phrase query. /// For instance if you search for "Adobe Systems Incorporated" and the slop is set to 0 then /// only results with that term is allowed. If you set the slop to 2 then two movements can be /// made, max, for each word. In the same example with slop set to 2 results would be returned /// for "Adobe Systems Incorporated", "Adobe Incorporated Systems", "Systems Adobe Incorporated", /// and "Systems Incorporated Adobe". /// </remarks> public void AddBooleanClause(SearchTerm term, ClauseOccurrence occurrence, int slop) { if (term == null) throw new ArgumentNullException("term", "term cannot be null"); IncrementTotalClauses(1); if (term.IsPhrase) { PhraseQuery phraseQuery = new PhraseQuery(); phraseQuery.Add(term.GetLuceneTerm()); phraseQuery.SetSlop(slop); phraseQuery.SetBoost(term.Boost); this.luceneQuery.Add(phraseQuery, TypeConverter.ConvertToLuceneClauseOccurrence(occurrence)); phraseQuery = null; } else { TermQuery termQuery = new TermQuery(term.GetLuceneTerm()); termQuery.SetBoost(term.Boost); this.luceneQuery.Add(termQuery, TypeConverter.ConvertToLuceneClauseOccurrence(occurrence)); termQuery = null; } }