private void CreateMLTQuery(MoreLikeThis query) { query.Analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); query.MinTermFreq = 1; query.MinDocFreq = 1; query.MaxQueryTerms = 15; query.SetFieldNames(new string[] { "title", "description" }); query.SetStopWords(StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
public StoryCollection Find(int hostId, int storyId) { int?docId = ConvertStoryIdtoDocId(hostId, storyId); if (docId.HasValue) { IndexSearcher indexSearch = SearchQuery.GetSearcher(hostId); IndexReader indexReader = indexSearch.GetIndexReader(); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.SetAnalyzer(new DnkAnalyzer()); //mlt.SetFieldNames(new string[] { "title", "description" }); //these values control the query used to find related/similar stories // //-we are only using the title and tags fields, //-the term must appear 1 or more times, //-the query will only have 3 terms //-a word less than 3 char in len with be ignored //-the term must appear at in at least 4 doc mlt.SetFieldNames(new string[] { "title", "tags" }); mlt.SetMinTermFreq(1); mlt.SetMaxQueryTerms(5); mlt.SetMinWordLen(3); mlt.SetMinDocFreq(4); mlt.SetStopWords(StopWords()); mlt.SetBoost(true); Query mltQuery = mlt.Like(docId.Value); Hits hits = indexSearch.Search(mltQuery); List <int> results = new List <int>(); for (int i = 0; i < hits.Length(); i++) { Document d = hits.Doc(i); int hitStoryId = int.Parse(d.GetField("id").StringValue()); if (hitStoryId != storyId) { results.Add(hitStoryId); if (results.Count == NUMBER_OF_RELATED_STORIES_TO_RETURN) { break; } } } return(SearchQuery.LoadStorySearchResults(results)); } else { return(null); } }
//Realiza la búsqueda de un texto "original" contra los documentos indexados utilizando TFIDF public static List <Document> moreLikeThisAnalyzer(String original, ISet <string> stopWords, Lucene.Net.Analysis.Analyzer analyzer) { Trace.WriteLine("Realizando la Búsqueda"); List <Document> DocumenResult = new List <Document>(); IndexReader indexReader = IndexReader.Open(_directory, true); IndexSearcher indexSearcher = new IndexSearcher(indexReader); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.SetFieldNames(DEFAULT_FIELD_NAMES); mlt.MinDocFreq = DEFALT_MIN_DOC_FREQ; mlt.MinTermFreq = DEFAULT_MIN_TERM_FREQ; mlt.MaxQueryTerms = MAX_QUERY_TERMS; mlt.MinWordLen = DEFAULT_MIN_WORD_LENGTH; mlt.Analyzer = analyzer; mlt.SetStopWords(stopWords); Query query = mlt.Like(new System.IO.StringReader(original)); int topCount = DEFAULT_DOCUMENT_TO_SEARCH; TopScoreDocCollector collector = TopScoreDocCollector.Create(topCount, true); indexSearcher.Search(query, collector); ScoreDoc[] hits = collector.TopDocs().ScoreDocs; var result = new List <string>(); //Hits hits = indexSearcher.Search(query); int len = hits.Length; Trace.WriteLine("Entering"); Trace.WriteLine("-------------------------------------------"); Trace.WriteLine("original :" + original); Trace.WriteLine("query: " + query); Trace.WriteLine("found: " + len + " documents"); for (int i = 0; i < Math.Min(25, len); i++) { int d = hits[i].Doc; Trace.WriteLine("score : " + hits[i].Score); Trace.WriteLine("name : " + d.ToString()); //Colocar los datos en el arreglo de resultados Document doc = indexSearcher.Doc(hits[i].Doc); DocumenResult.Add(doc); } Trace.WriteLine("-------------------------------------------"); Trace.WriteLine("Exiting"); return(DocumenResult); }
public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) { var pageQuery = new TermQuery(new Term("key", key)); var topDocs = _searcher.Search(pageQuery, 1); if (topDocs.TotalHits == 0) { return new SearchResult(); } var doc = topDocs.ScoreDocs[0].Doc; var moreLikeThis = new MoreLikeThis(_reader) { Analyzer = _analyzer, MinWordLen = 3 }; moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" }); moreLikeThis.SetStopWords(StopWords.DefaultEnglish); moreLikeThis.MinDocFreq = 2; var query = moreLikeThis.Like(doc); var startTime = DateTime.Now; var ticks = DateTime.Now.ToUniversalTime().Ticks; Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false); Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true); var booleanQuery = new BooleanQuery { {query, Occur.MUST}, {pageQuery, Occur.MUST_NOT}, {publishStartQuery, Occur.MUST}, {publishStopQuery, Occur.MUST} }; if (matchCategory) { var document = _searcher.Doc(doc); var field = document.GetField("category"); if (field != null && !string.IsNullOrEmpty(field.StringValue)) { var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant())); booleanQuery.Add(categoryQuery, Occur.MUST); } } var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs; var result = new SearchResult { NumberOfHits = scoreDocs.Length }; if (resultOffset < scoreDocs.Length) { var resultUpperOffset = resultOffset + resultLength; if (resultUpperOffset > scoreDocs.Length) { resultUpperOffset = scoreDocs.Length; } for (int i = resultOffset; i < resultUpperOffset; i++) { Document document = _searcher.Doc(scoreDocs[i].Doc); Guid pageId; (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId); var hit = new SearchHit { PageId = pageId, Path = document.Get("path"), Title = document.Get("title"), Excerpt = document.Get("summary") }; //foreach (string key in metaData) { // hit.MetaData.Add(key, document.Get(key)); //} result.Hits.Add(hit); } } var timeTaken = DateTime.Now - startTime; result.SecondsTaken = timeTaken.TotalSeconds; return result; }