private static Query CreateMoreProjectsLikeThisQuery(int projectId) { int docNum = GetLuceneProjectDocumentNumber(projectId); if (docNum == 0) { return(null); } var analyzer = new StandardAnalyzer(_version); using (var searcher = new IndexSearcher(_directory, false)) { IndexReader reader = searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductId), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Title), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Description), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Price), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductStatus), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Category) }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return(moreLikeThis.Like(docNum)); } }
public List <Article> GetRelatedArticles(int articleId, int count) { var reader = IndexReader.Open(_directory, true); var searcher = new IndexSearcher(_directory, true); var searchQuery = new TermQuery(new Term("Id", articleId.ToString())); var doc = searcher.Search(searchQuery, 1); if (doc.TotalHits == 0) { return(new List <Article>()); } var docId = doc.ScoreDocs[0].Doc; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetFieldNames(new[] { "Name", "Body", "TagName", "FeedName" }); Query query = mlt.Like(docId); var hits = searcher.Search(query, count + 1); var articles = ConvertToArticles(hits, searcher, 1, count).Where(a => a.Id != articleId); reader.Dispose(); searcher.Dispose(); return(articles.ToList()); }
/// <summary> /// for given document and fields in that doc get fixed no of docs that are similar /// assumes you have index that is up to date /// </summary> /// <returns>list of similar docs found</returns> public IEnumerable <SearchResultItem> FindMoreLikeThis() { var results = new List <SearchResultItem>(); if (IsInit()) { var moreLikeThis = new MoreLikeThis(reader); moreLikeThis.SetFieldNames(fieldsToSearch.ToArray()); moreLikeThis.SetMinTermFreq(1); moreLikeThis.SetMinDocFreq(1); int currentLuceneDocId = GetLuceneDocNo(docId); if (currentLuceneDocId != 0) { var query = moreLikeThis.Like(currentLuceneDocId); var docs = searcher.Search(query, maxNo); int count = docs.ScoreDocs.Length; //start at 1 as first item will be current document itself which we dont want for (int i = 1; i < count; i++) { var d = reader.Document(docs.ScoreDocs[i].doc); var item = new SearchResultItem { PageName = d.GetField("nodeName").StringValue(), NodeId = int.Parse(d.GetField("__NodeId").StringValue()) }; results.Add(item); } } } return(results); }
public static IList <CorpusDocument> GetMoreLikeThis(string indexName, int indexDocumentId, int maxDocs) { // See: http://lucene.apache.org/java/2_2_0/api/org/apache/lucene/search/similar/MoreLikeThis.html var mlt = new MoreLikeThis(Searcher.GetIndexReader()); mlt.SetAnalyzer(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)); mlt.SetFieldNames(new[] { "Title", "Content" }); mlt.SetMinWordLen(4); // improve relevancy var query = mlt.Like(indexDocumentId); var tsdc = TopScoreDocCollector.create(maxDocs, true); Searcher.Search(query, tsdc); var hits = tsdc.TopDocs().ScoreDocs; var ret = new List <CorpusDocument>(maxDocs); foreach (var hit in hits) { var d = Searcher.Doc(hit.doc); ret.Add(new CorpusDocument { Id = d.Get("Id"), Title = d.Get("Title"), }); } return(ret); }
public IEnumerable <SearchEngineResult> RelatedContents(int entryId, int max, int blogId) { var list = new List <SearchEngineResult>(); //First look for the original doc Query query = GetIdSearchQuery(entryId); TopDocs hits = Searcher.Search(query, max); if (hits.scoreDocs.Length <= 0) { return(list); } int docNum = hits.scoreDocs[0].doc; //Setup MoreLikeThis searcher var reader = DoWriterAction(w => w.GetReader()); var mlt = new MoreLikeThis(reader); mlt.SetAnalyzer(_analyzer); mlt.SetFieldNames(new[] { Title, Body, Tags }); mlt.SetMinDocFreq(_settings.Parameters.MinimumDocumentFrequency); mlt.SetMinTermFreq(_settings.Parameters.MinimumTermFrequency); mlt.SetBoost(_settings.Parameters.MoreLikeThisBoost); var moreResultsQuery = mlt.Like(docNum); return(PerformQuery(list, moreResultsQuery, max + 1, blogId, entryId)); }
public SearchEngineResponse RelatedContents(int entryId, int max, int blogId) { var list = new List <SearchEngineResult>(); //First look for the original doc Query query = GetIdSearchQuery(entryId); TopDocs hits = Searcher.Search(query, max); if (hits.scoreDocs.Length <= 0) { return(new SearchEngineResponse { TotalCount = hits.totalHits, Results = list }); } int docNum = hits.scoreDocs[0].doc; //Setup MoreLikeThis searcher var reader = DoWriterAction(w => w.GetReader()); var mlt = new MoreLikeThis(reader); mlt.SetAnalyzer(_analyzer); mlt.SetFieldNames(new[] { Categories, Silouhettes, Tags }); var moreResultsQuery = mlt.Like(docNum); return(PerformQuery(list, moreResultsQuery, 0, max + 1, blogId, entryId)); }
public StoryCollection Find(int hostId, int storyId) { int?docId = ConvertStoryIdtoDocId(hostId, storyId); if (docId.HasValue) { IndexSearcher indexSearch = SearchQuery.GetSearcher(hostId); IndexReader indexReader = indexSearch.GetIndexReader(); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.SetAnalyzer(new DnkAnalyzer()); //mlt.SetFieldNames(new string[] { "title", "description" }); //these values control the query used to find related/similar stories // //-we are only using the title and tags fields, //-the term must appear 1 or more times, //-the query will only have 3 terms //-a word less than 3 char in len with be ignored //-the term must appear at in at least 4 doc mlt.SetFieldNames(new string[] { "title", "tags" }); mlt.SetMinTermFreq(1); mlt.SetMaxQueryTerms(5); mlt.SetMinWordLen(3); mlt.SetMinDocFreq(4); mlt.SetStopWords(StopWords()); mlt.SetBoost(true); Query mltQuery = mlt.Like(docId.Value); Hits hits = indexSearch.Search(mltQuery); List <int> results = new List <int>(); for (int i = 0; i < hits.Length(); i++) { Document d = hits.Doc(i); int hitStoryId = int.Parse(d.GetField("id").StringValue()); if (hitStoryId != storyId) { results.Add(hitStoryId); if (results.Count == NUMBER_OF_RELATED_STORIES_TO_RETURN) { break; } } } return(SearchQuery.LoadStorySearchResults(results)); } else { return(null); } }
public override void PrepareSearchSettings(IQueryExecutionContext context) { mlt = new MoreLikeThis(context.Searcher.IndexReader); mlt.MinDocFreq = 2; mlt.MinTermFreq = 1; mlt.Analyzer = new StandardAnalyzer(Version.LUCENE_30); mlt.SetFieldNames(new[] { "Text" }); base.PrepareSearchSettings(context); }
private void CreateMLTQuery(MoreLikeThis query) { query.Analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); query.MinTermFreq = 1; query.MinDocFreq = 1; query.MaxQueryTerms = 15; query.SetFieldNames(new string[] { "title", "description" }); query.SetStopWords(StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
//Realiza la búsqueda de un texto "original" contra los documentos indexados utilizando TFIDF public static List <Document> moreLikeThisAnalyzer(String original, ISet <string> stopWords, Lucene.Net.Analysis.Analyzer analyzer) { Trace.WriteLine("Realizando la Búsqueda"); List <Document> DocumenResult = new List <Document>(); IndexReader indexReader = IndexReader.Open(_directory, true); IndexSearcher indexSearcher = new IndexSearcher(indexReader); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.SetFieldNames(DEFAULT_FIELD_NAMES); mlt.MinDocFreq = DEFALT_MIN_DOC_FREQ; mlt.MinTermFreq = DEFAULT_MIN_TERM_FREQ; mlt.MaxQueryTerms = MAX_QUERY_TERMS; mlt.MinWordLen = DEFAULT_MIN_WORD_LENGTH; mlt.Analyzer = analyzer; mlt.SetStopWords(stopWords); Query query = mlt.Like(new System.IO.StringReader(original)); int topCount = DEFAULT_DOCUMENT_TO_SEARCH; TopScoreDocCollector collector = TopScoreDocCollector.Create(topCount, true); indexSearcher.Search(query, collector); ScoreDoc[] hits = collector.TopDocs().ScoreDocs; var result = new List <string>(); //Hits hits = indexSearcher.Search(query); int len = hits.Length; Trace.WriteLine("Entering"); Trace.WriteLine("-------------------------------------------"); Trace.WriteLine("original :" + original); Trace.WriteLine("query: " + query); Trace.WriteLine("found: " + len + " documents"); for (int i = 0; i < Math.Min(25, len); i++) { int d = hits[i].Doc; Trace.WriteLine("score : " + hits[i].Score); Trace.WriteLine("name : " + d.ToString()); //Colocar los datos en el arreglo de resultados Document doc = indexSearcher.Doc(hits[i].Doc); DocumenResult.Add(doc); } Trace.WriteLine("-------------------------------------------"); Trace.WriteLine("Exiting"); return(DocumenResult); }
public IList <LuceneSearchResult> GetSourcesLikeThis(int sourceId, int numResults) { IList <LuceneSearchResult> results = new List <LuceneSearchResult>(); using (SearcherManager manager = new SearcherManager(SourceIndexWriterSingleton.Instance)) { this.searcher = manager.Acquire().Searcher; Query query = NumericRangeQuery.NewIntRange("Id", sourceId, sourceId, true, true); this.topDocs = this.searcher.Search(query, null, 1); if (this.topDocs != null && this.topDocs.ScoreDocs != null && this.topDocs.ScoreDocs.Length > 0) { // run second search using MoreLikeThis query using (IndexReader reader = IndexReader.Open(SourceIndexWriterSingleton.Directory, true)) { int maxDoc = reader.MaxDoc; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetFieldNames(new string[] { "FileData" }); mlt.MinTermFreq = 1; mlt.MinDocFreq = 1; BooleanQuery bq = new BooleanQuery(); bq.Add(mlt.Like(this.topDocs.ScoreDocs[0].Doc), Occur.MUST); bq.Add(query, Occur.MUST_NOT); log.Info("More like this query: " + bq.ToString()); TopDocs similarDocs = this.searcher.Search(bq, numResults); if (similarDocs.TotalHits > 0) { foreach (ScoreDoc scoreDoc in similarDocs.ScoreDocs) { results.Add(new LuceneSearchResult(this.searcher.Doc(scoreDoc.Doc), scoreDoc.Score, similarDocs.TotalHits)); } } } } } return(results); }
private static void Search(Directory path) { var ir = IndexReader.Open(path, true); var mlt = new MoreLikeThis(ir); mlt.SetFieldNames(new string[] { "content" }); mlt.MinTermFreq = 1; mlt.MinDocFreq = 1; var reader = new System.IO.StringReader("are the most well known"); var query = mlt.Like(reader); using (var searcher = new IndexSearcher(path, true)) { var topDocs = searcher.Search(query, 5); foreach (var scoreDoc in topDocs.ScoreDocs) { Document doc = searcher.Doc(scoreDoc.Doc); } } }
public IEnumerable <ISearchHit> GetRelatedItems(int id, RelatedContentContext context) { IndexReader reader = IndexReader.Open(GetDirectory(context.Index), true); var indexSearcher = new IndexSearcher(reader); var analyzer = _analyzerProvider.GetAnalyzer(context.Index); var mlt = new MoreLikeThis(reader) { Boost = true, MinTermFreq = 1, Analyzer = analyzer, MinDocFreq = 1 }; if (context.FieldNames.Length > 0) { mlt.SetFieldNames(context.FieldNames); } var docid = GetDocumentId(id, indexSearcher); Filter filter; BooleanQuery query = (BooleanQuery)mlt.Like(docid); if (!String.IsNullOrWhiteSpace(context.ContentType)) { var contentTypeQuery = new TermQuery(new Term("type", context.ContentType)); query.Add(new BooleanClause(contentTypeQuery, Occur.MUST)); } // exclude same doc var exclude = new TermQuery(new Term("id", id.ToString())); query.Add(new BooleanClause(exclude, Occur.MUST_NOT)); TopDocs simDocs = indexSearcher.Search(query, context.Count); var results = simDocs.ScoreDocs .Select(scoreDoc => new LuceneSearchHit(indexSearcher.Doc(scoreDoc.Doc), scoreDoc.Score)); return(results); }
private static Query CreateMoreLikeThisQuery(long prodcutId) { var docNum = GetLuceneDocumentNumber(prodcutId); if (docNum == 0) { return(null); } var analyzer = new StandardAnalyzer(_version); var reader = Searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { "Name", "Description" }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return(moreLikeThis.Like(docNum)); }
public static IEnumerable <QueryResult> SearchTranslationProjects(Lucene.Net.Store.Directory dir, string lang, string searchText, IEnumerable <string> languages) { var ir = IndexReader.Open(dir, true); var mlt = new MoreLikeThis(ir); //mlt.SetFieldNames(new string[] { lang }); mlt.SetFieldNames(new[] { "fr" }); mlt.MinTermFreq = 1; mlt.MinDocFreq = 1; mlt.MinWordLen = 4; //mlt.Analyzer = new Lucene.Net.Analysis.Fr.FrenchAnalyzer(Lucene.Net.Util.Version.LUCENE_30); //mlt.Analyzer = new Lucene.Net.Analysis.Snowball.SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "French"); var reader = new System.IO.StringReader(searchText); var query = mlt.Like(reader); var results = new List <QueryResult>(); using (var searcher = new IndexSearcher(dir, true)) { var topDocs = searcher.Search(query, 50); foreach (var scoreDoc in topDocs.ScoreDocs) { Document doc = searcher.Doc(scoreDoc.Doc); float score = scoreDoc.Score; var trads = languages.Select(l => new Segment(l, doc.Get(l))); var set = new SegmentsByKey(doc.Get("key"), trads); results.Add(new QueryResult(doc.Get("key"), trads, score)); } } //var g = results.GroupBy(s => s); return(results); }
private static Query CreateMoreLikeThisQuery(int postId) { int docNum = GetLuceneDocumentNumber(postId); if (docNum == 0) { return(null); } var analyzer = new StandardAnalyzer(_version); using (var searcher = new IndexSearcher(_directory, false)) { IndexReader reader = searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { "Title", "Name", "Description", "Publisher", "Author" }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return(moreLikeThis.Like(docNum)); } }
public List <Post> Similar(int postid, int itemsToReturn) { var list = new List <Post>(); if (postid <= 0) { return(list); } IndexSearcher searcher = null; IndexReader reader = null; EnsureIndexExists(); var query = GetIdSearchQuery(postid); lck.AcquireReaderLock(ReaderTimeOut); try { searcher = new IndexSearcher(rd); // Get Original document TopDocs hits = searcher.Search(query, itemsToReturn); if (hits == null || hits.ScoreDocs.Length <= 0) { return(list); } int docNum = hits.ScoreDocs[0].Doc; if (docNum > -1) { LQ.QueryParser parser = GetQueryParser(); reader = IndexReader.Open(rd, true); var mlt = new MoreLikeThis(reader); mlt.Analyzer = _analyzer; mlt.SetFieldNames(new[] { SearchFields.Title, SearchFields.Body, SearchFields.Tag }); mlt.MinDocFreq = 5; mlt.MinTermFreq = 2; mlt.Boost = true; var moreResultsQuery = mlt.Like(docNum); TopDocs similarhits = searcher.Search(moreResultsQuery, itemsToReturn); for (int i = 0; i < similarhits.ScoreDocs.Length; i++) { Document doc = searcher.Doc(similarhits.ScoreDocs[i].Doc); var post = CreatePostFromDocument(doc, null); if (postid != post.Id) { list.Add(post); } if (list.Count >= itemsToReturn) { break; } } } } catch (Exception) { } finally { if (searcher != null) { searcher.Dispose(); } if (reader != null) { reader.Dispose(); } lck.ReleaseReaderLock(); } return(list); }
public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) { var pageQuery = new TermQuery(new Term("key", key)); var topDocs = _searcher.Search(pageQuery, 1); if (topDocs.TotalHits == 0) { return new SearchResult(); } var doc = topDocs.ScoreDocs[0].Doc; var moreLikeThis = new MoreLikeThis(_reader) { Analyzer = _analyzer, MinWordLen = 3 }; moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" }); moreLikeThis.SetStopWords(StopWords.DefaultEnglish); moreLikeThis.MinDocFreq = 2; var query = moreLikeThis.Like(doc); var startTime = DateTime.Now; var ticks = DateTime.Now.ToUniversalTime().Ticks; Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false); Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true); var booleanQuery = new BooleanQuery { {query, Occur.MUST}, {pageQuery, Occur.MUST_NOT}, {publishStartQuery, Occur.MUST}, {publishStopQuery, Occur.MUST} }; if (matchCategory) { var document = _searcher.Doc(doc); var field = document.GetField("category"); if (field != null && !string.IsNullOrEmpty(field.StringValue)) { var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant())); booleanQuery.Add(categoryQuery, Occur.MUST); } } var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs; var result = new SearchResult { NumberOfHits = scoreDocs.Length }; if (resultOffset < scoreDocs.Length) { var resultUpperOffset = resultOffset + resultLength; if (resultUpperOffset > scoreDocs.Length) { resultUpperOffset = scoreDocs.Length; } for (int i = resultOffset; i < resultUpperOffset; i++) { Document document = _searcher.Doc(scoreDocs[i].Doc); Guid pageId; (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId); var hit = new SearchHit { PageId = pageId, Path = document.Get("path"), Title = document.Get("title"), Excerpt = document.Get("summary") }; //foreach (string key in metaData) { // hit.MetaData.Add(key, document.Get(key)); //} result.Hits.Add(hit); } } var timeTaken = DateTime.Now - startTime; result.SecondsTaken = timeTaken.TotalSeconds; return result; }
public virtual SearchResult MoreLikeThis(int postId, int?filterByCategory = null, int languageId = -1, PostType?postType = null, SearchPlace searchPlace = SearchPlace.Title | SearchPlace.Description, int maxResult = 5, SearchResultSortType orderBy = SearchResultSortType.Score) { var result = new SearchResult(); var watch = new System.Diagnostics.Stopwatch(); watch.Start(); try { using (var directory = FSDirectory.Open(new DirectoryInfo(_indexFilesPath))) { using (var searcher = new IndexSearcher(directory, readOnly: true)) { var docNumber = GetLuceneDocNumber(postId, searcher); if (docNumber == -1) { return(result); } var searchInFields = new List <string>(); if (searchPlace == SearchPlace.Anywhere) { searchInFields.AddRange(new string[] { "Title", "Description", "Keywords", "Tags" }); } else { if (searchPlace.HasFlagFast(SearchPlace.Title)) { searchInFields.Add("Title"); } if (searchPlace.HasFlagFast(SearchPlace.Description)) { searchInFields.Add("Description"); } if (searchPlace.HasFlagFast(SearchPlace.Keywords)) { searchInFields.Add("Keywords"); } if (searchPlace.HasFlagFast(SearchPlace.Tags)) { searchInFields.Add("Tags"); } } var analyzer = new StandardAnalyzer(Version); var moreLikeThis = new MoreLikeThis(searcher.IndexReader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(searchInFields.ToArray()); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; var query = moreLikeThis.Like(docNumber); var filter = new BooleanFilter(); filter.Add(new FilterClause( new QueryWrapperFilter(new TermQuery(new Term("ID", postId.ToString()))), Occur.MUST_NOT)); if (languageId > -1) { filter.Add(new FilterClause( new QueryWrapperFilter(new TermQuery(new Term("LanguageId", languageId.ToString()))), Occur.MUST)); } if (filterByCategory != null) { filter.Add(new FilterClause( new QueryWrapperFilter(new TermQuery(new Term("Categories", filterByCategory.Value.ToString()))), Occur.MUST)); } if (postType != null) { filter.Add(new FilterClause( new QueryWrapperFilter(new TermQuery(new Term("PostType", postType.Value.ToString()))), Occur.MUST)); } Sort sort = new Sort(SortField.FIELD_SCORE); switch (orderBy) { case SearchResultSortType.NumberOfVisits: sort = new Sort(new SortField("NumberOfVisit", SortField.INT, true)); break; case SearchResultSortType.PublishDate: sort = new Sort(new SortField("PublishDate", SortField.LONG, true)); break; case SearchResultSortType.LastUpDate: sort = new Sort(new SortField("LastUpDate", SortField.LONG, true)); break; } var hits = searcher.Search(query, filter, maxResult, sort).ScoreDocs; foreach (var scoreDoc in hits) { var doc = searcher.Doc(scoreDoc.Doc); result.Documents.Add(new SearchResultDocument() { DocumentId = int.Parse(doc.Get("ID")), LanguageId = int.Parse(doc.Get("LanguageId")), LanguageIsoCode = doc.Get("LanguageCode"), Score = scoreDoc.Score, DocumentTitle = doc.Get("Title"), DocumentBody = doc.Get("Description"), DocumentKeywords = doc.Get("Keywords"), DocumentTags = doc.Get("Tags"), }); } result.Documents = result.Documents.DistinctBy(p => new { p.DocumentId }) .ToList(); analyzer.Close(); } } } catch (Exception ex) { result.Error = ex; result.HasError = true; } watch.Stop(); result.ElapsedMilliseconds = watch.ElapsedMilliseconds; return(result); }