public IEnumerable <SearchEngineResult> RelatedContents(int entryId, int max, int blogId) { var list = new List <SearchEngineResult>(); //First look for the original doc Query query = GetIdSearchQuery(entryId); TopDocs hits = Searcher.Search(query, max); if (hits.scoreDocs.Length <= 0) { return(list); } int docNum = hits.scoreDocs[0].doc; //Setup MoreLikeThis searcher var reader = DoWriterAction(w => w.GetReader()); var mlt = new MoreLikeThis(reader); mlt.SetAnalyzer(_analyzer); mlt.SetFieldNames(new[] { Title, Body, Tags }); mlt.SetMinDocFreq(_settings.Parameters.MinimumDocumentFrequency); mlt.SetMinTermFreq(_settings.Parameters.MinimumTermFrequency); mlt.SetBoost(_settings.Parameters.MoreLikeThisBoost); var moreResultsQuery = mlt.Like(docNum); return(PerformQuery(list, moreResultsQuery, max + 1, blogId, entryId)); }
public SearchEngineResponse RelatedContents(int entryId, int max, int blogId) { var list = new List <SearchEngineResult>(); //First look for the original doc Query query = GetIdSearchQuery(entryId); TopDocs hits = Searcher.Search(query, max); if (hits.scoreDocs.Length <= 0) { return(new SearchEngineResponse { TotalCount = hits.totalHits, Results = list }); } int docNum = hits.scoreDocs[0].doc; //Setup MoreLikeThis searcher var reader = DoWriterAction(w => w.GetReader()); var mlt = new MoreLikeThis(reader); mlt.SetAnalyzer(_analyzer); mlt.SetFieldNames(new[] { Categories, Silouhettes, Tags }); var moreResultsQuery = mlt.Like(docNum); return(PerformQuery(list, moreResultsQuery, 0, max + 1, blogId, entryId)); }
protected void Page_Load(object sender, EventArgs e) { if (!Page.IsPostBack) { List <SearchResult> results = new List <SearchResult>(); if (!string.IsNullOrEmpty(Request["relatedto"])) { string indexName = string.Format("sitecore_{0}_index", Sitecore.Context.Database.Name); var index = (LuceneIndex)ContentSearchManager.GetIndex(indexName); var reader = index.CreateReader(LuceneIndexAccess.ReadOnly); var moreLikeThis = new MoreLikeThis(reader); CreateMLTQuery(moreLikeThis); string itemId = Request["relatedto"]; var searcher = (IndexSearcher)index.CreateSearcher(LuceneIndexAccess.ReadOnly); int docId = GetDocumentId(itemId, searcher); int minimumNumberShouldMatch = 5; results = ShowSimilarResults(searcher, moreLikeThis, docId, minimumNumberShouldMatch); // OR using MoreLikeThisQuery // string description = SelectedItem["Description"]; // results = ShowSimilarResultsUsingMLTQuery(searcher, description, new string[] { "title", "description" }, MinimumNumberShouldMatch); } if (!string.IsNullOrEmpty(Request["query"])) { results = SearchResults(Request["query"]); } repeaterResults.DataSource = results; repeaterResults.DataBind(); } }
/// <summary> /// for given document and fields in that doc get fixed no of docs that are similar /// assumes you have index that is up to date /// </summary> /// <returns>list of similar docs found</returns> public IEnumerable <SearchResultItem> FindMoreLikeThis() { var results = new List <SearchResultItem>(); if (IsInit()) { var moreLikeThis = new MoreLikeThis(reader); moreLikeThis.SetFieldNames(fieldsToSearch.ToArray()); moreLikeThis.SetMinTermFreq(1); moreLikeThis.SetMinDocFreq(1); int currentLuceneDocId = GetLuceneDocNo(docId); if (currentLuceneDocId != 0) { var query = moreLikeThis.Like(currentLuceneDocId); var docs = searcher.Search(query, maxNo); int count = docs.ScoreDocs.Length; //start at 1 as first item will be current document itself which we dont want for (int i = 1; i < count; i++) { var d = reader.Document(docs.ScoreDocs[i].doc); var item = new SearchResultItem { PageName = d.GetField("nodeName").StringValue(), NodeId = int.Parse(d.GetField("__NodeId").StringValue()) }; results.Add(item); } } } return(results); }
public void TestBoostFactor() { IDictionary<string, float?> originalValues = OriginalValues; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.Analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.FieldNames = new[] { "text" }; mlt.Boost = true; // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.BoostFactor = boostFactor; BooleanQuery query = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text"); IList<BooleanClause> clauses = query.Clauses; assertEquals("Expected " + originalValues.Count + " clauses.", originalValues.Count, clauses.Count); foreach (BooleanClause clause in clauses) { TermQuery tq = (TermQuery)clause.Query; float? termBoost = originalValues[tq.Term.Text()]; assertNotNull("Expected term " + tq.Term.Text(), termBoost); float totalBoost = (float) (termBoost * boostFactor); assertEquals("Expected boost of " + totalBoost + " for term '" + tq.Term.Text() + "' got " + tq.Boost, totalBoost, tq.Boost, 0.0001); } }
public List <Article> GetRelatedArticles(int articleId, int count) { var reader = IndexReader.Open(_directory, true); var searcher = new IndexSearcher(_directory, true); var searchQuery = new TermQuery(new Term("Id", articleId.ToString())); var doc = searcher.Search(searchQuery, 1); if (doc.TotalHits == 0) { return(new List <Article>()); } var docId = doc.ScoreDocs[0].Doc; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetFieldNames(new[] { "Name", "Body", "TagName", "FeedName" }); Query query = mlt.Like(docId); var hits = searcher.Search(query, count + 1); var articles = ConvertToArticles(hits, searcher, 1, count).Where(a => a.Id != articleId); reader.Dispose(); searcher.Dispose(); return(articles.ToList()); }
private static Query CreateMoreProjectsLikeThisQuery(int projectId) { int docNum = GetLuceneProjectDocumentNumber(projectId); if (docNum == 0) { return(null); } var analyzer = new StandardAnalyzer(_version); using (var searcher = new IndexSearcher(_directory, false)) { IndexReader reader = searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductId), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Title), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Description), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Price), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductStatus), StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Category) }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return(moreLikeThis.Like(docNum)); } }
public static IList <CorpusDocument> GetMoreLikeThis(string indexName, int indexDocumentId, int maxDocs) { // See: http://lucene.apache.org/java/2_2_0/api/org/apache/lucene/search/similar/MoreLikeThis.html var mlt = new MoreLikeThis(Searcher.GetIndexReader()); mlt.SetAnalyzer(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)); mlt.SetFieldNames(new[] { "Title", "Content" }); mlt.SetMinWordLen(4); // improve relevancy var query = mlt.Like(indexDocumentId); var tsdc = TopScoreDocCollector.create(maxDocs, true); Searcher.Search(query, tsdc); var hits = tsdc.TopDocs().ScoreDocs; var ret = new List <CorpusDocument>(maxDocs); foreach (var hit in hits) { var d = Searcher.Doc(hit.doc); ret.Add(new CorpusDocument { Id = d.Get("Id"), Title = d.Get("Title"), }); } return(ret); }
public void TestBoostFactor() { IDictionary <string, float> originalValues = GetOriginalValues(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.Analyzer = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.FieldNames = new[] { "text" }; mlt.ApplyBoost = true; // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.BoostFactor = boostFactor; BooleanQuery query = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text"); IList <BooleanClause> clauses = query.Clauses; assertEquals("Expected " + originalValues.Count + " clauses.", originalValues.Count, clauses.Count); foreach (BooleanClause clause in clauses) { TermQuery tq = (TermQuery)clause.Query; float termBoost = originalValues[tq.Term.Text]; assertNotNull("Expected term " + tq.Term.Text, termBoost); float totalBoost = (float)(termBoost * boostFactor); assertEquals("Expected boost of " + totalBoost + " for term '" + tq.Term.Text + "' got " + tq.Boost, totalBoost, tq.Boost, 0.0001); } }
/// <summary> /// Uses lucenes MoreLikeThis feature to find items similar to the one passed in /// </summary> /// <param name="item">The item to find similar items</param> /// <param name="pageNo">Page number of the result set</param> /// <param name="pageSize">Number of items to return in the result set</param> /// <returns>Items similar to the one pased in</returns> public IPagedList <T> MoreLikeThis(T item, int pageNo, int pageSize) { using (IndexSearcher indexSearcher = _luceneIndexer.GetSearcher()) { var itemId = _luceneIndexer.GetIdentifier(item); var docQuery = new TermQuery(new Term(_luceneIndexer.PrimaryKeyField, itemId)); var docHit = indexSearcher.Search(docQuery, 1); if (docHit.ScoreDocs.Any()) { var moreLikeThis = new MoreLikeThis(indexSearcher.IndexReader) { MaxDocFreq = 0, MinTermFreq = 0 }; //moreLikeThis.SetFieldNames(_luceneIndexer.FullTextFields); var likeQuery = moreLikeThis.Like(docHit.ScoreDocs[0].Doc); var query = new BooleanQuery { { likeQuery, Occur.MUST }, //{docQuery, Occur.MUST_NOT} // Exclude the doc we basing similar matches on }; return(Search(query, pageNo, pageSize, indexSearcher)); } return(NoResults(pageNo, pageSize)); } }
private void CreateMLTQuery(MoreLikeThis query) { query.Analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); query.MinTermFreq = 1; query.MinDocFreq = 1; query.MaxQueryTerms = 15; query.SetFieldNames(new string[] { "title", "description" }); query.SetStopWords(StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
public override void PrepareSearchSettings(IQueryExecutionContext context) { mlt = new MoreLikeThis(context.Searcher.IndexReader); mlt.MinDocFreq = 2; mlt.MinTermFreq = 1; mlt.Analyzer = new StandardAnalyzer(Version.LUCENE_30); mlt.SetFieldNames(new[] { "Text" }); base.PrepareSearchSettings(context); }
public StoryCollection Find(int hostId, int storyId) { int?docId = ConvertStoryIdtoDocId(hostId, storyId); if (docId.HasValue) { IndexSearcher indexSearch = SearchQuery.GetSearcher(hostId); IndexReader indexReader = indexSearch.GetIndexReader(); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.SetAnalyzer(new DnkAnalyzer()); //mlt.SetFieldNames(new string[] { "title", "description" }); //these values control the query used to find related/similar stories // //-we are only using the title and tags fields, //-the term must appear 1 or more times, //-the query will only have 3 terms //-a word less than 3 char in len with be ignored //-the term must appear at in at least 4 doc mlt.SetFieldNames(new string[] { "title", "tags" }); mlt.SetMinTermFreq(1); mlt.SetMaxQueryTerms(5); mlt.SetMinWordLen(3); mlt.SetMinDocFreq(4); mlt.SetStopWords(StopWords()); mlt.SetBoost(true); Query mltQuery = mlt.Like(docId.Value); Hits hits = indexSearch.Search(mltQuery); List <int> results = new List <int>(); for (int i = 0; i < hits.Length(); i++) { Document d = hits.Doc(i); int hitStoryId = int.Parse(d.GetField("id").StringValue()); if (hitStoryId != storyId) { results.Add(hitStoryId); if (results.Count == NUMBER_OF_RELATED_STORIES_TO_RETURN) { break; } } } return(SearchQuery.LoadStorySearchResults(results)); } else { return(null); } }
public void TestMultiFields() { MoreLikeThis mlt = new MoreLikeThis(reader); mlt.Analyzer = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.FieldNames = new[] { "text", "foobar" }; mlt.Like(new StringReader("this is a test"), "foobar"); }
//Realiza la búsqueda de un texto "original" contra los documentos indexados utilizando TFIDF public static List <Document> moreLikeThisAnalyzer(String original, ISet <string> stopWords, Lucene.Net.Analysis.Analyzer analyzer) { Trace.WriteLine("Realizando la Búsqueda"); List <Document> DocumenResult = new List <Document>(); IndexReader indexReader = IndexReader.Open(_directory, true); IndexSearcher indexSearcher = new IndexSearcher(indexReader); MoreLikeThis mlt = new MoreLikeThis(indexReader); mlt.SetFieldNames(DEFAULT_FIELD_NAMES); mlt.MinDocFreq = DEFALT_MIN_DOC_FREQ; mlt.MinTermFreq = DEFAULT_MIN_TERM_FREQ; mlt.MaxQueryTerms = MAX_QUERY_TERMS; mlt.MinWordLen = DEFAULT_MIN_WORD_LENGTH; mlt.Analyzer = analyzer; mlt.SetStopWords(stopWords); Query query = mlt.Like(new System.IO.StringReader(original)); int topCount = DEFAULT_DOCUMENT_TO_SEARCH; TopScoreDocCollector collector = TopScoreDocCollector.Create(topCount, true); indexSearcher.Search(query, collector); ScoreDoc[] hits = collector.TopDocs().ScoreDocs; var result = new List <string>(); //Hits hits = indexSearcher.Search(query); int len = hits.Length; Trace.WriteLine("Entering"); Trace.WriteLine("-------------------------------------------"); Trace.WriteLine("original :" + original); Trace.WriteLine("query: " + query); Trace.WriteLine("found: " + len + " documents"); for (int i = 0; i < Math.Min(25, len); i++) { int d = hits[i].Doc; Trace.WriteLine("score : " + hits[i].Score); Trace.WriteLine("name : " + d.ToString()); //Colocar los datos en el arreglo de resultados Document doc = indexSearcher.Doc(hits[i].Doc); DocumenResult.Add(doc); } Trace.WriteLine("-------------------------------------------"); Trace.WriteLine("Exiting"); return(DocumenResult); }
private LuceneQuery GetApplicationsMltQuery(MoreLikeThis mlt, IEnumerable <Guid> jobAdIds) { var jobAds = _jobAdsQuery.GetJobAds <JobAd>(jobAdIds); var applicationsString = new StringBuilder(); foreach (var jobAd in jobAds) { applicationsString.AppendLine(jobAd.Title) .AppendLine(jobAd.Description.BulletPoints == null ? string.Empty : jobAd.Description.BulletPoints.ToString()) .AppendLine(jobAd.Description.Content); } return(mlt.like(new StringReader(applicationsString.ToString()))); }
public override Query Rewrite(IndexReader reader) { var mlt = new MoreLikeThis(reader) { FieldNames = MoreLikeFields, Analyzer = Analyzer, MinTermFreq = MinTermFrequency }; if (MinDocFreq >= 0) { mlt.MinDocFreq = MinDocFreq; } mlt.MaxQueryTerms = MaxQueryTerms; mlt.StopWords = StopWords; var bq = (BooleanQuery)mlt.Like(new StringReader(LikeText), fieldName); var clauses = bq.Clauses; //make at least half the terms match bq.MinimumNumberShouldMatch = (int)(clauses.Length * PercentTermsToMatch); return bq; }
private List <SearchResult> ShowSimilarResults(IndexSearcher searcher, MoreLikeThis mlt, int docId, int topHits) { BooleanQuery boolQuery = (BooleanQuery)mlt.Like(docId); ScoreDoc[] scoreDocs = searcher.Search(boolQuery, topHits).ScoreDocs; List <SearchResult> results = new List <SearchResult>(); foreach (var scoreDoc in scoreDocs) { Document doc = searcher.Doc(scoreDoc.Doc); SearchResult result = new SearchResult(doc.Get("title"), doc.Get("description"), doc.Get("_group")); results.Add(result); } return(results); }
private static LuceneQuery GetCandidateMltQuery(MoreLikeThis mlt, ICandidate candidate, Resume resume, string method) { var candidateString = new StringBuilder(); if (resume.Jobs != null) { // construct a stream of relevant job data for passing to mlt foreach (var job in resume.Jobs.Take(5)) { candidateString.AppendLine(job.Description).AppendLine(job.Title); } } #region Log if (EventSource.IsEnabled(Event.Trace)) { EventSource.Raise(Event.Trace, method, "Building MLT Query #1", Event.Arg("Analysis Text", candidateString.ToString())); } #endregion // add additional relevant resume data candidateString.AppendLine(candidate.DesiredJobTitle); candidateString.AppendLine(resume.Summary).AppendLine(resume.Skills); if (candidateString.Length < 1000) { #region Log if (EventSource.IsEnabled(Event.Trace)) { EventSource.Raise(Event.Trace, method, "MLT Query aborted - insufficient text for analysis", Event.Arg("Analysis Text", candidateString.ToString())); } #endregion return(null); } #region Log if (EventSource.IsEnabled(Event.Trace)) { EventSource.Raise(Event.Trace, method, "Building MLT Query #2", Event.Arg("Analysis Text", candidateString.ToString())); } #endregion return(mlt.like(new StringReader(candidateString.ToString()))); }
/// <summary>Train the classifier using the underlying Lucene index</summary> /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param> /// <param name="atomicReader">the reader to use to access the Lucene index</param> /// <param name="classFieldName">the name of the field containing the class assigned to documents</param> /// <param name="query">the query to filter which documents use for training</param> /// <param name="textFieldNames">the names of the fields to be used to compare documents</param> public virtual void Train(AtomicReader atomicReader, string[] textFieldNames, string classFieldName, Analyzer analyzer, Query query) { _textFieldNames = textFieldNames; _classFieldName = classFieldName; _mlt = new MoreLikeThis(atomicReader); _mlt.Analyzer = analyzer; _mlt.FieldNames = _textFieldNames; _indexSearcher = new IndexSearcher(atomicReader); if (_minDocsFreq > 0) { _mlt.MinDocFreq = _minDocsFreq; } if (_minTermFreq > 0) { _mlt.MinTermFreq = _minTermFreq; } _query = query; }
/// <summary>Train the classifier using the underlying Lucene index</summary> /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param> /// <param name="atomicReader">the reader to use to access the Lucene index</param> /// <param name="classFieldName">the name of the field containing the class assigned to documents</param> /// <param name="query">the query to filter which documents use for training</param> /// <param name="textFieldNames">the names of the fields to be used to compare documents</param> public virtual void Train(AtomicReader atomicReader, string[] textFieldNames, string classFieldName, Analyzer analyzer, Query query) { this.textFieldNames = textFieldNames; this.classFieldName = classFieldName; mlt = new MoreLikeThis(atomicReader); mlt.Analyzer = analyzer; mlt.FieldNames = this.textFieldNames; indexSearcher = new IndexSearcher(atomicReader); if (minDocsFreq > 0) { mlt.MinDocFreq = minDocsFreq; } if (minTermFreq > 0) { mlt.MinTermFreq = minTermFreq; } this.query = query; }
public IList <LuceneSearchResult> GetSourcesLikeThis(int sourceId, int numResults) { IList <LuceneSearchResult> results = new List <LuceneSearchResult>(); using (SearcherManager manager = new SearcherManager(SourceIndexWriterSingleton.Instance)) { this.searcher = manager.Acquire().Searcher; Query query = NumericRangeQuery.NewIntRange("Id", sourceId, sourceId, true, true); this.topDocs = this.searcher.Search(query, null, 1); if (this.topDocs != null && this.topDocs.ScoreDocs != null && this.topDocs.ScoreDocs.Length > 0) { // run second search using MoreLikeThis query using (IndexReader reader = IndexReader.Open(SourceIndexWriterSingleton.Directory, true)) { int maxDoc = reader.MaxDoc; MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetFieldNames(new string[] { "FileData" }); mlt.MinTermFreq = 1; mlt.MinDocFreq = 1; BooleanQuery bq = new BooleanQuery(); bq.Add(mlt.Like(this.topDocs.ScoreDocs[0].Doc), Occur.MUST); bq.Add(query, Occur.MUST_NOT); log.Info("More like this query: " + bq.ToString()); TopDocs similarDocs = this.searcher.Search(bq, numResults); if (similarDocs.TotalHits > 0) { foreach (ScoreDoc scoreDoc in similarDocs.ScoreDocs) { results.Add(new LuceneSearchResult(this.searcher.Doc(scoreDoc.Doc), scoreDoc.Score, similarDocs.TotalHits)); } } } } } return(results); }
private static void AssignParameters(MoreLikeThis mlt, MoreLikeThisQueryServerSide parameters) { if (parameters.Boost != null) { mlt.Boost = parameters.Boost.Value; } if (parameters.BoostFactor != null) { mlt.BoostFactor = parameters.BoostFactor.Value; } if (parameters.MaximumNumberOfTokensParsed != null) { mlt.MaxNumTokensParsed = parameters.MaximumNumberOfTokensParsed.Value; } if (parameters.MaximumQueryTerms != null) { mlt.MaxQueryTerms = parameters.MaximumQueryTerms.Value; } if (parameters.MinimumWordLength != null) { mlt.MinWordLen = parameters.MinimumWordLength.Value; } if (parameters.MaximumWordLength != null) { mlt.MaxWordLen = parameters.MaximumWordLength.Value; } if (parameters.MinimumTermFrequency != null) { mlt.MinTermFreq = parameters.MinimumTermFrequency.Value; } if (parameters.MinimumDocumentFrequency != null) { mlt.MinDocFreq = parameters.MinimumDocumentFrequency.Value; } if (parameters.MaximumDocumentFrequency != null) { mlt.MaxDocFreq = parameters.MaximumDocumentFrequency.Value; } if (parameters.MaximumDocumentFrequencyPercentage != null) { mlt.SetMaxDocFreqPct(parameters.MaximumDocumentFrequencyPercentage.Value); } }
private IDictionary <string, float> GetOriginalValues() { IDictionary <string, float> originalValues = new Dictionary <string, float>(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.Analyzer = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.FieldNames = new[] { "text" }; mlt.ApplyBoost = true; BooleanQuery query = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text"); IList <BooleanClause> clauses = query.Clauses; foreach (BooleanClause clause in clauses) { TermQuery tq = (TermQuery)clause.Query; originalValues[tq.Term.Text] = tq.Boost; } return(originalValues); }
private static void Search(Directory path) { var ir = IndexReader.Open(path, true); var mlt = new MoreLikeThis(ir); mlt.SetFieldNames(new string[] { "content" }); mlt.MinTermFreq = 1; mlt.MinDocFreq = 1; var reader = new System.IO.StringReader("are the most well known"); var query = mlt.Like(reader); using (var searcher = new IndexSearcher(path, true)) { var topDocs = searcher.Search(query, 5); foreach (var scoreDoc in topDocs.ScoreDocs) { Document doc = searcher.Doc(scoreDoc.Doc); } } }
public IEnumerable <ISearchHit> GetRelatedItems(int id, RelatedContentContext context) { IndexReader reader = IndexReader.Open(GetDirectory(context.Index), true); var indexSearcher = new IndexSearcher(reader); var analyzer = _analyzerProvider.GetAnalyzer(context.Index); var mlt = new MoreLikeThis(reader) { Boost = true, MinTermFreq = 1, Analyzer = analyzer, MinDocFreq = 1 }; if (context.FieldNames.Length > 0) { mlt.SetFieldNames(context.FieldNames); } var docid = GetDocumentId(id, indexSearcher); Filter filter; BooleanQuery query = (BooleanQuery)mlt.Like(docid); if (!String.IsNullOrWhiteSpace(context.ContentType)) { var contentTypeQuery = new TermQuery(new Term("type", context.ContentType)); query.Add(new BooleanClause(contentTypeQuery, Occur.MUST)); } // exclude same doc var exclude = new TermQuery(new Term("id", id.ToString())); query.Add(new BooleanClause(exclude, Occur.MUST_NOT)); TopDocs simDocs = indexSearcher.Search(query, context.Count); var results = simDocs.ScoreDocs .Select(scoreDoc => new LuceneSearchHit(indexSearcher.Doc(scoreDoc.Doc), scoreDoc.Score)); return(results); }
private static Query CreateMoreLikeThisQuery(long prodcutId) { var docNum = GetLuceneDocumentNumber(prodcutId); if (docNum == 0) { return(null); } var analyzer = new StandardAnalyzer(_version); var reader = Searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { "Name", "Description" }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return(moreLikeThis.Like(docNum)); }
public static IEnumerable <QueryResult> SearchTranslationProjects(Lucene.Net.Store.Directory dir, string lang, string searchText, IEnumerable <string> languages) { var ir = IndexReader.Open(dir, true); var mlt = new MoreLikeThis(ir); //mlt.SetFieldNames(new string[] { lang }); mlt.SetFieldNames(new[] { "fr" }); mlt.MinTermFreq = 1; mlt.MinDocFreq = 1; mlt.MinWordLen = 4; //mlt.Analyzer = new Lucene.Net.Analysis.Fr.FrenchAnalyzer(Lucene.Net.Util.Version.LUCENE_30); //mlt.Analyzer = new Lucene.Net.Analysis.Snowball.SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "French"); var reader = new System.IO.StringReader(searchText); var query = mlt.Like(reader); var results = new List <QueryResult>(); using (var searcher = new IndexSearcher(dir, true)) { var topDocs = searcher.Search(query, 50); foreach (var scoreDoc in topDocs.ScoreDocs) { Document doc = searcher.Doc(scoreDoc.Doc); float score = scoreDoc.Score; var trads = languages.Select(l => new Segment(l, doc.Get(l))); var set = new SegmentsByKey(doc.Get("key"), trads); results.Add(new QueryResult(doc.Get("key"), trads, score)); } } //var g = results.GroupBy(s => s); return(results); }
private static Query CreateMoreLikeThisQuery(int postId) { int docNum = GetLuceneDocumentNumber(postId); if (docNum == 0) { return(null); } var analyzer = new StandardAnalyzer(_version); using (var searcher = new IndexSearcher(_directory, false)) { IndexReader reader = searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { "Title", "Name", "Description", "Publisher", "Author" }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return(moreLikeThis.Like(docNum)); } }
public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) { var pageQuery = new TermQuery(new Term("key", key)); var topDocs = _searcher.Search(pageQuery, 1); if (topDocs.TotalHits == 0) { return new SearchResult(); } var doc = topDocs.ScoreDocs[0].Doc; var moreLikeThis = new MoreLikeThis(_reader) { Analyzer = _analyzer, MinWordLen = 3 }; moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" }); moreLikeThis.SetStopWords(StopWords.DefaultEnglish); moreLikeThis.MinDocFreq = 2; var query = moreLikeThis.Like(doc); var startTime = DateTime.Now; var ticks = DateTime.Now.ToUniversalTime().Ticks; Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false); Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true); var booleanQuery = new BooleanQuery { {query, Occur.MUST}, {pageQuery, Occur.MUST_NOT}, {publishStartQuery, Occur.MUST}, {publishStopQuery, Occur.MUST} }; if (matchCategory) { var document = _searcher.Doc(doc); var field = document.GetField("category"); if (field != null && !string.IsNullOrEmpty(field.StringValue)) { var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant())); booleanQuery.Add(categoryQuery, Occur.MUST); } } var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs; var result = new SearchResult { NumberOfHits = scoreDocs.Length }; if (resultOffset < scoreDocs.Length) { var resultUpperOffset = resultOffset + resultLength; if (resultUpperOffset > scoreDocs.Length) { resultUpperOffset = scoreDocs.Length; } for (int i = resultOffset; i < resultUpperOffset; i++) { Document document = _searcher.Doc(scoreDocs[i].Doc); Guid pageId; (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId); var hit = new SearchHit { PageId = pageId, Path = document.Get("path"), Title = document.Get("title"), Excerpt = document.Get("summary") }; //foreach (string key in metaData) { // hit.MetaData.Add(key, document.Get(key)); //} result.Hits.Add(hit); } } var timeTaken = DateTime.Now - startTime; result.SecondsTaken = timeTaken.TotalSeconds; return result; }
public List <Post> Similar(int postid, int itemsToReturn) { var list = new List <Post>(); if (postid <= 0) { return(list); } IndexSearcher searcher = null; IndexReader reader = null; EnsureIndexExists(); var query = GetIdSearchQuery(postid); lck.AcquireReaderLock(ReaderTimeOut); try { searcher = new IndexSearcher(rd); // Get Original document TopDocs hits = searcher.Search(query, itemsToReturn); if (hits == null || hits.ScoreDocs.Length <= 0) { return(list); } int docNum = hits.ScoreDocs[0].Doc; if (docNum > -1) { LQ.QueryParser parser = GetQueryParser(); reader = IndexReader.Open(rd, true); var mlt = new MoreLikeThis(reader); mlt.Analyzer = _analyzer; mlt.SetFieldNames(new[] { SearchFields.Title, SearchFields.Body, SearchFields.Tag }); mlt.MinDocFreq = 5; mlt.MinTermFreq = 2; mlt.Boost = true; var moreResultsQuery = mlt.Like(docNum); TopDocs similarhits = searcher.Search(moreResultsQuery, itemsToReturn); for (int i = 0; i < similarhits.ScoreDocs.Length; i++) { Document doc = searcher.Doc(similarhits.ScoreDocs[i].Doc); var post = CreatePostFromDocument(doc, null); if (postid != post.Id) { list.Add(post); } if (list.Count >= itemsToReturn) { break; } } } } catch (Exception) { } finally { if (searcher != null) { searcher.Dispose(); } if (reader != null) { reader.Dispose(); } lck.ReleaseReaderLock(); } return(list); }
JobAdSearchResults IJobAdSearchService.SearchSimilar(Guid?memberId, Guid jobAdId, JobAdSearchQuery searchQuery) { const string method = "GetSimilarJobs"; try { var reader = GetReader(); var searcher = new Searcher(reader); var docId = searcher.Fetch(jobAdId); // If the job ad cannot be found then return no results. if (docId == -1) { return(new JobAdSearchResults()); } var jobAd = _jobAdsQuery.GetJobAd <JobAd>(jobAdId); if (jobAd == null) { return(new JobAdSearchResults()); } // Look for more like this. var mlt = new MoreLikeThis(reader); mlt.setAnalyzer(_contentAnalyzer); mlt.setFieldNames(new [] { FieldName.Content, FieldName.Title }); var query = mlt.like(docId); //query = new SeniorityIndexHandler().GetQuery(query, new JobAdSearchQuery {SeniorityIndex = jobAd.SeniorityIndex}); // Ensure the initial job is not in the results. var searchFilter = new BooleanFilter(); searchFilter.add(new FilterClause(new SpecialsFilter(SearchFieldName.Id, false, new[] { jobAdId.ToFieldValue() }), BooleanClause.Occur.MUST_NOT)); // Add salary and location restriction. var filter = _indexer.GetFilter( new JobAdSearchQuery { Salary = FudgeSalary(jobAd.Description.Salary), ExcludeNoSalary = true, Location = jobAd.Description.Location, Distance = 50, }, null, null); searchFilter.add(new FilterClause(filter, BooleanClause.Occur.MUST)); return(searcher.Search(query, searchFilter, null, null, searchQuery.Skip, searchQuery.Take ?? reader.maxDoc(), false)); } catch (Exception e) { #region Log EventSource.Raise(Event.Error, method, "Unexpected exception.", e); #endregion throw; } }
public void TestMultiFields() { MoreLikeThis mlt = new MoreLikeThis(reader); mlt.Analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.FieldNames = new[] { "text", "foobar" }; mlt.Like(new StringReader("this is a test"), "foobar"); }