public static IList<CorpusDocument> GetMoreLikeThis(string indexName, int indexDocumentId, int maxDocs) { // See: http://lucene.apache.org/java/2_2_0/api/org/apache/lucene/search/similar/MoreLikeThis.html var mlt = new MoreLikeThis(Searcher.GetIndexReader()); mlt.SetAnalyzer(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29)); mlt.SetFieldNames(new[] { "Title", "Content" }); mlt.SetMinWordLen(4); // improve relevancy var query = mlt.Like(indexDocumentId); var tsdc = TopScoreDocCollector.create(maxDocs, true); Searcher.Search(query, tsdc); var hits = tsdc.TopDocs().ScoreDocs; var ret = new List<CorpusDocument>(maxDocs); foreach (var hit in hits) { var d = Searcher.Doc(hit.doc); ret.Add(new CorpusDocument { Id = d.Get("Id"), Title = d.Get("Title"), }); } return ret; }
public override Query Rewrite(IndexReader reader) { MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetFieldNames(moreLikeFields); mlt.Analyzer = analyzer; mlt.MinTermFreq = minTermFrequency; if (minDocFreq >= 0) { mlt.MinDocFreq = minDocFreq; } mlt.MaxQueryTerms = maxQueryTerms; mlt.SetStopWords(stopWords); BooleanQuery bq = (BooleanQuery)mlt.Like( new System.IO.StringReader(likeText)); BooleanClause[] clauses = bq.GetClauses(); //make at least half the terms match bq.MinimumNumberShouldMatch = (int)(clauses.Length * percentTermsToMatch); return bq; }
public override Query Rewrite(IndexReader reader) { MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetFieldNames(moreLikeFields); mlt.Analyzer = analyzer; mlt.MinTermFreq = minTermFrequency; if (minDocFreq >= 0) { mlt.MinDocFreq = minDocFreq; } mlt.MaxQueryTerms = maxQueryTerms; mlt.SetStopWords(stopWords); BooleanQuery bq = (BooleanQuery)mlt.Like(new System.IO.StringReader(likeText)); BooleanClause[] clauses = bq.GetClauses(); //make at least half the terms match bq.MinimumNumberShouldMatch = (int)(clauses.Length * percentTermsToMatch); return(bq); }
private Hashtable GetOriginalValues() { Hashtable originalValues = new Hashtable(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.SetMinDocFreq(1); mlt.SetMinTermFreq(1); mlt.SetMinWordLen(1); mlt.SetFieldNames(new String[] { "text" }); mlt.SetBoost(true); BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release")); IList clauses = query.Clauses(); for (int i = 0; i < clauses.Count; i++) { BooleanClause clause = (BooleanClause)clauses[i]; TermQuery tq = (TermQuery)clause.GetQuery(); originalValues.Add(tq.GetTerm().Text(), tq.GetBoost()); } return(originalValues); }
public void TestBoostFactor() { Hashtable originalValues = GetOriginalValues(); MoreLikeThis mlt = new MoreLikeThis( reader); mlt.SetMinDocFreq(1); mlt.SetMinTermFreq(1); mlt.SetMinWordLen(1); mlt.SetFieldNames(new String[] { "text" }); mlt.SetBoost(true); // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.SetBoostFactor(boostFactor); BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release")); IList clauses = query.Clauses(); Assert.AreEqual(originalValues.Count, clauses.Count, "Expected " + originalValues.Count + " clauses."); for (int i = 0; i < clauses.Count; i++) { BooleanClause clause = (BooleanClause)clauses[i]; TermQuery tq = (TermQuery)clause.GetQuery(); float termBoost = (float)originalValues[tq.GetTerm().Text()]; Assert.IsNotNull(termBoost, "Expected term " + tq.GetTerm().Text()); float totalBoost = termBoost * boostFactor; Assert.AreEqual(totalBoost, tq.GetBoost(), 0.0001, "Expected boost of " + totalBoost + " for term '" + tq.GetTerm().Text() + "' got " + tq.GetBoost()); } }
public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) { var pageQuery = new TermQuery(new Term("key", key)); var topDocs = _searcher.Search(pageQuery, 1); if (topDocs.TotalHits == 0) { return new SearchResult(); } var doc = topDocs.ScoreDocs[0].Doc; var moreLikeThis = new MoreLikeThis(_reader) { Analyzer = _analyzer, MinWordLen = 3 }; moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" }); moreLikeThis.SetStopWords(StopWords.DefaultEnglish); moreLikeThis.MinDocFreq = 2; var query = moreLikeThis.Like(doc); var startTime = DateTime.Now; var ticks = DateTime.Now.ToUniversalTime().Ticks; Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false); Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true); var booleanQuery = new BooleanQuery { {query, Occur.MUST}, {pageQuery, Occur.MUST_NOT}, {publishStartQuery, Occur.MUST}, {publishStopQuery, Occur.MUST} }; if (matchCategory) { var document = _searcher.Doc(doc); var field = document.GetField("category"); if (field != null && !string.IsNullOrEmpty(field.StringValue)) { var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant())); booleanQuery.Add(categoryQuery, Occur.MUST); } } var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs; var result = new SearchResult { NumberOfHits = scoreDocs.Length }; if (resultOffset < scoreDocs.Length) { var resultUpperOffset = resultOffset + resultLength; if (resultUpperOffset > scoreDocs.Length) { resultUpperOffset = scoreDocs.Length; } for (int i = resultOffset; i < resultUpperOffset; i++) { Document document = _searcher.Doc(scoreDocs[i].Doc); Guid pageId; (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId); var hit = new SearchHit { PageId = pageId, Path = document.Get("path"), Title = document.Get("title"), Excerpt = document.Get("summary") }; //foreach (string key in metaData) { // hit.MetaData.Add(key, document.Get(key)); //} result.Hits.Add(hit); } } var timeTaken = DateTime.Now - startTime; result.SecondsTaken = timeTaken.TotalSeconds; return result; }
public void TestBoostFactor() { Hashtable originalValues = GetOriginalValues(); MoreLikeThis mlt = new MoreLikeThis( reader); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.SetFieldNames(new String[] { "text" }); mlt.Boost = true; // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.BoostFactor = boostFactor; BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release")); IList clauses = query.Clauses; Assert.AreEqual(originalValues.Count, clauses.Count,"Expected " + originalValues.Count + " clauses."); for (int i = 0; i < clauses.Count; i++) { BooleanClause clause = (BooleanClause)clauses[i]; TermQuery tq = (TermQuery)clause.Query; float termBoost = (float)originalValues[tq.Term.Text]; Assert.IsNotNull(termBoost,"Expected term " + tq.Term.Text); float totalBoost = termBoost * boostFactor; Assert.AreEqual(totalBoost, tq.Boost, 0.0001,"Expected boost of " + totalBoost + " for term '" + tq.Term.Text + "' got " + tq.Boost); } }
private Hashtable GetOriginalValues() { Hashtable originalValues = new Hashtable(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.MinDocFreq = 1; mlt.MinTermFreq = 1; mlt.MinWordLen = 1; mlt.SetFieldNames(new String[] { "text" }); mlt.Boost = true; BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release")); IList clauses = query.Clauses; for (int i = 0; i < clauses.Count; i++) { BooleanClause clause = (BooleanClause)clauses[i]; TermQuery tq = (TermQuery)clause.Query; originalValues.Add(tq.Term.Text, tq.Boost); } return originalValues; }
public static void Main(System.String[] a) { System.String indexName = "localhost_index"; System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; System.Uri url = null; for (int i = 0; i < a.Length; i++) { if (a[i].Equals("-i")) { indexName = a[++i]; } else if (a[i].Equals("-f")) { fn = a[++i]; } else if (a[i].Equals("-url")) { url = new System.Uri(a[++i]); } } System.IO.StreamWriter temp_writer; temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); temp_writer.AutoFlush = true; System.IO.StreamWriter o = temp_writer; IndexReader r = IndexReader.Open(indexName); o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs"); MoreLikeThis mlt = new MoreLikeThis(r); o.WriteLine("Query generation parameters:"); o.WriteLine(mlt.DescribeParams()); o.WriteLine(); Query query = null; if (url != null) { o.WriteLine("Parsing URL: " + url); query = mlt.Like(url); } else if (fn != null) { o.WriteLine("Parsing file: " + fn); query = mlt.Like(new System.IO.FileInfo(fn)); } o.WriteLine("q: " + query); o.WriteLine(); IndexSearcher searcher = new IndexSearcher(indexName); Hits hits = searcher.Search(query); int len = hits.Length(); o.WriteLine("found: " + len + " documents matching"); o.WriteLine(); for (int i = 0; i < System.Math.Min(25, len); i++) { Document d = hits.Doc(i); System.String summary = d.Get("summary"); o.WriteLine("score : " + hits.Score(i)); o.WriteLine("url : " + d.Get("url")); o.WriteLine("\ttitle : " + d.Get("title")); if (summary != null) { o.WriteLine("\tsummary: " + d.Get("summary")); } o.WriteLine(); } }
public static void Main(System.String[] a) { System.String indexName = "localhost_index"; System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; System.Uri url = null; for (int i = 0; i < a.Length; i++) { if (a[i].Equals("-i")) { indexName = a[++i]; } else if (a[i].Equals("-f")) { fn = a[++i]; } else if (a[i].Equals("-url")) { url = new System.Uri(a[++i]); } } System.IO.StreamWriter temp_writer; temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding); temp_writer.AutoFlush = true; System.IO.StreamWriter o = temp_writer; IndexReader r = IndexReader.Open(indexName); o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs"); MoreLikeThis mlt = new MoreLikeThis(r); o.WriteLine("Query generation parameters:"); o.WriteLine(mlt.DescribeParams()); o.WriteLine(); Query query = null; if (url != null) { o.WriteLine("Parsing URL: " + url); query = mlt.Like(url); } else if (fn != null) { o.WriteLine("Parsing file: " + fn); query = mlt.Like(new System.IO.FileInfo(fn)); } o.WriteLine("q: " + query); o.WriteLine(); IndexSearcher searcher = new IndexSearcher(indexName); Hits hits = searcher.Search(query); int len = hits.Length(); o.WriteLine("found: " + len + " documents matching"); o.WriteLine(); for (int i = 0; i < System.Math.Min(25, len); i++) { Document d = hits.Doc(i); System.String summary = d.Get("summary"); o.WriteLine("score : " + hits.Score(i)); o.WriteLine("url : " + d.Get("url")); o.WriteLine("\ttitle : " + d.Get("title")); if (summary != null) o.WriteLine("\tsummary: " + d.Get("summary")); o.WriteLine(); } }
private static Query CreateMoreLikeThisQuery(int postId) { int docNum = GetLuceneDocumentNumber(postId); if (docNum == 0) return null; var analyzer = new StandardAnalyzer(_version); using (var searcher = new IndexSearcher(_directory, false)) { IndexReader reader = searcher.IndexReader; var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer }; moreLikeThis.SetFieldNames(new[] { "Title", "Name", "Description", "Publisher", "Author" }); moreLikeThis.MinDocFreq = 1; moreLikeThis.MinTermFreq = 1; moreLikeThis.Boost = true; return moreLikeThis.Like(docNum); } }