Example #1
0
        public void TestBoostFactor()
        {
            Hashtable originalValues = GetOriginalValues();

            MoreLikeThis mlt = new MoreLikeThis(
                reader);
            mlt.MinDocFreq = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen = 1;
            mlt.SetFieldNames(new String[] { "text" });
            mlt.Boost = true;

            // this mean that every term boost factor will be multiplied by this
            // number
            float boostFactor = 5;
            mlt.BoostFactor = boostFactor;

            BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release"));
            IList clauses = query.Clauses;

            Assert.AreEqual(originalValues.Count, clauses.Count,"Expected " + originalValues.Count + " clauses.");

            for (int i = 0; i < clauses.Count; i++)
            {
                BooleanClause clause = (BooleanClause)clauses[i];
                TermQuery tq = (TermQuery)clause.Query;
                float termBoost = (float)originalValues[tq.Term.Text];
                Assert.IsNotNull(termBoost,"Expected term " + tq.Term.Text);

                float totalBoost = termBoost * boostFactor;
                Assert.AreEqual(totalBoost, tq.Boost, 0.0001,"Expected boost of " + totalBoost + " for term '"
                                 + tq.Term.Text + "' got " + tq.Boost);
            }
        }
Example #2
0
        public static IList<CorpusDocument> GetMoreLikeThis(string indexName, int indexDocumentId, int maxDocs)
        {
            // See: http://lucene.apache.org/java/2_2_0/api/org/apache/lucene/search/similar/MoreLikeThis.html

            var mlt = new MoreLikeThis(Searcher.GetIndexReader());
            mlt.SetAnalyzer(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29));
            mlt.SetFieldNames(new[] { "Title", "Content" });
            mlt.SetMinWordLen(4); // improve relevancy

            var query = mlt.Like(indexDocumentId);

            var tsdc = TopScoreDocCollector.create(maxDocs, true);
            Searcher.Search(query, tsdc);
            var hits = tsdc.TopDocs().ScoreDocs;

            var ret = new List<CorpusDocument>(maxDocs);

            foreach (var hit in hits)
            {
                var d = Searcher.Doc(hit.doc);
                ret.Add(new CorpusDocument
                {
                    Id = d.Get("Id"),
                    Title = d.Get("Title"),
                });
            }
            return ret;
        }
        public override Query Rewrite(IndexReader reader)
        {
            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.SetFieldNames(moreLikeFields);
            mlt.Analyzer = analyzer;
            mlt.MinTermFreq = minTermFrequency;
            if (minDocFreq >= 0)
            {
                mlt.MinDocFreq = minDocFreq;
            }
            mlt.MaxQueryTerms = maxQueryTerms;
            mlt.SetStopWords(stopWords);
            BooleanQuery bq = (BooleanQuery)mlt.Like( new System.IO.StringReader(likeText));
            BooleanClause[] clauses = bq.GetClauses();
            //make at least half the terms match
            bq.MinimumNumberShouldMatch = (int)(clauses.Length * percentTermsToMatch);
            return bq;
        }
Example #4
0
        public override Query Rewrite(IndexReader reader)
        {
            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.SetFieldNames(moreLikeFields);
            mlt.Analyzer    = analyzer;
            mlt.MinTermFreq = minTermFrequency;
            if (minDocFreq >= 0)
            {
                mlt.MinDocFreq = minDocFreq;
            }
            mlt.MaxQueryTerms = maxQueryTerms;
            mlt.SetStopWords(stopWords);
            BooleanQuery bq = (BooleanQuery)mlt.Like(new System.IO.StringReader(likeText));

            BooleanClause[] clauses = bq.GetClauses();
            //make at least half the terms match
            bq.MinimumNumberShouldMatch = (int)(clauses.Length * percentTermsToMatch);
            return(bq);
        }
Example #5
0
        private Hashtable GetOriginalValues()
        {
            Hashtable    originalValues = new Hashtable();
            MoreLikeThis mlt            = new MoreLikeThis(reader);

            mlt.SetMinDocFreq(1);
            mlt.SetMinTermFreq(1);
            mlt.SetMinWordLen(1);
            mlt.SetFieldNames(new String[] { "text" });
            mlt.SetBoost(true);
            BooleanQuery query   = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release"));
            IList        clauses = query.Clauses();

            for (int i = 0; i < clauses.Count; i++)
            {
                BooleanClause clause = (BooleanClause)clauses[i];
                TermQuery     tq     = (TermQuery)clause.GetQuery();
                originalValues.Add(tq.GetTerm().Text(), tq.GetBoost());
            }
            return(originalValues);
        }
Example #6
0
        public void TestBoostFactor()
        {
            Hashtable originalValues = GetOriginalValues();

            MoreLikeThis mlt = new MoreLikeThis(
                reader);

            mlt.SetMinDocFreq(1);
            mlt.SetMinTermFreq(1);
            mlt.SetMinWordLen(1);
            mlt.SetFieldNames(new String[] { "text" });
            mlt.SetBoost(true);

            // this mean that every term boost factor will be multiplied by this
            // number
            float boostFactor = 5;

            mlt.SetBoostFactor(boostFactor);

            BooleanQuery query   = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release"));
            IList        clauses = query.Clauses();

            Assert.AreEqual(originalValues.Count, clauses.Count, "Expected " + originalValues.Count + " clauses.");

            for (int i = 0; i < clauses.Count; i++)
            {
                BooleanClause clause    = (BooleanClause)clauses[i];
                TermQuery     tq        = (TermQuery)clause.GetQuery();
                float         termBoost = (float)originalValues[tq.GetTerm().Text()];
                Assert.IsNotNull(termBoost, "Expected term " + tq.GetTerm().Text());

                float totalBoost = termBoost * boostFactor;
                Assert.AreEqual(totalBoost, tq.GetBoost(), 0.0001, "Expected boost of " + totalBoost + " for term '"
                                + tq.GetTerm().Text() + "' got " + tq.GetBoost());
            }
        }
        public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) {
            var pageQuery = new TermQuery(new Term("key", key));
            var topDocs = _searcher.Search(pageQuery, 1);
            if (topDocs.TotalHits == 0) {
                return new SearchResult();
            }

            var doc = topDocs.ScoreDocs[0].Doc;

            var moreLikeThis = new MoreLikeThis(_reader) {
                Analyzer = _analyzer, 
                MinWordLen = 3
            };
            moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" });
            moreLikeThis.SetStopWords(StopWords.DefaultEnglish);
            moreLikeThis.MinDocFreq = 2;
            
            var query = moreLikeThis.Like(doc);
            var startTime = DateTime.Now;
            var ticks = DateTime.Now.ToUniversalTime().Ticks;

            Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false);
            Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true);

            var booleanQuery = new BooleanQuery {
                {query, Occur.MUST},
                {pageQuery, Occur.MUST_NOT},
                {publishStartQuery, Occur.MUST},
                {publishStopQuery, Occur.MUST}
            };

            if (matchCategory) {
                var document = _searcher.Doc(doc);
                var field = document.GetField("category");

                if (field != null && !string.IsNullOrEmpty(field.StringValue)) {
                    var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant()));
                    booleanQuery.Add(categoryQuery, Occur.MUST);
                }
            }

            var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs;

            var result = new SearchResult { NumberOfHits = scoreDocs.Length };

            if (resultOffset < scoreDocs.Length) {
                var resultUpperOffset = resultOffset + resultLength;
                if (resultUpperOffset > scoreDocs.Length) {
                    resultUpperOffset = scoreDocs.Length;
                }

                for (int i = resultOffset; i < resultUpperOffset; i++) {
                    Document document = _searcher.Doc(scoreDocs[i].Doc);

                    Guid pageId;
                    (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId);

                    var hit = new SearchHit {
                        PageId = pageId,
                        Path = document.Get("path"),
                        Title = document.Get("title"),
                        Excerpt = document.Get("summary")
                    };

                    //foreach (string key in metaData) {
                    //    hit.MetaData.Add(key, document.Get(key));
                    //}

                    result.Hits.Add(hit);
                }
            }

            var timeTaken = DateTime.Now - startTime;
            result.SecondsTaken = timeTaken.TotalSeconds;

            return result;
        }
Example #8
0
        private Hashtable GetOriginalValues()
        {
            Hashtable originalValues = new Hashtable();
            MoreLikeThis mlt = new MoreLikeThis(reader);
            mlt.MinDocFreq = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen = 1;
            mlt.SetFieldNames(new String[] { "text" });
            mlt.Boost = true;
            BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release"));
            IList clauses = query.Clauses;

            for (int i = 0; i < clauses.Count; i++)
            {
                BooleanClause clause = (BooleanClause)clauses[i];
                TermQuery tq = (TermQuery)clause.Query;
                originalValues.Add(tq.Term.Text, tq.Boost);
            }
            return originalValues;
        }
Example #9
0
        public static void Main(System.String[] a)
        {
            System.String indexName = "localhost_index";
            System.String fn        = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
            System.Uri    url       = null;
            for (int i = 0; i < a.Length; i++)
            {
                if (a[i].Equals("-i"))
                {
                    indexName = a[++i];
                }
                else if (a[i].Equals("-f"))
                {
                    fn = a[++i];
                }
                else if (a[i].Equals("-url"))
                {
                    url = new System.Uri(a[++i]);
                }
            }

            System.IO.StreamWriter temp_writer;
            temp_writer           = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
            temp_writer.AutoFlush = true;
            System.IO.StreamWriter o = temp_writer;
            IndexReader            r = IndexReader.Open(indexName);

            o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");

            MoreLikeThis mlt = new MoreLikeThis(r);

            o.WriteLine("Query generation parameters:");
            o.WriteLine(mlt.DescribeParams());
            o.WriteLine();

            Query query = null;

            if (url != null)
            {
                o.WriteLine("Parsing URL: " + url);
                query = mlt.Like(url);
            }
            else if (fn != null)
            {
                o.WriteLine("Parsing file: " + fn);
                query = mlt.Like(new System.IO.FileInfo(fn));
            }

            o.WriteLine("q: " + query);
            o.WriteLine();
            IndexSearcher searcher = new IndexSearcher(indexName);

            Hits hits = searcher.Search(query);
            int  len  = hits.Length();

            o.WriteLine("found: " + len + " documents matching");
            o.WriteLine();
            for (int i = 0; i < System.Math.Min(25, len); i++)
            {
                Document      d       = hits.Doc(i);
                System.String summary = d.Get("summary");
                o.WriteLine("score  : " + hits.Score(i));
                o.WriteLine("url    : " + d.Get("url"));
                o.WriteLine("\ttitle  : " + d.Get("title"));
                if (summary != null)
                {
                    o.WriteLine("\tsummary: " + d.Get("summary"));
                }
                o.WriteLine();
            }
        }
Example #10
0
        public static void Main(System.String[] a)
        {
            System.String indexName = "localhost_index";
            System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
            System.Uri url = null;
            for (int i = 0; i < a.Length; i++)
            {
                if (a[i].Equals("-i"))
                {
                    indexName = a[++i];
                }
                else if (a[i].Equals("-f"))
                {
                    fn = a[++i];
                }
                else if (a[i].Equals("-url"))
                {
                    url = new System.Uri(a[++i]);
                }
            }

            System.IO.StreamWriter temp_writer;
            temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
            temp_writer.AutoFlush = true;
            System.IO.StreamWriter o = temp_writer;
            IndexReader r = IndexReader.Open(indexName);
            o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");

            MoreLikeThis mlt = new MoreLikeThis(r);

            o.WriteLine("Query generation parameters:");
            o.WriteLine(mlt.DescribeParams());
            o.WriteLine();

            Query query = null;
            if (url != null)
            {
                o.WriteLine("Parsing URL: " + url);
                query = mlt.Like(url);
            }
            else if (fn != null)
            {
                o.WriteLine("Parsing file: " + fn);
                query = mlt.Like(new System.IO.FileInfo(fn));
            }

            o.WriteLine("q: " + query);
            o.WriteLine();
            IndexSearcher searcher = new IndexSearcher(indexName);

            Hits hits = searcher.Search(query);
            int len = hits.Length();
            o.WriteLine("found: " + len + " documents matching");
            o.WriteLine();
            for (int i = 0; i < System.Math.Min(25, len); i++)
            {
                Document d = hits.Doc(i);
                System.String summary = d.Get("summary");
                o.WriteLine("score  : " + hits.Score(i));
                o.WriteLine("url    : " + d.Get("url"));
                o.WriteLine("\ttitle  : " + d.Get("title"));
                if (summary != null)
                    o.WriteLine("\tsummary: " + d.Get("summary"));
                o.WriteLine();
            }
        }
Example #11
0
 private static Query CreateMoreLikeThisQuery(int postId)
 {
     int docNum = GetLuceneDocumentNumber(postId);
     if (docNum == 0)
         return null;
     var analyzer = new StandardAnalyzer(_version);
     using (var searcher = new IndexSearcher(_directory, false))
     {
         IndexReader reader = searcher.IndexReader;
         var moreLikeThis = new MoreLikeThis(reader) { Analyzer = analyzer };
         moreLikeThis.SetFieldNames(new[] { "Title", "Name", "Description", "Publisher", "Author" });
         moreLikeThis.MinDocFreq = 1;
         moreLikeThis.MinTermFreq = 1;
         moreLikeThis.Boost = true;
         return moreLikeThis.Like(docNum);
     }
 }