示例#1
0
        public IEnumerable <SearchEngineResult> RelatedContents(int entryId, int max, int blogId)
        {
            var list = new List <SearchEngineResult>();

            //First look for the original doc
            Query   query = GetIdSearchQuery(entryId);
            TopDocs hits  = Searcher.Search(query, max);

            if (hits.scoreDocs.Length <= 0)
            {
                return(list);
            }

            int docNum = hits.scoreDocs[0].doc;

            //Setup MoreLikeThis searcher
            var reader = DoWriterAction(w => w.GetReader());
            var mlt    = new MoreLikeThis(reader);

            mlt.SetAnalyzer(_analyzer);
            mlt.SetFieldNames(new[] { Title, Body, Tags });
            mlt.SetMinDocFreq(_settings.Parameters.MinimumDocumentFrequency);
            mlt.SetMinTermFreq(_settings.Parameters.MinimumTermFrequency);
            mlt.SetBoost(_settings.Parameters.MoreLikeThisBoost);

            var moreResultsQuery = mlt.Like(docNum);

            return(PerformQuery(list, moreResultsQuery, max + 1, blogId, entryId));
        }
示例#2
0
        public SearchEngineResponse RelatedContents(int entryId, int max, int blogId)
        {
            var list = new List <SearchEngineResult>();

            //First look for the original doc
            Query   query = GetIdSearchQuery(entryId);
            TopDocs hits  = Searcher.Search(query, max);

            if (hits.scoreDocs.Length <= 0)
            {
                return(new SearchEngineResponse {
                    TotalCount = hits.totalHits, Results = list
                });
            }

            int docNum = hits.scoreDocs[0].doc;

            //Setup MoreLikeThis searcher
            var reader = DoWriterAction(w => w.GetReader());
            var mlt    = new MoreLikeThis(reader);

            mlt.SetAnalyzer(_analyzer);
            mlt.SetFieldNames(new[] { Categories, Silouhettes, Tags });

            var moreResultsQuery = mlt.Like(docNum);

            return(PerformQuery(list, moreResultsQuery, 0, max + 1, blogId, entryId));
        }
    protected void Page_Load(object sender, EventArgs e)
    {
        if (!Page.IsPostBack)
        {
            List <SearchResult> results = new List <SearchResult>();
            if (!string.IsNullOrEmpty(Request["relatedto"]))
            {
                string indexName    = string.Format("sitecore_{0}_index", Sitecore.Context.Database.Name);
                var    index        = (LuceneIndex)ContentSearchManager.GetIndex(indexName);
                var    reader       = index.CreateReader(LuceneIndexAccess.ReadOnly);
                var    moreLikeThis = new MoreLikeThis(reader);
                CreateMLTQuery(moreLikeThis);

                string itemId   = Request["relatedto"];
                var    searcher = (IndexSearcher)index.CreateSearcher(LuceneIndexAccess.ReadOnly);
                int    docId    = GetDocumentId(itemId, searcher);

                int minimumNumberShouldMatch = 5;

                results = ShowSimilarResults(searcher, moreLikeThis, docId, minimumNumberShouldMatch);

                // OR using MoreLikeThisQuery
                // string description = SelectedItem["Description"];
                // results = ShowSimilarResultsUsingMLTQuery(searcher, description, new string[] { "title", "description" }, MinimumNumberShouldMatch);
            }
            if (!string.IsNullOrEmpty(Request["query"]))
            {
                results = SearchResults(Request["query"]);
            }

            repeaterResults.DataSource = results;
            repeaterResults.DataBind();
        }
    }
示例#4
0
        /// <summary>
        /// for given document and fields in that doc get fixed no of docs that are similar
        /// assumes you have index that is up to date
        /// </summary>
        /// <returns>list of similar docs found</returns>
        public IEnumerable <SearchResultItem> FindMoreLikeThis()
        {
            var results = new List <SearchResultItem>();

            if (IsInit())
            {
                var moreLikeThis = new MoreLikeThis(reader);
                moreLikeThis.SetFieldNames(fieldsToSearch.ToArray());
                moreLikeThis.SetMinTermFreq(1);
                moreLikeThis.SetMinDocFreq(1);
                int currentLuceneDocId = GetLuceneDocNo(docId);
                if (currentLuceneDocId != 0)
                {
                    var query = moreLikeThis.Like(currentLuceneDocId);
                    var docs  = searcher.Search(query, maxNo);
                    int count = docs.ScoreDocs.Length;
                    //start at 1 as first item will be current document itself which we dont want
                    for (int i = 1; i < count; i++)
                    {
                        var d    = reader.Document(docs.ScoreDocs[i].doc);
                        var item = new SearchResultItem
                        {
                            PageName = d.GetField("nodeName").StringValue(),
                            NodeId   = int.Parse(d.GetField("__NodeId").StringValue())
                        };
                        results.Add(item);
                    }
                }
            }
            return(results);
        }
        public void TestBoostFactor()
        {
            IDictionary<string, float?> originalValues = OriginalValues;

            MoreLikeThis mlt = new MoreLikeThis(reader);
            mlt.Analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen = 1;
            mlt.FieldNames = new[] { "text" };
            mlt.Boost = true;

            // this mean that every term boost factor will be multiplied by this
            // number
            float boostFactor = 5;
            mlt.BoostFactor = boostFactor;

            BooleanQuery query = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text");
            IList<BooleanClause> clauses = query.Clauses;

            assertEquals("Expected " + originalValues.Count + " clauses.", originalValues.Count, clauses.Count);

            foreach (BooleanClause clause in clauses)
            {
                TermQuery tq = (TermQuery)clause.Query;
                float? termBoost = originalValues[tq.Term.Text()];
                assertNotNull("Expected term " + tq.Term.Text(), termBoost);

                float totalBoost = (float) (termBoost * boostFactor);
                assertEquals("Expected boost of " + totalBoost + " for term '" + tq.Term.Text() + "' got " + tq.Boost, totalBoost, tq.Boost, 0.0001);
            }
        }
示例#6
0
        public List <Article> GetRelatedArticles(int articleId, int count)
        {
            var reader   = IndexReader.Open(_directory, true);
            var searcher = new IndexSearcher(_directory, true);

            var searchQuery = new TermQuery(new Term("Id", articleId.ToString()));
            var doc         = searcher.Search(searchQuery, 1);

            if (doc.TotalHits == 0)
            {
                return(new List <Article>());
            }

            var docId = doc.ScoreDocs[0].Doc;

            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.SetFieldNames(new[] { "Name", "Body", "TagName", "FeedName" });
            Query query = mlt.Like(docId);
            var   hits  = searcher.Search(query, count + 1);

            var articles = ConvertToArticles(hits, searcher, 1, count).Where(a => a.Id != articleId);

            reader.Dispose();
            searcher.Dispose();
            return(articles.ToList());
        }
示例#7
0
        private static Query CreateMoreProjectsLikeThisQuery(int projectId)
        {
            int docNum = GetLuceneProjectDocumentNumber(projectId);

            if (docNum == 0)
            {
                return(null);
            }
            var analyzer = new StandardAnalyzer(_version);

            using (var searcher = new IndexSearcher(_directory, false))
            {
                IndexReader reader       = searcher.IndexReader;
                var         moreLikeThis = new MoreLikeThis(reader)
                {
                    Analyzer = analyzer
                };
                moreLikeThis.SetFieldNames(new[]
                {
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductId),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Title),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Description),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Price),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductStatus),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Category)
                });
                moreLikeThis.MinDocFreq  = 1;
                moreLikeThis.MinTermFreq = 1;
                moreLikeThis.Boost       = true;
                return(moreLikeThis.Like(docNum));
            }
        }
示例#8
0
        public static IList <CorpusDocument> GetMoreLikeThis(string indexName, int indexDocumentId, int maxDocs)
        {
            // See: http://lucene.apache.org/java/2_2_0/api/org/apache/lucene/search/similar/MoreLikeThis.html

            var mlt = new MoreLikeThis(Searcher.GetIndexReader());

            mlt.SetAnalyzer(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29));
            mlt.SetFieldNames(new[] { "Title", "Content" });
            mlt.SetMinWordLen(4);             // improve relevancy

            var query = mlt.Like(indexDocumentId);

            var tsdc = TopScoreDocCollector.create(maxDocs, true);

            Searcher.Search(query, tsdc);
            var hits = tsdc.TopDocs().ScoreDocs;

            var ret = new List <CorpusDocument>(maxDocs);

            foreach (var hit in hits)
            {
                var d = Searcher.Doc(hit.doc);
                ret.Add(new CorpusDocument
                {
                    Id    = d.Get("Id"),
                    Title = d.Get("Title"),
                });
            }
            return(ret);
        }
示例#9
0
        public void TestBoostFactor()
        {
            IDictionary <string, float> originalValues = GetOriginalValues();

            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.Analyzer    = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq  = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen  = 1;
            mlt.FieldNames  = new[] { "text" };
            mlt.ApplyBoost  = true;

            // this mean that every term boost factor will be multiplied by this
            // number
            float boostFactor = 5;

            mlt.BoostFactor = boostFactor;

            BooleanQuery          query   = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text");
            IList <BooleanClause> clauses = query.Clauses;

            assertEquals("Expected " + originalValues.Count + " clauses.", originalValues.Count, clauses.Count);

            foreach (BooleanClause clause in clauses)
            {
                TermQuery tq        = (TermQuery)clause.Query;
                float     termBoost = originalValues[tq.Term.Text];
                assertNotNull("Expected term " + tq.Term.Text, termBoost);

                float totalBoost = (float)(termBoost * boostFactor);
                assertEquals("Expected boost of " + totalBoost + " for term '"
                             + tq.Term.Text + "' got " + tq.Boost, totalBoost, tq.Boost, 0.0001);
            }
        }
示例#10
0
        /// <summary>
        /// Uses lucenes MoreLikeThis feature to find items similar to the one passed in
        /// </summary>
        /// <param name="item">The item to find similar items</param>
        /// <param name="pageNo">Page number of the result set</param>
        /// <param name="pageSize">Number of items to return in the result set</param>
        /// <returns>Items similar to the one pased in</returns>
        public IPagedList <T> MoreLikeThis(T item, int pageNo, int pageSize)
        {
            using (IndexSearcher indexSearcher = _luceneIndexer.GetSearcher())
            {
                var itemId   = _luceneIndexer.GetIdentifier(item);
                var docQuery = new TermQuery(new Term(_luceneIndexer.PrimaryKeyField, itemId));

                var docHit = indexSearcher.Search(docQuery, 1);

                if (docHit.ScoreDocs.Any())
                {
                    var moreLikeThis = new MoreLikeThis(indexSearcher.IndexReader)
                    {
                        MaxDocFreq  = 0,
                        MinTermFreq = 0
                    };

                    //moreLikeThis.SetFieldNames(_luceneIndexer.FullTextFields);

                    var likeQuery = moreLikeThis.Like(docHit.ScoreDocs[0].Doc);

                    var query = new BooleanQuery
                    {
                        { likeQuery, Occur.MUST },
                        //{docQuery, Occur.MUST_NOT} // Exclude the doc we basing similar matches on
                    };

                    return(Search(query, pageNo, pageSize, indexSearcher));
                }

                return(NoResults(pageNo, pageSize));
            }
        }
 private void CreateMLTQuery(MoreLikeThis query)
 {
     query.Analyzer      = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
     query.MinTermFreq   = 1;
     query.MinDocFreq    = 1;
     query.MaxQueryTerms = 15;
     query.SetFieldNames(new string[] { "title", "description" });
     query.SetStopWords(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 }
示例#12
0
 public override void PrepareSearchSettings(IQueryExecutionContext context)
 {
     mlt             = new MoreLikeThis(context.Searcher.IndexReader);
     mlt.MinDocFreq  = 2;
     mlt.MinTermFreq = 1;
     mlt.Analyzer    = new StandardAnalyzer(Version.LUCENE_30);
     mlt.SetFieldNames(new[] { "Text" });
     base.PrepareSearchSettings(context);
 }
示例#13
0
        public StoryCollection Find(int hostId, int storyId)
        {
            int?docId = ConvertStoryIdtoDocId(hostId, storyId);

            if (docId.HasValue)
            {
                IndexSearcher indexSearch = SearchQuery.GetSearcher(hostId);
                IndexReader   indexReader = indexSearch.GetIndexReader();

                MoreLikeThis mlt = new MoreLikeThis(indexReader);

                mlt.SetAnalyzer(new DnkAnalyzer());
                //mlt.SetFieldNames(new string[] { "title", "description" });

                //these values control the query used to find related/similar stories
                //
                //-we are only using the title and tags fields,
                //-the term must appear 1 or more times,
                //-the query will only have 3 terms
                //-a word less than 3 char in len with be ignored
                //-the term must appear at in at least 4 doc
                mlt.SetFieldNames(new string[] { "title", "tags" });
                mlt.SetMinTermFreq(1);
                mlt.SetMaxQueryTerms(5);
                mlt.SetMinWordLen(3);
                mlt.SetMinDocFreq(4);
                mlt.SetStopWords(StopWords());
                mlt.SetBoost(true);
                Query mltQuery = mlt.Like(docId.Value);

                Hits hits = indexSearch.Search(mltQuery);

                List <int> results = new List <int>();


                for (int i = 0; i < hits.Length(); i++)
                {
                    Document d          = hits.Doc(i);
                    int      hitStoryId = int.Parse(d.GetField("id").StringValue());

                    if (hitStoryId != storyId)
                    {
                        results.Add(hitStoryId);
                        if (results.Count == NUMBER_OF_RELATED_STORIES_TO_RETURN)
                        {
                            break;
                        }
                    }
                }

                return(SearchQuery.LoadStorySearchResults(results));
            }
            else
            {
                return(null);
            }
        }
示例#14
0
        public void TestMultiFields()
        {
            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.Analyzer    = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq  = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen  = 1;
            mlt.FieldNames  = new[] { "text", "foobar" };
            mlt.Like(new StringReader("this is a test"), "foobar");
        }
        //Realiza la búsqueda de un texto "original" contra los documentos indexados utilizando TFIDF
        public static List <Document> moreLikeThisAnalyzer(String original, ISet <string> stopWords, Lucene.Net.Analysis.Analyzer analyzer)
        {
            Trace.WriteLine("Realizando la Búsqueda");
            List <Document> DocumenResult = new List <Document>();

            IndexReader   indexReader   = IndexReader.Open(_directory, true);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);

            MoreLikeThis mlt = new MoreLikeThis(indexReader);

            mlt.SetFieldNames(DEFAULT_FIELD_NAMES);
            mlt.MinDocFreq    = DEFALT_MIN_DOC_FREQ;
            mlt.MinTermFreq   = DEFAULT_MIN_TERM_FREQ;
            mlt.MaxQueryTerms = MAX_QUERY_TERMS;
            mlt.MinWordLen    = DEFAULT_MIN_WORD_LENGTH;
            mlt.Analyzer      = analyzer;
            mlt.SetStopWords(stopWords);

            Query query = mlt.Like(new System.IO.StringReader(original));

            int topCount = DEFAULT_DOCUMENT_TO_SEARCH;

            TopScoreDocCollector collector = TopScoreDocCollector.Create(topCount, true);

            indexSearcher.Search(query, collector);
            ScoreDoc[] hits   = collector.TopDocs().ScoreDocs;
            var        result = new List <string>();
            //Hits hits = indexSearcher.Search(query);

            int len = hits.Length;

            Trace.WriteLine("Entering");
            Trace.WriteLine("-------------------------------------------");
            Trace.WriteLine("original :" + original);
            Trace.WriteLine("query: " + query);
            Trace.WriteLine("found: " + len + " documents");
            for (int i = 0; i < Math.Min(25, len); i++)
            {
                int d = hits[i].Doc;
                Trace.WriteLine("score   : " + hits[i].Score);
                Trace.WriteLine("name    : " + d.ToString());
                //Colocar los datos en el arreglo de resultados
                Document doc = indexSearcher.Doc(hits[i].Doc);
                DocumenResult.Add(doc);
            }
            Trace.WriteLine("-------------------------------------------");
            Trace.WriteLine("Exiting");
            return(DocumenResult);
        }
示例#16
0
        private LuceneQuery GetApplicationsMltQuery(MoreLikeThis mlt, IEnumerable <Guid> jobAdIds)
        {
            var jobAds = _jobAdsQuery.GetJobAds <JobAd>(jobAdIds);

            var applicationsString = new StringBuilder();

            foreach (var jobAd in jobAds)
            {
                applicationsString.AppendLine(jobAd.Title)
                .AppendLine(jobAd.Description.BulletPoints == null ? string.Empty : jobAd.Description.BulletPoints.ToString())
                .AppendLine(jobAd.Description.Content);
            }

            return(mlt.like(new StringReader(applicationsString.ToString())));
        }
示例#17
0
        public override Query Rewrite(IndexReader reader)
        {
            var mlt = new MoreLikeThis(reader) { FieldNames = MoreLikeFields, Analyzer = Analyzer, MinTermFreq = MinTermFrequency };

            if (MinDocFreq >= 0)
            {
                mlt.MinDocFreq = MinDocFreq;
            }
            mlt.MaxQueryTerms = MaxQueryTerms;
            mlt.StopWords = StopWords;
            var bq = (BooleanQuery)mlt.Like(new StringReader(LikeText), fieldName);
            var clauses = bq.Clauses;
            //make at least half the terms match
            bq.MinimumNumberShouldMatch = (int)(clauses.Length * PercentTermsToMatch);
            return bq;
        }
    private List <SearchResult> ShowSimilarResults(IndexSearcher searcher, MoreLikeThis mlt, int docId, int topHits)
    {
        BooleanQuery boolQuery = (BooleanQuery)mlt.Like(docId);

        ScoreDoc[] scoreDocs = searcher.Search(boolQuery, topHits).ScoreDocs;

        List <SearchResult> results = new List <SearchResult>();

        foreach (var scoreDoc in scoreDocs)
        {
            Document     doc    = searcher.Doc(scoreDoc.Doc);
            SearchResult result = new SearchResult(doc.Get("title"), doc.Get("description"), doc.Get("_group"));
            results.Add(result);
        }
        return(results);
    }
示例#19
0
        private static LuceneQuery GetCandidateMltQuery(MoreLikeThis mlt, ICandidate candidate, Resume resume, string method)
        {
            var candidateString = new StringBuilder();

            if (resume.Jobs != null)
            {
                // construct a stream of relevant job data for passing to mlt
                foreach (var job in resume.Jobs.Take(5))
                {
                    candidateString.AppendLine(job.Description).AppendLine(job.Title);
                }
            }

            #region Log
            if (EventSource.IsEnabled(Event.Trace))
            {
                EventSource.Raise(Event.Trace, method, "Building MLT Query #1", Event.Arg("Analysis Text", candidateString.ToString()));
            }
            #endregion

            // add additional relevant resume data
            candidateString.AppendLine(candidate.DesiredJobTitle);
            candidateString.AppendLine(resume.Summary).AppendLine(resume.Skills);

            if (candidateString.Length < 1000)
            {
                #region Log
                if (EventSource.IsEnabled(Event.Trace))
                {
                    EventSource.Raise(Event.Trace, method, "MLT Query aborted - insufficient text for analysis", Event.Arg("Analysis Text", candidateString.ToString()));
                }
                #endregion

                return(null);
            }

            #region Log
            if (EventSource.IsEnabled(Event.Trace))
            {
                EventSource.Raise(Event.Trace, method, "Building MLT Query #2", Event.Arg("Analysis Text", candidateString.ToString()));
            }
            #endregion

            return(mlt.like(new StringReader(candidateString.ToString())));
        }
示例#20
0
 /// <summary>Train the classifier using the underlying Lucene index</summary>
 /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
 /// <param name="atomicReader">the reader to use to access the Lucene index</param>
 /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
 /// <param name="query">the query to filter which documents use for training</param>
 /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
 public virtual void Train(AtomicReader atomicReader, string[] textFieldNames, string classFieldName, Analyzer analyzer, Query query)
 {
     _textFieldNames = textFieldNames;
     _classFieldName = classFieldName;
     _mlt            = new MoreLikeThis(atomicReader);
     _mlt.Analyzer   = analyzer;
     _mlt.FieldNames = _textFieldNames;
     _indexSearcher  = new IndexSearcher(atomicReader);
     if (_minDocsFreq > 0)
     {
         _mlt.MinDocFreq = _minDocsFreq;
     }
     if (_minTermFreq > 0)
     {
         _mlt.MinTermFreq = _minTermFreq;
     }
     _query = query;
 }
示例#21
0
 /// <summary>Train the classifier using the underlying Lucene index</summary>
 /// <param name="analyzer">the analyzer used to tokenize / filter the unseen text</param>
 /// <param name="atomicReader">the reader to use to access the Lucene index</param>
 /// <param name="classFieldName">the name of the field containing the class assigned to documents</param>
 /// <param name="query">the query to filter which documents use for training</param>
 /// <param name="textFieldNames">the names of the fields to be used to compare documents</param>
 public virtual void Train(AtomicReader atomicReader, string[] textFieldNames, string classFieldName, Analyzer analyzer, Query query)
 {
     this.textFieldNames = textFieldNames;
     this.classFieldName = classFieldName;
     mlt            = new MoreLikeThis(atomicReader);
     mlt.Analyzer   = analyzer;
     mlt.FieldNames = this.textFieldNames;
     indexSearcher  = new IndexSearcher(atomicReader);
     if (minDocsFreq > 0)
     {
         mlt.MinDocFreq = minDocsFreq;
     }
     if (minTermFreq > 0)
     {
         mlt.MinTermFreq = minTermFreq;
     }
     this.query = query;
 }
示例#22
0
        public IList <LuceneSearchResult> GetSourcesLikeThis(int sourceId, int numResults)
        {
            IList <LuceneSearchResult> results = new List <LuceneSearchResult>();

            using (SearcherManager manager = new SearcherManager(SourceIndexWriterSingleton.Instance))
            {
                this.searcher = manager.Acquire().Searcher;

                Query query = NumericRangeQuery.NewIntRange("Id", sourceId, sourceId, true, true);

                this.topDocs = this.searcher.Search(query, null, 1);

                if (this.topDocs != null && this.topDocs.ScoreDocs != null && this.topDocs.ScoreDocs.Length > 0)
                {
                    // run second search using MoreLikeThis query
                    using (IndexReader reader = IndexReader.Open(SourceIndexWriterSingleton.Directory, true))
                    {
                        int maxDoc = reader.MaxDoc;

                        MoreLikeThis mlt = new MoreLikeThis(reader);
                        mlt.SetFieldNames(new string[] { "FileData" });
                        mlt.MinTermFreq = 1;
                        mlt.MinDocFreq  = 1;

                        BooleanQuery bq = new BooleanQuery();
                        bq.Add(mlt.Like(this.topDocs.ScoreDocs[0].Doc), Occur.MUST);
                        bq.Add(query, Occur.MUST_NOT);
                        log.Info("More like this query: " + bq.ToString());

                        TopDocs similarDocs = this.searcher.Search(bq, numResults);

                        if (similarDocs.TotalHits > 0)
                        {
                            foreach (ScoreDoc scoreDoc in similarDocs.ScoreDocs)
                            {
                                results.Add(new LuceneSearchResult(this.searcher.Doc(scoreDoc.Doc), scoreDoc.Score, similarDocs.TotalHits));
                            }
                        }
                    }
                }
            }

            return(results);
        }
示例#23
0
 private static void AssignParameters(MoreLikeThis mlt, MoreLikeThisQueryServerSide parameters)
 {
     if (parameters.Boost != null)
     {
         mlt.Boost = parameters.Boost.Value;
     }
     if (parameters.BoostFactor != null)
     {
         mlt.BoostFactor = parameters.BoostFactor.Value;
     }
     if (parameters.MaximumNumberOfTokensParsed != null)
     {
         mlt.MaxNumTokensParsed = parameters.MaximumNumberOfTokensParsed.Value;
     }
     if (parameters.MaximumQueryTerms != null)
     {
         mlt.MaxQueryTerms = parameters.MaximumQueryTerms.Value;
     }
     if (parameters.MinimumWordLength != null)
     {
         mlt.MinWordLen = parameters.MinimumWordLength.Value;
     }
     if (parameters.MaximumWordLength != null)
     {
         mlt.MaxWordLen = parameters.MaximumWordLength.Value;
     }
     if (parameters.MinimumTermFrequency != null)
     {
         mlt.MinTermFreq = parameters.MinimumTermFrequency.Value;
     }
     if (parameters.MinimumDocumentFrequency != null)
     {
         mlt.MinDocFreq = parameters.MinimumDocumentFrequency.Value;
     }
     if (parameters.MaximumDocumentFrequency != null)
     {
         mlt.MaxDocFreq = parameters.MaximumDocumentFrequency.Value;
     }
     if (parameters.MaximumDocumentFrequencyPercentage != null)
     {
         mlt.SetMaxDocFreqPct(parameters.MaximumDocumentFrequencyPercentage.Value);
     }
 }
示例#24
0
        private IDictionary <string, float> GetOriginalValues()
        {
            IDictionary <string, float> originalValues = new Dictionary <string, float>();
            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.Analyzer    = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq  = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen  = 1;
            mlt.FieldNames  = new[] { "text" };
            mlt.ApplyBoost  = true;
            BooleanQuery          query   = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text");
            IList <BooleanClause> clauses = query.Clauses;

            foreach (BooleanClause clause in clauses)
            {
                TermQuery tq = (TermQuery)clause.Query;
                originalValues[tq.Term.Text] = tq.Boost;
            }
            return(originalValues);
        }
示例#25
0
        private static void Search(Directory path)
        {
            var ir  = IndexReader.Open(path, true);
            var mlt = new MoreLikeThis(ir);

            mlt.SetFieldNames(new string[] { "content" });
            mlt.MinTermFreq = 1;
            mlt.MinDocFreq  = 1;

            var reader = new System.IO.StringReader("are the most well known");
            var query  = mlt.Like(reader);

            using (var searcher = new IndexSearcher(path, true))
            {
                var topDocs = searcher.Search(query, 5);
                foreach (var scoreDoc in topDocs.ScoreDocs)
                {
                    Document doc = searcher.Doc(scoreDoc.Doc);
                }
            }
        }
        public IEnumerable <ISearchHit> GetRelatedItems(int id, RelatedContentContext context)
        {
            IndexReader reader        = IndexReader.Open(GetDirectory(context.Index), true);
            var         indexSearcher = new IndexSearcher(reader);
            var         analyzer      = _analyzerProvider.GetAnalyzer(context.Index);

            var mlt = new MoreLikeThis(reader)
            {
                Boost = true, MinTermFreq = 1, Analyzer = analyzer, MinDocFreq = 1
            };

            if (context.FieldNames.Length > 0)
            {
                mlt.SetFieldNames(context.FieldNames);
            }

            var    docid = GetDocumentId(id, indexSearcher);
            Filter filter;

            BooleanQuery query = (BooleanQuery)mlt.Like(docid);

            if (!String.IsNullOrWhiteSpace(context.ContentType))
            {
                var contentTypeQuery = new TermQuery(new Term("type", context.ContentType));
                query.Add(new BooleanClause(contentTypeQuery, Occur.MUST));
            }

            // exclude same doc
            var exclude = new TermQuery(new Term("id", id.ToString()));

            query.Add(new BooleanClause(exclude, Occur.MUST_NOT));

            TopDocs simDocs = indexSearcher.Search(query, context.Count);
            var     results = simDocs.ScoreDocs
                              .Select(scoreDoc => new LuceneSearchHit(indexSearcher.Doc(scoreDoc.Doc), scoreDoc.Score));

            return(results);
        }
        private static Query CreateMoreLikeThisQuery(long prodcutId)
        {
            var docNum = GetLuceneDocumentNumber(prodcutId);

            if (docNum == 0)
            {
                return(null);
            }

            var analyzer = new StandardAnalyzer(_version);
            var reader   = Searcher.IndexReader;

            var moreLikeThis = new MoreLikeThis(reader)
            {
                Analyzer = analyzer
            };

            moreLikeThis.SetFieldNames(new[] { "Name", "Description" });
            moreLikeThis.MinDocFreq  = 1;
            moreLikeThis.MinTermFreq = 1;
            moreLikeThis.Boost       = true;

            return(moreLikeThis.Like(docNum));
        }
示例#28
0
        public static IEnumerable <QueryResult> SearchTranslationProjects(Lucene.Net.Store.Directory dir, string lang, string searchText, IEnumerable <string> languages)
        {
            var ir = IndexReader.Open(dir, true);

            var mlt = new MoreLikeThis(ir);

            //mlt.SetFieldNames(new string[] { lang });
            mlt.SetFieldNames(new[] { "fr" });
            mlt.MinTermFreq = 1;
            mlt.MinDocFreq  = 1;
            mlt.MinWordLen  = 4;
            //mlt.Analyzer = new Lucene.Net.Analysis.Fr.FrenchAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            //mlt.Analyzer = new Lucene.Net.Analysis.Snowball.SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "French");
            var reader = new System.IO.StringReader(searchText);
            var query  = mlt.Like(reader);

            var results = new List <QueryResult>();

            using (var searcher = new IndexSearcher(dir, true))
            {
                var topDocs = searcher.Search(query, 50);
                foreach (var scoreDoc in topDocs.ScoreDocs)
                {
                    Document doc   = searcher.Doc(scoreDoc.Doc);
                    float    score = scoreDoc.Score;

                    var trads = languages.Select(l => new Segment(l, doc.Get(l)));
                    var set   = new SegmentsByKey(doc.Get("key"), trads);

                    results.Add(new QueryResult(doc.Get("key"), trads, score));
                }
            }
            //var g = results.GroupBy(s => s);

            return(results);
        }
示例#29
0
        private static Query CreateMoreLikeThisQuery(int postId)
        {
            int docNum = GetLuceneDocumentNumber(postId);

            if (docNum == 0)
            {
                return(null);
            }
            var analyzer = new StandardAnalyzer(_version);

            using (var searcher = new IndexSearcher(_directory, false))
            {
                IndexReader reader       = searcher.IndexReader;
                var         moreLikeThis = new MoreLikeThis(reader)
                {
                    Analyzer = analyzer
                };
                moreLikeThis.SetFieldNames(new[] { "Title", "Name", "Description", "Publisher", "Author" });
                moreLikeThis.MinDocFreq  = 1;
                moreLikeThis.MinTermFreq = 1;
                moreLikeThis.Boost       = true;
                return(moreLikeThis.Like(docNum));
            }
        }
示例#30
0
        public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) {
            var pageQuery = new TermQuery(new Term("key", key));
            var topDocs = _searcher.Search(pageQuery, 1);
            if (topDocs.TotalHits == 0) {
                return new SearchResult();
            }

            var doc = topDocs.ScoreDocs[0].Doc;

            var moreLikeThis = new MoreLikeThis(_reader) {
                Analyzer = _analyzer, 
                MinWordLen = 3
            };
            moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" });
            moreLikeThis.SetStopWords(StopWords.DefaultEnglish);
            moreLikeThis.MinDocFreq = 2;
            
            var query = moreLikeThis.Like(doc);
            var startTime = DateTime.Now;
            var ticks = DateTime.Now.ToUniversalTime().Ticks;

            Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false);
            Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true);

            var booleanQuery = new BooleanQuery {
                {query, Occur.MUST},
                {pageQuery, Occur.MUST_NOT},
                {publishStartQuery, Occur.MUST},
                {publishStopQuery, Occur.MUST}
            };

            if (matchCategory) {
                var document = _searcher.Doc(doc);
                var field = document.GetField("category");

                if (field != null && !string.IsNullOrEmpty(field.StringValue)) {
                    var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant()));
                    booleanQuery.Add(categoryQuery, Occur.MUST);
                }
            }

            var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs;

            var result = new SearchResult { NumberOfHits = scoreDocs.Length };

            if (resultOffset < scoreDocs.Length) {
                var resultUpperOffset = resultOffset + resultLength;
                if (resultUpperOffset > scoreDocs.Length) {
                    resultUpperOffset = scoreDocs.Length;
                }

                for (int i = resultOffset; i < resultUpperOffset; i++) {
                    Document document = _searcher.Doc(scoreDocs[i].Doc);

                    Guid pageId;
                    (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId);

                    var hit = new SearchHit {
                        PageId = pageId,
                        Path = document.Get("path"),
                        Title = document.Get("title"),
                        Excerpt = document.Get("summary")
                    };

                    //foreach (string key in metaData) {
                    //    hit.MetaData.Add(key, document.Get(key));
                    //}

                    result.Hits.Add(hit);
                }
            }

            var timeTaken = DateTime.Now - startTime;
            result.SecondsTaken = timeTaken.TotalSeconds;

            return result;
        }
示例#31
0
        public List <Post> Similar(int postid, int itemsToReturn)
        {
            var list = new List <Post>();

            if (postid <= 0)
            {
                return(list);
            }

            IndexSearcher searcher = null;
            IndexReader   reader   = null;

            EnsureIndexExists();

            var query = GetIdSearchQuery(postid);

            lck.AcquireReaderLock(ReaderTimeOut);
            try
            {
                searcher = new IndexSearcher(rd);

                // Get Original document
                TopDocs hits = searcher.Search(query, itemsToReturn);
                if (hits == null || hits.ScoreDocs.Length <= 0)
                {
                    return(list);
                }

                int docNum = hits.ScoreDocs[0].Doc;
                if (docNum > -1)
                {
                    LQ.QueryParser parser = GetQueryParser();
                    reader = IndexReader.Open(rd, true);

                    var mlt = new MoreLikeThis(reader);
                    mlt.Analyzer = _analyzer;
                    mlt.SetFieldNames(new[] { SearchFields.Title, SearchFields.Body, SearchFields.Tag });
                    mlt.MinDocFreq  = 5;
                    mlt.MinTermFreq = 2;
                    mlt.Boost       = true;
                    var moreResultsQuery = mlt.Like(docNum);

                    TopDocs similarhits = searcher.Search(moreResultsQuery, itemsToReturn);

                    for (int i = 0; i < similarhits.ScoreDocs.Length; i++)
                    {
                        Document doc  = searcher.Doc(similarhits.ScoreDocs[i].Doc);
                        var      post = CreatePostFromDocument(doc, null);
                        if (postid != post.Id)
                        {
                            list.Add(post);
                        }

                        if (list.Count >= itemsToReturn)
                        {
                            break;
                        }
                    }
                }
            }
            catch (Exception)
            {
            }
            finally
            {
                if (searcher != null)
                {
                    searcher.Dispose();
                }

                if (reader != null)
                {
                    reader.Dispose();
                }

                lck.ReleaseReaderLock();
            }


            return(list);
        }
示例#32
0
        JobAdSearchResults IJobAdSearchService.SearchSimilar(Guid?memberId, Guid jobAdId, JobAdSearchQuery searchQuery)
        {
            const string method = "GetSimilarJobs";

            try
            {
                var reader   = GetReader();
                var searcher = new Searcher(reader);

                var docId = searcher.Fetch(jobAdId);

                // If the job ad cannot be found then return no results.

                if (docId == -1)
                {
                    return(new JobAdSearchResults());
                }

                var jobAd = _jobAdsQuery.GetJobAd <JobAd>(jobAdId);
                if (jobAd == null)
                {
                    return(new JobAdSearchResults());
                }

                // Look for more like this.

                var mlt = new MoreLikeThis(reader);
                mlt.setAnalyzer(_contentAnalyzer);
                mlt.setFieldNames(new [] { FieldName.Content, FieldName.Title });
                var query = mlt.like(docId);

                //query = new SeniorityIndexHandler().GetQuery(query, new JobAdSearchQuery {SeniorityIndex = jobAd.SeniorityIndex});

                // Ensure the initial job is not in the results.

                var searchFilter = new BooleanFilter();
                searchFilter.add(new FilterClause(new SpecialsFilter(SearchFieldName.Id, false, new[] { jobAdId.ToFieldValue() }), BooleanClause.Occur.MUST_NOT));

                // Add salary and location restriction.

                var filter = _indexer.GetFilter(
                    new JobAdSearchQuery
                {
                    Salary          = FudgeSalary(jobAd.Description.Salary),
                    ExcludeNoSalary = true,
                    Location        = jobAd.Description.Location,
                    Distance        = 50,
                },
                    null,
                    null);

                searchFilter.add(new FilterClause(filter, BooleanClause.Occur.MUST));

                return(searcher.Search(query, searchFilter, null, null, searchQuery.Skip, searchQuery.Take ?? reader.maxDoc(), false));
            }
            catch (Exception e)
            {
                #region Log
                EventSource.Raise(Event.Error, method, "Unexpected exception.", e);
                #endregion
                throw;
            }
        }
 public void TestMultiFields()
 {
     MoreLikeThis mlt = new MoreLikeThis(reader);
     mlt.Analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
     mlt.MinDocFreq = 1;
     mlt.MinTermFreq = 1;
     mlt.MinWordLen = 1;
     mlt.FieldNames = new[] { "text", "foobar" };
     mlt.Like(new StringReader("this is a test"), "foobar");
 }