Пример #1
0
        private static Query CreateMoreProjectsLikeThisQuery(int projectId)
        {
            int docNum = GetLuceneProjectDocumentNumber(projectId);

            if (docNum == 0)
            {
                return(null);
            }
            var analyzer = new StandardAnalyzer(_version);

            using (var searcher = new IndexSearcher(_directory, false))
            {
                IndexReader reader       = searcher.IndexReader;
                var         moreLikeThis = new MoreLikeThis(reader)
                {
                    Analyzer = analyzer
                };
                moreLikeThis.SetFieldNames(new[]
                {
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductId),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Title),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Description),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Price),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.ProductStatus),
                    StronglyTyped.PropertyName <LuceneSearchModel>(x => x.Category)
                });
                moreLikeThis.MinDocFreq  = 1;
                moreLikeThis.MinTermFreq = 1;
                moreLikeThis.Boost       = true;
                return(moreLikeThis.Like(docNum));
            }
        }
Пример #2
0
        public void TestBoostFactor()
        {
            IDictionary<string, float?> originalValues = OriginalValues;

            MoreLikeThis mlt = new MoreLikeThis(reader);
            mlt.Analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen = 1;
            mlt.FieldNames = new[] { "text" };
            mlt.Boost = true;

            // this mean that every term boost factor will be multiplied by this
            // number
            float boostFactor = 5;
            mlt.BoostFactor = boostFactor;

            BooleanQuery query = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text");
            IList<BooleanClause> clauses = query.Clauses;

            assertEquals("Expected " + originalValues.Count + " clauses.", originalValues.Count, clauses.Count);

            foreach (BooleanClause clause in clauses)
            {
                TermQuery tq = (TermQuery)clause.Query;
                float? termBoost = originalValues[tq.Term.Text()];
                assertNotNull("Expected term " + tq.Term.Text(), termBoost);

                float totalBoost = (float) (termBoost * boostFactor);
                assertEquals("Expected boost of " + totalBoost + " for term '" + tq.Term.Text() + "' got " + tq.Boost, totalBoost, tq.Boost, 0.0001);
            }
        }
Пример #3
0
        public List <Article> GetRelatedArticles(int articleId, int count)
        {
            var reader   = IndexReader.Open(_directory, true);
            var searcher = new IndexSearcher(_directory, true);

            var searchQuery = new TermQuery(new Term("Id", articleId.ToString()));
            var doc         = searcher.Search(searchQuery, 1);

            if (doc.TotalHits == 0)
            {
                return(new List <Article>());
            }

            var docId = doc.ScoreDocs[0].Doc;

            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.SetFieldNames(new[] { "Name", "Body", "TagName", "FeedName" });
            Query query = mlt.Like(docId);
            var   hits  = searcher.Search(query, count + 1);

            var articles = ConvertToArticles(hits, searcher, 1, count).Where(a => a.Id != articleId);

            reader.Dispose();
            searcher.Dispose();
            return(articles.ToList());
        }
Пример #4
0
        public void TestBoostFactor()
        {
            IDictionary <string, float> originalValues = GetOriginalValues();

            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.Analyzer    = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq  = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen  = 1;
            mlt.FieldNames  = new[] { "text" };
            mlt.ApplyBoost  = true;

            // this mean that every term boost factor will be multiplied by this
            // number
            float boostFactor = 5;

            mlt.BoostFactor = boostFactor;

            BooleanQuery          query   = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text");
            IList <BooleanClause> clauses = query.Clauses;

            assertEquals("Expected " + originalValues.Count + " clauses.", originalValues.Count, clauses.Count);

            foreach (BooleanClause clause in clauses)
            {
                TermQuery tq        = (TermQuery)clause.Query;
                float     termBoost = originalValues[tq.Term.Text];
                assertNotNull("Expected term " + tq.Term.Text, termBoost);

                float totalBoost = (float)(termBoost * boostFactor);
                assertEquals("Expected boost of " + totalBoost + " for term '"
                             + tq.Term.Text + "' got " + tq.Boost, totalBoost, tq.Boost, 0.0001);
            }
        }
Пример #5
0
        /// <summary>
        /// for given document and fields in that doc get fixed no of docs that are similar
        /// assumes you have index that is up to date
        /// </summary>
        /// <returns>list of similar docs found</returns>
        public IEnumerable <SearchResultItem> FindMoreLikeThis()
        {
            var results = new List <SearchResultItem>();

            if (IsInit())
            {
                var moreLikeThis = new MoreLikeThis(reader);
                moreLikeThis.SetFieldNames(fieldsToSearch.ToArray());
                moreLikeThis.SetMinTermFreq(1);
                moreLikeThis.SetMinDocFreq(1);
                int currentLuceneDocId = GetLuceneDocNo(docId);
                if (currentLuceneDocId != 0)
                {
                    var query = moreLikeThis.Like(currentLuceneDocId);
                    var docs  = searcher.Search(query, maxNo);
                    int count = docs.ScoreDocs.Length;
                    //start at 1 as first item will be current document itself which we dont want
                    for (int i = 1; i < count; i++)
                    {
                        var d    = reader.Document(docs.ScoreDocs[i].doc);
                        var item = new SearchResultItem
                        {
                            PageName = d.GetField("nodeName").StringValue(),
                            NodeId   = int.Parse(d.GetField("__NodeId").StringValue())
                        };
                        results.Add(item);
                    }
                }
            }
            return(results);
        }
Пример #6
0
        public static IList <CorpusDocument> GetMoreLikeThis(string indexName, int indexDocumentId, int maxDocs)
        {
            // See: http://lucene.apache.org/java/2_2_0/api/org/apache/lucene/search/similar/MoreLikeThis.html

            var mlt = new MoreLikeThis(Searcher.GetIndexReader());

            mlt.SetAnalyzer(new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29));
            mlt.SetFieldNames(new[] { "Title", "Content" });
            mlt.SetMinWordLen(4);             // improve relevancy

            var query = mlt.Like(indexDocumentId);

            var tsdc = TopScoreDocCollector.create(maxDocs, true);

            Searcher.Search(query, tsdc);
            var hits = tsdc.TopDocs().ScoreDocs;

            var ret = new List <CorpusDocument>(maxDocs);

            foreach (var hit in hits)
            {
                var d = Searcher.Doc(hit.doc);
                ret.Add(new CorpusDocument
                {
                    Id    = d.Get("Id"),
                    Title = d.Get("Title"),
                });
            }
            return(ret);
        }
Пример #7
0
        public IEnumerable <SearchEngineResult> RelatedContents(int entryId, int max, int blogId)
        {
            var list = new List <SearchEngineResult>();

            //First look for the original doc
            Query   query = GetIdSearchQuery(entryId);
            TopDocs hits  = Searcher.Search(query, max);

            if (hits.scoreDocs.Length <= 0)
            {
                return(list);
            }

            int docNum = hits.scoreDocs[0].doc;

            //Setup MoreLikeThis searcher
            var reader = DoWriterAction(w => w.GetReader());
            var mlt    = new MoreLikeThis(reader);

            mlt.SetAnalyzer(_analyzer);
            mlt.SetFieldNames(new[] { Title, Body, Tags });
            mlt.SetMinDocFreq(_settings.Parameters.MinimumDocumentFrequency);
            mlt.SetMinTermFreq(_settings.Parameters.MinimumTermFrequency);
            mlt.SetBoost(_settings.Parameters.MoreLikeThisBoost);

            var moreResultsQuery = mlt.Like(docNum);

            return(PerformQuery(list, moreResultsQuery, max + 1, blogId, entryId));
        }
Пример #8
0
        public SearchEngineResponse RelatedContents(int entryId, int max, int blogId)
        {
            var list = new List <SearchEngineResult>();

            //First look for the original doc
            Query   query = GetIdSearchQuery(entryId);
            TopDocs hits  = Searcher.Search(query, max);

            if (hits.scoreDocs.Length <= 0)
            {
                return(new SearchEngineResponse {
                    TotalCount = hits.totalHits, Results = list
                });
            }

            int docNum = hits.scoreDocs[0].doc;

            //Setup MoreLikeThis searcher
            var reader = DoWriterAction(w => w.GetReader());
            var mlt    = new MoreLikeThis(reader);

            mlt.SetAnalyzer(_analyzer);
            mlt.SetFieldNames(new[] { Categories, Silouhettes, Tags });

            var moreResultsQuery = mlt.Like(docNum);

            return(PerformQuery(list, moreResultsQuery, 0, max + 1, blogId, entryId));
        }
Пример #9
0
        /// <summary>
        /// Uses lucenes MoreLikeThis feature to find items similar to the one passed in
        /// </summary>
        /// <param name="item">The item to find similar items</param>
        /// <param name="pageNo">Page number of the result set</param>
        /// <param name="pageSize">Number of items to return in the result set</param>
        /// <returns>Items similar to the one pased in</returns>
        public IPagedList <T> MoreLikeThis(T item, int pageNo, int pageSize)
        {
            using (IndexSearcher indexSearcher = _luceneIndexer.GetSearcher())
            {
                var itemId   = _luceneIndexer.GetIdentifier(item);
                var docQuery = new TermQuery(new Term(_luceneIndexer.PrimaryKeyField, itemId));

                var docHit = indexSearcher.Search(docQuery, 1);

                if (docHit.ScoreDocs.Any())
                {
                    var moreLikeThis = new MoreLikeThis(indexSearcher.IndexReader)
                    {
                        MaxDocFreq  = 0,
                        MinTermFreq = 0
                    };

                    //moreLikeThis.SetFieldNames(_luceneIndexer.FullTextFields);

                    var likeQuery = moreLikeThis.Like(docHit.ScoreDocs[0].Doc);

                    var query = new BooleanQuery
                    {
                        { likeQuery, Occur.MUST },
                        //{docQuery, Occur.MUST_NOT} // Exclude the doc we basing similar matches on
                    };

                    return(Search(query, pageNo, pageSize, indexSearcher));
                }

                return(NoResults(pageNo, pageSize));
            }
        }
Пример #10
0
        public StoryCollection Find(int hostId, int storyId)
        {
            int?docId = ConvertStoryIdtoDocId(hostId, storyId);

            if (docId.HasValue)
            {
                IndexSearcher indexSearch = SearchQuery.GetSearcher(hostId);
                IndexReader   indexReader = indexSearch.GetIndexReader();

                MoreLikeThis mlt = new MoreLikeThis(indexReader);

                mlt.SetAnalyzer(new DnkAnalyzer());
                //mlt.SetFieldNames(new string[] { "title", "description" });

                //these values control the query used to find related/similar stories
                //
                //-we are only using the title and tags fields,
                //-the term must appear 1 or more times,
                //-the query will only have 3 terms
                //-a word less than 3 char in len with be ignored
                //-the term must appear at in at least 4 doc
                mlt.SetFieldNames(new string[] { "title", "tags" });
                mlt.SetMinTermFreq(1);
                mlt.SetMaxQueryTerms(5);
                mlt.SetMinWordLen(3);
                mlt.SetMinDocFreq(4);
                mlt.SetStopWords(StopWords());
                mlt.SetBoost(true);
                Query mltQuery = mlt.Like(docId.Value);

                Hits hits = indexSearch.Search(mltQuery);

                List <int> results = new List <int>();


                for (int i = 0; i < hits.Length(); i++)
                {
                    Document d          = hits.Doc(i);
                    int      hitStoryId = int.Parse(d.GetField("id").StringValue());

                    if (hitStoryId != storyId)
                    {
                        results.Add(hitStoryId);
                        if (results.Count == NUMBER_OF_RELATED_STORIES_TO_RETURN)
                        {
                            break;
                        }
                    }
                }

                return(SearchQuery.LoadStorySearchResults(results));
            }
            else
            {
                return(null);
            }
        }
Пример #11
0
        public void TestMultiFields()
        {
            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.Analyzer    = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq  = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen  = 1;
            mlt.FieldNames  = new[] { "text", "foobar" };
            mlt.Like(new StringReader("this is a test"), "foobar");
        }
Пример #12
0
        //Realiza la búsqueda de un texto "original" contra los documentos indexados utilizando TFIDF
        public static List <Document> moreLikeThisAnalyzer(String original, ISet <string> stopWords, Lucene.Net.Analysis.Analyzer analyzer)
        {
            Trace.WriteLine("Realizando la Búsqueda");
            List <Document> DocumenResult = new List <Document>();

            IndexReader   indexReader   = IndexReader.Open(_directory, true);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);

            MoreLikeThis mlt = new MoreLikeThis(indexReader);

            mlt.SetFieldNames(DEFAULT_FIELD_NAMES);
            mlt.MinDocFreq    = DEFALT_MIN_DOC_FREQ;
            mlt.MinTermFreq   = DEFAULT_MIN_TERM_FREQ;
            mlt.MaxQueryTerms = MAX_QUERY_TERMS;
            mlt.MinWordLen    = DEFAULT_MIN_WORD_LENGTH;
            mlt.Analyzer      = analyzer;
            mlt.SetStopWords(stopWords);

            Query query = mlt.Like(new System.IO.StringReader(original));

            int topCount = DEFAULT_DOCUMENT_TO_SEARCH;

            TopScoreDocCollector collector = TopScoreDocCollector.Create(topCount, true);

            indexSearcher.Search(query, collector);
            ScoreDoc[] hits   = collector.TopDocs().ScoreDocs;
            var        result = new List <string>();
            //Hits hits = indexSearcher.Search(query);

            int len = hits.Length;

            Trace.WriteLine("Entering");
            Trace.WriteLine("-------------------------------------------");
            Trace.WriteLine("original :" + original);
            Trace.WriteLine("query: " + query);
            Trace.WriteLine("found: " + len + " documents");
            for (int i = 0; i < Math.Min(25, len); i++)
            {
                int d = hits[i].Doc;
                Trace.WriteLine("score   : " + hits[i].Score);
                Trace.WriteLine("name    : " + d.ToString());
                //Colocar los datos en el arreglo de resultados
                Document doc = indexSearcher.Doc(hits[i].Doc);
                DocumenResult.Add(doc);
            }
            Trace.WriteLine("-------------------------------------------");
            Trace.WriteLine("Exiting");
            return(DocumenResult);
        }
Пример #13
0
        public override Query Rewrite(IndexReader reader)
        {
            var mlt = new MoreLikeThis(reader) { FieldNames = MoreLikeFields, Analyzer = Analyzer, MinTermFreq = MinTermFrequency };

            if (MinDocFreq >= 0)
            {
                mlt.MinDocFreq = MinDocFreq;
            }
            mlt.MaxQueryTerms = MaxQueryTerms;
            mlt.StopWords = StopWords;
            var bq = (BooleanQuery)mlt.Like(new StringReader(LikeText), fieldName);
            var clauses = bq.Clauses;
            //make at least half the terms match
            bq.MinimumNumberShouldMatch = (int)(clauses.Length * PercentTermsToMatch);
            return bq;
        }
    private List <SearchResult> ShowSimilarResults(IndexSearcher searcher, MoreLikeThis mlt, int docId, int topHits)
    {
        BooleanQuery boolQuery = (BooleanQuery)mlt.Like(docId);

        ScoreDoc[] scoreDocs = searcher.Search(boolQuery, topHits).ScoreDocs;

        List <SearchResult> results = new List <SearchResult>();

        foreach (var scoreDoc in scoreDocs)
        {
            Document     doc    = searcher.Doc(scoreDoc.Doc);
            SearchResult result = new SearchResult(doc.Get("title"), doc.Get("description"), doc.Get("_group"));
            results.Add(result);
        }
        return(results);
    }
Пример #15
0
        public IList <LuceneSearchResult> GetSourcesLikeThis(int sourceId, int numResults)
        {
            IList <LuceneSearchResult> results = new List <LuceneSearchResult>();

            using (SearcherManager manager = new SearcherManager(SourceIndexWriterSingleton.Instance))
            {
                this.searcher = manager.Acquire().Searcher;

                Query query = NumericRangeQuery.NewIntRange("Id", sourceId, sourceId, true, true);

                this.topDocs = this.searcher.Search(query, null, 1);

                if (this.topDocs != null && this.topDocs.ScoreDocs != null && this.topDocs.ScoreDocs.Length > 0)
                {
                    // run second search using MoreLikeThis query
                    using (IndexReader reader = IndexReader.Open(SourceIndexWriterSingleton.Directory, true))
                    {
                        int maxDoc = reader.MaxDoc;

                        MoreLikeThis mlt = new MoreLikeThis(reader);
                        mlt.SetFieldNames(new string[] { "FileData" });
                        mlt.MinTermFreq = 1;
                        mlt.MinDocFreq  = 1;

                        BooleanQuery bq = new BooleanQuery();
                        bq.Add(mlt.Like(this.topDocs.ScoreDocs[0].Doc), Occur.MUST);
                        bq.Add(query, Occur.MUST_NOT);
                        log.Info("More like this query: " + bq.ToString());

                        TopDocs similarDocs = this.searcher.Search(bq, numResults);

                        if (similarDocs.TotalHits > 0)
                        {
                            foreach (ScoreDoc scoreDoc in similarDocs.ScoreDocs)
                            {
                                results.Add(new LuceneSearchResult(this.searcher.Doc(scoreDoc.Doc), scoreDoc.Score, similarDocs.TotalHits));
                            }
                        }
                    }
                }
            }

            return(results);
        }
Пример #16
0
        private static void Search(Directory path)
        {
            var ir  = IndexReader.Open(path, true);
            var mlt = new MoreLikeThis(ir);

            mlt.SetFieldNames(new string[] { "content" });
            mlt.MinTermFreq = 1;
            mlt.MinDocFreq  = 1;

            var reader = new System.IO.StringReader("are the most well known");
            var query  = mlt.Like(reader);

            using (var searcher = new IndexSearcher(path, true))
            {
                var topDocs = searcher.Search(query, 5);
                foreach (var scoreDoc in topDocs.ScoreDocs)
                {
                    Document doc = searcher.Doc(scoreDoc.Doc);
                }
            }
        }
Пример #17
0
        private IDictionary <string, float> GetOriginalValues()
        {
            IDictionary <string, float> originalValues = new Dictionary <string, float>();
            MoreLikeThis mlt = new MoreLikeThis(reader);

            mlt.Analyzer    = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false);
            mlt.MinDocFreq  = 1;
            mlt.MinTermFreq = 1;
            mlt.MinWordLen  = 1;
            mlt.FieldNames  = new[] { "text" };
            mlt.ApplyBoost  = true;
            BooleanQuery          query   = (BooleanQuery)mlt.Like(new StringReader("lucene release"), "text");
            IList <BooleanClause> clauses = query.Clauses;

            foreach (BooleanClause clause in clauses)
            {
                TermQuery tq = (TermQuery)clause.Query;
                originalValues[tq.Term.Text] = tq.Boost;
            }
            return(originalValues);
        }
        public IEnumerable <ISearchHit> GetRelatedItems(int id, RelatedContentContext context)
        {
            IndexReader reader        = IndexReader.Open(GetDirectory(context.Index), true);
            var         indexSearcher = new IndexSearcher(reader);
            var         analyzer      = _analyzerProvider.GetAnalyzer(context.Index);

            var mlt = new MoreLikeThis(reader)
            {
                Boost = true, MinTermFreq = 1, Analyzer = analyzer, MinDocFreq = 1
            };

            if (context.FieldNames.Length > 0)
            {
                mlt.SetFieldNames(context.FieldNames);
            }

            var    docid = GetDocumentId(id, indexSearcher);
            Filter filter;

            BooleanQuery query = (BooleanQuery)mlt.Like(docid);

            if (!String.IsNullOrWhiteSpace(context.ContentType))
            {
                var contentTypeQuery = new TermQuery(new Term("type", context.ContentType));
                query.Add(new BooleanClause(contentTypeQuery, Occur.MUST));
            }

            // exclude same doc
            var exclude = new TermQuery(new Term("id", id.ToString()));

            query.Add(new BooleanClause(exclude, Occur.MUST_NOT));

            TopDocs simDocs = indexSearcher.Search(query, context.Count);
            var     results = simDocs.ScoreDocs
                              .Select(scoreDoc => new LuceneSearchHit(indexSearcher.Doc(scoreDoc.Doc), scoreDoc.Score));

            return(results);
        }
Пример #19
0
        private static Query CreateMoreLikeThisQuery(int postId)
        {
            int docNum = GetLuceneDocumentNumber(postId);

            if (docNum == 0)
            {
                return(null);
            }
            var analyzer = new StandardAnalyzer(_version);

            using (var searcher = new IndexSearcher(_directory, false))
            {
                IndexReader reader       = searcher.IndexReader;
                var         moreLikeThis = new MoreLikeThis(reader)
                {
                    Analyzer = analyzer
                };
                moreLikeThis.SetFieldNames(new[] { "Title", "Name", "Description", "Publisher", "Author" });
                moreLikeThis.MinDocFreq  = 1;
                moreLikeThis.MinTermFreq = 1;
                moreLikeThis.Boost       = true;
                return(moreLikeThis.Like(docNum));
            }
        }
        private static Query CreateMoreLikeThisQuery(long prodcutId)
        {
            var docNum = GetLuceneDocumentNumber(prodcutId);

            if (docNum == 0)
            {
                return(null);
            }

            var analyzer = new StandardAnalyzer(_version);
            var reader   = Searcher.IndexReader;

            var moreLikeThis = new MoreLikeThis(reader)
            {
                Analyzer = analyzer
            };

            moreLikeThis.SetFieldNames(new[] { "Name", "Description" });
            moreLikeThis.MinDocFreq  = 1;
            moreLikeThis.MinTermFreq = 1;
            moreLikeThis.Boost       = true;

            return(moreLikeThis.Like(docNum));
        }
Пример #21
0
        public static IEnumerable <QueryResult> SearchTranslationProjects(Lucene.Net.Store.Directory dir, string lang, string searchText, IEnumerable <string> languages)
        {
            var ir = IndexReader.Open(dir, true);

            var mlt = new MoreLikeThis(ir);

            //mlt.SetFieldNames(new string[] { lang });
            mlt.SetFieldNames(new[] { "fr" });
            mlt.MinTermFreq = 1;
            mlt.MinDocFreq  = 1;
            mlt.MinWordLen  = 4;
            //mlt.Analyzer = new Lucene.Net.Analysis.Fr.FrenchAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
            //mlt.Analyzer = new Lucene.Net.Analysis.Snowball.SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "French");
            var reader = new System.IO.StringReader(searchText);
            var query  = mlt.Like(reader);

            var results = new List <QueryResult>();

            using (var searcher = new IndexSearcher(dir, true))
            {
                var topDocs = searcher.Search(query, 50);
                foreach (var scoreDoc in topDocs.ScoreDocs)
                {
                    Document doc   = searcher.Doc(scoreDoc.Doc);
                    float    score = scoreDoc.Score;

                    var trads = languages.Select(l => new Segment(l, doc.Get(l)));
                    var set   = new SegmentsByKey(doc.Get("key"), trads);

                    results.Add(new QueryResult(doc.Get("key"), trads, score));
                }
            }
            //var g = results.GroupBy(s => s);

            return(results);
        }
Пример #22
0
        /// <summary>
        /// Assign a class (with score) to the given text string
        /// </summary>
        /// <param name="text">a string containing text to be classified</param>
        /// <returns>a <see cref="ClassificationResult{BytesRef}"/> holding assigned class of type <see cref="BytesRef"/> and score</returns>
        public virtual ClassificationResult <BytesRef> AssignClass(string text)
        {
            if (mlt is null)
            {
                throw new IOException("You must first call Classifier#train");
            }

            BooleanQuery mltQuery = new BooleanQuery();

            foreach (string textFieldName in textFieldNames)
            {
                mltQuery.Add(new BooleanClause(mlt.Like(new StringReader(text), textFieldName), Occur.SHOULD));
            }
            Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*"));

            mltQuery.Add(new BooleanClause(classFieldQuery, Occur.MUST));
            if (query != null)
            {
                mltQuery.Add(query, Occur.MUST);
            }
            TopDocs topDocs = indexSearcher.Search(mltQuery, k);

            return(SelectClassFromNeighbors(topDocs));
        }
Пример #23
0
 public override void ToObject(Document source, IQueryExecutionContext context, T target)
 {
     base.ToObject(source, context, target);
     Console.WriteLine(context.Searcher.DocFreq(new Term("Text", "words")));
     queries[target] = mlt.Like(context.CurrentScoreDoc.Doc);
 }
Пример #24
0
 public void TestMultiFields()
 {
     MoreLikeThis mlt = new MoreLikeThis(reader);
     mlt.Analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
     mlt.MinDocFreq = 1;
     mlt.MinTermFreq = 1;
     mlt.MinWordLen = 1;
     mlt.FieldNames = new[] { "text", "foobar" };
     mlt.Like(new StringReader("this is a test"), "foobar");
 }
Пример #25
0
        public List <Post> Similar(int postid, int itemsToReturn)
        {
            var list = new List <Post>();

            if (postid <= 0)
            {
                return(list);
            }

            IndexSearcher searcher = null;
            IndexReader   reader   = null;

            EnsureIndexExists();

            var query = GetIdSearchQuery(postid);

            lck.AcquireReaderLock(ReaderTimeOut);
            try
            {
                searcher = new IndexSearcher(rd);

                // Get Original document
                TopDocs hits = searcher.Search(query, itemsToReturn);
                if (hits == null || hits.ScoreDocs.Length <= 0)
                {
                    return(list);
                }

                int docNum = hits.ScoreDocs[0].Doc;
                if (docNum > -1)
                {
                    LQ.QueryParser parser = GetQueryParser();
                    reader = IndexReader.Open(rd, true);

                    var mlt = new MoreLikeThis(reader);
                    mlt.Analyzer = _analyzer;
                    mlt.SetFieldNames(new[] { SearchFields.Title, SearchFields.Body, SearchFields.Tag });
                    mlt.MinDocFreq  = 5;
                    mlt.MinTermFreq = 2;
                    mlt.Boost       = true;
                    var moreResultsQuery = mlt.Like(docNum);

                    TopDocs similarhits = searcher.Search(moreResultsQuery, itemsToReturn);

                    for (int i = 0; i < similarhits.ScoreDocs.Length; i++)
                    {
                        Document doc  = searcher.Doc(similarhits.ScoreDocs[i].Doc);
                        var      post = CreatePostFromDocument(doc, null);
                        if (postid != post.Id)
                        {
                            list.Add(post);
                        }

                        if (list.Count >= itemsToReturn)
                        {
                            break;
                        }
                    }
                }
            }
            catch (Exception)
            {
            }
            finally
            {
                if (searcher != null)
                {
                    searcher.Dispose();
                }

                if (reader != null)
                {
                    reader.Dispose();
                }

                lck.ReleaseReaderLock();
            }


            return(list);
        }
Пример #26
0
        public SearchResult FindSimular(string key, int resultOffset, int resultLength, bool matchCategory) {
            var pageQuery = new TermQuery(new Term("key", key));
            var topDocs = _searcher.Search(pageQuery, 1);
            if (topDocs.TotalHits == 0) {
                return new SearchResult();
            }

            var doc = topDocs.ScoreDocs[0].Doc;

            var moreLikeThis = new MoreLikeThis(_reader) {
                Analyzer = _analyzer, 
                MinWordLen = 3
            };
            moreLikeThis.SetFieldNames(new[] { "title", "summary", "content", "tags" });
            moreLikeThis.SetStopWords(StopWords.DefaultEnglish);
            moreLikeThis.MinDocFreq = 2;
            
            var query = moreLikeThis.Like(doc);
            var startTime = DateTime.Now;
            var ticks = DateTime.Now.ToUniversalTime().Ticks;

            Query publishStartQuery = NumericRangeQuery.NewLongRange("publishStart", null, ticks, true, false);
            Query publishStopQuery = NumericRangeQuery.NewLongRange("publishStop", ticks, null, false, true);

            var booleanQuery = new BooleanQuery {
                {query, Occur.MUST},
                {pageQuery, Occur.MUST_NOT},
                {publishStartQuery, Occur.MUST},
                {publishStopQuery, Occur.MUST}
            };

            if (matchCategory) {
                var document = _searcher.Doc(doc);
                var field = document.GetField("category");

                if (field != null && !string.IsNullOrEmpty(field.StringValue)) {
                    var categoryQuery = new TermQuery(new Term("category", field.StringValue.ToLowerInvariant()));
                    booleanQuery.Add(categoryQuery, Occur.MUST);
                }
            }

            var scoreDocs = _searcher.Search(booleanQuery, null, MaxHits, Sort.RELEVANCE).ScoreDocs;

            var result = new SearchResult { NumberOfHits = scoreDocs.Length };

            if (resultOffset < scoreDocs.Length) {
                var resultUpperOffset = resultOffset + resultLength;
                if (resultUpperOffset > scoreDocs.Length) {
                    resultUpperOffset = scoreDocs.Length;
                }

                for (int i = resultOffset; i < resultUpperOffset; i++) {
                    Document document = _searcher.Doc(scoreDocs[i].Doc);

                    Guid pageId;
                    (document.Get("pageId") ?? string.Empty).TryParseGuid(out pageId);

                    var hit = new SearchHit {
                        PageId = pageId,
                        Path = document.Get("path"),
                        Title = document.Get("title"),
                        Excerpt = document.Get("summary")
                    };

                    //foreach (string key in metaData) {
                    //    hit.MetaData.Add(key, document.Get(key));
                    //}

                    result.Hits.Add(hit);
                }
            }

            var timeTaken = DateTime.Now - startTime;
            result.SecondsTaken = timeTaken.TotalSeconds;

            return result;
        }
Пример #27
0
        public virtual SearchResult MoreLikeThis(int postId, int?filterByCategory = null, int languageId = -1, PostType?postType = null, SearchPlace searchPlace = SearchPlace.Title | SearchPlace.Description,
                                                 int maxResult = 5, SearchResultSortType orderBy         = SearchResultSortType.Score)
        {
            var result = new SearchResult();

            var watch = new System.Diagnostics.Stopwatch();

            watch.Start();
            try
            {
                using (var directory = FSDirectory.Open(new DirectoryInfo(_indexFilesPath)))
                {
                    using (var searcher = new IndexSearcher(directory, readOnly: true))
                    {
                        var docNumber = GetLuceneDocNumber(postId, searcher);

                        if (docNumber == -1)
                        {
                            return(result);
                        }

                        var searchInFields = new List <string>();
                        if (searchPlace == SearchPlace.Anywhere)
                        {
                            searchInFields.AddRange(new string[] { "Title", "Description", "Keywords", "Tags" });
                        }
                        else
                        {
                            if (searchPlace.HasFlagFast(SearchPlace.Title))
                            {
                                searchInFields.Add("Title");
                            }

                            if (searchPlace.HasFlagFast(SearchPlace.Description))
                            {
                                searchInFields.Add("Description");
                            }

                            if (searchPlace.HasFlagFast(SearchPlace.Keywords))
                            {
                                searchInFields.Add("Keywords");
                            }

                            if (searchPlace.HasFlagFast(SearchPlace.Tags))
                            {
                                searchInFields.Add("Tags");
                            }
                        }

                        var analyzer     = new StandardAnalyzer(Version);
                        var moreLikeThis = new MoreLikeThis(searcher.IndexReader)
                        {
                            Analyzer = analyzer
                        };
                        moreLikeThis.SetFieldNames(searchInFields.ToArray());
                        moreLikeThis.MinDocFreq  = 1;
                        moreLikeThis.MinTermFreq = 1;
                        moreLikeThis.Boost       = true;

                        var query = moreLikeThis.Like(docNumber);

                        var filter = new BooleanFilter();

                        filter.Add(new FilterClause(
                                       new QueryWrapperFilter(new TermQuery(new Term("ID",
                                                                                     postId.ToString()))),
                                       Occur.MUST_NOT));

                        if (languageId > -1)
                        {
                            filter.Add(new FilterClause(
                                           new QueryWrapperFilter(new TermQuery(new Term("LanguageId",
                                                                                         languageId.ToString()))),
                                           Occur.MUST));
                        }
                        if (filterByCategory != null)
                        {
                            filter.Add(new FilterClause(
                                           new QueryWrapperFilter(new TermQuery(new Term("Categories",
                                                                                         filterByCategory.Value.ToString()))), Occur.MUST));
                        }
                        if (postType != null)
                        {
                            filter.Add(new FilterClause(
                                           new QueryWrapperFilter(new TermQuery(new Term("PostType",
                                                                                         postType.Value.ToString()))), Occur.MUST));
                        }

                        Sort sort = new Sort(SortField.FIELD_SCORE);

                        switch (orderBy)
                        {
                        case SearchResultSortType.NumberOfVisits:
                            sort = new Sort(new SortField("NumberOfVisit", SortField.INT, true));
                            break;

                        case SearchResultSortType.PublishDate:
                            sort = new Sort(new SortField("PublishDate", SortField.LONG, true));
                            break;

                        case SearchResultSortType.LastUpDate:
                            sort = new Sort(new SortField("LastUpDate", SortField.LONG, true));
                            break;
                        }

                        var hits = searcher.Search(query, filter, maxResult, sort).ScoreDocs;

                        foreach (var scoreDoc in hits)
                        {
                            var doc = searcher.Doc(scoreDoc.Doc);
                            result.Documents.Add(new SearchResultDocument()
                            {
                                DocumentId       = int.Parse(doc.Get("ID")),
                                LanguageId       = int.Parse(doc.Get("LanguageId")),
                                LanguageIsoCode  = doc.Get("LanguageCode"),
                                Score            = scoreDoc.Score,
                                DocumentTitle    = doc.Get("Title"),
                                DocumentBody     = doc.Get("Description"),
                                DocumentKeywords = doc.Get("Keywords"),
                                DocumentTags     = doc.Get("Tags"),
                            });
                        }

                        result.Documents = result.Documents.DistinctBy(p => new { p.DocumentId })
                                           .ToList();

                        analyzer.Close();
                    }
                }
            }
            catch (Exception ex)
            {
                result.Error    = ex;
                result.HasError = true;
            }

            watch.Stop();
            result.ElapsedMilliseconds = watch.ElapsedMilliseconds;

            return(result);
        }