Beispiel #1
0
        public string Search(string strQuery)
        {
            string result = string.Empty;

            Lucene.Net.Index.IndexReader        reader           = Lucene.Net.Index.IndexReader.Open(Server.MapPath(System.Configuration.ConfigurationManager.AppSettings["IndexingArticle"]));
            Lucene.Net.QueryParsers.QueryParser parser           = new Lucene.Net.QueryParsers.QueryParser("ArticleDetail", new Lucene.Net.Analysis.Standard.StandardAnalyzer());
            Lucene.Net.Search.Query             query            = parser.Parse(strQuery);
            Lucene.Net.Search.IndexSearcher     searcher         = new Lucene.Net.Search.IndexSearcher(reader);
            Lucene.Net.Search.Hits                   hits        = searcher.Search(query);
            Lucene.Net.Highlight.QueryScorer         score       = new Lucene.Net.Highlight.QueryScorer(query);
            Lucene.Net.Highlight.SimpleHTMLFormatter formater    = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='Highlight'>", "</span>");
            Lucene.Net.Highlight.Highlighter         highlighter = new Lucene.Net.Highlight.Highlighter(formater, score);
            result += "<div align='right' style='background-color:#F0F7F9; padding-right:15px' height='30px'><font style='FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #005482; FONT-FAMILY: arial'>Kết quả tìm thấy : " + hits.Length() + "  </font></div>";
            result += "<div style='padding: 10px 10px 10px 10px;'>";
            for (int i = 0; i < hits.Length(); i++)
            {
                string id     = hits.Doc(i).Get("ArticleId");
                string title  = hits.Doc(i).Get("ArticleTitle");
                string detail = hits.Doc(i).Get("ArticleDetail");
                Lucene.Net.Analysis.TokenStream ts = (new Lucene.Net.Analysis.Standard.StandardAnalyzer()).TokenStream("ArticleDetail", new System.IO.StringReader(detail));
                result += string.Format("<div align='left'><font style='FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #5b5b5b; FONT-FAMILY: arial'><a href='/?ArticleId={0}'>{1}</a></font>", id, title);
                result += string.Format("<div align='left'><font style='FONT-SIZE: 9pt' face='Arial' color='#005482'>...{0}...</font></div></div></br>", highlighter.GetBestFragment(ts, detail));
            }
            result += "</div>";
            reader.Close();
            return(result);
        }
Beispiel #2
0
        public string SearchAndPaging(string strQuery, string index)
        {
            string result = string.Empty;

            try
            {
                List <SearchArticle>            searchArticleList = new List <SearchArticle>();
                PSCPortal.CMS.ArticleCollection ArticleList       = ArticleCollection.GetArticleCollectionPublish();
                string         nameSub       = Libs.Ultility.GetSubDomain() == string.Empty ? "HomePage" : Libs.Ultility.GetSubDomain();
                SubDomain      subDomain     = PSCPortal.Engine.SubDomain.GetSubByName(nameSub);
                PageCollection pagesBelongTo = subDomain.GetPagesBelongTo();
                string         strId         = string.Empty;
                foreach (var page in pagesBelongTo)
                {
                    foreach (var ar in ArticleList.Where(ar => ar.PageId == page.Id))
                    {
                        strId += ar.Id + " OR ";
                    }
                    if (strId.Length > 0)
                    {
                        strId = strId.Remove(strId.Length - 3, 3);
                    }
                }
                int    pageIndex = Int32.Parse(index);
                string strSearch = " ArticleDetail:(" + strQuery + ") AND ArticleId:" + "( " + strId + " )";
                Lucene.Net.Index.IndexReader        reader           = Lucene.Net.Index.IndexReader.Open(Server.MapPath(System.Configuration.ConfigurationManager.AppSettings["IndexingArticle"]));
                Lucene.Net.QueryParsers.QueryParser parser           = new Lucene.Net.QueryParsers.QueryParser("ArticleDetail", new Lucene.Net.Analysis.Standard.StandardAnalyzer());
                Lucene.Net.Search.Query             query            = parser.Parse(strSearch);
                Lucene.Net.Search.IndexSearcher     searcher         = new Lucene.Net.Search.IndexSearcher(reader);
                Lucene.Net.Search.Hits                   hits        = searcher.Search(query);
                Lucene.Net.Highlight.QueryScorer         score       = new Lucene.Net.Highlight.QueryScorer(query);
                Lucene.Net.Highlight.SimpleHTMLFormatter formater    = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='Highlight'>", "</span>");
                Lucene.Net.Highlight.Highlighter         highlighter = new Lucene.Net.Highlight.Highlighter(formater, score);
                result += hits.Length() + "_" + "<div class='blog_news'><div class='topic_news_title1'><div class='topic_news_title'><a href='#'>Kết quả tìm thấy: " + hits.Length() + "</a></div></div>";
                result += "<div class='ct_topic_l'><div class='ct_topic_r1'>";
                for (int i = pageIndex * 20 - 20; i < pageIndex * 20 && i < hits.Length(); i++)
                {
                    string detail = hits.Doc(i).Get("ArticleDetail");
                    Lucene.Net.Analysis.TokenStream ts = (new Lucene.Net.Analysis.Standard.StandardAnalyzer()).TokenStream("ArticleDetail", new System.IO.StringReader(detail));
                    SearchArticle searchArticle        = new SearchArticle();
                    searchArticle.Id        = hits.Doc(i).Get("ArticleId");;
                    searchArticle.Title     = hits.Doc(i).Get("ArticleTitle");
                    searchArticle.Highligth = highlighter.GetBestFragment(ts, detail);
                    searchArticleList.Add(searchArticle);
                }
                reader.Close();
                JavaScriptSerializer        serializer = new JavaScriptSerializer();
                Dictionary <string, object> resultDic  = new Dictionary <string, object>();
                resultDic["Count"] = hits.Length();
                resultDic["Data"]  = searchArticleList;
                result             = serializer.Serialize(resultDic);
            }
            catch (Exception e)
            {
            }
            return(result);
        }
		public virtual void  TestEncoding()
		{
            String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
            // run the highlighter on the raw content (scorer does not score any tokens
            // for
            // highlighting but scores a single fragment for selection
            Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new AnonymousClassScorer());
            highlighter.SetTextFragmenter(new SimpleFragmenter(2000));
            TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(rawDocContent));

            String encodedSnippet = highlighter.GetBestFragments(tokenStream, rawDocContent, 1, "");
            // An ugly bit of XML creation:
            String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
                + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"
                + "<head>\n" + "<title>My Test HTML Document</title>\n" + "</head>\n" + "<body>\n" + "<h2>"
                + encodedSnippet + "</h2>\n" + "</body>\n" + "</html>";
            // now an ugly built of XML parsing to test the snippet is encoded OK

            System.Xml.XmlDocument doc = new System.Xml.XmlDocument();
            doc.LoadXml(xhtml);
            System.Xml.XmlNodeList nodes = doc.GetElementsByTagName("body");
            System.Xml.XmlNode body = nodes[0];
            nodes = body.ChildNodes;
            System.Xml.XmlNode h2 = (System.Xml.XmlNode)nodes[0];
            String decodedSnippet = h2.FirstChild.InnerText;
            Assert.AreEqual(rawDocContent, decodedSnippet,"XHTML Encoding should have worked:");
		}
		public virtual void  TestNoFragments()
		{
			DoSearching("AnInvalidQueryWhichShouldYieldNoResults");
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			
			for (int i = 0; i < texts.Length; i++)
			{
				System.String text = texts[i];
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				
				System.String result = highlighter.GetBestFragment(tokenStream, text);
				Assert.IsNull(result, "The highlight result should be null for text with no query terms");
			}
		}
		public virtual void  TestUnRewrittenQuery()
		{
			//test to show how rewritten query can still be used
			searcher = new IndexSearcher(ramDir);
			Analyzer analyzer = new StandardAnalyzer();
			
			QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
			Query query = parser.Parse("JF? or Kenned*");
			System.Console.Out.WriteLine("Searching with primitive query");
			//forget to set this and...
			//query=query.rewrite(reader);
			Hits hits = searcher.Search(query);
			
			//create an instance of the highlighter with the tags used to surround highlighted text
			//		QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			
			highlighter.SetTextFragmenter(new SimpleFragmenter(40));
			
			int maxNumFragmentsRequired = 3;
			
			for (int i = 0; i < hits.Length(); i++)
			{
				System.String text = hits.Doc(i).Get(FIELD_NAME);
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				
				System.String highlightedText = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
				System.Console.Out.WriteLine(highlightedText);
			}
			//We expect to have zero highlights if the query is multi-terms and is not rewritten!
			Assert.IsTrue(numHighlights == 0, "Failed to find correct number of highlights " + numHighlights + " found");
		}
		public virtual void  TestMaxSizeHighlightTruncates()
		{
			System.String goodWord = "goodtoken";
			System.String[] stopWords = new System.String[]{"stoppedtoken"};
			
			TermQuery query = new TermQuery(new Term("data", goodWord));
			SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
			Highlighter hg = new Highlighter(fm, new QueryScorer(query));
			hg.SetTextFragmenter(new NullFragmenter());
			
			System.String match = null;
			System.Text.StringBuilder sb = new System.Text.StringBuilder();
			sb.Append(goodWord);
			for (int i = 0; i < 10000; i++)
			{
				sb.Append(" ");
				sb.Append(stopWords[0]);
			}
			
			hg.SetMaxDocBytesToAnalyze(100);
			match = hg.GetBestFragment(new StandardAnalyzer(stopWords), "data", sb.ToString());
			Assert.IsTrue(match.Length < hg.GetMaxDocBytesToAnalyze(), "Matched text should be no more than 100 chars in length ");
			
			//add another tokenized word to the overrall length - but set way beyond 
			//the length of text under consideration (after a large slug of stop words + whitespace)
			sb.Append(" ");
			sb.Append(goodWord);
			match = hg.GetBestFragment(new StandardAnalyzer(stopWords), "data", sb.ToString());
			Assert.IsTrue(match.Length < hg.GetMaxDocBytesToAnalyze(), "Matched text should be no more than 100 chars in length ");
		}
		public virtual void  TestMaxSizeHighlight()
		{
			DoSearching("meat");
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			highlighter.SetMaxDocBytesToAnalyze(30);
			TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(texts[0]));
			highlighter.GetBestFragment(tokenStream, texts[0]);
			Assert.IsTrue(numHighlights == 0, "Setting MaxDocBytesToAnalyze should have prevented " + "us from finding matches for this record: " + numHighlights + " found");
		}
		public virtual void  TestOverlapAnalyzer2()
		{
			
			System.String s = "Hi-Speed10 foo";
			
			Query query; Highlighter highlighter; System.String result;
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("foo");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2(), s, 3, "...");
			Assert.AreEqual(result, "Hi-Speed10 <B>foo</B>");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("10");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2(), s, 3, "...");
			Assert.AreEqual(result, "Hi-Speed<B>10</B> foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("hi");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2(), s, 3, "...");
			Assert.AreEqual(result, "<B>Hi</B>-Speed10 foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("speed");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2(), s, 3, "...");
			Assert.AreEqual(result, "Hi-<B>Speed</B>10 foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("hispeed");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2(), s, 3, "...");
			Assert.AreEqual(result, "<B>Hi-Speed</B>10 foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("hi speed");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2(), s, 3, "...");
			Assert.AreEqual(result, "<B>Hi-Speed</B>10 foo");
			
			/////////////////// same tests, just put the bigger overlapping token first
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("foo");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2a(), s, 3, "...");
			Assert.AreEqual(result, "Hi-Speed10 <B>foo</B>");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("10");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2a(), s, 3, "...");
			Assert.AreEqual(result, "Hi-Speed<B>10</B> foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("hi");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2a(), s, 3, "...");
			Assert.AreEqual(result, "<B>Hi</B>-Speed10 foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("speed");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2a(), s, 3, "...");
			Assert.AreEqual(result, "Hi-<B>Speed</B>10 foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("hispeed");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2a(), s, 3, "...");
			Assert.AreEqual(result, "<B>Hi-Speed</B>10 foo");
			
			query = new QueryParser("text", new WhitespaceAnalyzer()).Parse("hi speed");
			highlighter = new Highlighter(new QueryScorer(query));
			result = highlighter.GetBestFragments(GetTS2a(), s, 3, "...");
			Assert.AreEqual(result, "<B>Hi-Speed</B>10 foo");
		}
        public virtual void  TestOverlapAnalyzer()
		{
			//UPGRADE_TODO: Class 'java.util.HashMap' was converted to 'System.Collections.Hashtable' which has a different behavior. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1073_javautilHashMap_3"'
			System.Collections.Hashtable synonyms = new System.Collections.Hashtable();
			synonyms["football"] = "soccer,footie";
			Analyzer analyzer = new SynonymAnalyzer(synonyms);
			System.String srchkey = "football";
			
			System.String s = "football-soccer in the euro 2004 footie competition";
			QueryParser parser = new QueryParser("bookid", analyzer);
			Query query = parser.Parse(srchkey);
			
			Highlighter highlighter = new Highlighter(new QueryScorer(query));
			TokenStream tokenStream = analyzer.TokenStream(null, new System.IO.StringReader(s));
			// Get 3 best fragments and seperate with a "..."
			System.String result = highlighter.GetBestFragments(tokenStream, s, 3, "...");
			System.String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
			Assert.IsTrue(expectedResult.Equals(result), "overlapping analyzer should handle highlights OK");
		}
        public virtual void  TestGetBestSingleFragmentWithWeights()
		{
			WeightedTerm[] wTerms = new WeightedTerm[2];
			wTerms[0] = new WeightedTerm(10f, "hello");
			wTerms[1] = new WeightedTerm(1f, "kennedy");
			Highlighter highlighter = new Highlighter(new QueryScorer(wTerms));
			TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(texts[0]));
			highlighter.SetTextFragmenter(new SimpleFragmenter(2));
			
			System.String result = highlighter.GetBestFragment(tokenStream, texts[0]).Trim();
			Assert.IsTrue("<B>Hello</B>".Equals(result), "Failed to find best section using weighted terms. Found: [" + result + "]");
			
			//readjust weights
			wTerms[1].SetWeight(50f);
			tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(texts[0]));
			highlighter = new Highlighter(new QueryScorer(wTerms));
			highlighter.SetTextFragmenter(new SimpleFragmenter(2));
			
			result = highlighter.GetBestFragment(tokenStream, texts[0]).Trim();
			Assert.IsTrue("<B>kennedy</B>".Equals(result), "Failed to find best section using weighted terms. Found: " + result);
		}
		public virtual void  TestOffByOne()
		{
			TermQuery query = new TermQuery(new Term("data", "help"));
			Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(query));
			hg.SetTextFragmenter(new NullFragmenter());
			
			System.String match = null;
			match = hg.GetBestFragment(new StandardAnalyzer(), "data", "help me [54-65]");
			Assert.AreEqual("<B>help</B> me [54-65]", match);
		}
        public virtual void  TestSimpleHighlighter()
		{
			DoSearching("Kennedy");
			Highlighter highlighter = new Highlighter(new QueryScorer(query));
			highlighter.SetTextFragmenter(new SimpleFragmenter(40));
			int maxNumFragmentsRequired = 2;
			for (int i = 0; i < hits.Length(); i++)
			{
				System.String text = hits.Doc(i).Get(FIELD_NAME);
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				
				System.String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
				System.Console.Out.WriteLine("\t" + result);
			}
			//Not sure we can assert anything here - just running to check we dont throw any exceptions
		}
        public IEnumerable<Content> Search(string keyword, int page, int pageSize, out int totals)
        {
            lock (locker)
            {
                List<Content> result = new List<Content>();
                IndexReader reader = IndexReader.Open(DBNLConfigurationManager.LuceneElement.IndexingFolder);

                IndexSearcher searcher = new IndexSearcher(reader);

                TopDocCollector collector = new TopDocCollector((page + 1) * pageSize);

                PhraseQuery pquery = new PhraseQuery();
                BooleanQuery myquery = new BooleanQuery();
                PhraseQuery q2 = new PhraseQuery();
                //grab the search terms from the query string
                string[] str = keyword.Split(' ');
                //build the query
                foreach (string word in str)
                {
                    //brand is the field I'm searching in
                    q2.Add(new Term("content", word.ToLower()));
                }

                //finally, add it to the BooleanQuery object
                myquery.Add(q2, BooleanClause.Occur.MUST);

                //foreach (string srt in keyword.Split(new char[] {' '}))
                //{
                //    pquery.Add(new Term("content", srt.ToLower()));
                //}
                //pquery.Add(q2, BooleanClause.Occur.MUST);

                TermQuery query = new TermQuery(new Term("content", keyword));
                //            TopDocs topDocs = searcher.Search(query, collector);
                //searcher.Search(query, collector);

                QueryParser qp = new QueryParser("content", new StandardAnalyzer());

                //Contains a phrase such as "this is a phrase"
                Query q = qp.Parse(keyword);
                //Hits hits = searcher.Search(q);
                //Hits hits = searcher.Search(query);
                Hits hits = searcher.Search(myquery);

                //ScoreDoc[] hits = collector.TopDocs().scoreDocs;
                totals = hits.Length();
                Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter(
            "<span class=\"Highlight\">",
            "</span>");

                Lucene.Net.Highlight.SimpleFragmenter fragmenter = new Lucene.Net.Highlight.SimpleFragmenter(400);
                Lucene.Net.Highlight.QueryScorer scorer = new Lucene.Net.Highlight.QueryScorer(myquery);
                Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(formatter, scorer);
                highlighter.SetTextFragmenter(fragmenter);

                for (int i = (page - 1) * pageSize; i < Math.Min(page * pageSize, hits.Length()); i++)
                {

                    Document doc = hits.Doc(i);
                    string raw_text = doc.Get("content");

                    Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(raw_text));
                    string highlighted_text = highlighter.GetBestFragments(stream, raw_text, 1, "...").Replace("'", "''");

                    if (highlighted_text == "") // someties the highlighter fails to emit text...
                    {
                        highlighted_text = raw_text.Replace("'", "''");
                    }
                    if (highlighted_text.Length > 500)
                    {
                        highlighted_text = highlighted_text.Substring(0, 500);
                    }

                    Content content = new ContentService().GetItem(int.Parse(doc.Get("id")));
                    content.HighlightText = highlighted_text;
                    result.Add(content);
                }
                reader.Close();

                searcher.Close();
                return result.AsEnumerable();
            }
        }
        public string Query(string keyword)
        {
            Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser("text", analyzer);
            Lucene.Net.Search.Query query = null;

            try
            {
                if (string.IsNullOrEmpty(keyword))
                {
                    throw new Exception("keywork is empty");
                }

                query = parser.Parse(keyword);

            }
            catch (Exception e)
            {
            }

            lock (locker)
            {

                Lucene.Net.Search.Hits hits = null;
                try
                {
                    if (searcher == null)
                    {
                        searcher = new Lucene.Net.Search.IndexSearcher(DBNLConfigurationManager.LuceneElement.IndexingFolder);
                    }

                    hits = searcher.Search(query);

                }
                catch (Exception e)
                {
                }

                for (int i = 0; i < hits.Length(); i++)
                {
                    Lucene.Net.Documents.Document doc = hits.Doc(i);

                }

                Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter(
            "<span style=\"background:yellow;color:red;\">",
            "</span>");

                Lucene.Net.Highlight.SimpleFragmenter fragmenter = new Lucene.Net.Highlight.SimpleFragmenter(400);
                Lucene.Net.Highlight.QueryScorer scorer = new Lucene.Net.Highlight.QueryScorer(query);
                Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(formatter, scorer);
                highlighter.SetTextFragmenter(fragmenter);

                StringBuilder sb = new StringBuilder();
                string guid = Guid.NewGuid().ToString().Replace("-", "");
                Dictionary<string, int> dict_already_seen_ids = new Dictionary<string, int>();

                // insert the search results into a temp table which we will join with what's in the database
                for (int i = 0; i < hits.Length(); i++)
                {
                    if (dict_already_seen_ids.Count < 100)
                    {
                        Lucene.Net.Documents.Document doc = hits.Doc(i);
                        string id = doc.Get("id");
                        if (!dict_already_seen_ids.ContainsKey(id))
                        {
                            dict_already_seen_ids[id] = 1;

                            string raw_text =doc.Get("raw_text");

                            Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(raw_text));
                            string highlighted_text = highlighter.GetBestFragments(stream, raw_text, 1, "...").Replace("'", "''");

                            if (highlighted_text == "") // someties the highlighter fails to emit text...
                            {
                                highlighted_text = raw_text.Replace("'", "''");
                            }
                            if (highlighted_text.Length > 3000)
                            {
                                highlighted_text = highlighted_text.Substring(0, 3000);
                            }
                            sb.Append(highlighted_text);
                            sb.Append("'");
                            sb.Append(")\n");
                        }
                    }
                    else
                    {
                        break;
                    }
                }
                return sb.ToString();

            }
        }
Beispiel #15
0
        public SearchModel Search(string searchText)
        {
            var result = new SearchModel();

            if (string.IsNullOrEmpty(searchText))
            {
                result.Message = "Įveskite paieškos užklausą.";
                return(result);
            }

            var stemmedSearchText = new LithuanianStemmer().Stem(searchText.Trim());

            if (string.IsNullOrEmpty(stemmedSearchText))
            {
                result.Message = "Įveskite paieškos užklausą.";
                return(result);
            }

            Lucene.Net.Search.Hits hits = null;
            try
            {
                if (char.IsLetter(stemmedSearchText[stemmedSearchText.Length - 1]))
                {
                    stemmedSearchText += "*";
                }

                query = parser.Parse(stemmedSearchText);

                if (searcher == null)
                {
                    searcher = new Lucene.Net.Search.IndexSearcher(CustomAppSettings.SearchIndexFolder);
                }

                hits = searcher.Search(query);
            }
            catch (Exception e)
            {
                result.Message = "Paieška nepavyko. Pataisykite užklausą. Klaidos pranešimas: " + e.Message;
                return(result);
            }

            Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter(
                "<span class=\"highlightResult\">",
                "</span>");

            var fragmenter  = new Lucene.Net.Highlight.SimpleFragmenter(100);
            var scorer      = new Lucene.Net.Highlight.QueryScorer(searcher.Rewrite(query));
            var highlighter = new Lucene.Net.Highlight.Highlighter(formatter, scorer);

            highlighter.SetTextFragmenter(fragmenter);

            Dictionary <string, int> dict_already_seen_ids = new Dictionary <string, int>();

            var list = new List <SearchIndexModel>();

            // insert the search results into a temp table which we will join with what's in the database
            for (int i = 0; i < hits.Length(); i++)
            {
                if (dict_already_seen_ids.Count < 100)
                {
                    Lucene.Net.Documents.Document doc = hits.Doc(i);
                    string id = doc.Get("id");
                    if (!dict_already_seen_ids.ContainsKey(id))
                    {
                        dict_already_seen_ids[id] = 1;
                        var model = new SearchIndexModel();
                        model.Id      = id;
                        model.Score   = hits.Score(i);
                        model.Subject = doc.Get("subject");
                        model.Type    = (EntryTypes)Enum.Parse(typeof(EntryTypes), doc.Get("type"));

                        string raw_text = HttpUtility.HtmlEncode(doc.Get("raw_text"));
                        //string raw_text = doc.Get("raw_text");

                        Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("text",
                                                                                      new System.IO.StringReader(
                                                                                          raw_text));
                        string highlighted_text = highlighter.GetBestFragments(stream, raw_text, 3, "...").Replace("'",
                                                                                                                   "''");


                        if (highlighted_text == "") // someties the highlighter fails to emit text...
                        {
                            highlighted_text = raw_text.Replace("'", "''");
                        }
                        if (highlighted_text.Length > 3000)
                        {
                            highlighted_text = highlighted_text.Substring(0, 3000);
                        }

                        model.HighlightedText = highlighted_text;

                        list.Add(model);
                    }
                }
                else
                {
                    break;
                }
            }

            result.List         = list;
            result.SearchPhrase = searchText;
            if (list.Count == 0)
            {
                result.Message = string.Format("Įrašų pagal užklausą '{0}' nerasta. Patikslinkite paieškos duomenis.", searchText);
            }

            return(result);
        }
		public virtual void  TestMultiSearcher()
		{
			//setup index 1
			RAMDirectory ramDir1 = new RAMDirectory();
			IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
			Document d = new Document();
			Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.TOKENIZED);
			d.Add(f);
			writer1.AddDocument(d);
			writer1.Optimize();
			writer1.Close();
			IndexReader reader1 = IndexReader.Open(ramDir1);
			
			//setup index 2
			RAMDirectory ramDir2 = new RAMDirectory();
			IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
			d = new Document();
			f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED);
			d.Add(f);
			writer2.AddDocument(d);
			writer2.Optimize();
			writer2.Close();
			IndexReader reader2 = IndexReader.Open(ramDir2);
			
			
			
			IndexSearcher[] searchers = new IndexSearcher[2];
			searchers[0] = new IndexSearcher(ramDir1);
			searchers[1] = new IndexSearcher(ramDir2);
			MultiSearcher multiSearcher = new MultiSearcher(searchers);
			QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer());
            parser.SetMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
			query = parser.Parse("multi*");
			System.Console.Out.WriteLine("Searching for: " + query.ToString(FIELD_NAME));
			//at this point the multisearcher calls combine(query[])
			hits = multiSearcher.Search(query);
			
			//query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
			Query[] expandedQueries = new Query[2];
			expandedQueries[0] = query.Rewrite(reader1);
			expandedQueries[1] = query.Rewrite(reader2);
			query = query.Combine(expandedQueries);
			
			
			//create an instance of the highlighter with the tags used to surround highlighted text
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			
			for (int i = 0; i < hits.Length(); i++)
			{
				System.String text = hits.Doc(i).Get(FIELD_NAME);
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				System.String highlightedText = highlighter.GetBestFragment(tokenStream, text);
				System.Console.Out.WriteLine(highlightedText);
			}
			Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found");
		}
		public virtual void  TestFieldSpecificHighlighting()
		{
			System.String docMainText = "fred is one of the people";
			QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
			Query query = parser.Parse("fred category:people");
			
			//highlighting respects fieldnames used in query
			QueryScorer fieldSpecificScorer = new QueryScorer(query, "contents");
			Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), fieldSpecificScorer);
			fieldSpecificHighlighter.SetTextFragmenter(new NullFragmenter());
			System.String result = fieldSpecificHighlighter.GetBestFragment(analyzer, FIELD_NAME, docMainText);
			Assert.AreEqual(result, "<B>fred</B> is one of the people", "Should match");
			
			//highlighting does not respect fieldnames used in query
			QueryScorer fieldInSpecificScorer = new QueryScorer(query);
			Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), fieldInSpecificScorer);
			fieldInSpecificHighlighter.SetTextFragmenter(new NullFragmenter());
			result = fieldInSpecificHighlighter.GetBestFragment(analyzer, FIELD_NAME, docMainText);
			Assert.AreEqual(result, "<B>fred</B> is one of the <B>people</B>", "Should match");
			
			
			reader.Close();
		}
		public virtual void  TestGetSimpleHighlight()
		{
			DoSearching("Kennedy");
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			
			for (int i = 0; i < hits.Length(); i++)
			{
				System.String text = hits.Doc(i).Get(FIELD_NAME);
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				
				System.String result = highlighter.GetBestFragment(tokenStream, text);
				System.Console.Out.WriteLine("\t" + result);
			}
			Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found");
		}
		internal virtual void  DoStandardHighlights()
		{
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			highlighter.SetTextFragmenter(new SimpleFragmenter(20));
			for (int i = 0; i < hits.Length(); i++)
			{
				System.String text = hits.Doc(i).Get(FIELD_NAME);
				int maxNumFragmentsRequired = 2;
				System.String fragmentSeparator = "...";
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				
				System.String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator);
				System.Console.Out.WriteLine("\t" + result);
			}
		}
		public virtual void  TestGetTextFragments()
		{
			DoSearching("Kennedy");
			Highlighter highlighter = new Highlighter(this, new QueryScorer(query));
			highlighter.SetTextFragmenter(new SimpleFragmenter(20));
			
			for (int i = 0; i < hits.Length(); i++)
			{
				System.String text = hits.Doc(i).Get(FIELD_NAME);
				TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				
				System.String[] stringResults = highlighter.GetBestFragments(tokenStream, text, 10);
				
				tokenStream = analyzer.TokenStream(FIELD_NAME, new System.IO.StringReader(text));
				TextFragment[] fragmentResults = highlighter.GetBestTextFragments(tokenStream, text, true, 10);
				
				Assert.IsTrue(fragmentResults.Length == stringResults.Length, "Failed to find correct number of text Fragments: " + fragmentResults.Length + " vs " + stringResults.Length);
				for (int j = 0; j < stringResults.Length; j++)
				{
					//UPGRADE_TODO: Method 'java.io.PrintStream.println' was converted to 'System.Console.Out.WriteLine' which has a different behavior. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1073_javaioPrintStreamprintln_javalangObject_3"'
					System.Console.Out.WriteLine(fragmentResults[j]);
					Assert.IsTrue(fragmentResults[j].ToString().Equals(stringResults[j]), "Failed to find same text Fragments: " + fragmentResults[j] + " found");
				}
			}
		}