Example #1
0
        static void CreateForwordIndex()
        {
            PanGu.Segment.Init();
            MongodbAccess    mongo        = new MongodbAccess();
            HashSet <string> words        = new HashSet <string>();
            HashSet <string> crawled_urls = mongo.GetCrawledURLs();

            foreach (string url in crawled_urls)
            {
                WebPage page = mongo.GetWebPageByURL(url);
                if (page == null)
                {
                    continue;
                }
                Dictionary <string, int> dict        = ContetnWordSegment(page.content);
                ForwardIndexItem         forwarditem = new ForwardIndexItem();
                forwarditem.webpage_id = page._id;
                forwarditem.words      = new MongoDB.Bson.BsonDocument();
                foreach (string word in dict.Keys)
                {
                    if (!_check(word))
                    {
                        continue;
                    }
                    forwarditem.words.Add(new MongoDB.Bson.BsonElement(word, dict[word]));
                    if (!words.Contains(word))
                    {
                        words.Add(word);
                    }
                }
                mongo.InsertForwardIndexItem(forwarditem);
            }
            Util.log("total {0} words.", words.Count);
            mongo.SaveWordDict(words);
        }
Example #2
0
        static void CreateInvertedIndexAndIDF()
        {
            MongodbAccess mongo = new MongodbAccess();
            Dictionary <ObjectId, Dictionary <string, int> > forward_index = mongo.GetForwardIndex();

            Util.log("{0} items in forward index.", forward_index.Count);
            HashSet <string> words = mongo.GetWordDict();

            Util.log("{0} items in word dict.", words.Count);

            foreach (string w in words)
            {
                InvertedIndexItem inverteditem = new InvertedIndexItem();
                inverteditem.word        = w;
                inverteditem.webpage_ids = new BsonArray();
                foreach (ObjectId webpageid in forward_index.Keys)
                {
                    if (forward_index[webpageid].ContainsKey(w))
                    {
                        inverteditem.webpage_ids.Add(webpageid);
                    }
                }
                mongo.InsertInvertedIndexItem(inverteditem);
                double idf = Math.Log10(forward_index.Count * 1.0 / inverteditem.webpage_ids.Count);
                mongo.SetWordIDF(w, idf);
            }
        }
Example #3
0
        static void ForwardIndexAndWordIDFInit()
        {
            MongodbAccess mongo = new MongodbAccess();

            forward_index = mongo.GetForwardIndex();
            word_idf      = mongo.GetWordIDF();
        }
Example #4
0
        static string GetAbstract(ObjectId docId)
        {
            MongodbAccess mongo = new MongodbAccess();
            WebPage       page  = mongo.GetWebPageById(docId);

            return(page.title.Length < page.content.Length ?
                   page.content.Substring(page.title.Length, (page.content.Length - page.title.Length) > 150 ? 150 : (page.content.Length - page.title.Length)) : "");
        }
Example #5
0
 static void Main(string[] args)
 {
     MongodbAccess mongo = new MongodbAccess();
     HashSet<string> crawled_urls = mongo.GetCrawledURLs();
     int count = 0;
     foreach(string url in crawled_urls)
     {
         WebPage page = mongo.GetWebPageByURL(url);
         if (page == null) continue;
         TextExtract te = new TextExtract(page.html, true);
         mongo.SetWebPageContent(url, page.title + te.content);
         count++;
         Util.log("{0} {1}", count, url);
     }
     Util.log("processed {0} urls.", count);
 }
Example #6
0
        static List <DocUrlAbstractResult> GetResult(List <ObjectId> docIds)
        {
            List <DocUrlAbstractResult> ret = new List <DocUrlAbstractResult>();
            MongodbAccess mongo             = new MongodbAccess();

            foreach (ObjectId docId in docIds)
            {
                WebPage page = mongo.GetWebPageById(docId);
                DocUrlAbstractResult duar = new DocUrlAbstractResult();
                duar.title = page.title;
                duar.url   = page.url;
                duar.abst  = GetAbstract(docId);
                ret.Add(duar);
            }
            return(ret);
        }
Example #7
0
        static string ProcessQuery(string[] query)
        {
            /* 对query进行分词 */
            List <string> query_words = new List <string>();

            PanGu.Segment.Init();
            Segment seg = new Segment();

            foreach (string q in query)
            {
                ICollection <WordInfo> words = seg.DoSegment(q);
                foreach (WordInfo wi in words)
                {
                    if (!query_words.Contains(wi.Word))
                    {
                        query_words.Add(wi.Word);
                    }
                }
            }
            /* 检索出符合条件的docIds */
            MongodbAccess   mongo  = new MongodbAccess();
            List <ObjectId> docIds = mongo.GetDocIDByQuery(query_words);

            docIds = SortResult(docIds, query_words);
            List <DocUrlAbstractResult> result = GetResult(docIds);

            /*
             * foreach (DocUrlAbstractResult duar in result)
             * {
             *  Console.WriteLine("{0}\n\t{1}", duar.title, duar.url);
             * }
             * */
            StringBuilder strbuilder = new StringBuilder();

            foreach (DocUrlAbstractResult duar in result)
            {
                strbuilder.Append(String.Format(
                                      "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>",
                                      duar.title, duar.abst, duar.url));
            }
            if (strbuilder.Length == 0)
            {
                strbuilder.Append("No pages mathch the query.");
            }
            return("<ol>" + strbuilder.ToString() + "</ol>");
        }