static void CreateInvertedIndexAndIDF()
            MongodbAccess mongo = new MongodbAccess();
            Dictionary<ObjectId, Dictionary<string, int>> forward_index = mongo.GetForwardIndex();
            Util.log("{0} items in forward index.", forward_index.Count);
            HashSet<string> words = mongo.GetWordDict();
            Util.log("{0} items in word dict.", words.Count);

            foreach (string w in words)
                InvertedIndexItem inverteditem = new InvertedIndexItem();
                inverteditem.word = w;
                inverteditem.webpage_ids = new BsonArray();
                foreach (ObjectId webpageid in forward_index.Keys)
                    if (forward_index[webpageid].ContainsKey(w))
                double idf = Math.Log10(forward_index.Count * 1.0 / inverteditem.webpage_ids.Count);
                mongo.SetWordIDF(w, idf);
 static void CreateForwordIndex()
     MongodbAccess mongo = new MongodbAccess();
     HashSet<string> words = new HashSet<string>();
     HashSet<string> crawled_urls = mongo.GetCrawledURLs();
     foreach (string url in crawled_urls)
         WebPage page = mongo.GetWebPageByURL(url);
         if (page == null) continue;
         Dictionary<string, int> dict = ContetnWordSegment(page.content);
         ForwardIndexItem forwarditem = new ForwardIndexItem();
         forwarditem.webpage_id = page._id;
         forwarditem.words = new MongoDB.Bson.BsonDocument();
         foreach (string word in dict.Keys)
             if (!_check(word)) continue;
             forwarditem.words.Add(new MongoDB.Bson.BsonElement(word, dict[word]));
             if (!words.Contains(word)) words.Add(word);
     Util.log("total {0} words.", words.Count);
        static void Main(string[] args)
            MongodbAccess    mongo        = new MongodbAccess();
            HashSet <string> crawled_urls = mongo.GetCrawledURLs();

            Util.log("{0} urls have been crawled.", crawled_urls.Count);

            Random r = new Random(99);

            HashSet <string> urls = GetWaitingURLs(crawled_urls);

            foreach (string url in urls)
                WebPage page = GetWebPage(url);
                if (page == null)
                Util.log("crawl {0} done.", url);
                if (r.Next(100) > 80)

             * HashSet<string> urls = GetWaitingURLs(null);
             * Console.WriteLine(urls.Count);
             * foreach (string url in urls)
             * {
             *  WebPage page = GetWebPage(url);
             *  if (page == null) continue;
             *  Console.WriteLine("{0} {1}", page.title, page.published_time);
             * }
             * */
 static List<DocUrlAbstractResult> GetResult(List<ObjectId> docIds)
     List<DocUrlAbstractResult> ret = new List<DocUrlAbstractResult>();
     MongodbAccess mongo = new MongodbAccess();
     foreach (ObjectId docId in docIds)
         WebPage page = mongo.GetWebPageById(docId);
         DocUrlAbstractResult duar = new DocUrlAbstractResult();
         duar.title = page.title;
         duar.url = page.url;
         duar.abst = GetAbstract(docId);
     return ret;
 static string GetAbstract(ObjectId docId)
     MongodbAccess mongo = new MongodbAccess();
     WebPage page = mongo.GetWebPageById(docId);
     return page.title.Length < page.content.Length ?
         page.content.Substring(page.title.Length, (page.content.Length - page.title.Length) > 150 ? 150 : (page.content.Length - page.title.Length)) : "";
 static void ForwardIndexAndWordIDFInit()
     MongodbAccess mongo = new MongodbAccess();
     forward_index = mongo.GetForwardIndex();
     word_idf = mongo.GetWordIDF();
 static string ProcessQuery(string[] query)
     /* 对query进行分词 */
     List<string> query_words = new List<string>();
     Segment seg = new Segment();
     foreach (string q in query)
         ICollection<WordInfo> words = seg.DoSegment(q);
         foreach (WordInfo wi in words)
             if (!query_words.Contains(wi.Word))
     /* 检索出符合条件的docIds */
     MongodbAccess mongo = new MongodbAccess();
     List<ObjectId> docIds = mongo.GetDocIDByQuery(query_words);
     docIds = SortResult(docIds, query_words);
     List<DocUrlAbstractResult> result = GetResult(docIds);
     foreach (DocUrlAbstractResult duar in result)
         Console.WriteLine("{0}\n\t{1}", duar.title, duar.url);
      * */
     StringBuilder strbuilder = new StringBuilder();
     foreach (DocUrlAbstractResult duar in result)
             "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>",
             duar.title, duar.abst, duar.url));
     if (strbuilder.Length == 0)
         strbuilder.Append("No pages mathch the query.");
     return "<ol>" + strbuilder.ToString() + "</ol>";
        static void Main(string[] args)
            MongodbAccess mongo = new MongodbAccess();
            HashSet<string> crawled_urls = mongo.GetCrawledURLs();
            Util.log("{0} urls have been crawled.", crawled_urls.Count);

            Random r = new Random(99);

            HashSet<string> urls = GetWaitingURLs(crawled_urls);
            foreach (string url in urls)
                WebPage page = GetWebPage(url);
                if (page == null) continue;
                Util.log("crawl {0} done.", url);
                if (r.Next(100) > 80)

            HashSet<string> urls = GetWaitingURLs(null);
            foreach (string url in urls)
                WebPage page = GetWebPage(url);
                if (page == null) continue;
                Console.WriteLine("{0} {1}", page.title, page.published_time);
             * */