Beispiel #1
0
        static void CreateInvertedIndexAndIDF()
        {
            MongodbAccess mongo = new MongodbAccess();
            Dictionary<ObjectId, Dictionary<string, int>> forward_index = mongo.GetForwardIndex();
            Util.log("{0} items in forward index.", forward_index.Count);
            HashSet<string> words = mongo.GetWordDict();
            Util.log("{0} items in word dict.", words.Count);

            foreach (string w in words)
            {
                InvertedIndexItem inverteditem = new InvertedIndexItem();
                inverteditem.word = w;
                inverteditem.webpage_ids = new BsonArray();
                foreach (ObjectId webpageid in forward_index.Keys)
                {
                    if (forward_index[webpageid].ContainsKey(w))
                    {
                        inverteditem.webpage_ids.Add(webpageid);
                    }
                }
                mongo.InsertInvertedIndexItem(inverteditem);
                double idf = Math.Log10(forward_index.Count * 1.0 / inverteditem.webpage_ids.Count);
                mongo.SetWordIDF(w, idf);
            }
        }
Beispiel #2
0
 static void CreateForwordIndex()
 {
     PanGu.Segment.Init();
     MongodbAccess mongo = new MongodbAccess();
     HashSet<string> words = new HashSet<string>();
     HashSet<string> crawled_urls = mongo.GetCrawledURLs();
     foreach (string url in crawled_urls)
     {
         WebPage page = mongo.GetWebPageByURL(url);
         if (page == null) continue;
         Dictionary<string, int> dict = ContetnWordSegment(page.content);
         ForwardIndexItem forwarditem = new ForwardIndexItem();
         forwarditem.webpage_id = page._id;
         forwarditem.words = new MongoDB.Bson.BsonDocument();
         foreach (string word in dict.Keys)
         {
             if (!_check(word)) continue;
             forwarditem.words.Add(new MongoDB.Bson.BsonElement(word, dict[word]));
             if (!words.Contains(word)) words.Add(word);
         }
         mongo.InsertForwardIndexItem(forwarditem);
     }
     Util.log("total {0} words.", words.Count);
     mongo.SaveWordDict(words);
 }
Beispiel #3
0
        static void Main(string[] args)
        {
            MongodbAccess    mongo        = new MongodbAccess();
            HashSet <string> crawled_urls = mongo.GetCrawledURLs();

            Util.log("{0} urls have been crawled.", crawled_urls.Count);

            Random r = new Random(99);

            HashSet <string> urls = GetWaitingURLs(crawled_urls);

            foreach (string url in urls)
            {
                WebPage page = GetWebPage(url);
                if (page == null)
                {
                    continue;
                }
                mongo.InsertWebPage(page);
                Util.log("crawl {0} done.", url);
                if (r.Next(100) > 80)
                {
                    System.Threading.Thread.Sleep(500);
                }
            }

            /*
             * HashSet<string> urls = GetWaitingURLs(null);
             * Console.WriteLine(urls.Count);
             * foreach (string url in urls)
             * {
             *  WebPage page = GetWebPage(url);
             *  if (page == null) continue;
             *  Console.WriteLine("{0} {1}", page.title, page.published_time);
             * }
             * */
        }
Beispiel #4
0
 static List<DocUrlAbstractResult> GetResult(List<ObjectId> docIds)
 {
     List<DocUrlAbstractResult> ret = new List<DocUrlAbstractResult>();
     MongodbAccess mongo = new MongodbAccess();
     foreach (ObjectId docId in docIds)
     {
         WebPage page = mongo.GetWebPageById(docId);
         DocUrlAbstractResult duar = new DocUrlAbstractResult();
         duar.title = page.title;
         duar.url = page.url;
         duar.abst = GetAbstract(docId);
         ret.Add(duar);
     }
     return ret;
 }
Beispiel #5
0
 static string GetAbstract(ObjectId docId)
 {
     MongodbAccess mongo = new MongodbAccess();
     WebPage page = mongo.GetWebPageById(docId);
     return page.title.Length < page.content.Length ?
         page.content.Substring(page.title.Length, (page.content.Length - page.title.Length) > 150 ? 150 : (page.content.Length - page.title.Length)) : "";
 }
Beispiel #6
0
 static void ForwardIndexAndWordIDFInit()
 {
     MongodbAccess mongo = new MongodbAccess();
     forward_index = mongo.GetForwardIndex();
     word_idf = mongo.GetWordIDF();
 }
Beispiel #7
0
 static string ProcessQuery(string[] query)
 {
     /* 对query进行分词 */
     List<string> query_words = new List<string>();
     PanGu.Segment.Init();
     Segment seg = new Segment();
     foreach (string q in query)
     {
         ICollection<WordInfo> words = seg.DoSegment(q);
         foreach (WordInfo wi in words)
         {
             if (!query_words.Contains(wi.Word))
                 query_words.Add(wi.Word);
         }
     }
     /* 检索出符合条件的docIds */
     MongodbAccess mongo = new MongodbAccess();
     List<ObjectId> docIds = mongo.GetDocIDByQuery(query_words);
     docIds = SortResult(docIds, query_words);
     List<DocUrlAbstractResult> result = GetResult(docIds);
     /*
     foreach (DocUrlAbstractResult duar in result)
     {
         Console.WriteLine("{0}\n\t{1}", duar.title, duar.url);
     }
      * */
     StringBuilder strbuilder = new StringBuilder();
     foreach (DocUrlAbstractResult duar in result)
     {
         strbuilder.Append(String.Format(
             "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>",
             duar.title, duar.abst, duar.url));
     }
     if (strbuilder.Length == 0)
     {
         strbuilder.Append("No pages mathch the query.");
     }
     return "<ol>" + strbuilder.ToString() + "</ol>";
 }
Beispiel #8
0
        static void Main(string[] args)
        {
            MongodbAccess mongo = new MongodbAccess();
            HashSet<string> crawled_urls = mongo.GetCrawledURLs();
            Util.log("{0} urls have been crawled.", crawled_urls.Count);

            Random r = new Random(99);

            HashSet<string> urls = GetWaitingURLs(crawled_urls);
            foreach (string url in urls)
            {
                WebPage page = GetWebPage(url);
                if (page == null) continue;
                mongo.InsertWebPage(page);
                Util.log("crawl {0} done.", url);
                if (r.Next(100) > 80)
                {
                    System.Threading.Thread.Sleep(500);
                }
            }

            /*
            HashSet<string> urls = GetWaitingURLs(null);
            Console.WriteLine(urls.Count);
            foreach (string url in urls)
            {
                WebPage page = GetWebPage(url);
                if (page == null) continue;
                Console.WriteLine("{0} {1}", page.title, page.published_time);
            }
             * */
        }