static void CreateInvertedIndexAndIDF() { MongodbAccess mongo = new MongodbAccess(); Dictionary<ObjectId, Dictionary<string, int>> forward_index = mongo.GetForwardIndex(); Util.log("{0} items in forward index.", forward_index.Count); HashSet<string> words = mongo.GetWordDict(); Util.log("{0} items in word dict.", words.Count); foreach (string w in words) { InvertedIndexItem inverteditem = new InvertedIndexItem(); inverteditem.word = w; inverteditem.webpage_ids = new BsonArray(); foreach (ObjectId webpageid in forward_index.Keys) { if (forward_index[webpageid].ContainsKey(w)) { inverteditem.webpage_ids.Add(webpageid); } } mongo.InsertInvertedIndexItem(inverteditem); double idf = Math.Log10(forward_index.Count * 1.0 / inverteditem.webpage_ids.Count); mongo.SetWordIDF(w, idf); } }
static void CreateForwordIndex() { PanGu.Segment.Init(); MongodbAccess mongo = new MongodbAccess(); HashSet<string> words = new HashSet<string>(); HashSet<string> crawled_urls = mongo.GetCrawledURLs(); foreach (string url in crawled_urls) { WebPage page = mongo.GetWebPageByURL(url); if (page == null) continue; Dictionary<string, int> dict = ContetnWordSegment(page.content); ForwardIndexItem forwarditem = new ForwardIndexItem(); forwarditem.webpage_id = page._id; forwarditem.words = new MongoDB.Bson.BsonDocument(); foreach (string word in dict.Keys) { if (!_check(word)) continue; forwarditem.words.Add(new MongoDB.Bson.BsonElement(word, dict[word])); if (!words.Contains(word)) words.Add(word); } mongo.InsertForwardIndexItem(forwarditem); } Util.log("total {0} words.", words.Count); mongo.SaveWordDict(words); }
static void Main(string[] args) { MongodbAccess mongo = new MongodbAccess(); HashSet <string> crawled_urls = mongo.GetCrawledURLs(); Util.log("{0} urls have been crawled.", crawled_urls.Count); Random r = new Random(99); HashSet <string> urls = GetWaitingURLs(crawled_urls); foreach (string url in urls) { WebPage page = GetWebPage(url); if (page == null) { continue; } mongo.InsertWebPage(page); Util.log("crawl {0} done.", url); if (r.Next(100) > 80) { System.Threading.Thread.Sleep(500); } } /* * HashSet<string> urls = GetWaitingURLs(null); * Console.WriteLine(urls.Count); * foreach (string url in urls) * { * WebPage page = GetWebPage(url); * if (page == null) continue; * Console.WriteLine("{0} {1}", page.title, page.published_time); * } * */ }
static List<DocUrlAbstractResult> GetResult(List<ObjectId> docIds) { List<DocUrlAbstractResult> ret = new List<DocUrlAbstractResult>(); MongodbAccess mongo = new MongodbAccess(); foreach (ObjectId docId in docIds) { WebPage page = mongo.GetWebPageById(docId); DocUrlAbstractResult duar = new DocUrlAbstractResult(); duar.title = page.title; duar.url = page.url; duar.abst = GetAbstract(docId); ret.Add(duar); } return ret; }
static string GetAbstract(ObjectId docId) { MongodbAccess mongo = new MongodbAccess(); WebPage page = mongo.GetWebPageById(docId); return page.title.Length < page.content.Length ? page.content.Substring(page.title.Length, (page.content.Length - page.title.Length) > 150 ? 150 : (page.content.Length - page.title.Length)) : ""; }
static void ForwardIndexAndWordIDFInit() { MongodbAccess mongo = new MongodbAccess(); forward_index = mongo.GetForwardIndex(); word_idf = mongo.GetWordIDF(); }
static string ProcessQuery(string[] query) { /* 对query进行分词 */ List<string> query_words = new List<string>(); PanGu.Segment.Init(); Segment seg = new Segment(); foreach (string q in query) { ICollection<WordInfo> words = seg.DoSegment(q); foreach (WordInfo wi in words) { if (!query_words.Contains(wi.Word)) query_words.Add(wi.Word); } } /* 检索出符合条件的docIds */ MongodbAccess mongo = new MongodbAccess(); List<ObjectId> docIds = mongo.GetDocIDByQuery(query_words); docIds = SortResult(docIds, query_words); List<DocUrlAbstractResult> result = GetResult(docIds); /* foreach (DocUrlAbstractResult duar in result) { Console.WriteLine("{0}\n\t{1}", duar.title, duar.url); } * */ StringBuilder strbuilder = new StringBuilder(); foreach (DocUrlAbstractResult duar in result) { strbuilder.Append(String.Format( "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>", duar.title, duar.abst, duar.url)); } if (strbuilder.Length == 0) { strbuilder.Append("No pages mathch the query."); } return "<ol>" + strbuilder.ToString() + "</ol>"; }
static void Main(string[] args) { MongodbAccess mongo = new MongodbAccess(); HashSet<string> crawled_urls = mongo.GetCrawledURLs(); Util.log("{0} urls have been crawled.", crawled_urls.Count); Random r = new Random(99); HashSet<string> urls = GetWaitingURLs(crawled_urls); foreach (string url in urls) { WebPage page = GetWebPage(url); if (page == null) continue; mongo.InsertWebPage(page); Util.log("crawl {0} done.", url); if (r.Next(100) > 80) { System.Threading.Thread.Sleep(500); } } /* HashSet<string> urls = GetWaitingURLs(null); Console.WriteLine(urls.Count); foreach (string url in urls) { WebPage page = GetWebPage(url); if (page == null) continue; Console.WriteLine("{0} {1}", page.title, page.published_time); } * */ }