static void CreateForwordIndex() { PanGu.Segment.Init(); MongodbAccess mongo = new MongodbAccess(); HashSet <string> words = new HashSet <string>(); HashSet <string> crawled_urls = mongo.GetCrawledURLs(); foreach (string url in crawled_urls) { WebPage page = mongo.GetWebPageByURL(url); if (page == null) { continue; } Dictionary <string, int> dict = ContetnWordSegment(page.content); ForwardIndexItem forwarditem = new ForwardIndexItem(); forwarditem.webpage_id = page._id; forwarditem.words = new MongoDB.Bson.BsonDocument(); foreach (string word in dict.Keys) { if (!_check(word)) { continue; } forwarditem.words.Add(new MongoDB.Bson.BsonElement(word, dict[word])); if (!words.Contains(word)) { words.Add(word); } } mongo.InsertForwardIndexItem(forwarditem); } Util.log("total {0} words.", words.Count); mongo.SaveWordDict(words); }
static void CreateInvertedIndexAndIDF() { MongodbAccess mongo = new MongodbAccess(); Dictionary <ObjectId, Dictionary <string, int> > forward_index = mongo.GetForwardIndex(); Util.log("{0} items in forward index.", forward_index.Count); HashSet <string> words = mongo.GetWordDict(); Util.log("{0} items in word dict.", words.Count); foreach (string w in words) { InvertedIndexItem inverteditem = new InvertedIndexItem(); inverteditem.word = w; inverteditem.webpage_ids = new BsonArray(); foreach (ObjectId webpageid in forward_index.Keys) { if (forward_index[webpageid].ContainsKey(w)) { inverteditem.webpage_ids.Add(webpageid); } } mongo.InsertInvertedIndexItem(inverteditem); double idf = Math.Log10(forward_index.Count * 1.0 / inverteditem.webpage_ids.Count); mongo.SetWordIDF(w, idf); } }
static void ForwardIndexAndWordIDFInit() { MongodbAccess mongo = new MongodbAccess(); forward_index = mongo.GetForwardIndex(); word_idf = mongo.GetWordIDF(); }
static string GetAbstract(ObjectId docId) { MongodbAccess mongo = new MongodbAccess(); WebPage page = mongo.GetWebPageById(docId); return(page.title.Length < page.content.Length ? page.content.Substring(page.title.Length, (page.content.Length - page.title.Length) > 150 ? 150 : (page.content.Length - page.title.Length)) : ""); }
static void Main(string[] args) { MongodbAccess mongo = new MongodbAccess(); HashSet<string> crawled_urls = mongo.GetCrawledURLs(); int count = 0; foreach(string url in crawled_urls) { WebPage page = mongo.GetWebPageByURL(url); if (page == null) continue; TextExtract te = new TextExtract(page.html, true); mongo.SetWebPageContent(url, page.title + te.content); count++; Util.log("{0} {1}", count, url); } Util.log("processed {0} urls.", count); }
static List <DocUrlAbstractResult> GetResult(List <ObjectId> docIds) { List <DocUrlAbstractResult> ret = new List <DocUrlAbstractResult>(); MongodbAccess mongo = new MongodbAccess(); foreach (ObjectId docId in docIds) { WebPage page = mongo.GetWebPageById(docId); DocUrlAbstractResult duar = new DocUrlAbstractResult(); duar.title = page.title; duar.url = page.url; duar.abst = GetAbstract(docId); ret.Add(duar); } return(ret); }
static string ProcessQuery(string[] query) { /* 对query进行分词 */ List <string> query_words = new List <string>(); PanGu.Segment.Init(); Segment seg = new Segment(); foreach (string q in query) { ICollection <WordInfo> words = seg.DoSegment(q); foreach (WordInfo wi in words) { if (!query_words.Contains(wi.Word)) { query_words.Add(wi.Word); } } } /* 检索出符合条件的docIds */ MongodbAccess mongo = new MongodbAccess(); List <ObjectId> docIds = mongo.GetDocIDByQuery(query_words); docIds = SortResult(docIds, query_words); List <DocUrlAbstractResult> result = GetResult(docIds); /* * foreach (DocUrlAbstractResult duar in result) * { * Console.WriteLine("{0}\n\t{1}", duar.title, duar.url); * } * */ StringBuilder strbuilder = new StringBuilder(); foreach (DocUrlAbstractResult duar in result) { strbuilder.Append(String.Format( "<li><div><span><a href='{2}' target='_blank' class='link'>{0}<a></span><br/><span class='abstract'>{1}</span><br/><span class='url'>{2}</span></div></li>", duar.title, duar.abst, duar.url)); } if (strbuilder.Length == 0) { strbuilder.Append("No pages mathch the query."); } return("<ol>" + strbuilder.ToString() + "</ol>"); }