public static void SaveData(JObject wlData, IW2S_WB_BaiduCommend wbb) { var taskid = wlData["data"]["task_id"].Value <int>(); var keywords = wlData["data"]["keyword_list"]; foreach (var item in wlData["data"]["item_list"]) { item["task_id"] = taskid; item["keywords"] = keywords; var s = item.ToString(); var bson = BsonDocument.Parse(s); var postUrl = item["weibo_url"].ToString(); var abs = item["text"].ToString(); //var c = MongoDBHelper.Instance.GetCollection<BsonDocument>("Dnl_WeiboItems"); //c.UpdateOne( // Builders<BsonDocument>.Filter.Eq("task_id", taskid) & Builders<BsonDocument>.Filter.Eq("weibo_url", item["weibo_url"].Value<string>()), //new BsonDocument { { "$set", bson} }, //new UpdateOptions{IsUpsert = true}); var userList = item["user"]; string PosterUrl = ""; string weibo_face = ""; string nickName = ""; int rank = 0; bool IsBlueV = false; //foreach (var item2 in userList) //{ nickName = userList["screen_name"].ToString(); weibo_face = userList["profile_image_url"].ToString(); PosterUrl = "http://weibo.com/" + userList["profile_url"].ToString(); string urank = userList["urank"].ToString(); rank = int.Parse(urank); IsBlueV = bool.Parse(userList["verified"].ToString()); // } IW2S_WB_level1link linkData = new IW2S_WB_level1link(); linkData.PosterUrl = PosterUrl; linkData.PostUrl = postUrl; linkData.HeadIcon = weibo_face; linkData.NickName = nickName; linkData.Description = abs; linkData.IsBlueV = IsBlueV; linkData.UsrId = wbb.UsrId; linkData.Keywords = wbb.Keyword; linkData.CreatedAt = DateTime.Now.AddHours(8); linkData.IsDel = false; linkData.ProjectId = wbb.ProjectId; linkData.Rank = rank; linkData.SearchkeywordId = wbb._id; linkData.BizId = string.Format("{0}{1}", postUrl, wbb._id.ToString()).ToObjectId(); save_level1_links(new List <IW2S_WB_level1link> { linkData }, wbb); } }
public void save_level1_links(List <IW2S_WB_level1link> links, IW2S_WB_BaiduCommend tsk) { if (links == null || links.Count == 0) { Console.WriteLine("SUCCESS saving 0 Level 1 Links for " + tsk.Keyword); return; } int pagesize = 100; int count = 0; var col = MongoDBHelper.Instance.GetIW2S_WB_level1links(); var builder = Builders <IW2S_WB_level1link> .Filter; for (int page = 0; page *pagesize < links.Count; page++) { var list = links.Skip(page * pagesize).Take(pagesize).ToList(); //list.ForEach(x => x._id = new MongoDB.Bson.ObjectId(IDHelper.GetGuid("{0}/&itemid={1}".FormatStr(x.Domain, x.LinkUrl)).ToString())); list = ListDistinctBy(list, x => x.BizId); FieldsDocument fd = new FieldsDocument(); fd.Add("BizId", 1); List <ObjectId> bizIds = list.Select(x => x.BizId).ToList(); var exists_objs = col.Find(builder.In(x => x.BizId, bizIds)).Project(x => x.BizId).ToList(); List <ObjectId> exists_ids = new List <ObjectId>(); foreach (var result in exists_objs) { exists_ids.Add(result); } if (exists_ids != null && exists_ids.Count > 0) { list = list.Where(x => !exists_ids.Contains(x.BizId)).ToList(); } if (list == null || list.Count == 0) { continue; } count += pagesize; col.InsertMany(links); Console.WriteLine("SUCCESS saving " + links.Count + " Level 1 Links for " + tsk.Keyword); } }
private void query(IW2S_WB_BaiduCommend p) { //http://www.baidu.com/s?rtt=2&tn=baiduwb&wd=%E8%80%81%E9%85%B8%E5%A5%B6 try { string link = "http://www.baidu.com/s?rtt=2&tn=baiduwb&wd={0}".FormatStr(p.Keyword); int nohist_pages = 0; int quried_pages = 0; int rank = 1; //最多搜索60页 while (!string.IsNullOrEmpty(link) && quried_pages <= 3) { Console.WriteLine(link); var html = proxy.GetFastHtmlWithProxyIpAndARE(link, "utf-8").RemoveSpace(); if (html == null) { break; } DateTime now = DateTime.Now; var tags = html.SubAfter("id=\"weibo\"").SubBefore("</ol>").SplitWith("<li"); if (tags == null || tags.Length == 0) { Console.WriteLine("BLOCKED " + p.Keyword); break; } bool nohit = true; foreach (string tag in tags) { var a = tag.SubAfter("weibo_detail"); string nickName = RemoveInivalidChar( a.RemoveSpace().GetLower().SubBefore("</a>").GetTxtFromHtml2().RemoveSpace().GetLower()); string PosterUrl = tag.GetFirstHref2(); string weibo_face = tag.GetFirstAttributeValue("img", "src"); string abs = tag.SubBefore("weibo_all").GetTxtFromHtml2().GetLower(); string postUrl = a.SubAfter("weibo_all").GetFirstAttributeValue("href"); //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(PosterUrl, abs); if (string.IsNullOrEmpty(txt)) { continue; } IW2S_WB_level1link linkData = new IW2S_WB_level1link(); string PublishDate = tag.SubAfter("weibo_info").GetTxtFromHtml2().RemoveSpace().GetLower(); int interval = 0; string intervalStr = ""; if (PublishDate.Contains("小时前")) { intervalStr = PublishDate.SubAfter("评论").SubAfter(")").SubBefore("小时前");// int.TryParse(intervalStr, out interval); linkData.PublishTime = now.AddHours(interval * (-1)).Date.AddHours(8); } else if (PublishDate.Contains("分钟前")) { intervalStr = PublishDate.SubAfter("评论").SubAfter(")").SubBefore("分钟前");// int.TryParse(intervalStr, out interval); linkData.PublishTime = now.AddMinutes(interval * (-1)).Date.AddHours(8); } else if (PublishDate.Contains("天前")) { intervalStr = PublishDate.SubAfter("评论").SubAfter(")").SubBefore("天前");// int.TryParse(intervalStr, out interval); linkData.PublishTime = now.AddDays(interval * (-1)).Date.AddHours(8); } linkData.PosterUrl = PosterUrl; linkData.PostUrl = postUrl; linkData.HeadIcon = weibo_face; linkData.NickName = nickName; linkData.Description = abs; linkData.IsBlueV = a.Contains("weibo_level_icon"); linkData.UsrId = p.UsrId; linkData.Keywords = p.Keyword; linkData.CreatedAt = DateTime.Now.AddHours(8); linkData.IsDel = false; linkData.ProjectId = p.ProjectId; linkData.Rank = rank; linkData.SearchkeywordId = p._id; linkData.BizId = "{0}{1}".FormatStr(postUrl, p._id.ToString()).ToObjectId(); save_level1_links(new List <IW2S_WB_level1link> { linkData }, p); nohit = false; nohist_pages = 0; rank++; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; link = html.SubAfter("id=\"page\"").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://www.baidu.com/".GetContact(link); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }