void Snapshot(IW2S_WX_BaiduCommend searchTask) { List <IW2S_WX_level1link> xListings = new List <IW2S_WX_level1link>(); WeChatQuery wc = new WeChatQuery(searchTask.Keyword + searchTask.CommendKeyword); var links360 = wc.Query(searchTask); // SaveKeyRecord(searchTask); }
public void Run() { while (true) { BotTaskService bt = new BotTaskService(); Random r = new Random(); IW2S_WX_BaiduCommend keyTask = bt.GetBotTask(); //get_task(); if (keyTask == null || keyTask.CommendKeyword == "" || keyTask.CommendKeyword == null) { SetReady(); log("No search task ! start search Detail !!!"); Thread.Sleep(1000); //WX_Data wscData = bt.GetWxData(); //if (wscData == null) //{ //} continue; } SetBusy(); var update = new UpdateDocument { { "$set", new QueryDocument { { "WXStatus", 1 }, { "BotStatus", 1 } } } }; var result = MongoDBHelper.Instance.Get_IW2S_BaiduCommend().UpdateOne(new QueryDocument { { "_id", keyTask._id } }, update); Snapshot(keyTask); //if (list.Count > 0) //{ // foreach (var busKeyword in list) // { // Snapshot(keyTask, busKeyword, ExcludeKeyword); // } //} try { update = new UpdateDocument { { "$set", new QueryDocument { { "WXStatus", 2 }, { "BotStatus", 2 } } } }; result = MongoDBHelper.Instance.Get_IW2S_BaiduCommend().UpdateOne(new QueryDocument { { "_id", keyTask._id } }, update); } catch (Exception ex) { log(DateTime.Now + "ERROR ." + ex.Message); Thread.Sleep(2000); } } }
public List <WX_links> Query(IW2S_WX_BaiduCommend searchTsk, List <IW2S_WX_BaiduCommend> taskList) { List <WX_links> linkvaluelist = new List <WX_links>(); WX_BotTaskService WX_bt = new WX_BotTaskService(); List <IW2S_WX_level1link> linklist = WX_bt.GetLinkTitleList(searchTsk.ProjectId); int index = 0; for (int i = 0; i < taskList.Count; i++) { if (searchTsk.CommendKeyword == taskList[i].CommendKeyword) { index = i; } } for (int i = 0; i < taskList.Count; i++) { WX_links lk = new WX_links(); //if (searchTsk.CommendKeyword == taskList[i].CommendKeyword) //{ //} //else //{ lk.source = index; lk.target = i; lk.KeywordId = searchTsk._id; lk.ProjectId = searchTsk.ProjectId; int linkNum = 0; foreach (var item in linklist) { if (!string.IsNullOrEmpty(item.Title)) { if (item.Title.Contains(searchTsk.CommendKeyword) && item.Title.Contains(taskList[i].CommendKeyword)) { linkNum = linkNum + 1; } } } lk.value = linkNum; lk.Gid = IDHelper.GetGuid("{0}/{1}/{2}/{3}".FormatStr(lk.source, lk.target, lk.KeywordId, lk.ProjectId)); linkvaluelist.Add(lk); // } } WX_SaveResult(linkvaluelist); return(linkvaluelist); }
public List <IW2S_WX_level1link> Query(IW2S_WX_BaiduCommend searchTsk) { var links = get_url(searchTsk.Keyword, searchTsk.CommendKeyword); if (links == null || links == "") { return(null); } List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>(); var list = GetLinks(links, searchTsk); if (list != null && list.Count > 0) { result.AddRange(list); } return(result); }
void WX_Snapshot(IW2S_WX_BaiduCommend searchTask, List <IW2S_WX_BaiduCommend> listKey) { List <IW2S_WX_Data> xListings = new List <IW2S_WX_Data>(); WX_WeChatQuery wc = new WX_WeChatQuery(searchTask.Keyword + searchTask.CommendKeyword); var links360 = wc.Query(searchTask, listKey); }
public List <IW2S_WX_level1link> GetLinks(string link, IW2S_WX_BaiduCommend searchTsk) { List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>(); int nohist_pages = 0; int quried_pages = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 2) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "http://weixin.sogou.com/"; string cookie = ""; string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; int gg = new Random().Next(5000, 8000); Thread.Sleep(gg); Rurl = link; var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); cookiesColl = cookieCollection; if (html == null) { break; } if (html.Contains("没有找到相关的微信公众号文章")) { break; } var tags = html.SplitWith("wx-rb wx-rb3"); if (tags == null || tags.Length == 0 || tags.Length == 1) { tags = html.SplitWith("wx-rbwx-rb3"); } if (tags == null || tags.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tags) { if (!tag.Contains("txt-box")) { continue; } string title = RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); string abs = RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string domain = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); string SourceLink = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetFirstHref2(); string TitleImg = tag.SubAfter("img_box2").SubBefore("</a").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace(); //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } //var excludekwdcount = ExcludeKeyword.Count(c => txt.Contains(c.KeywordName)); //if (excludekwdcount > 0) // continue; if (href.IsStartWith("/websearch")) { href = "http://weixin.sogou.com" + href.Replace("amp;", ""); } if (href.IsStartWith("s?__biz")) { var href1 = href.Replace("amp;", ""); } href = href.Replace("amp;", ""); int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (m.Groups.Count > 0) { time = m.Groups[0].Value; } href = Rurl; var hrefNew = href + "&f=json"; var htmldetailNewUrl = get_Detailehtml(hrefNew, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); try { var uuurl = htmldetailNewUrl.SubAfter("\"link\":").SubBefore(",\"source_url\":").Replace('"', ' ').Replace("\\", "").RemoveSpace(); href = uuurl; } catch (Exception) { } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); var no = ""; var qrcode = ""; var function = ""; var NoIcon = ""; var QrcodeIcon = ""; SourceLink = SourceLink.Replace("amp;", ""); int nnn = new Random().Next(8000, 15000); Thread.Sleep(nnn); var htmlNo = get_Nohtml(SourceLink, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); if (!string.IsNullOrEmpty(htmlNo) && htmlNo.Contains("em_weixinhao")) { no = htmlNo.SubAfter("em_weixinhao").SubBefore("/label").GetTxtFromHtml2().RemoveSpace(); qrcode = htmlNo.SubAfter("v-box").SubBefore("<em").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace(); function = htmlNo.SubAfter("功能介绍:</").SubBefore("/span").GetTxtFromHtml2().RemoveSpace(); SourceLink = htmlNo.SubAfter("微信认证:").SubBefore("/div").GetTxtFromHtml2().RemoveSpace(); NoIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("src=").SubBefore("onload").Replace(">", "").Replace('"', ' ').RemoveSpace(); QrcodeIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("err:").SubBefore(">").Replace(">", "").Replace('"', ' ').Replace("'", "").RemoveSpace(); } IW2S_WX_level1link l1 = new IW2S_WX_level1link { BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(title, domain, searchTsk.UsrId)), Description = abs, Domain = domain, UsrId = searchTsk.UsrId, LinkUrl = href, MatchAt = (byte)part, Title = title, CreatedAt = DateTime.Now, DataCleanStatus = 0, Function = function, SearchkeywordId = searchTsk._id.ToString(), Keywords = searchTsk.Keyword, PublicNo = no, QrCode = qrcode, SourceLink = SourceLink, TagType = 0, ImgIcon = NoIcon, QrCodeIcon = QrcodeIcon, ProjectId = searchTsk.ProjectId, TitleImg = TitleImg, PublishTime = time, Html = htmldetail }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://weixin.sogou.com/weixin".GetContact(link); } SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }