void Snapshot(IW2S_SG_BaiduCommend tsk) { SogouQuery wc = new SogouQuery(); wc.Query(tsk); // SaveKeyRecord(searchTask); }
string get_urls(IW2S_SG_BaiduCommend tsk) { string searchKeywords = tsk.Keyword.RemoveSpace().GetLower(); if (!string.IsNullOrEmpty(searchKeywords)) { string baiduUrlFormat = "https://www.sogou.com/web?query={0}&cid=&page=1&ie=utf8&dr=1";// "https://www.sogou.com/web?query={0}&ie=utf8"; return(baiduUrlFormat.FormatStr(searchKeywords.GetUrlEncodedString("utf-8"))); } return(string.Empty); }
public void Query(IW2S_SG_BaiduCommend tsk) { List <ResultLiks> result = new List <ResultLiks>(); var link = get_urls(tsk); try { GetLinks(link, tsk); //if (list != null && list.Count > 0) // result.AddRange(list); } catch (Exception ex) { log(ex.Message + ex.StackTrace); } }
public List <SG_links> Query(IW2S_SG_BaiduCommend searchTsk, List <IW2S_SG_BaiduCommend> taskList) { List <SG_links> linkvaluelist = new List <SG_links>(); SG_BotTaskService SG_bt = new SG_BotTaskService(); List <IW2S_SG_level1link> linklist = SG_bt.GetLinkTitleList(searchTsk.ProjectId); int index = 0; for (int i = 0; i < taskList.Count; i++) { if (searchTsk.CommendKeyword == taskList[i].CommendKeyword) { index = i; } } for (int i = 0; i < taskList.Count; i++) { SG_links lk = new SG_links(); //if (searchTsk.CommendKeyword == taskList[i].CommendKeyword) //{ //} //else //{ lk.source = index; lk.target = i; lk.KeywordId = searchTsk._id; lk.ProjectId = searchTsk.ProjectId; int linkNum = 0; foreach (var item in linklist) { if (!string.IsNullOrEmpty(item.Title)) { if (item.Title.Contains(searchTsk.CommendKeyword) && item.Title.Contains(taskList[i].CommendKeyword)) { linkNum = linkNum + 1; } } } lk.value = linkNum; lk.Gid = IDHelper.GetGuid("{0}/{1}/{2}/{3}".FormatStr(lk.source, lk.target, lk.KeywordId, lk.ProjectId)); linkvaluelist.Add(lk); // } } SG_SaveResult(linkvaluelist); return(linkvaluelist); }
public void Run() { while (true) { BotTaskService bt = new BotTaskService(); Random r = new Random(); IW2S_SG_BaiduCommend keyTask = bt.GetBotTask(); //get_task(); if (keyTask == null || keyTask.CommendKeyword == "" || keyTask.CommendKeyword == null) { SetReady(); log("没有搜索任务 !!!"); Thread.Sleep(3000); continue; } SetBusy(); var update = new UpdateDocument { { "$set", new QueryDocument { { "WXStatus", 1 }, { "BotStatus", 1 } } } }; var result = MongoDBHelper.Instance.Get_IW2S_SG_BaiduCommend().UpdateOne(new QueryDocument { { "_id", keyTask._id } }, update); Snapshot(keyTask); try { update = new UpdateDocument { { "$set", new QueryDocument { { "WXStatus", 2 }, { "BotStatus", 2 } } } }; result = MongoDBHelper.Instance.Get_IW2S_SG_BaiduCommend().UpdateOne(new QueryDocument { { "_id", keyTask._id } }, update); } catch (Exception ex) { log(DateTime.Now + "ERROR ." + ex.Message); Thread.Sleep(2000); } } }
void SG_Snapshot(IW2S_SG_BaiduCommend searchTask, List <IW2S_SG_BaiduCommend> listKey) { List <IW2S_SG_Data> xListings = new List <IW2S_SG_Data>(); SG_WeChatQuery wc = new SG_WeChatQuery(searchTask.Keyword + searchTask.CommendKeyword); var links360 = wc.Query(searchTask, listKey); }
void GetLinks(string link, IW2S_SG_BaiduCommend tsk) { List <IW2S_SG_level1link> result = new List <IW2S_SG_level1link>(); int nohist_pages = 0; int quried_pages = 0; while (!string.IsNullOrEmpty(link) && quried_pages <= 2) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "https://www.sogou.com/"; string cookie = ""; string hhhtml = GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; string realUrl = ""; var html = GetContent(link, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); //(link); cookiesColl = cookieCollection; if (html == null) { break; } //处理 vrwrap var tags = html.SubAfter("<body").SubAfter("results").SubBefore("hint_container").SplitWith("<h3"); if (tags == null || tags.Length == 0) { log("已被sogou屏蔽,请调试! " + tsk.Keyword); break; } bool nohit = true; foreach (var tag in tags) { try { if (!tag.Contains("<a")) { continue; } string title = RemoveInivalidChar(tag.SubAfter("<a").SubBefore("</a>").GetTxtFromHtml2().RemoveSpace()); string href = tag.SubAfter("<a").SubBefore("</a>").GetFirstHref2(); string Jianjie = ""; if (tag.Contains("简介:")) { Jianjie = tag.SubAfter("简介:").SubBefore("</").GetTxtFromHtml2().RemoveSpace(); } if (tag.Contains("cacheresult_summary")) { Jianjie = tag.SubAfter("cacheresult_summary").SubBefore("</div>").GetTxtFromHtml2().RemoveSpace(); } if (string.IsNullOrEmpty(Jianjie)) { Jianjie = tag.SubAfter("summary_beg").SubBefore("summary_end").GetTxtFromHtml2().RemoveSpace(); } int n = new Random().Next(8000, 15000); Thread.Sleep(n); var tuplehtml = GetContent(href, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); // get_htmlUrl(href); if (!string.IsNullOrEmpty(tuplehtml)) { if (tuplehtml.Contains("window.location.replace")) { realUrl = tuplehtml.SubAfter("window.location.replace").SubBefore("</script>").Replace('"', ' ').Replace("(", "").Replace(")", "").RemoveSpace(); } else { realUrl = Rurl; } } string domain = ""; if (!string.IsNullOrEmpty(realUrl)) { domain = GetDomain(realUrl); } else { realUrl = href; domain = GetDomain(href); } string topDomain = GetLevel1Domain(domain); bool IsContains = false; int States = 0; int blackid = 0; realUrl = realUrl.Replace("amp;", ""); int nn = new Random().Next(6000, 15000); Thread.Sleep(nn); var htmldetail = GetContent(realUrl, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (m.Groups.Count > 0) { time = m.Groups[0].Value; } //foreach (var item in BLtb) //{ // if (item.Domain.Trim().ToLower().Equals(topDomain.Trim().ToLower())) // { // States = 2; // blackid = item.Id; // } //} //foreach (var item in excludedKeywords) //{ // if (item.AuthorizedUrl1.Contains(topDomain)) // { // IsContains = true; // States = 1; // } //} //if (IsContains == true) // continue; IW2S_SG_level1link l1 = new IW2S_SG_level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = topDomain, Keywords = string.Format("{0}", tsk.Keyword), LinkUrl = realUrl, Html = htmldetail, BizId = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = Jianjie, Title = title, ProjectId = tsk.ProjectId, PublishTime = time, AlternateFields = "0", DataCleanStatus = 0 }; result.Add(l1); nohit = false; nohist_pages = 0; } catch (Exception ex) { log("有错误信息!" + ex.Message); } } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "https://www.sogou.com/web".GetContact(link); } SaveResult(result); result.Clear(); int nn1 = new Random().Next(6000, 15000); Thread.Sleep(nn1); } }