void Snapshot(Dnl_Google_BaiduCommend searchTask) { List <Dnl_Google_level1link> xListings = new List <Dnl_Google_level1link>(); GoogleQuery wc = new GoogleQuery(searchTask.Keyword + searchTask.CommendKeyword); var links360 = wc.Query(searchTask); // SaveKeyRecord(searchTask); }
public void Run() { while (true) { BotTaskService bt = new BotTaskService(); Random r = new Random(); Dnl_Google_BaiduCommend keyTask = bt.GetBotTask(); //get_task(); if (keyTask == null || keyTask.CommendKeyword == "" || keyTask.CommendKeyword == null) { log("No search task ! start search Detail !!!"); Thread.Sleep(1000); //WX_Data wscData = bt.GetWxData(); //if (wscData == null) //{ //} continue; } var update = new UpdateDocument { { "$set", new QueryDocument { { "WXStatus", 1 }, { "BotStatus", 1 } } } }; var result = MongoDBHelper.Instance.Get_Dnl_Google_BaiduCommend().UpdateOne(new QueryDocument { { "_id", keyTask._id } }, update); Snapshot(keyTask); try { update = new UpdateDocument { { "$set", new QueryDocument { { "WXStatus", 2 }, { "BotStatus", 2 } } } }; result = MongoDBHelper.Instance.Get_Dnl_Google_BaiduCommend().UpdateOne(new QueryDocument { { "_id", keyTask._id } }, update); } catch (Exception ex) { log(DateTime.Now + "ERROR ." + ex.Message); Thread.Sleep(2000); } } }
public List <Dnl_Google_level1link> Query(Dnl_Google_BaiduCommend searchTsk) { var links = get_url(searchTsk.Keyword, searchTsk.CommendKeyword); if (links == null || links == "") { return(null); } List <Dnl_Google_level1link> result = new List <Dnl_Google_level1link>(); var list = GetLinks(links, searchTsk); if (list != null && list.Count > 0) { result.AddRange(list); } return(result); }
public List <Dnl_Google_level1link> GetLinks(string link, Dnl_Google_BaiduCommend searchTsk) { List <Dnl_Google_level1link> result = new List <Dnl_Google_level1link>(); int nohist_pages = 0; int quried_pages = 0; int fanye = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 10) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "https://www.google.com"; string cookie = ""; WebClient webClient = new WebClient(); webClient.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData = webClient.DownloadData(link); string pageHtml = Encoding.GetEncoding("Big5").GetString(pageData); pageHtml = Strings.StrConv(pageHtml, VbStrConv.SimplifiedChinese, 0); //string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 80000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); //cookiesColl = cookieCollection; //int gg = new Random().Next(2000, 5000); //Thread.Sleep(gg); //Rurl = link; //var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); //cookiesColl = cookieCollection; var html = pageHtml; if (html == null) { break; } // html = Regex.Unescape(html); if (html.Contains("没有找到搜索内容!")) { break; } var tags = html.SubAfter("<body").SubAfter("center_col").SubBefore("id=\"foot\""); var tagsD = tags.SplitWith("class=\"g\""); if (tagsD == null || tagsD.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tagsD) { if (!tag.Contains("h3")) { continue; } //if (!tag.Contains("sp_requery")) //{ // continue; //} var a = tag.SubAfter("h3").SubAfter("a"); string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = a.GetFirstHref2(); //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); if (href.Contains("/url?q=")) { href = href.Replace("/url?q=", ""); } if (!href.Contains("http")) { href = "https://www.google.com" + href; } if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href)) { continue; } href = href.Replace("amp;", ""); var sdsfdsf = GetDomain(href); string abs = RemoveInivalidChar(tag.SubAfter("class=\"st\"").SubBefore("</span").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string timesp = ""; string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); int maxScore = 0; byte appType = 0; //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = ""; try { // htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); WebClient webClient2 = new WebClient(); webClient2.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData2 = webClient2.DownloadData(href); htmldetail = Encoding.GetEncoding("Big5").GetString(pageData2); htmldetail = Strings.StrConv(htmldetail, VbStrConv.SimplifiedChinese, 0); } catch (Exception) { //htmldetail = ""; } Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); if (m.Groups.Count > 0) { timesp = m.Groups[0].Value; } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); Dnl_Google_level1link l1 = new Dnl_Google_level1link { UsrId = searchTsk.UsrId, Domain = domain.Replace("http://", "").Replace("https://", ""), TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword), LinkUrl = href, MatchAt = (byte)part, Html = htmldetail, MatchType = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)), AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)), SearchkeywordId = searchTsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abs, ProjectId = searchTsk.ProjectId, PublishTime = timesp }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("id=\"foot\"").SubAfter("text-align:left").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "https://www.google.com/".GetContact(link); link = link.Replace("amp;", ""); } fanye = fanye + 10; SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }