string get_html(string url, int timeout, CookieContainer cc, ref Encoding enc, out string Rurl, string cookie, ref CookieCollection cookieColl, out CookieCollection cookieCollection) { var html = TaobaoWebHelper.GetContent(url, 8000, cc, ref enc, out Rurl, cookie, ref cookieColl, out cookieCollection); //web.GetHtml(url, null); //(string url, int timeout, CookieContainer cc, ref Encoding encoding, out string Rurl, string cookie, ref CookieCollection cookiesColl, out CookieCollection cookiesCollection) if (!html.IsContains2("b_content")) { WebHelperNoCookieProxy web2 = new WebHelperNoCookieProxy(); var html2 = false; while (html2 == false) { //web2.ChangeIp(); //IP ip = web2.my_ip; // html = WeChatQueryByBus.GetContentByIP(url, 8000, cc, ip.Ip, ip.Port, ref enc, out Rurl, cookie, ref cookieColl, out cookieCollection);//web2.GetHtml(url, null, "utf-8"); html = TaobaoWebHelper.GetContent(url, 8000, cc, ref enc, out Rurl, cookie, ref cookieColl, out cookieCollection); //web.GetHtml(url, null); html2 = html.IsContains2("b_content"); if (html.Contains("没有找到内容!")) { return(html); } if (!string.IsNullOrEmpty(html) && html.Contains("您的访问过于频繁")) { log("您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。"); int n = new Random().Next(1000, 3000); Thread.Sleep(n); } } } return(html); }
string get_Nohtml(string url, int timeout, CookieContainer cc, ref Encoding enc, out string Rurl, string cookie, ref CookieCollection cookieColl, out CookieCollection cookieCollection) { // var html = web.GetHtml(url, null); var html = TaobaoWebHelper.GetContent(url, 8000, cc, ref enc, out Rurl, cookie, ref cookieColl, out cookieCollection); if (!html.IsContains2("wx-rb wx-rb2", "wx-rbwx-rb2")) { WebHelperNoCookieProxy web2 = new WebHelperNoCookieProxy(); var html2 = false; //while (html2 == false) //{ // // web2.ChangeIp(); // html = web2.GetHtml(url, null, "utf-8"); // html2 = html.IsContains2("wx-rb wx-rb2", "wx-rbwx-rb2"); // if (!string.IsNullOrEmpty(html) && html.Contains("您的访问过于频繁")) // { // log("您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。"); // //int n = new Random().Next(1000, 3000); // //Thread.Sleep(n); // } //} } return(html); }
string get_Detailehtml(string url, int timeout, CookieContainer cc, ref Encoding enc, out string Rurl, string cookie, ref CookieCollection cookieColl, out CookieCollection cookieCollection) { WebHelperNoCookieProxy web1 = new WebHelperNoCookieProxy(); var html = TaobaoWebHelper.GetContent(url, 8000, cc, ref enc, out Rurl, cookie, ref cookieColl, out cookieCollection); if (!string.IsNullOrEmpty(url)) { if (!html.IsContains2("profile_inner")) { WebHelperNoCookieProxy web2 = new WebHelperNoCookieProxy(); var html2 = false; html = TaobaoWebHelper.GetContent(url, 8000, cc, ref enc, out Rurl, cookie, ref cookieColl, out cookieCollection); //web.GetHtml(url, null); html2 = html.IsContains2("profile_inner"); if (!string.IsNullOrEmpty(html) && html.Contains("您的访问过于频繁")) { log("您的访问过于频繁,为确认本次访问为正常用户行为,需要您协助验证。"); int n = new Random().Next(1000, 3000); Thread.Sleep(n); } } } return(html); }
public List <IW2S_Bing_level1link> GetLinks(string link, IW2S_Bing_BaiduCommend searchTsk) { List <IW2S_Bing_level1link> result = new List <IW2S_Bing_level1link>(); int nohist_pages = 0; int quried_pages = 0; int fanye = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 10) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "http://cn.bing.com/"; string cookie = ""; string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; int gg = new Random().Next(2000, 5000); Thread.Sleep(gg); Rurl = link; var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); cookiesColl = cookieCollection; if (html == null) { break; } if (html.Contains("没有找到搜索内容!")) { break; } var tags = html.SubAfter("body").SubBefore("/body").SplitWith("b_content"); var tagsD = tags[tags.Length - 1].SubAfter("搜索结果").SubBefore("</ol>").ToString().SplitWith("</li>"); if (tagsD == null || tagsD.Length == 0 || tagsD.Length == 1) { tags = html.SplitWith("b_content"); } if (tagsD == null || tagsD.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tagsD) { if (!tag.Contains("h2")) { continue; } //if (!tag.Contains("sp_requery")) //{ // continue; //} var a = tag.SubAfter("h2").SubAfter("a"); string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h2>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = a.GetFirstHref2(); //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href)) { continue; } href = href.Replace("amp;", ""); var sdsfdsf = GetDomain(href); string abs = RemoveInivalidChar(tag.SubAfter("<p>").SubBefore("</p").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string timesp = ""; if (tag.Contains("此网站的操作")) { timesp = tag.SubAfter("此网站的操作").SubAfter("</a>").SubBefore("</div>").Replace('"', ' '); } string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); int maxScore = 0; byte appType = 0; //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = ""; try { htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); } catch (Exception) { //htmldetail = ""; href = "http://cn.bing.com" + href; } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); IW2S_Bing_level1link l1 = new IW2S_Bing_level1link { UsrId = searchTsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword), LinkUrl = href, MatchAt = (byte)part, Html = htmldetail, MatchType = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)), AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)), SearchkeywordId = searchTsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abs, ProjectId = searchTsk.ProjectId }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("sb_pagN").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://cn.bing.com/".GetContact(link); } fanye = fanye + 10; link = "http://cn.bing.com/search?q={0}&first={1}&FORM=PERE3".FormatStr(searchTsk.Keyword, fanye); SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }
SnapSearchResult GetLinks(SnapSearchResult result, FreeTask tsk, string recordId) { List <XListing> xListings = new List <XListing>(); string link = "https://s.taobao.com/search?q={0}&ie=utf8&sort=default".FormatStr(tsk.TaskName); // string link = "https://s.taobao.com/search?q={0}&ie=utf8&sort=default".FormatStr("连衣裙冬"); int nohist_pages = 0; int quried_pages = 1; int Position = 1; //最多搜索20页 while (!string.IsNullOrEmpty(link) && quried_pages <= 20) { log(link); var html = TaobaoWebHelper.GetSnapshotHtml(link);; try { if (html != null) { var tagslist = html.SubAfter("itemlist").SubBefore("recommendAuctions"); var tags = tagslist.SubAfter("p4pTags").SplitWith("p4pTags"); if (tags == null || tags.Length == 0) { log("BLOCKED " + tsk); break; } bool nohit = true; foreach (var tag in tags) { try { if (!tag.Contains("raw_title")) { Console.WriteLine(DateTime.Now); break; } if (tag == null || tag == "" || tag.Trim().Length == 0) { Console.WriteLine(DateTime.Now); break; } var a = tag.SubAfter("raw_title").SubBefore("pic_url"); string title = RemoveChar(a.GetLower()); string nid = RemoveChar(tag.SubAfter("nid").SubBefore("category")); string pic_url = RemoveChar(tag.SubAfter("pic_url").SubBefore("detail_url")); string detail_url = "https:" + RemoveChar(tag.SubAfter("detail_url").SubBefore("view_price")); detail_url = Regex.Unescape(detail_url); var view_price = RemoveChar(tag.SubAfter("view_price").SubBefore("view_fee")); double price = 0; if (!string.IsNullOrEmpty(view_price)) { price = Convert.ToDouble(view_price); } string item_loc = RemoveChar(tag.SubAfter("item_loc").SubBefore("reserve_price")); string view_sales = RemoveChar(tag.SubAfter("view_sales").SubBefore("comment_count")).Replace("人收货", "").Replace("人付款", ""); int days30 = 0; if (!string.IsNullOrEmpty(view_sales)) { days30 = Convert.ToInt32(view_sales); } string comment_count = RemoveChar(tag.SubAfter("comment_count").SubBefore("user_id")); int commentcount = 0; if (!string.IsNullOrEmpty(comment_count)) { commentcount = Convert.ToInt32(comment_count); } //shop string user_id = RemoveChar(tag.SubAfter("user_id").SubBefore("nick")); string nick = RemoveChar(tag.SubAfter("nick").SubBefore("shopcard")); string isTmall = RemoveChar(tag.SubAfter("isTmall").SubBefore("delivery")); string delivery = RemoveChar(tag.SubAfter("delivery").SubBefore("description")); string description = RemoveChar(tag.SubAfter("description").SubBefore("service")); string sellerCredit = RemoveChar(tag.SubAfter("sellerCredit").SubBefore("totalRate")); string siteName = "taobao"; Guid siteId = Guid.Parse("A00A672B-DD05-65FB-4EE0-CFA26EBF2ED5"); var totalRate = RemoveChar(tag.SubAfter("totalRate").SubBefore("icon").GetLower()); var shopLink = RemoveChar(tag.SubAfter("shopLink").SubBefore("}")); shopLink = "https:" + shopLink; shopLink = Regex.Unescape(shopLink); XListing listing = new XListing { ShopContactUrl = shopLink, ItemDetailUrl = detail_url, ItemPrice = price, ItemName = title, ItemID = nid, ItemLocation = item_loc, ItemSold30Days = days30, Itempic = pic_url, ItemTotalCommentCount = commentcount, UId = tsk.UId, ShopID = user_id, ShopName = nick, ShopLocation = item_loc, ShopIsTmall = isTmall == "true" ? true : false, taskid = tsk._id, taskName = tsk.TaskName, SiteName = siteName, SiteID = siteId, usrid = tsk.UsrId, ShopIsAuthorized = false, Position = Position, PageNum = quried_pages, ProjectId = tsk.ProjectId }; if (listing.ItemDetailUrl != null && listing.ItemName != null) { listing.ItemBotStatus = BotStatus.Ok; } result.Listings.Add(listing); xListings.Add(listing); nohit = false; nohist_pages = 0; Position++; } catch (Exception we) { Console.WriteLine(DateTime.Now + "错误:" + we.Message); break; } } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; NextPage = NextPage + 44; // link = "https://s.taobao.com/search?q={0}&sort=sale-desc&s={1}".FormatStr(tsk.TaskName, NextPage); // link = "https://s.taobao.com/search?q={0}&ie=utf8&sort=default&s={1}".FormatStr(tsk.TaskName, NextPage); Console.WriteLine(DateTime.Now + "任务名:" + tsk.TaskName + ";开始搜索第" + quried_pages + "页"); SaveResult(xListings, BotTypes.ItemSnapshot, recordId, tsk); xListings.Clear(); int n = new Random().Next(3000, 6000); Thread.Sleep(n); } else { return(result); } } catch (Exception ex) { Console.WriteLine(DateTime.Now + "错误:" + ex.Message); break; } } return(result); }
public List <IW2S_WX_level1link> GetLinks(string link, IW2S_WX_BaiduCommend searchTsk) { List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>(); int nohist_pages = 0; int quried_pages = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 2) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "http://weixin.sogou.com/"; string cookie = ""; string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; int gg = new Random().Next(5000, 8000); Thread.Sleep(gg); Rurl = link; var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); cookiesColl = cookieCollection; if (html == null) { break; } if (html.Contains("没有找到相关的微信公众号文章")) { break; } var tags = html.SplitWith("wx-rb wx-rb3"); if (tags == null || tags.Length == 0 || tags.Length == 1) { tags = html.SplitWith("wx-rbwx-rb3"); } if (tags == null || tags.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tags) { if (!tag.Contains("txt-box")) { continue; } string title = RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); string abs = RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string domain = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); string SourceLink = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetFirstHref2(); string TitleImg = tag.SubAfter("img_box2").SubBefore("</a").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace(); //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } //var excludekwdcount = ExcludeKeyword.Count(c => txt.Contains(c.KeywordName)); //if (excludekwdcount > 0) // continue; if (href.IsStartWith("/websearch")) { href = "http://weixin.sogou.com" + href.Replace("amp;", ""); } if (href.IsStartWith("s?__biz")) { var href1 = href.Replace("amp;", ""); } href = href.Replace("amp;", ""); int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (m.Groups.Count > 0) { time = m.Groups[0].Value; } href = Rurl; var hrefNew = href + "&f=json"; var htmldetailNewUrl = get_Detailehtml(hrefNew, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); try { var uuurl = htmldetailNewUrl.SubAfter("\"link\":").SubBefore(",\"source_url\":").Replace('"', ' ').Replace("\\", "").RemoveSpace(); href = uuurl; } catch (Exception) { } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); var no = ""; var qrcode = ""; var function = ""; var NoIcon = ""; var QrcodeIcon = ""; SourceLink = SourceLink.Replace("amp;", ""); int nnn = new Random().Next(8000, 15000); Thread.Sleep(nnn); var htmlNo = get_Nohtml(SourceLink, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); if (!string.IsNullOrEmpty(htmlNo) && htmlNo.Contains("em_weixinhao")) { no = htmlNo.SubAfter("em_weixinhao").SubBefore("/label").GetTxtFromHtml2().RemoveSpace(); qrcode = htmlNo.SubAfter("v-box").SubBefore("<em").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace(); function = htmlNo.SubAfter("功能介绍:</").SubBefore("/span").GetTxtFromHtml2().RemoveSpace(); SourceLink = htmlNo.SubAfter("微信认证:").SubBefore("/div").GetTxtFromHtml2().RemoveSpace(); NoIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("src=").SubBefore("onload").Replace(">", "").Replace('"', ' ').RemoveSpace(); QrcodeIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("err:").SubBefore(">").Replace(">", "").Replace('"', ' ').Replace("'", "").RemoveSpace(); } IW2S_WX_level1link l1 = new IW2S_WX_level1link { BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(title, domain, searchTsk.UsrId)), Description = abs, Domain = domain, UsrId = searchTsk.UsrId, LinkUrl = href, MatchAt = (byte)part, Title = title, CreatedAt = DateTime.Now, DataCleanStatus = 0, Function = function, SearchkeywordId = searchTsk._id.ToString(), Keywords = searchTsk.Keyword, PublicNo = no, QrCode = qrcode, SourceLink = SourceLink, TagType = 0, ImgIcon = NoIcon, QrCodeIcon = QrcodeIcon, ProjectId = searchTsk.ProjectId, TitleImg = TitleImg, PublishTime = time, Html = htmldetail }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://weixin.sogou.com/weixin".GetContact(link); } SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }