public static Guid GenerateBotId() { if (_botId != Guid.Empty) { return(_botId); } var hostName = Dns.GetHostName(); var processId = Process.GetCurrentProcess().Id; var ts = GetTimestamp(); var idStr = hostName + processId + ts; _botId = IDHelper.GetGuid(idStr); return(_botId); }
public List <WX_links> Query(IW2S_WX_BaiduCommend searchTsk, List <IW2S_WX_BaiduCommend> taskList) { List <WX_links> linkvaluelist = new List <WX_links>(); WX_BotTaskService WX_bt = new WX_BotTaskService(); List <IW2S_WX_level1link> linklist = WX_bt.GetLinkTitleList(searchTsk.ProjectId); int index = 0; for (int i = 0; i < taskList.Count; i++) { if (searchTsk.CommendKeyword == taskList[i].CommendKeyword) { index = i; } } for (int i = 0; i < taskList.Count; i++) { WX_links lk = new WX_links(); //if (searchTsk.CommendKeyword == taskList[i].CommendKeyword) //{ //} //else //{ lk.source = index; lk.target = i; lk.KeywordId = searchTsk._id; lk.ProjectId = searchTsk.ProjectId; int linkNum = 0; foreach (var item in linklist) { if (!string.IsNullOrEmpty(item.Title)) { if (item.Title.Contains(searchTsk.CommendKeyword) && item.Title.Contains(taskList[i].CommendKeyword)) { linkNum = linkNum + 1; } } } lk.value = linkNum; lk.Gid = IDHelper.GetGuid("{0}/{1}/{2}/{3}".FormatStr(lk.source, lk.target, lk.KeywordId, lk.ProjectId)); linkvaluelist.Add(lk); // } } WX_SaveResult(linkvaluelist); return(linkvaluelist); }
public void Run() { while (true) { Random r = new Random(); var p = get_search_to_qry(); if (p == null) { SetReady(); Thread.Sleep(r.Next(30000, 100000)); continue; } try { SetBusy(); //var internetIp = IWSBot.Utility.Utility.GetInternetIpAddress(); var botId = IWSBot.Utility.Utility.GenerateBotId().ToString().Replace("-", ""); var pro = Process.GetCurrentProcess(); string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString(); int botInterval = p.BotIntervalHours == 0 ? 7 * 24 : p.BotIntervalHours; var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 1 }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)botInterval + 8) } , { "BotTag", string.Format("{0}#", processName) }, { "BotId", botId } } } }; var result = MongoDBHelper.Instance.GetIW2S_WB_BaiduCommends().UpdateOne(new QueryDocument { { "_id", p._id } }, update); query(p); } catch (Exception ex) { while (ex != null) { Console.WriteLine("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace)); ex = ex.InnerException; } } //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm") try { var update = new UpdateDocument { { "$set", new QueryDocument { { "LastBotEndAt", DateTime.UtcNow.AddHours(8) }, { "BotStatus", 2 } } } }; var commendCol = MongoDBHelper.Instance.GetIW2S_WB_BaiduCommends(); var result = commendCol.UpdateOne(new QueryDocument { { "_id", p._id } }, update); var builder = Builders <IW2S_WB_level1link> .Filter; var filter = builder.Eq(x => x.UsrId, p.UsrId); filter &= builder.Eq(x => x.SearchkeywordId, p._id); filter &= builder.Ne(x => x.DataCleanStatus, (byte)2); filter &= builder.Regex(x => x.Description, new BsonRegularExpression("/.*" + p.Keyword + ".*/i")); var col = MongoDBHelper.Instance.GetIW2S_WB_level1links(); var agreresult = col.Aggregate().Match(filter) .Group(new BsonDocument { { "_id", "$_id" }, { "Count", new BsonDocument("$sum", 1) } }) .ToListAsync() .Result; var vallinkCount = agreresult.Count; update = new UpdateDocument { { "$set", new QueryDocument { { "ValLinkCount", vallinkCount } } } }; commendCol.UpdateOne(new QueryDocument { { "_id", p._id } }, update); } catch (Exception ex) { Console.WriteLine("get_proj_to_qry ERROR ." + ex.Message); Thread.Sleep(5000); } } }
public List <Dnl_Google_level1link> GetLinks(string link, Dnl_Google_BaiduCommend searchTsk) { List <Dnl_Google_level1link> result = new List <Dnl_Google_level1link>(); int nohist_pages = 0; int quried_pages = 0; int fanye = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 10) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "https://www.google.com"; string cookie = ""; WebClient webClient = new WebClient(); webClient.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData = webClient.DownloadData(link); string pageHtml = Encoding.GetEncoding("Big5").GetString(pageData); pageHtml = Strings.StrConv(pageHtml, VbStrConv.SimplifiedChinese, 0); //string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 80000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); //cookiesColl = cookieCollection; //int gg = new Random().Next(2000, 5000); //Thread.Sleep(gg); //Rurl = link; //var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); //cookiesColl = cookieCollection; var html = pageHtml; if (html == null) { break; } // html = Regex.Unescape(html); if (html.Contains("没有找到搜索内容!")) { break; } var tags = html.SubAfter("<body").SubAfter("center_col").SubBefore("id=\"foot\""); var tagsD = tags.SplitWith("class=\"g\""); if (tagsD == null || tagsD.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tagsD) { if (!tag.Contains("h3")) { continue; } //if (!tag.Contains("sp_requery")) //{ // continue; //} var a = tag.SubAfter("h3").SubAfter("a"); string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = a.GetFirstHref2(); //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); if (href.Contains("/url?q=")) { href = href.Replace("/url?q=", ""); } if (!href.Contains("http")) { href = "https://www.google.com" + href; } if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href)) { continue; } href = href.Replace("amp;", ""); var sdsfdsf = GetDomain(href); string abs = RemoveInivalidChar(tag.SubAfter("class=\"st\"").SubBefore("</span").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string timesp = ""; string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); int maxScore = 0; byte appType = 0; //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = ""; try { // htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); WebClient webClient2 = new WebClient(); webClient2.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData2 = webClient2.DownloadData(href); htmldetail = Encoding.GetEncoding("Big5").GetString(pageData2); htmldetail = Strings.StrConv(htmldetail, VbStrConv.SimplifiedChinese, 0); } catch (Exception) { //htmldetail = ""; } Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); if (m.Groups.Count > 0) { timesp = m.Groups[0].Value; } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); Dnl_Google_level1link l1 = new Dnl_Google_level1link { UsrId = searchTsk.UsrId, Domain = domain.Replace("http://", "").Replace("https://", ""), TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword), LinkUrl = href, MatchAt = (byte)part, Html = htmldetail, MatchType = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)), AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)), SearchkeywordId = searchTsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abs, ProjectId = searchTsk.ProjectId, PublishTime = timesp }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("id=\"foot\"").SubAfter("text-align:left").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "https://www.google.com/".GetContact(link); link = link.Replace("amp;", ""); } fanye = fanye + 10; SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }
public IW2SUserDto Regist(string uName, string uPwd1, string uPwd2, string email) { //var code = VerifyCodeClass.YzmCode; //if (YZM.ToLower() != code.ToLower()) //{ // return new IW2SUserDto { Error = "验证码填写错误!" }; //} if (string.IsNullOrEmpty(uName) || string.IsNullOrEmpty(uPwd1)) { return new IW2SUserDto { Error = "用户名和密码不能为空" } } ; if (!uPwd1.Equals(uPwd2)) { return new IW2SUserDto { Error = "密码不一致!" } } ; if (string.IsNullOrEmpty(email)) { return new IW2SUserDto { Error = "邮箱不能为空!" } } ; bool dd = System.Text.RegularExpressions.Regex.IsMatch(email, @"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?"); if (dd == false) { return new IW2SUserDto { Error = "邮箱格式不正确!" } } ; var builder = Builders <IW2SUser> .Filter; var filter = builder.Eq(x => x.LoginName, uName); var _usr = MongoDBHelper.Instance.Get_IW2SUser().Find(filter).FirstOrDefault(); if (_usr != null) { return new IW2SUserDto { Error = "用户名已经存在" } } ; // var queryTask1 = new QueryDocument { { "UsrEmail", email } }; filter = builder.Eq(x => x.UsrEmail, email); IW2SUser _usr1 = MongoDBHelper.Instance.Get_IW2SUser().Find(filter).FirstOrDefault(); if (_usr1 != null) { return new IW2SUserDto { Error = "该邮箱已经注册过,请换一个试试!" } } ; var md5 = EncryptHelper.GetEncryPwd(uPwd1.ToLower()); _usr = new IW2SUser() { _id = ObjectId.GenerateNewId(), LoginName = uName, LoginPwd = md5, UsrKey = IDHelper.GetGuid(uName + usr_key), applicationState = false, IsEmailConfirmed = false, UsrEmail = email, UsrRole = UserTypes.Free, UsrNum = 1, Gender = "", MobileNo = "", Remarks = "", PictureSrc = "", CreatedAt = DateTime.Now.AddHours(8), ProjectNum = 2, KeywordNum = 20, ReportNum = 2, LinkNum = 2000 }; MongoDBHelper.Instance.Get_IW2SUser().InsertOne(_usr); IW2SUserDto freDto = new IW2SUserDto(); freDto._id = _usr._id.ToString(); freDto.LoginName = _usr.LoginName; freDto.LoginPwd = _usr.LoginPwd; freDto.UsrRole = _usr.UsrRole; freDto.UsrKey = _usr.UsrKey; freDto.UsrEmail = _usr.UsrEmail; freDto.IsEmailConfirmed = _usr.IsEmailConfirmed; freDto.applicationState = _usr.applicationState; freDto.UsrNum = _usr.UsrNum; //freDto.Token = Helpers.IprAuthorizeAttribute.GetToken(_usr.LoginName, _usr.UsrRole); return(freDto); }
public static Guid GetEncryPwd(string pwd) { return(IDHelper.GetGuid(pwd + pwd_key)); }
public void SetID(XTask tsk) { Listings.ToList().ForEach(x => { if (tsk != null) { x.CompanyName = tsk.CompanyName; x.SiteName = tsk.SiteName; x.DetailQueryName = tsk.DetailQueryName; x.CommentQueryName = tsk.CommentQueryName; x.BuyerListQueryName = tsk.BuyerListQueryName; x.BotComments = tsk.BotComments; x.BotBuyerList = tsk.BotBuyerList; x.RealBrandName = tsk.BrandName; x.RealProductName = tsk.ProductName; if (tsk.ItemID.HasValue) { x.BotItemID = tsk.ItemID.Value; } if (tsk.ShopID.HasValue) { x.BotShopID = tsk.ShopID.Value; } } if (!x.BotShopID.HasValue && !string.IsNullOrEmpty(x.ShopName)) { x.BotShopID = IDHelper.GetGuid(string.Format("{0},{1}", x.ShopID ?? x.ShopName, x.SiteName)); } if (!x.BotItemID.HasValue && !string.IsNullOrEmpty(x.ItemName)) { if (!string.IsNullOrEmpty(x.ItemID)) { x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1}", x.ItemID, x.SiteName)); } else if (x.BotShopID.HasValue) { x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1}", x.ItemName, x.BotShopID)); } else if (!string.IsNullOrEmpty(x.ItemDetailUrl)) { x.BotItemID = IDHelper.GetGuid(x.ItemDetailUrl); } else { x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1}", x.ItemName, x.SiteName)); } } if (x.ItemBotStatus == BotStatus.Removed) { x.ClosedAt = DateTime.Now; } x.ItemCommentList.ForEach(y => { y.EntityID = x.BotItemID; y.ID = IDHelper.GetGuid(string.Format("{0},{1},{2}", x.BotItemID, y.Poster, y.PostAt)); }); x.SalesRecords.ForEach(y => { y.PackageID = x.BotItemID; }); }); }
void GetLinks(string link, IW2S_SG_BaiduCommend tsk) { List <IW2S_SG_level1link> result = new List <IW2S_SG_level1link>(); int nohist_pages = 0; int quried_pages = 0; while (!string.IsNullOrEmpty(link) && quried_pages <= 2) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "https://www.sogou.com/"; string cookie = ""; string hhhtml = GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; string realUrl = ""; var html = GetContent(link, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); //(link); cookiesColl = cookieCollection; if (html == null) { break; } //处理 vrwrap var tags = html.SubAfter("<body").SubAfter("results").SubBefore("hint_container").SplitWith("<h3"); if (tags == null || tags.Length == 0) { log("已被sogou屏蔽,请调试! " + tsk.Keyword); break; } bool nohit = true; foreach (var tag in tags) { try { if (!tag.Contains("<a")) { continue; } string title = RemoveInivalidChar(tag.SubAfter("<a").SubBefore("</a>").GetTxtFromHtml2().RemoveSpace()); string href = tag.SubAfter("<a").SubBefore("</a>").GetFirstHref2(); string Jianjie = ""; if (tag.Contains("简介:")) { Jianjie = tag.SubAfter("简介:").SubBefore("</").GetTxtFromHtml2().RemoveSpace(); } if (tag.Contains("cacheresult_summary")) { Jianjie = tag.SubAfter("cacheresult_summary").SubBefore("</div>").GetTxtFromHtml2().RemoveSpace(); } if (string.IsNullOrEmpty(Jianjie)) { Jianjie = tag.SubAfter("summary_beg").SubBefore("summary_end").GetTxtFromHtml2().RemoveSpace(); } int n = new Random().Next(8000, 15000); Thread.Sleep(n); var tuplehtml = GetContent(href, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); // get_htmlUrl(href); if (!string.IsNullOrEmpty(tuplehtml)) { if (tuplehtml.Contains("window.location.replace")) { realUrl = tuplehtml.SubAfter("window.location.replace").SubBefore("</script>").Replace('"', ' ').Replace("(", "").Replace(")", "").RemoveSpace(); } else { realUrl = Rurl; } } string domain = ""; if (!string.IsNullOrEmpty(realUrl)) { domain = GetDomain(realUrl); } else { realUrl = href; domain = GetDomain(href); } string topDomain = GetLevel1Domain(domain); bool IsContains = false; int States = 0; int blackid = 0; realUrl = realUrl.Replace("amp;", ""); int nn = new Random().Next(6000, 15000); Thread.Sleep(nn); var htmldetail = GetContent(realUrl, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (m.Groups.Count > 0) { time = m.Groups[0].Value; } //foreach (var item in BLtb) //{ // if (item.Domain.Trim().ToLower().Equals(topDomain.Trim().ToLower())) // { // States = 2; // blackid = item.Id; // } //} //foreach (var item in excludedKeywords) //{ // if (item.AuthorizedUrl1.Contains(topDomain)) // { // IsContains = true; // States = 1; // } //} //if (IsContains == true) // continue; IW2S_SG_level1link l1 = new IW2S_SG_level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = topDomain, Keywords = string.Format("{0}", tsk.Keyword), LinkUrl = realUrl, Html = htmldetail, BizId = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = Jianjie, Title = title, ProjectId = tsk.ProjectId, PublishTime = time, AlternateFields = "0", DataCleanStatus = 0 }; result.Add(l1); nohit = false; nohist_pages = 0; } catch (Exception ex) { log("有错误信息!" + ex.Message); } } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "https://www.sogou.com/web".GetContact(link); } SaveResult(result); result.Clear(); int nn1 = new Random().Next(6000, 15000); Thread.Sleep(nn1); } }
public void Run() { while (true) { Random r = new Random(); var p = get_search_to_count(); if (p == null) { SetReady(); Thread.Sleep(r.Next(30000, 100000)); continue; } int LinkCount = 0; try { SetBusy(); // var ipaddrs = System.Net.Dns.GetHostEntry(System.Environment.MachineName).AddressList; // string ip = string.Empty; // if (ipaddrs.Length >= 3) // { // ip = ipaddrs[2].ToString(); // } // else if (string.IsNullOrEmpty(ip) && ipaddrs.Length >= 0) // { // ip = ipaddrs[0].ToString(); // } //var internetIp = Utility.GetInternetIpAddress(); var botId = Utility.GenerateBotId().ToString().Replace("-", ""); var pro = Process.GetCurrentProcess(); string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString(); int botInterval = p.BotIntervalHours == 0 ? 7 * 24 : p.BotIntervalHours; var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 1 }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)botInterval + 8) } , { "BotTag", string.Format("{0}#", processName) }, { "BotId", botId } } } }; var result = MongoDBHelper.Instance.GetIW2S_Projects().UpdateOne(new QueryDocument { { "_id", p._id } }, update); LinkCount = count(p); } catch (Exception ex) { while (ex != null) { log("Project_BaiduLinkCount_Count ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace)); ex = ex.InnerException; } } //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm") try { var update = new UpdateDocument { { "$set", new QueryDocument { { "LastBotEndAt", DateTime.UtcNow.AddHours(8) }, { "BotStatus", 2 }, { "BaiduLinkCount", LinkCount } } } }; var commendCol = MongoDBHelper.Instance.GetIW2S_Projects(); var result = commendCol.UpdateOne(new QueryDocument { { "_id", p._id } }, update); } catch (Exception ex) { log("get_proj_to_count ERROR ." + ex.Message); Thread.Sleep(5000); } } }
public void Run() { while (true) { Random r = new Random(); var p = get_search_to_qry(); if (p == null) { SetReady(); Thread.Sleep(r.Next(30000, 100000)); continue; } try { SetBusy(); //var internetIp = IWSBot.Utility.Utility.GetInternetIpAddress(); var botId = IWSBot.Utility.Utility.GenerateBotId().ToString().Replace("-", ""); var pro = Process.GetCurrentProcess(); string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString(); int botInterval = p.BotIntervalHours == 0 ? 7 * 24 : p.BotIntervalHours; var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 1 }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)botInterval + 8) } , { "BotTag", string.Format("{0}#", processName) }, { "BotId", botId } } } }; var result = MongoDBHelper.Instance.GetIW2S_ImgSearchTasks().UpdateOne(new QueryDocument { { "_id", p._id } }, update); query(p); } catch (Exception ex) { while (ex != null) { Console.WriteLine("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace)); ex = ex.InnerException; } } //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm") try { var update = new UpdateDocument { { "$set", new QueryDocument { { "LastBotEndAt", DateTime.UtcNow.AddHours(8) }, { "BotStatus", 2 } } } }; var commendCol = MongoDBHelper.Instance.GetIW2S_ImgSearchTasks(); var result = commendCol.UpdateOne(new QueryDocument { { "_id", p._id } }, update); } catch (Exception ex) { Console.WriteLine("get_proj_to_qry ERROR ." + ex.Message); Thread.Sleep(5000); } } }
public void Run() { while (true) { Random r = new Random(); var p = get_search_to_qry(); if (p == null) { Thread.Sleep(r.Next(30000, 100000)); continue; } try { var ipaddrs = System.Net.Dns.GetHostEntry(System.Environment.MachineName).AddressList; string ip = string.Empty; if (ipaddrs.Length >= 3) { ip = ipaddrs[2].ToString(); } else if (string.IsNullOrEmpty(ip) && ipaddrs.Length >= 0) { ip = ipaddrs[0].ToString(); } var pro = Process.GetCurrentProcess(); string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString(); var update = new UpdateDocument { { "$set", new QueryDocument { { "IsBot", true }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)p.BotIntervalHours + 8) } , { "BotTag", string.Format("{0}#{1}", ip, processName) } } } }; var result = MongoDBHelper.Instance.Getiws_searchkeywords().UpdateOne(new QueryDocument { { "_id", p._id } }, update); query(p); } catch (Exception ex) { while (ex != null) { log("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace)); ex = ex.InnerException; } } //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm") try { var update = new UpdateDocument { { "$set", new QueryDocument { { "LastBotEndAt", DateTime.UtcNow.AddHours(8) }, { "IsBot", false } } } }; var result = MongoDBHelper.Instance.Getiws_searchkeywords().UpdateOne(new QueryDocument { { "_id", p._id } }, update); } catch (Exception ex) { log("get_proj_to_qry ERROR ." + ex.Message); Thread.Sleep(5000); } } }
public void Run() { while (true) { Random r = new Random(); var p = get_search_to_qry(); if (p == null) { SetReady(); Thread.Sleep(r.Next(30000, 100000)); continue; } //var keywordbuilder = Builders<IW2S_BaiduKeyword>.Filter; //var keywordFilter = keywordbuilder.Eq(x => x.Keyword, p.Keyword) & keywordbuilder.Eq(x => x.BotStatus, 2); //var keywordId = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().Find(keywordFilter).Project(x=>x._id).FirstOrDefault(); //var col = MongoDBHelper.Instance.GetIW2S_BaiduCommends(); //var builder = Builders<IW2S_BaiduCommend>.Filter; //var filter = builder.Eq(x => x.KeywordId, keywordId); //List<string> commends = col.Find(filter).Project(x=>x.CommendKeyword).ToList().Distinct().ToList(); //if (commends != null && commends.Count > 3) //{ // foreach (string commend in commends) // { // if (commend == p.Keyword) continue; // IW2S_BaiduCommend baiduCommend = new IW2S_BaiduCommend // { // CommendKeyword = commend, // CreatedAt = DateTime.UtcNow.AddHours(8), // Keyword = p.Keyword, // UsrId = p.UsrId, // KeywordId = p._id, // ProjectId = p.ProjectId // }; // col.InsertOne(baiduCommend); // } // var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 2 } } } }; // var result = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().UpdateOne(new QueryDocument { { "_id", p._id } }, update); //} //else { try { SetBusy(); //var internetIp = Utility.GetInternetIpAddress(); //var ipaddrs = System.Net.Dns.GetHostEntry(System.Environment.MachineName).AddressList; // //string ip = string.Empty; // if (ipaddrs.Length >= 3) // { // ip = ipaddrs[2].ToString(); // } // else if (string.IsNullOrEmpty(ip) && ipaddrs.Length >= 0) // { // ip = ipaddrs[0].ToString(); // } var pro = Process.GetCurrentProcess(); var processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString(); var botId = Utility.GenerateBotId().ToString().Replace("-", ""); var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 1 } , { "BotTag", string.Format("{0}#", processName) }, { "BotId", botId } } } }; var result = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().UpdateOne(new QueryDocument { { "_id", p._id } }, update); query(p); } catch (Exception ex) { while (ex != null) { log("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace)); ex = ex.InnerException; } } //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm") try { var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 2 } } } }; var result = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().UpdateOne(new QueryDocument { { "_id", p._id } }, update); } catch (Exception ex) { log("get_proj_to_qry ERROR ." + ex.Message); Thread.Sleep(5000); } } } }
public List <IW2S_WX_level1link> GetLinks(string link, IW2S_WX_BaiduCommend searchTsk) { List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>(); int nohist_pages = 0; int quried_pages = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 2) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "http://weixin.sogou.com/"; string cookie = ""; string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; int gg = new Random().Next(5000, 8000); Thread.Sleep(gg); Rurl = link; var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); cookiesColl = cookieCollection; if (html == null) { break; } if (html.Contains("没有找到相关的微信公众号文章")) { break; } var tags = html.SplitWith("wx-rb wx-rb3"); if (tags == null || tags.Length == 0 || tags.Length == 1) { tags = html.SplitWith("wx-rbwx-rb3"); } if (tags == null || tags.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tags) { if (!tag.Contains("txt-box")) { continue; } string title = RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); string abs = RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string domain = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); string SourceLink = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetFirstHref2(); string TitleImg = tag.SubAfter("img_box2").SubBefore("</a").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace(); //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } //var excludekwdcount = ExcludeKeyword.Count(c => txt.Contains(c.KeywordName)); //if (excludekwdcount > 0) // continue; if (href.IsStartWith("/websearch")) { href = "http://weixin.sogou.com" + href.Replace("amp;", ""); } if (href.IsStartWith("s?__biz")) { var href1 = href.Replace("amp;", ""); } href = href.Replace("amp;", ""); int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(htmldetail); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (m.Groups.Count > 0) { time = m.Groups[0].Value; } href = Rurl; var hrefNew = href + "&f=json"; var htmldetailNewUrl = get_Detailehtml(hrefNew, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); try { var uuurl = htmldetailNewUrl.SubAfter("\"link\":").SubBefore(",\"source_url\":").Replace('"', ' ').Replace("\\", "").RemoveSpace(); href = uuurl; } catch (Exception) { } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); var no = ""; var qrcode = ""; var function = ""; var NoIcon = ""; var QrcodeIcon = ""; SourceLink = SourceLink.Replace("amp;", ""); int nnn = new Random().Next(8000, 15000); Thread.Sleep(nnn); var htmlNo = get_Nohtml(SourceLink, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); if (!string.IsNullOrEmpty(htmlNo) && htmlNo.Contains("em_weixinhao")) { no = htmlNo.SubAfter("em_weixinhao").SubBefore("/label").GetTxtFromHtml2().RemoveSpace(); qrcode = htmlNo.SubAfter("v-box").SubBefore("<em").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace(); function = htmlNo.SubAfter("功能介绍:</").SubBefore("/span").GetTxtFromHtml2().RemoveSpace(); SourceLink = htmlNo.SubAfter("微信认证:").SubBefore("/div").GetTxtFromHtml2().RemoveSpace(); NoIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("src=").SubBefore("onload").Replace(">", "").Replace('"', ' ').RemoveSpace(); QrcodeIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("err:").SubBefore(">").Replace(">", "").Replace('"', ' ').Replace("'", "").RemoveSpace(); } IW2S_WX_level1link l1 = new IW2S_WX_level1link { BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(title, domain, searchTsk.UsrId)), Description = abs, Domain = domain, UsrId = searchTsk.UsrId, LinkUrl = href, MatchAt = (byte)part, Title = title, CreatedAt = DateTime.Now, DataCleanStatus = 0, Function = function, SearchkeywordId = searchTsk._id.ToString(), Keywords = searchTsk.Keyword, PublicNo = no, QrCode = qrcode, SourceLink = SourceLink, TagType = 0, ImgIcon = NoIcon, QrCodeIcon = QrcodeIcon, ProjectId = searchTsk.ProjectId, TitleImg = TitleImg, PublishTime = time, Html = htmldetail }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://weixin.sogou.com/weixin".GetContact(link); } SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }
public Guid GetID() { return(IDHelper.GetGuid(string.Format("{0}{1}{2}", PackageID, Buyer, SettleDT.ToDateKey2()))); }
private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank) { int maxScore = 0; string realUrl = null, detailHtml = null, abstracts = null; byte appType = 0; int?baiduVStar = null; if (tag.Contains("c-icon-v1")) { baiduVStar = 1; } else if (tag.Contains("c-icon-v2")) { baiduVStar = 2; } else if (tag.Contains("c-icon-v3")) { baiduVStar = 3; } if (!string.IsNullOrWhiteSpace(href)) { //Encoding enc = Encoding.UTF8; //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl); var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { return; } else { //if (!detailHtml.Contains(tsk.CommendKeyword)) //{ // return; //} var hrefs = detailHtml.GetDescendents("a", "href"); StringBuilder sbabstracts = new StringBuilder(); List <string> abstractlist = new List <string>(); StringBuilder sbabstractlist = new StringBuilder(); foreach (KeywordScore pattern in patterns) { string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword); if (splitDetailHtmls.Length > 1) { matchpatterns.Add(pattern); } StringBuilder sbpatternStr = new StringBuilder(); for (int i = 0; i < splitDetailHtmls.Length - 1; i++) { string splitDetailHtml1 = splitDetailHtmls[i]; string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : ""; for (int j = splitDetailHtml1.Length - 1; j >= 0; j--) { if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1])) { break; } sbpatternStr.Append(splitDetailHtml1[j]); } for (int q = sbpatternStr.Length - 1; q >= 0; q--) { sbabstracts.Append(sbpatternStr[q]); } sbabstracts.Append(pattern.Keyword); sbpatternStr.Clear(); for (int j = 0; j < splitDetailHtml2.Length; j++) { if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1])) { break; } sbpatternStr.Append(splitDetailHtml2[j]); } sbabstracts.Append(sbpatternStr); sbpatternStr.Clear(); string tmpsbabstracts = sbabstracts.ToString(); tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower()); if (!abstractlist.Contains(tmpsbabstracts)) { abstractlist.Add(tmpsbabstracts); sbabstractlist.Append(tmpsbabstracts).Append(" "); } sbabstracts.Clear(); } } //获取摘要 abstracts = sbabstractlist.ToString(); if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs)) { matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList(); if (matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (maxScore > 100) { maxScore = 100; } bool is_title_matched = title.GetLower().IsContains2(searchKeywords); bool is_abstr_matched = abs.IsContains2(searchKeywords); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); MatchCollection mc = reg.Matches(detailHtml); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (mc.Count > 0) { foreach (Match x in mc) { //判断是正文中的还是代码和注释中的时间 if (!string.IsNullOrEmpty(x.Value)) { var txt = detailHtml.SubAfter(x.Value); var index1 = txt.IndexOf('<'); var index2 = txt.IndexOf('>'); var index3 = txt.IndexOf('\"'); //只使用正文中的时间 if (index1 < index2 && index1 < index3) { time = x.Value; break; } } } } IW2S_level1link l1 = new IW2S_level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0}", tsk.CommendKeyword), LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abstracts, IsMarket = isMarket, ProjectId = tsk.ProjectId, PublishTime = time, AlternateFields = "0", Rank = rank }; if (baiduVStar.HasValue) { l1.BaiduVStar = baiduVStar.Value; } botmng.save_level1_links(new List <IW2S_level1link> { l1 }, tsk, excludedKeywords); }
void SaveResult(List <XListing> listings, BotTypes botType, string recordId, FreeTask tsk) { listings.ToList().ForEach(x => { if (string.IsNullOrEmpty(x.ShopName) && !string.IsNullOrEmpty(x.SiteName) && !x.BotShopID.HasValue && !x.SiteName.ToLower().Contains("taobao") && !x.SiteName.ToLower().Contains("alibaba")) { x.ShopName = x.SiteName; x.BotShopID = x.SiteID; } if (!x.BotShopID.HasValue && !string.IsNullOrEmpty(x.ShopName)) { x.BotShopID = IDHelper.GetGuid(string.Format("{0},{1},{2},{3}", x.ShopName, x.SiteName, tsk._id, x.ShopID)); } }); var shopList = listings; shopList = shopList.DistinctBy(x => x.BotShopID); //var exists_ids = MySqlDbHelper.GetExsitsIds<Guid?>(com, "bot_shops", "Shop_id", shopList.Select(x => x.BotShopID).ToArray()); //if (exists_ids != null && exists_ids.Count > 0) //{ // shopList = shopList.Where(x => !exists_ids.Contains(x.BotShopID)).ToList(); //} FieldsDocument shopfd = new FieldsDocument(); shopfd.Add("BotShopID", 1); // MongoCollection<Guid> shopcol = MongoDBHelper<Guid>.GetMongoDB().GetCollection<Guid>("FreeBotShop"); var shopcol = MongoDBHelper.Instance.Get_FreeBotShop(); var builder = Builders <FreeBotShop> .Filter; List <Guid?> BotShopID = shopList.Select(x => x.BotShopID).ToList(); // var existsshop_objs = shopcol.Find(MongoDB.Driver.Builders.Query.In("BotShopID", new BsonArray(BotShopID))).SetFields(shopfd); var existsshop_objs = shopcol.Find(builder.In(x => x.Shop_id, BotShopID)).Project(x => x.Shop_id).ToList(); List <Guid?> exists_ids = new List <Guid?>(); foreach (var result in existsshop_objs) { exists_ids.Add(result); } if (exists_ids != null && exists_ids.Count > 0) { shopList = shopList.Where(x => !exists_ids.Contains(x.BotShopID)).ToList(); } if (shopList == null || shopList.Count == 0) { return; } List <FreeBotShop> dt = null; if (shopList.Count > 0) { dt = GetShopList(shopList); // var saved = MongoDBHelper<FreeBotShop>.BatchInsertData(dt, "FreeBotShop"); shopcol.InsertMany(dt); log("to save bot_shops"); log("Done"); } listings.ToList().ForEach(x => { if (!x.BotItemID.HasValue && !string.IsNullOrEmpty(x.ItemName)) { x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1},{2},{3}", x.ItemName, x.SiteName, tsk._id, x.ItemID)); } }); var itemList = listings; itemList = itemList.DistinctBy(x => x.BotItemID); //var exists_itemids = MySqlDbHelper.GetExsitsIds<Guid?>(com, "bot_items", "ItemId", itemList.Select(x => x.BotItemID).ToArray()); //if (exists_itemids != null && exists_itemids.Count > 0) //{ // itemList = itemList.Where(x => !exists_itemids.Contains(x.BotItemID)).ToList(); //} FieldsDocument fd = new FieldsDocument(); fd.Add("BotItemID", 1); // MongoCollection<Guid> col = MongoDBHelper<Guid>.GetMongoDB().GetCollection<Guid>("FreeBotItem"); var col = MongoDBHelper.Instance.Get_FreeBotItem(); var itemBuilder = Builders <FreeBotItem> .Filter; List <Guid?> BotItemID = itemList.Select(x => x.BotItemID).ToList(); // var exists_objs = col.Find(MongoDB.Driver.Builders.Query.In("BotItemID", new BsonArray(BotItemID))).SetFields(fd); var exists_objs = col.Find(itemBuilder.In(x => x.ItemId, BotItemID)).Project(x => x.ItemId).ToList(); List <Guid?> existsitem_ids = new List <Guid?>(); foreach (var result in exists_objs) { existsitem_ids.Add(result); } if (existsitem_ids != null && existsitem_ids.Count > 0) { itemList = itemList.Where(x => !existsitem_ids.Contains(x.BotItemID)).ToList(); } List <XListing> updatelinks = new List <XListing>(); if (existsitem_ids != null && existsitem_ids.Count > 0) { updatelinks = updatelinks.Where(x => existsitem_ids.Contains(x.BotItemID)).ToList(); } update_level1_links(updatelinks, botType, recordId, tsk); if (itemList == null || itemList.Count == 0) { return; } var itemdt = GetItemList(itemList); // var savedListings = MySqlDbHelper.BatchInsert(con, "bot_items", itemdt); // var savedListings = MongoDBHelper<FreeBotItem>.BatchInsertData(itemdt, "FreeBotItem"); col.InsertMany(itemdt); // var wequery = new QueryDocument { { "_id", new ObjectId(recordId) } }; // FreeTaskRecord TaskList = MongoDBHelper<FreeTaskRecord>.Find1("FreeTaskRecord", wequery); var colRecord = MongoDBHelper.Instance.Get_FreeTaskRecord(); var RecordBuilder = Builders <FreeTaskRecord> .Filter; FreeTaskRecord TaskList = colRecord.Find(RecordBuilder.Eq(x => x._id, new ObjectId(recordId))).FirstOrDefault(); int LinksNum = 0; int ShopsNum = 0; LinksNum = TaskList.LinksNum + itemdt.Count; ShopsNum = TaskList.ShopsNum + dt.Count; var updateWebsiteCount = new UpdateDocument { { "$set", new QueryDocument { { "LinksNum", LinksNum }, { "ShopsNum", ShopsNum } } } }; // MongoDBHelper<FreeTaskRecord>.Update("FreeTaskRecord", wequery, updateWebsiteCount); MongoDBHelper.Instance.Get_FreeTaskRecord().UpdateOne(new QueryDocument { { "_id", new ObjectId(recordId) } }, updateWebsiteCount); var colTask = MongoDBHelper.Instance.Get_FreeTask(); var TaskBuilder = Builders <FreeTask> .Filter; FreeTask Task2List = colTask.Find(TaskBuilder.Eq(x => x._id, itemdt[0].taskId)).FirstOrDefault(); int TaskLinksNum = 0; int TaskShopsNum = 0; TaskLinksNum = Task2List.LinksNum + itemdt.Count; TaskShopsNum = Task2List.ShopsNum + dt.Count; var TaskupdateWebsiteCount = new UpdateDocument { { "$set", new QueryDocument { { "LinksNum", TaskLinksNum }, { "ShopsNum", TaskShopsNum } } } }; // MongoDBHelper<FreeTaskRecord>.Update("FreeTaskRecord", wequery, updateWebsiteCount); MongoDBHelper.Instance.Get_FreeTask().UpdateOne(new QueryDocument { { "_id", itemdt[0].taskId } }, TaskupdateWebsiteCount); log("to save listings"); log("Done"); }
public List <IW2S_Bing_level1link> GetLinks(string link, IW2S_Bing_BaiduCommend searchTsk) { List <IW2S_Bing_level1link> result = new List <IW2S_Bing_level1link>(); int nohist_pages = 0; int quried_pages = 0; int fanye = 0; //最多搜索10页 while (!string.IsNullOrEmpty(link) && quried_pages <= 10) { log(link); CookieContainer cc = new CookieContainer(); Encoding enc = null; CookieCollection cookiesColl = new CookieCollection(); CookieCollection cookieCollection = new CookieCollection(); string Rurl = "http://cn.bing.com/"; string cookie = ""; string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection); cookiesColl = cookieCollection; int gg = new Random().Next(2000, 5000); Thread.Sleep(gg); Rurl = link; var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl); cookiesColl = cookieCollection; if (html == null) { break; } if (html.Contains("没有找到搜索内容!")) { break; } var tags = html.SubAfter("body").SubBefore("/body").SplitWith("b_content"); var tagsD = tags[tags.Length - 1].SubAfter("搜索结果").SubBefore("</ol>").ToString().SplitWith("</li>"); if (tagsD == null || tagsD.Length == 0 || tagsD.Length == 1) { tags = html.SplitWith("b_content"); } if (tagsD == null || tagsD.Length == 0) { log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword); break; } bool nohit = true; foreach (var tag in tagsD) { if (!tag.Contains("h2")) { continue; } //if (!tag.Contains("sp_requery")) //{ // continue; //} var a = tag.SubAfter("h2").SubAfter("a"); string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h2>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace()); string href = a.GetFirstHref2(); //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2(); if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href)) { continue; } href = href.Replace("amp;", ""); var sdsfdsf = GetDomain(href); string abs = RemoveInivalidChar(tag.SubAfter("<p>").SubBefore("</p").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace()); string timesp = ""; if (tag.Contains("此网站的操作")) { timesp = tag.SubAfter("此网站的操作").SubAfter("</a>").SubBefore("</div>").Replace('"', ' '); } string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim(); //domain = BaiduQuery.GetDomain(domain); int maxScore = 0; byte appType = 0; //没有包含需要protect item信息的过滤掉 string txt = "{0},{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } int nn = new Random().Next(8000, 20000); Thread.Sleep(nn); var htmldetail = ""; try { htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl); } catch (Exception) { //htmldetail = ""; href = "http://cn.bing.com" + href; } bool is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower()); bool is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower()); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower()); bool is_bus_matched = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower()); IW2S_Bing_level1link l1 = new IW2S_Bing_level1link { UsrId = searchTsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword), LinkUrl = href, MatchAt = (byte)part, Html = htmldetail, MatchType = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)), AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)), SearchkeywordId = searchTsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abs, ProjectId = searchTsk.ProjectId }; if (is_bus_matched) { l1.MatchType = l1.MatchType; } if (is_itm_title_matched) { l1.MatchType = l1.MatchType; } byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); if (is_bus_matched == true && is_itm_title_matched == true) { //l1.Score = busTsk.Score + 5; l1.Score = 80 + 10; } if (is_bus_matched == true && is_itm_title_matched == false) { l1.Score = 80; } if (is_bus_matched == false && is_itm_title_matched == true) { l1.Score = 50; } result.Add(l1); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //****** sougou 需要重写 ********************* link = html.SubAfter("sb_pagN").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://cn.bing.com/".GetContact(link); } fanye = fanye + 10; link = "http://cn.bing.com/search?q={0}&first={1}&FORM=PERE3".FormatStr(searchTsk.Keyword, fanye); SaveResult(result); result.Clear(); int n = new Random().Next(8000, 15000); Thread.Sleep(n); } return(result); }
void GetLinks(string link, searchkeyword tsk, keyword businessKeyword, List <keyword> businessKeywords, List <keyword> excludedKeywords) { BotMng botmng = BotMng.Instance; AppType img = (AppType)tsk.AppType; string[] searchKeywords = tsk.Keyword.GetLower().RemoveSpace().Split(';'); List <KeywordScore> patterns = businessKeywords.Select(x => new KeywordScore { Keyword = x.Txt, Score = x.Score, BizType = x.BizType }).ToList(); string[] bizPatterns = businessKeywords.Select(x => x.Txt).ToArray(); patterns.Add(new KeywordScore { Keyword = tsk.Keyword, Score = 50, BizType = 0 }); //List<level1link> result = new List<level1link>(); int nohist_pages = 0; int quried_pages = 0; //最多搜索60页 while (!string.IsNullOrEmpty(link) && quried_pages <= 60) { log(link); var html = get_html(link); if (html == null) { break; } var tags = html.SubAfter("content_left").SplitWith("c-container"); if (tags == null || tags.Length == 0) { log("BLOCKED " + tsk.Keyword); break; } bool nohit = true; foreach (var tag in tags) { var a = tag.SubAfter("h3").SubAfter("a"); string title = RemoveInivalidChar( a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower()); string href = a.GetFirstHref2(); string abs = RemoveInivalidChar(tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2().RemoveSpace().GetLower()); string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2(); domain = GetDomain(domain); int maxScore = 0; //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } string realUrl = null, detailHtml = null, abstracts = null; byte appType = 0; if (!string.IsNullOrWhiteSpace(href)) { //Encoding enc = Encoding.UTF8; //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl); var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { continue; } else { if (!detailHtml.Contains(tsk.Keyword) || !detailHtml.IsContains2(bizPatterns)) { continue; } var hrefs = detailHtml.GetDescendents("a", "href"); StringBuilder sbabstracts = new StringBuilder(); List <string> abstractlist = new List <string>(); StringBuilder sbabstractlist = new StringBuilder(); foreach (KeywordScore pattern in patterns) { string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword); if (splitDetailHtmls.Length > 1) { matchpatterns.Add(pattern); } StringBuilder sbpatternStr = new StringBuilder(); for (int i = 0; i < splitDetailHtmls.Length - 1; i++) { string splitDetailHtml1 = splitDetailHtmls[i]; string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : ""; for (int j = splitDetailHtml1.Length - 1; j >= 0; j--) { if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1])) { break; } sbpatternStr.Append(splitDetailHtml1[j]); } for (int q = sbpatternStr.Length - 1; q >= 0; q--) { sbabstracts.Append(sbpatternStr[q]); } sbabstracts.Append(pattern.Keyword); sbpatternStr.Clear(); for (int j = 0; j < splitDetailHtml2.Length; j++) { if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1])) { break; } sbpatternStr.Append(splitDetailHtml2[j]); } sbabstracts.Append(sbpatternStr); sbpatternStr.Clear(); string tmpsbabstracts = sbabstracts.ToString(); tmpsbabstracts = BaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower()); if (!abstractlist.Contains(tmpsbabstracts)) { abstractlist.Add(tmpsbabstracts); sbabstractlist.Append(tmpsbabstracts).Append(" "); } sbabstracts.Clear(); } } abstracts = sbabstractlist.ToString(); if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs)) { matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList(); maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } if (maxScore > 100) { maxScore = 100; } bool is_bus_matched = txt.IsContains2(businessKeyword.Txt); bool is_title_matched = title.GetLower().IsContains2(searchKeywords); bool is_abstr_matched = abs.IsContains2(searchKeywords); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; bool is_itm_title_matched = txt.GetLower().IsContains2(searchKeywords); level1link l1 = new level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0} + {1}", tsk.Keyword, businessKeyword.Txt), LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, MatchType = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)), AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(realUrl, tsk.UsrId, tsk.Keyword)), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abstracts }; byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0)); botmng.save_level1_links(new List <level1link> { l1 }, tsk, excludedKeywords); nohit = false; nohist_pages = 0; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; link = html.SubAfter("fkfk_cur").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://www.baidu.com/".GetContact(link); } } //return result; }