/// <summary> /// /// </summary> /// <param name="spiderinfo"></param> /// <returns></returns> public bool Control(Spiderinfo spiderinfo) { bool flg = false; #region 爬虫监控系统 string insertSql = string.Format("insert ignore into spiderinfo(SpiderProgramName,TaskName,SiteName,TemplateName,StartDT,EndDT,RequestMount,CatchMount,StoreMount,ProxyRequestMount,ProxyPercent,VertifyMount,VertifyPercent,TotalMount,TotalLength,IPAddress,Ext) values('{0}','{1}','{2}','{3}','{4}','{5}',{6},{7},{8},{9},{10},{11},{12},{13},{14},'{15}','{16}')", spiderinfo.SpiderProgramName, spiderinfo.TaskName, spiderinfo.SiteName, spiderinfo.TemplateName, spiderinfo.StartDT, spiderinfo.EndDT, spiderinfo.RequestMount, spiderinfo.CatchMount, spiderinfo.StoreMount, spiderinfo.ProxyRequestMount, spiderinfo.ProxyPercent, spiderinfo.VertifyMount, spiderinfo.VertifyPercent, spiderinfo.TotalMount, spiderinfo.TotalLength, spiderinfo.IPAddress, spiderinfo.Ext); int itryMax = 3; while (itryMax > 0) { int iflg = MySqlHelp.Insert(AliyunConn, insertSql, spiderinfo.TaskName + "插入数据失败记录.txt"); if (iflg >= 0) { itryMax = 0; flg = true; } else { itryMax--; } string checkInfo = string.Format("爬虫:{0} 请求量:{1} 抓取量:{2} 入库量:{3} 代理请求量:{4} 代理成功率:{5} 打码次数:{6} 成功打码:{7} 打码成功率:{8} 总抓取量:{9} 时间:{10}", spiderinfo.TaskName, spiderinfo.RequestMount, spiderinfo.CatchMount, spiderinfo.StoreMount, spiderinfo.ProxyRequestMount, spiderinfo.ProxyPercent, spiderinfo.VertifyMount, spiderinfo.VertifyTrueMount, spiderinfo.VertifyPercent, spiderinfo.TotalMount, DateTime.Now); Console.WriteLine("【{0}】{1}", iflg, checkInfo); CLog.DiaryLog(checkInfo, spiderinfo.TaskName + "(每半小时统计一次)_" + DateTime.Now.ToString("yyyyMMdd") + ".txt"); } #endregion return(flg); }
/// <summary> /// 执行任务 /// </summary> /// <param name="allInfoUrls">任务源集合</param> public void DoTask(List <TaskUrlConfig> allInfoUrls) { List <Action> lss = new List <Action>(); for (int i = 0; i < allInfoUrls.Count; i++) { //爬虫请求次数计数 StaticConfig.Spiderinfo.RequestMount++; StaticConfig.CountConfig.Itotal++; MyParallel grabAllInfo = new MyParallel { //接收任务对象 TaskUrl = allInfoUrls[i], //代理初始化,默认使用本地代理 //需要用到阿布云代理的,可以开启fiddler代理或者把本地代理改成阿布云的代理 ProxyConfig = { // [-or-] 本地 fiddler代理阿布云 Proxy = new WebProxy("192.168.1.1"), // [-or-] 阿布云 //Proxy = new WebProxy("http://http-dyn.abuyun.com:9020"), ProxyUser = "******", ProxyPass = "******" }, //初始化账号Cookie信息,下面为不需要用到账号的情况 CkConfig = { Cookie = "" } }; lss.Add(new MyParallel(grabAllInfo).MyRequest); //此处控制并发数量,并发数量可以在配置文件设置 if (i % StaticConfig.SpConfig.ActionLssNum == StaticConfig.SpConfig.ActionLssNum - 1 || i == allInfoUrls.Count - 1) { try { //开始并发 System.Threading.Tasks.Parallel.Invoke(lss.ToArray()); Console.WriteLine("***********************************************************"); //TODO:回收任务 GcTask(StaticConfig.TaskUrls, StaticConfig.AddTaskUrls); } catch (Exception ex) { Console.WriteLine("并发出错:{0}>>>{1}", ex.Message, DateTime.Now); CLog.DiaryLog(ex.Message, $"\\{StaticConfig.Spiderinfo.TaskName}并发出错\\{StaticConfig.Spiderinfo.TaskName}并发出错_{DateTime.Now:yyyyMMdd}.txt"); System.Threading.Thread.Sleep(new TimeSpan(0, 1, 0)); } //初始化变量 lss = new List <Action>(); StaticConfig.TaskUrls = new List <TaskUrlConfig>(); StaticConfig.AddTaskUrls = new List <TaskUrlConfig>(); System.Threading.Thread.Sleep(new TimeSpan(0, 0, StaticConfig.SpConfig.ActionSleepTime)); System.GC.Collect(); //TODO:发送请求异常处理的邮件通知 FixException(StaticConfig.SpConfig.ActionEmail, StaticConfig.Spiderinfo.TaskName, StaticConfig.CountConfig.ErrorCodeStr); Console.WriteLine("***********************************************************"); } //程序界面头部显示的信息,方便查看数据情况 StaticConfig.CountConfig.EMessage = string.Format("{0}[{1:MMddHHmm}]【{2}/{3}>{4:0.000}-{5}】[{6}/{7}]", StaticConfig.Spiderinfo.TaskName, StaticConfig.SpConfig.DateKs, StaticConfig.CountConfig.Itrue, StaticConfig.CountConfig.Itotal, ((double)StaticConfig.CountConfig.Itrue / StaticConfig.CountConfig.Itotal), StaticConfig.CountConfig.Ifasle, i, allInfoUrls.Count); Console.Title = StaticConfig.CountConfig.EMessage; #region 通知邮件(每8小时统计一次) //可以根据实际情况选择性给自己发邮件的,不强制 if (DateTime.Now.Hour % 8 == 0 && DateTime.Now.Minute == 0) { MessageEail("每8个小时反馈"); } #endregion } }
/// <summary> /// 半小时自动统计 /// </summary> public void AutomaticCount() { int isign = 1; #region 自动统计爬虫信息 Thread rdsTh = new Thread(() => { while (true) { if (Flg) { isign = 0; } if (DateTime.Now.Minute % 30 == 0 || (!Flg && isign == 0)) { //当Flg第一次为false时进行写入 if (Flg || (!Flg && isign == 0)) { long totalMount = TotalMount; try { string sql = string.Format( "select TotalMount from spiderinfo where SpiderProgramName='{0}' and TaskName='{1}' and SiteName='{2}' and TemplateName='{3}' order by id DESC limit 0,1", SpiderProgramName, TaskName, SiteName, TemplateName); totalMount = long.Parse( SpiderHelp.SaveModule.MySqlHelp.Select(AliyunConn, sql).Rows[0][0].ToString()); } catch { } TotalMount = totalMount + CatchMount; EndDT = DateTime.Now; ProxyRequestMount = RequestMount; if (CatchMount > 0 && CatchMount <= RequestMount) { ProxyPercent = Convert.ToDecimal(((double)CatchMount / RequestMount).ToString("0.000")); } if (VertifyTrueMount > 0 && VertifyTrueMount <= VertifyMount) { VertifyPercent = Convert.ToDecimal(((double)VertifyTrueMount / VertifyMount).ToString("0.000")); } string rdsSql = string.Format( "insert ignore into spiderinfo(SpiderProgramName,TaskName,SiteName,TemplateName,StartDT,EndDT,RequestMount,CatchMount,StoreMount,ProxyRequestMount,ProxyPercent,VertifyMount,VertifyPercent,TotalMount,TotalLength,IPAddress,Ext) values ('{0}','{1}','{2}','{3}','{4}','{5}',{6},{7},{8},{9},{10},{11},{12},{13},{14},'{15}','{16}');", SpiderProgramName, TaskName, SiteName, TemplateName, StartDT, EndDT, RequestMount, CatchMount, StoreMount, ProxyRequestMount, ProxyPercent, VertifyMount, VertifyPercent, TotalMount, TotalLength, IPAddress, Ext); int itryMax = 3; while (itryMax > 0) { int iflg = SpiderHelp.SaveModule.MySqlHelp.Insert(AliyunConn, rdsSql, TaskName + "_数据监控失败记录"); isign = Flg ? 0 : 1; if (iflg >= 0) { itryMax = 0; } else { itryMax--; } string checkInfo = string.Format( "爬虫:{0} 请求量:{1} 抓取量:{2} 入库量:{3} 代理请求量:{4} 代理成功率:{5} 打码次数:{6} 成功打码:{7} 打码成功率:{8} 总抓取量:{9} 时间:{10}", TaskName, RequestMount, CatchMount, StoreMount, ProxyRequestMount, ProxyPercent, VertifyMount, VertifyTrueMount, VertifyPercent, TotalMount, DateTime.Now); CLog.DiaryLog(checkInfo, TaskName + "(每半小时统计一次)_" + DateTime.Now.ToString("yyyyMMdd") + ".txt"); } StartDT = DateTime.Now; VertifyPercent = 0; VertifyMount = 0; VertifyTrueMount = 0; StoreMount = 0; ProxyRequestMount = 0; CatchMount = 0; RequestMount = 0; ProxyPercent = 0; TotalLength = 0; Thread.Sleep(new TimeSpan(0, 1, 1)); } else { Thread.Sleep(new TimeSpan(0, 0, 59)); } } else { Thread.Sleep(new TimeSpan(0, 0, 59)); } } }); rdsTh.Start(); #endregion }
/// <summary> /// 初始任务源入库 /// </summary> private void TaskAdd() { try { List <TaskUrlConfig> allInfoUrls = new List <TaskUrlConfig>(); int maxPage = 214; for (int i = 1; i < maxPage + 1; i++) { string urlInfo = $"XXX"; TaskUrlConfig allInfoUrl = new TaskUrlConfig { CompanyName = "无", Uid = "无", Tab = "list", Url = urlInfo, Md5 = MyConvert.ToUserMd5(urlInfo), Method = "get", ICount = 0, IState = 0, Queue_time = DateTime.Now, Done_time = DateTime.Now }; allInfoUrls.Add(allInfoUrl); } Console.WriteLine($@"共计任务:【{allInfoUrls.Count}】>>>{DateTime.Now}"); int lssNum = 100; DateTime date = DateTime.Now; string sqll = $"INSERT IGNORE INTO {actionTable}(CompanyName,Uid,Tab,Url,Md5,Method,ICount,IState,Queue_time,Done_time) VALUES"; string sqlStr = string.Empty; for (int i = 0; i < allInfoUrls.Count; i++) { sqlStr += $"('{allInfoUrls[i].CompanyName}','{allInfoUrls[i].Uid}','{allInfoUrls[i].Tab}','{allInfoUrls[i].Url}','{allInfoUrls[i].Md5}','{allInfoUrls[i].Method}',{allInfoUrls[i].ICount},{allInfoUrls[i].IState},now(),now()),"; if (i % lssNum == lssNum - 1 || i == allInfoUrls.Count - 1) { int itryMax = 3; while (itryMax > 0) { int iflg = spideBll.Insert(sqll + sqlStr.TrimEnd(','), $"{taskName}任务入库异常"); if (iflg >= 0) { itryMax = 0; } else { itryMax--; } Console.WriteLine("入库【{0}】>>>{1}", iflg, DateTime.Now); } Console.WriteLine(@"**************************************************"); sqlStr = string.Empty; } Console.Title = $@"{taskName}任务入库[{date:MMddHHmm}]【{i}/{allInfoUrls.Count}】"; } } catch (Exception ex) { Console.WriteLine($@"{ex.Message}>>>{DateTime.Now}"); CLog.DiaryLog(ex.Message, $"\\{taskName}任务源入库异常\\{actionTable}任务源入库异常_{DateTime.Now:yyyyMMdd}.txt"); } }