/// <summary> /// 获取任务类 /// </summary> /// <param name="startNum">起始id</param> /// <param name="endNum">结束id</param> /// <returns>任务类集合</returns> public List <TaskUrlConfig> GetTask(int startNum, int endNum) { List <TaskUrlConfig> allInfoUrls = new List <TaskUrlConfig>(); int itryMax = 3; do { //这个可以根据需求自己调整 string sql; if (StaticConfig.SpConfig.ActionTab == "allinfo") { sql = $"SELECT * FROM {ActionTable} WHERE IState=0 LIMIT {startNum},{endNum}"; } else { sql = $"SELECT * FROM {ActionTable} WHERE Tab='{StaticConfig.SpConfig.ActionTab}' and IState=0 LIMIT {startNum},{endNum}"; } System.Data.DataTable dt = spideBll.Select(sql); if (dt != null && dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { TaskUrlConfig company = new TaskUrlConfig { Id = int.Parse(dt.Rows[i]["Id"].ToString()), Uid = dt.Rows[i]["Uid"].ToString(), CompanyName = dt.Rows[i]["CompanyName"].ToString(), Tab = dt.Rows[i]["Tab"].ToString(), Method = dt.Rows[i]["Method"].ToString(), Url = dt.Rows[i]["Url"].ToString(), Md5 = dt.Rows[i]["Md5"].ToString(), ICount = int.Parse(dt.Rows[i]["ICount"].ToString()), IState = int.Parse(dt.Rows[i]["IState"].ToString()) }; try { company.Queue_time = Convert.ToDateTime(dt.Rows[i]["Queue_time"].ToString()); company.Done_time = Convert.ToDateTime(dt.Rows[i]["Done_time"].ToString()); } catch { company.Queue_time = DateTime.Now; company.Done_time = DateTime.Now; } allInfoUrls.Add(company); } itryMax = 0; StaticConfig.ErrorCode.IdbError = 0; } else if (dt == null) { itryMax--; StaticConfig.ErrorCode.IdbError++; } else { itryMax = 0; StaticConfig.ErrorCode.IdbError = 0; } } while (itryMax > 0); return(allInfoUrls); }
/// <summary> /// 获取列表数据 /// </summary> private void Request_List() { lock (TaskUrl) { HttpWebResponse response = null; try { #region 请求 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(TaskUrl.Url); request.Timeout = 10000; request.ReadWriteTimeout = 12000; request.KeepAlive = false; request.AllowAutoRedirect = false; request.ServicePoint.Expect100Continue = false; //加快载入速度 request.ServicePoint.UseNagleAlgorithm = false; //禁止Nagle算法加快载入速度 request.AllowWriteStreamBuffering = false; //禁止缓冲加快载入速度 request.Headers.Add("Upgrade-Insecure-Requests", @"1"); request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36"; request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"; request.Headers.Add("DNT", @"1"); request.Headers.Set(HttpRequestHeader.AcceptEncoding, "gzip, deflate"); request.Headers.Set(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8"); request.Headers.Set(HttpRequestHeader.Cookie, CkConfig.Cookie); if (ProxyConfig.Proxy != null && ProxyConfig.Proxy.Address.Authority != "192.168.1.1") { request.Proxy = ProxyConfig.Proxy; } ///TODO:添加代理身份验证(2018年4月4日14:58:47) request.Proxy.Credentials = new NetworkCredential(ProxyConfig.ProxyUser, ProxyConfig.ProxyPass); response = (HttpWebResponse)request.GetResponse(); #endregion ///TODO:标准化处理200代码 StaticConfig.CountConfig.ErrorCodeStr = StaticConfig.ErrorCode.HandleMethod(response); string html = response.ResponseHtml(); //判断是否含有该特征标志 HtmlNode pageNode = MyConvert.ToHtmlNode(html).SelectSingleNode("//*[@class='chushou-shangpu cl zk']//*[@class='toubu-chushou-shangpu cl']"); if (pageNode != null) { StaticConfig.CountConfig.Itrue++; HtmlNodeCollection pageNodes = MyConvert.ToHtmlNode(html).SelectNodes("//*[@class='wen fr']//a"); if (pageNodes != null) { foreach (var item in pageNodes) { //加锁防止资源争夺 lock (StaticConfig.AddTaskUrls) { string urlInfo = item.Attributes["href"].Value; TaskUrlConfig taskUrl = new TaskUrlConfig { Uid = TaskUrl.Uid, CompanyName = item.InnerText, Tab = "base", Url = urlInfo, Md5 = MyConvert.ToUserMd5(urlInfo), Method = "get", Queue_time = DateTime.Now, Done_time = DateTime.Now, ICount = 0, IState = 0 }; StaticConfig.AddTaskUrls.Add(taskUrl); } } #region list不存储 //string fileName = $"{StaticConfig.Spiderinfo.SiteName}#{TaskUrl.Tab}#{TaskUrl.Md5}.html"; //string path = // $"{StaticConfig.SpConfig.PathSign}\\{DateTime.Now.ToString("yyyyMMdd")}\\{StaticConfig.Spiderinfo.SiteName}\\{TaskUrl.Tab}\\{DateTime.Now.ToString("HH")}"; ////该方法可以改用 FileIoHelp.FileDown(); //if (ReadToFile(html, fileName, path)) //{ // //写入错误计数清零 // StaticConfig.ErrorCode.IwriteError = 0; // ///TODO:同一页面的多个模板分开存储数量切换为入库量计算 // StaticConfig.Spiderinfo.StoreMount++; //} //else //{ // //写入错误计数 // StaticConfig.ErrorCode.IwriteError++; //} #endregion #region 记录任务状态 和写入爬虫监控 TaskUrl.IState = 1; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } #endregion } else { TaskUrl.IState = 3; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } //解析出错次数清零 StaticConfig.ErrorCode.IanalysisError++; WriteRequestResult("解析出错"); } //转换编码 Encoding gb = System.Text.Encoding.GetEncoding("utf-8"); //获取字节数组 byte[] bytes = gb.GetBytes(html); ///TODO:统计成功请求数据的字节长度 StaticConfig.Spiderinfo.TotalLength += bytes.Length; ///TODO:统计成功抓取量 StaticConfig.Spiderinfo.CatchMount++; //解析出错次数清零 StaticConfig.ErrorCode.IanalysisError = 0; //请求无数据次数清零 StaticConfig.ErrorCode.IzeroError = 0; //请求为空次数清零 StaticConfig.ErrorCode.InullError = 0; //请求模板异常次数清零 StaticConfig.ErrorCode.ItabError = 0; } else if (string.IsNullOrWhiteSpace(html)) { StaticConfig.ErrorCode.InullError++; WriteRequestResult("请求为空"); } else if (html.Contains("alert(\'温馨提醒,该信息不存在,可能已经被删除\');")) { TaskUrl.IState = 2; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } WriteRequestResult("无数据"); } else { StaticConfig.ErrorCode.ItabError++; TaskUrl.IState = -1; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } #region 特殊异常存储 string fileNameError = $"{StaticConfig.Spiderinfo.SiteName}#{TaskUrl.Tab}#{TaskUrl.Md5}.html"; string pathError = $"{StaticConfig.SpConfig.PathSign}\\Others\\{StaticConfig.Spiderinfo.SiteName}异常\\{DateTime.Now:yyyyMMdd}\\{TaskUrl.Tab}"; if (ReadToFile(html, fileNameError, pathError)) { StaticConfig.ErrorCode.IwriteError = 0; } else { StaticConfig.ErrorCode.IwriteError++; } #endregion WriteRequestResult("特殊异常"); } } catch (WebException ex) { if (ex.Status == WebExceptionStatus.ProtocolError) { response = (HttpWebResponse)ex.Response; #region 标准化处理XXX代码 StaticConfig.CountConfig.ErrorCodeStr = StaticConfig.ErrorCode.HandleMethod(response); if (response.StatusCode == HttpStatusCode.InternalServerError) { StaticConfig.CountConfig.Ifasle++; TaskUrl.IState = 500; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } } else if (response.StatusCode == HttpStatusCode.Forbidden) { StaticConfig.CountConfig.Ifasle++; TaskUrl.IState = 403; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } } else if (response.StatusCode == HttpStatusCode.NotFound) { StaticConfig.CountConfig.Ifasle++; TaskUrl.IState = 404; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } } #endregion } WriteRequestResult(ex.Message); } catch (Exception ex) { #region 非服务器反馈异常 StaticConfig.CountConfig.Ifasle++; StaticConfig.ErrorCode.IanalysisError++; TaskUrl.IState = 3; TaskUrl.Done_time = DateTime.Now; lock (StaticConfig.TaskUrls) { StaticConfig.TaskUrls.Add(TaskUrl); } #endregion WriteRequestResult(ex.Message); } finally { response?.Close(); } } }
/// <summary> /// 带参类构造函数 /// </summary> /// <param name="myParallel">我的并发类对象</param> public MyParallel(MyParallel myParallel) { TaskUrl = myParallel.TaskUrl; ProxyConfig = myParallel.ProxyConfig; CkConfig = myParallel.CkConfig; }
/// <summary> /// 初始任务源入库 /// </summary> private void TaskAdd() { try { List <TaskUrlConfig> allInfoUrls = new List <TaskUrlConfig>(); int maxPage = 214; for (int i = 1; i < maxPage + 1; i++) { string urlInfo = $"XXX"; TaskUrlConfig allInfoUrl = new TaskUrlConfig { CompanyName = "无", Uid = "无", Tab = "list", Url = urlInfo, Md5 = MyConvert.ToUserMd5(urlInfo), Method = "get", ICount = 0, IState = 0, Queue_time = DateTime.Now, Done_time = DateTime.Now }; allInfoUrls.Add(allInfoUrl); } Console.WriteLine($@"共计任务:【{allInfoUrls.Count}】>>>{DateTime.Now}"); int lssNum = 100; DateTime date = DateTime.Now; string sqll = $"INSERT IGNORE INTO {actionTable}(CompanyName,Uid,Tab,Url,Md5,Method,ICount,IState,Queue_time,Done_time) VALUES"; string sqlStr = string.Empty; for (int i = 0; i < allInfoUrls.Count; i++) { sqlStr += $"('{allInfoUrls[i].CompanyName}','{allInfoUrls[i].Uid}','{allInfoUrls[i].Tab}','{allInfoUrls[i].Url}','{allInfoUrls[i].Md5}','{allInfoUrls[i].Method}',{allInfoUrls[i].ICount},{allInfoUrls[i].IState},now(),now()),"; if (i % lssNum == lssNum - 1 || i == allInfoUrls.Count - 1) { int itryMax = 3; while (itryMax > 0) { int iflg = spideBll.Insert(sqll + sqlStr.TrimEnd(','), $"{taskName}任务入库异常"); if (iflg >= 0) { itryMax = 0; } else { itryMax--; } Console.WriteLine("入库【{0}】>>>{1}", iflg, DateTime.Now); } Console.WriteLine(@"**************************************************"); sqlStr = string.Empty; } Console.Title = $@"{taskName}任务入库[{date:MMddHHmm}]【{i}/{allInfoUrls.Count}】"; } } catch (Exception ex) { Console.WriteLine($@"{ex.Message}>>>{DateTime.Now}"); CLog.DiaryLog(ex.Message, $"\\{taskName}任务源入库异常\\{actionTable}任务源入库异常_{DateTime.Now:yyyyMMdd}.txt"); } }