Пример #1
0
        /// <summary>
        /// 获取任务类
        /// </summary>
        /// <param name="startNum">起始id</param>
        /// <param name="endNum">结束id</param>
        /// <returns>任务类集合</returns>
        public List <TaskUrlConfig> GetTask(int startNum, int endNum)
        {
            List <TaskUrlConfig> allInfoUrls = new List <TaskUrlConfig>();
            int itryMax = 3;

            do
            {
                //这个可以根据需求自己调整
                string sql;
                if (StaticConfig.SpConfig.ActionTab == "allinfo")
                {
                    sql = $"SELECT * FROM {ActionTable} WHERE IState=0 LIMIT {startNum},{endNum}";
                }
                else
                {
                    sql = $"SELECT * FROM {ActionTable} WHERE Tab='{StaticConfig.SpConfig.ActionTab}' and IState=0 LIMIT {startNum},{endNum}";
                }
                System.Data.DataTable dt = spideBll.Select(sql);
                if (dt != null && dt.Rows.Count > 0)
                {
                    for (int i = 0; i < dt.Rows.Count; i++)
                    {
                        TaskUrlConfig company = new TaskUrlConfig
                        {
                            Id          = int.Parse(dt.Rows[i]["Id"].ToString()),
                            Uid         = dt.Rows[i]["Uid"].ToString(),
                            CompanyName = dt.Rows[i]["CompanyName"].ToString(),
                            Tab         = dt.Rows[i]["Tab"].ToString(),
                            Method      = dt.Rows[i]["Method"].ToString(),
                            Url         = dt.Rows[i]["Url"].ToString(),
                            Md5         = dt.Rows[i]["Md5"].ToString(),
                            ICount      = int.Parse(dt.Rows[i]["ICount"].ToString()),
                            IState      = int.Parse(dt.Rows[i]["IState"].ToString())
                        };
                        try
                        {
                            company.Queue_time = Convert.ToDateTime(dt.Rows[i]["Queue_time"].ToString());
                            company.Done_time  = Convert.ToDateTime(dt.Rows[i]["Done_time"].ToString());
                        }
                        catch
                        {
                            company.Queue_time = DateTime.Now;
                            company.Done_time  = DateTime.Now;
                        }
                        allInfoUrls.Add(company);
                    }
                    itryMax = 0;
                    StaticConfig.ErrorCode.IdbError = 0;
                }
                else if (dt == null)
                {
                    itryMax--;
                    StaticConfig.ErrorCode.IdbError++;
                }
                else
                {
                    itryMax = 0;
                    StaticConfig.ErrorCode.IdbError = 0;
                }
            } while (itryMax > 0);
            return(allInfoUrls);
        }
Пример #2
0
        /// <summary>
        /// 获取列表数据
        /// </summary>
        private void Request_List()
        {
            lock (TaskUrl)
            {
                HttpWebResponse response = null;
                try
                {
                    #region 请求
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(TaskUrl.Url);
                    request.Timeout           = 10000;
                    request.ReadWriteTimeout  = 12000;
                    request.KeepAlive         = false;
                    request.AllowAutoRedirect = false;
                    request.ServicePoint.Expect100Continue = false; //加快载入速度
                    request.ServicePoint.UseNagleAlgorithm = false; //禁止Nagle算法加快载入速度
                    request.AllowWriteStreamBuffering      = false; //禁止缓冲加快载入速度
                    request.Headers.Add("Upgrade-Insecure-Requests", @"1");
                    request.UserAgent =
                        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36";
                    request.Accept =
                        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
                    request.Headers.Add("DNT", @"1");
                    request.Headers.Set(HttpRequestHeader.AcceptEncoding, "gzip, deflate");
                    request.Headers.Set(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8");
                    request.Headers.Set(HttpRequestHeader.Cookie, CkConfig.Cookie);
                    if (ProxyConfig.Proxy != null && ProxyConfig.Proxy.Address.Authority != "192.168.1.1")
                    {
                        request.Proxy = ProxyConfig.Proxy;
                    }
                    ///TODO:添加代理身份验证(2018年4月4日14:58:47)
                    request.Proxy.Credentials = new NetworkCredential(ProxyConfig.ProxyUser, ProxyConfig.ProxyPass);
                    response = (HttpWebResponse)request.GetResponse();
                    #endregion

                    ///TODO:标准化处理200代码
                    StaticConfig.CountConfig.ErrorCodeStr = StaticConfig.ErrorCode.HandleMethod(response);
                    string html = response.ResponseHtml();
                    //判断是否含有该特征标志
                    HtmlNode pageNode =
                        MyConvert.ToHtmlNode(html).SelectSingleNode("//*[@class='chushou-shangpu cl zk']//*[@class='toubu-chushou-shangpu cl']");
                    if (pageNode != null)
                    {
                        StaticConfig.CountConfig.Itrue++;
                        HtmlNodeCollection pageNodes =
                            MyConvert.ToHtmlNode(html).SelectNodes("//*[@class='wen fr']//a");
                        if (pageNodes != null)
                        {
                            foreach (var item in pageNodes)
                            {
                                //加锁防止资源争夺
                                lock (StaticConfig.AddTaskUrls)
                                {
                                    string        urlInfo = item.Attributes["href"].Value;
                                    TaskUrlConfig taskUrl = new TaskUrlConfig
                                    {
                                        Uid         = TaskUrl.Uid,
                                        CompanyName = item.InnerText,
                                        Tab         = "base",
                                        Url         = urlInfo,
                                        Md5         = MyConvert.ToUserMd5(urlInfo),
                                        Method      = "get",
                                        Queue_time  = DateTime.Now,
                                        Done_time   = DateTime.Now,
                                        ICount      = 0,
                                        IState      = 0
                                    };
                                    StaticConfig.AddTaskUrls.Add(taskUrl);
                                }
                            }
                            #region list不存储
                            //string fileName = $"{StaticConfig.Spiderinfo.SiteName}#{TaskUrl.Tab}#{TaskUrl.Md5}.html";
                            //string path =
                            //    $"{StaticConfig.SpConfig.PathSign}\\{DateTime.Now.ToString("yyyyMMdd")}\\{StaticConfig.Spiderinfo.SiteName}\\{TaskUrl.Tab}\\{DateTime.Now.ToString("HH")}";
                            ////该方法可以改用 FileIoHelp.FileDown();
                            //if (ReadToFile(html, fileName, path))
                            //{
                            //    //写入错误计数清零
                            //    StaticConfig.ErrorCode.IwriteError = 0;
                            //    ///TODO:同一页面的多个模板分开存储数量切换为入库量计算
                            //    StaticConfig.Spiderinfo.StoreMount++;
                            //}
                            //else
                            //{
                            //    //写入错误计数
                            //    StaticConfig.ErrorCode.IwriteError++;
                            //}
                            #endregion

                            #region 记录任务状态 和写入爬虫监控
                            TaskUrl.IState    = 1;
                            TaskUrl.Done_time = DateTime.Now;
                            lock (StaticConfig.TaskUrls)
                            {
                                StaticConfig.TaskUrls.Add(TaskUrl);
                            }
                            #endregion
                        }
                        else
                        {
                            TaskUrl.IState    = 3;
                            TaskUrl.Done_time = DateTime.Now;
                            lock (StaticConfig.TaskUrls)
                            {
                                StaticConfig.TaskUrls.Add(TaskUrl);
                            }
                            //解析出错次数清零
                            StaticConfig.ErrorCode.IanalysisError++;
                            WriteRequestResult("解析出错");
                        }
                        //转换编码
                        Encoding gb = System.Text.Encoding.GetEncoding("utf-8");
                        //获取字节数组
                        byte[] bytes = gb.GetBytes(html);
                        ///TODO:统计成功请求数据的字节长度
                        StaticConfig.Spiderinfo.TotalLength += bytes.Length;
                        ///TODO:统计成功抓取量
                        StaticConfig.Spiderinfo.CatchMount++;

                        //解析出错次数清零
                        StaticConfig.ErrorCode.IanalysisError = 0;
                        //请求无数据次数清零
                        StaticConfig.ErrorCode.IzeroError = 0;
                        //请求为空次数清零
                        StaticConfig.ErrorCode.InullError = 0;
                        //请求模板异常次数清零
                        StaticConfig.ErrorCode.ItabError = 0;
                    }
                    else if (string.IsNullOrWhiteSpace(html))
                    {
                        StaticConfig.ErrorCode.InullError++;
                        WriteRequestResult("请求为空");
                    }
                    else if (html.Contains("alert(\'温馨提醒,该信息不存在,可能已经被删除\');"))
                    {
                        TaskUrl.IState    = 2;
                        TaskUrl.Done_time = DateTime.Now;
                        lock (StaticConfig.TaskUrls)
                        {
                            StaticConfig.TaskUrls.Add(TaskUrl);
                        }
                        WriteRequestResult("无数据");
                    }
                    else
                    {
                        StaticConfig.ErrorCode.ItabError++;
                        TaskUrl.IState    = -1;
                        TaskUrl.Done_time = DateTime.Now;
                        lock (StaticConfig.TaskUrls)
                        {
                            StaticConfig.TaskUrls.Add(TaskUrl);
                        }
                        #region 特殊异常存储
                        string fileNameError = $"{StaticConfig.Spiderinfo.SiteName}#{TaskUrl.Tab}#{TaskUrl.Md5}.html";
                        string pathError     = $"{StaticConfig.SpConfig.PathSign}\\Others\\{StaticConfig.Spiderinfo.SiteName}异常\\{DateTime.Now:yyyyMMdd}\\{TaskUrl.Tab}";
                        if (ReadToFile(html, fileNameError, pathError))
                        {
                            StaticConfig.ErrorCode.IwriteError = 0;
                        }
                        else
                        {
                            StaticConfig.ErrorCode.IwriteError++;
                        }
                        #endregion
                        WriteRequestResult("特殊异常");
                    }
                }
                catch (WebException ex)
                {
                    if (ex.Status == WebExceptionStatus.ProtocolError)
                    {
                        response = (HttpWebResponse)ex.Response;
                        #region 标准化处理XXX代码

                        StaticConfig.CountConfig.ErrorCodeStr = StaticConfig.ErrorCode.HandleMethod(response);
                        if (response.StatusCode == HttpStatusCode.InternalServerError)
                        {
                            StaticConfig.CountConfig.Ifasle++;
                            TaskUrl.IState    = 500;
                            TaskUrl.Done_time = DateTime.Now;
                            lock (StaticConfig.TaskUrls)
                            {
                                StaticConfig.TaskUrls.Add(TaskUrl);
                            }
                        }
                        else if (response.StatusCode == HttpStatusCode.Forbidden)
                        {
                            StaticConfig.CountConfig.Ifasle++;
                            TaskUrl.IState    = 403;
                            TaskUrl.Done_time = DateTime.Now;
                            lock (StaticConfig.TaskUrls)
                            {
                                StaticConfig.TaskUrls.Add(TaskUrl);
                            }
                        }
                        else if (response.StatusCode == HttpStatusCode.NotFound)
                        {
                            StaticConfig.CountConfig.Ifasle++;
                            TaskUrl.IState    = 404;
                            TaskUrl.Done_time = DateTime.Now;
                            lock (StaticConfig.TaskUrls)
                            {
                                StaticConfig.TaskUrls.Add(TaskUrl);
                            }
                        }

                        #endregion
                    }
                    WriteRequestResult(ex.Message);
                }
                catch (Exception ex)
                {
                    #region 非服务器反馈异常
                    StaticConfig.CountConfig.Ifasle++;
                    StaticConfig.ErrorCode.IanalysisError++;
                    TaskUrl.IState    = 3;
                    TaskUrl.Done_time = DateTime.Now;
                    lock (StaticConfig.TaskUrls)
                    {
                        StaticConfig.TaskUrls.Add(TaskUrl);
                    }
                    #endregion
                    WriteRequestResult(ex.Message);
                }
                finally
                {
                    response?.Close();
                }
            }
        }
Пример #3
0
 /// <summary>
 /// 带参类构造函数
 /// </summary>
 /// <param name="myParallel">我的并发类对象</param>
 public MyParallel(MyParallel myParallel)
 {
     TaskUrl     = myParallel.TaskUrl;
     ProxyConfig = myParallel.ProxyConfig;
     CkConfig    = myParallel.CkConfig;
 }
Пример #4
0
 /// <summary>
 /// 初始任务源入库
 /// </summary>
 private void TaskAdd()
 {
     try
     {
         List <TaskUrlConfig> allInfoUrls = new List <TaskUrlConfig>();
         int maxPage = 214;
         for (int i = 1; i < maxPage + 1; i++)
         {
             string        urlInfo    = $"XXX";
             TaskUrlConfig allInfoUrl = new TaskUrlConfig
             {
                 CompanyName = "无",
                 Uid         = "无",
                 Tab         = "list",
                 Url         = urlInfo,
                 Md5         = MyConvert.ToUserMd5(urlInfo),
                 Method      = "get",
                 ICount      = 0,
                 IState      = 0,
                 Queue_time  = DateTime.Now,
                 Done_time   = DateTime.Now
             };
             allInfoUrls.Add(allInfoUrl);
         }
         Console.WriteLine($@"共计任务:【{allInfoUrls.Count}】>>>{DateTime.Now}");
         int      lssNum = 100;
         DateTime date   = DateTime.Now;
         string   sqll   = $"INSERT IGNORE INTO {actionTable}(CompanyName,Uid,Tab,Url,Md5,Method,ICount,IState,Queue_time,Done_time) VALUES";
         string   sqlStr = string.Empty;
         for (int i = 0; i < allInfoUrls.Count; i++)
         {
             sqlStr +=
                 $"('{allInfoUrls[i].CompanyName}','{allInfoUrls[i].Uid}','{allInfoUrls[i].Tab}','{allInfoUrls[i].Url}','{allInfoUrls[i].Md5}','{allInfoUrls[i].Method}',{allInfoUrls[i].ICount},{allInfoUrls[i].IState},now(),now()),";
             if (i % lssNum == lssNum - 1 || i == allInfoUrls.Count - 1)
             {
                 int itryMax = 3;
                 while (itryMax > 0)
                 {
                     int iflg = spideBll.Insert(sqll + sqlStr.TrimEnd(','), $"{taskName}任务入库异常");
                     if (iflg >= 0)
                     {
                         itryMax = 0;
                     }
                     else
                     {
                         itryMax--;
                     }
                     Console.WriteLine("入库【{0}】>>>{1}", iflg, DateTime.Now);
                 }
                 Console.WriteLine(@"**************************************************");
                 sqlStr = string.Empty;
             }
             Console.Title = $@"{taskName}任务入库[{date:MMddHHmm}]【{i}/{allInfoUrls.Count}】";
         }
     }
     catch (Exception ex)
     {
         Console.WriteLine($@"{ex.Message}>>>{DateTime.Now}");
         CLog.DiaryLog(ex.Message, $"\\{taskName}任务源入库异常\\{actionTable}任务源入库异常_{DateTime.Now:yyyyMMdd}.txt");
     }
 }