Ejemplo n.º 1
0
        public void Execute(IJobExecutionContext context)
        {
            try
            {
                if (!isRun)
                {
                    isRun = true;
                    object objParam = context.JobDetail.JobDataMap.Get("TaskParam");
                    if (objParam != null)
                    {
                        ProxyParam Param = JsonConvert.DeserializeObject <ProxyParam>(objParam.ToString());
                        DateTime   start = DateTime.Now;
                        LogHelper.WriteLog("\r\n\r\n\r\n\r\n------------------爬虫开始执行获取代理ip任务 " + start.ToString("yyyy-MM-dd HH:mm:ss") + " BEGIN-----------------------------\r\n\r\n");


                        //每执行10次任务,换一个代理IP
                        if (NeedChangeIP || ExecuteCount % Speed == 0)
                        {
                            if (NeedChangeIP)
                            {
                                ExecuteCount = (ExecuteCount / Speed + 1) * Speed;
                            }
                            LogHelper.WriteLog("\r\n\r\n\r\n\r\n------------------开始解析使用的代理ip " + DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + " BEGIN-----------------------------\r\n\r\n");
                            ProxyIp = IpProxyGet.GetCorrectIP(Param);
                            LogHelper.WriteLog("------------------保存使用的代理ip:" + ProxyIp + " -----------------------------");
                            SQLHelper.ExecuteNonQuery("INSERT INTO dbo.p_ProxyIPUseHistory(ProxyIP,Type) VALUES (@ProxyIP,'IpProxyJob')", new { ProxyIP = ProxyIp });
                            NeedChangeIP = false;
                        }
                        Param.ProxyIp = ProxyIp;
                        LogHelper.WriteLog("\r\n\r\n\r\n\r\n------------------任务使用的代理ip:" + Param.ProxyIp + "----------------------------\r\n\r\n");

                        List <IPProxy> list = IpProxyGet.ParseProxy(Param);
                        if (list.Count == 0)
                        {
                            //没有返回数据.表示当前IP已经被锁定需要更换
                            NeedChangeIP = true;
                        }

                        DateTime end = DateTime.Now;
                        ExecuteCount++;
                        LogHelper.WriteLog("\r\n\r\n------------------爬虫完成获取代理ip任务:" + end.ToString("yyyy-MM-dd HH:mm:ss") + ",本次共耗时(分):" + (end - start).TotalMinutes + " END------------------------\r\n\r\n\r\n\r\n");
                    }
                    isRun = false;
                }
            }
            catch (Exception ex)
            {
                JobExecutionException e2 = new JobExecutionException(ex);
                LogHelper.WriteLog("爬虫获取代理ip任务异常", ex);
                isRun = false;
                ExecuteCount++;
                //1.立即重新执行任务
                e2.RefireImmediately = true;
                //2 立即停止所有相关这个任务的触发器
                //e2.UnscheduleAllTriggers=true;
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 从数据库里面拿出一个可以使用的代理ip
        /// </summary>
        /// <param name="param"></param>
        /// <returns></returns>
        public string GetCorrectIp(ProxyParam param)
        {
            IProxyUseHistoryService proxyUseHistoryService = new ProxyUseHistoryService();
            string proxyIp = string.Empty;
            //当前页
            int currentPage   = 1;
            int PageSize      = 100;
            var proxyType     = ProxyType.Http.GetDescription();
            var proxyJobType  = ProxyUseForType.ProxyJob.GetDescription();
            var updateProxies = new List <Proxy>();

            while (string.IsNullOrEmpty(proxyIp))
            {
                int total;
                var proxyUnUseList = LoadPageEntities(currentPage * PageSize, (currentPage - 1) * PageSize + 1, out total, r => !r.IsDelete && r.Type.Equals(proxyType), o => o.Speed, true);
                var proxyUsedList  = proxyUseHistoryService.LoadEntities(r => r.Type.Equals(proxyJobType)).Select(p => p.ProxyGuid);
                var proxyList      = proxyUnUseList.Where(r => !proxyUsedList.Contains(r.Guid)).ToList();
                if (proxyList.Count == 0)
                {
                    break;
                }
                foreach (var item in proxyList)
                {
                    //检查是否能ping通并且可以代理拿到网页
                    if (WebUtils.PingProxy(item.IP, item.Port) && ProxyUtil.GetTotalPage(param.IpUrl, $"{item.IP}:{item.Port}") > 1)
                    {
                        proxyIp = $"{item.IP}:{item.Port}";
                        proxyUseHistoryService.Add(new ProxyUseHistory()
                        {
                            Guid = Guid.NewGuid(), ProxyGuid = item.Guid, CreatedOn = DateTime.Now, Type = proxyJobType
                        });
                        break;
                    }
                    else
                    {
                        item.IsDelete = true;
                        updateProxies.Add(item);
                    }
                }

                currentPage++;
            }

            //将不能使用的Ip删除
            if (updateProxies.Count > 0)
            {
                Update(updateProxies, r => r.IsDelete);
            }
            if (string.IsNullOrEmpty(proxyIp))
            {
                proxyIp = param.DefaultProxyIp;
            }
            return(proxyIp);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 开始解析网站数据
        /// </summary>
        /// <param name="param"></param>
        /// <returns></returns>
        public static List <DBModels.Base.Proxy> ParseProxy(ProxyParam param)
        {
            if (string.IsNullOrEmpty(param.IpUrl))
            {
                throw new ArgumentNullException("ParseProxy函数参数空异常");
            }

            //总页数
            int total = GetTotalPage(param.IpUrl, param.ProxyIp);

            if (total == 1)
            {
                throw new ArgumentNullException("总页数信息异常");
            }

            //返回结果
            List <DBModels.Base.Proxy> list = new List <DBModels.Base.Proxy>();

            //多线程进行解析获取
            List <Thread> listThread = new List <Thread>();

            //每个线程需要解析的页面数量
            var threadPageCount = (total / CpuCount);
            int threadPqgeSize  = threadPageCount == 0? 1 : threadPageCount;

            //为每个线程准备参数
            List <Hashtable> threadParams = new List <Hashtable>();
            int       start = 0, end = 0;
            Hashtable table = null;

            //平均分配到每个线程
            for (int i = 0; i < CpuCount; i++)
            {
                start = i * threadPqgeSize + 1;
                if (start == total || total <= CpuCount)
                {
                    i   = CpuCount;
                    end = total;
                }
                else
                {
                    end = start + threadPqgeSize - 1;
                    if (i == CpuCount - 1 && end < total)//如果还有余数就都分配在最后的线程
                    {
                        end = total;
                    }
                }
                table = new Hashtable();
                table.Add("start", start);
                table.Add("end", end);
                table.Add("list", list);
                table.Add("param", param);
                threadParams.Add(table);

                Thread thread = new Thread(DoWork)
                {
                    IsBackground = true,
                    Name         = "PageParse #" + i.ToString()
                };

                LogHelper.WriteInfoLog($"线程{thread.Name}已开启,Start:{start},End:{end}");
                listThread.Add(thread);
                thread.Start(threadParams[i]);
            }

            //for (int i = 0; i < CpuCount; i++)
            //{
            //    Thread thread = new Thread(DoWork)
            //    {
            //        IsBackground = true,
            //        Name = "PageParse #" + i.ToString()
            //    };
            //    listThread.Add(thread);
            //    thread.Start(threadParams[i]);
            //}

            // 为当前线程指派生成任务。
            // DoWork(threadParams[0]);

            // 等待所有的编译线程执行线束。
            foreach (Thread thread in listThread)
            {
                thread.Join();
            }
            if (list.Count == 0)
            {
                LogHelper.WriteInfoLog("爬虫-代理ip任务,没有获取到数据,可能当前ip(" + param.ProxyIp + ")已被服务器封锁");
            }
            return(list);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// 解析每一页数据
        /// </summary>
        /// <param name="param"></param>
        private static void DoWork(object param)
        {
            //参数还原
            Hashtable table = param as Hashtable;

            if (table == null)
            {
                throw new ArgumentNullException(nameof(table));
            }
            int start = Convert.ToInt32(table["start"]);
            int end   = Convert.ToInt32(table["end"]);
            List <DBModels.Base.Proxy> list = table["list"] as List <DBModels.Base.Proxy>;

            if (list == null)
            {
                throw new ArgumentNullException(nameof(list));
            }
            ProxyParam proxyParam = table["param"] as ProxyParam;

            if (proxyParam == null)
            {
                throw new ArgumentNullException(nameof(proxyParam));
            }

            //页面地址
            string url = string.Empty;
            string ip  = string.Empty;

            DBModels.Base.Proxy item  = null;
            HtmlNodeCollection  nodes = null;
            HtmlNode            node  = null;
            HtmlAttribute       atr   = null;

            for (int i = start; i <= end; i++)
            {
                LogHelper.WriteAsyncLog($"开始解析,页码{start}~{end},当前页码{i}");
                url = $"{proxyParam.IpUrl}/{i}";
                var doc = new HtmlDocument();
                //如果代理失效会带来问题
                doc.LoadHtml(GetHtml(url, proxyParam.ProxyIp));
                //获取所有数据节点tr 解析每一个ip
                int count = 0;
                var trs   = doc.DocumentNode.SelectNodes(@"//table[@id='ip_list']/tr");
                if (trs != null && trs.Count > 1)
                {
                    LogHelper.WriteAsyncLog($"当前页码{i},请求地址{url},共{trs.Count}条数据");
                    for (int j = 1; j < trs.Count; j++)
                    {
                        nodes = trs[j].SelectNodes("td");
                        if (nodes != null && nodes.Count > 9)
                        {
                            ip = nodes[2].InnerText.Trim();
                            var port = nodes[3].InnerText.Trim();
                            //LogHelper.WriteAsyncLog($"开始验证IP:{ip}:{port}");
                            if (proxyParam.IsPingIp && !WebUtils.PingProxy(ip, port))//!WebUtils.Ping(ip) // GetTotalPage(proxyParam.IpUrl, $"{ip}:{port}") <=1
                            {
                                //LogHelper.WriteInfoLog($"验证IP不通过:{ip}:{port}");
                                continue;
                            }
                            count++;
                            LogHelper.WriteAsyncLog($"验证IP成功:{ip}:{port}");
                            //有效的IP才添加
                            item = new DBModels.Base.Proxy();

                            node = nodes[1].FirstChild;
                            if (node != null)
                            {
                                atr = node.Attributes["alt"];
                                if (atr != null)
                                {
                                    item.Country = atr.Value.Trim();
                                }
                            }

                            item.IP        = ip;
                            item.Port      = port;
                            item.ProxyIp   = $"{item.IP}:{item.Port}";
                            item.Position  = nodes[4].InnerText.Trim();
                            item.Anonymity = nodes[5].InnerText.Trim();
                            item.Type      = nodes[6].InnerText.Trim();
                            item.Guid      = Guid.NewGuid();
                            item.CreatedOn = DateTime.Now;
                            node           = nodes[7].SelectSingleNode("div[@class='bar']");
                            if (node != null)
                            {
                                atr = node.Attributes["title"];
                                if (atr != null)
                                {
                                    item.Speed = atr.Value.Trim();
                                }
                            }

                            node = nodes[8].SelectSingleNode("div[@class='bar']");
                            if (node != null)
                            {
                                atr = node.Attributes["title"];
                                if (atr != null)
                                {
                                    item.ConnectTime = atr.Value.Trim();
                                }
                            }
                            item.VerifyTime = nodes[9].InnerText.Trim();
                            list.Add(item);
                        }
                    }
                    LogHelper.WriteAsyncLog($"当前页码{i},共{trs.Count}条数据,验证成功{count}条。");
                }
                LogHelper.WriteAsyncLog($"结束解析,页码{start}~{end},当前页码{i}");
            }
        }