示例#1
0
        /// <summary>
        /// 单个查询
        /// </summary>
        /// <param name="companyName"></param>
        public void SingelSearch(string companyName)
        {
            var httpClient = new HttpClient();
            httpClient.Setting.Timeout = 1000 * 5;
            httpClient.Create<string>(HttpMethod.Post, firsturl).Send();
            while (true)
            {
                var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique };
                try
                {
                    //IP处理
                    var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP
                    if (proxyEntity == null)
                    {
                        Console.WriteLine("在线代理临时获取策略启动。");
                        proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity();
                        Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port);
                    }

                    httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port);

                    var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new
                    {
                        queryStr = targetModel.搜索名称,
                        module = "",
                        idFlag = "qyxy"
                    }).Send();
                    if (!resultBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //提取二级连接
                    var parser = new JumonyParser();
                    var urls = parser.Parse(resultBody.Result).Find("li a").ToList();
                    var nextUrl = "";
                    if (urls.Count < 1)
                    {
                        AddNull(targetModel);
                        break;
                    }
                    foreach (var htmlElement in urls)
                    {
                        targetModel.名称 = htmlElement.InnerText();
                        nextUrl = url + htmlElement.Attribute("href").AttributeValue;
                    }
                    //提取目标正文
                    var resultsecondBody =
                        httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send();
                    var nameValueCollection =
                        new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query));
                    if (!resultsecondBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultsecondBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //正文处理
                    var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td"));
                    var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList();
                    var listall = new List<string>();
                    foreach (var tableList in tableLists)
                        tableList.Find("tr td")
                            .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim()));
                    var fillModel = FillModel(listall);
                    fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper();
                    new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel));
                    //后续其他处理 包括了IP使用状态,以查询列表状态
                    proxyEntity.Usage = proxyEntity.Usage + 1;
                    new ProxyDomain().Update(proxyEntity);
                    Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称);
                }
                catch (Exception e)
                {
                    new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now });
                    continue;
                }
                break;
            }
        }
示例#2
0
文件: Proxy.cs 项目: CBDlkl/LiGather
 /// <summary>
 /// 验证Ip可用性
 /// </summary>
 /// <param name="maxThreadNum">启用多少线程验证</param>
 public void ValidateCanUse(int maxThreadNum = 1)
 {
     var tasks = new Task[maxThreadNum];
     for (var i = 0; i < maxThreadNum; i++)
     {
         tasks[i] = new Task(() =>
         {
             for (var j = 0; j < MaxValue; j++)
             {
                 ProxyDomain proxyDomain = new ProxyDomain();
                 var id = new Random().Next(1, proxyDomain.GetMaxId());
                 var proxyEntity = proxyDomain.GetById(id);
                 if (proxyEntity.Id == 0)
                     continue;
                 if (ThreadValidate.Doit(proxyEntity))
                     break;
             }
             Console.WriteLine("完成");
         });
         tasks[i].Start();
     }
     Task.WaitAll(tasks);
 }
示例#3
0
文件: Proxy.cs 项目: CBDlkl/LiGather
 /// <summary>
 /// 直接采集IP并更新到数据库
 /// </summary>
 /// <param name="countNum">采集IP总数</param>
 /// <param name="getNum">每次提取数量</param>
 /// <param name="isValidate">是否对代理验证</param>
 public void ProxySave(int countNum = 100, int getNum = 5, bool isValidate = false)
 {
     new Thread(() =>
     {
         while (true)
         {
             if (countNum == 0)
                 break;
             var ipLists = GetProxyByHttp(getNum).Split(Environment.NewLine.ToCharArray()).ExceptNull().ToList();
             foreach (var ipList in ipLists)
             {
                 var ipAndPort = ipList.Split(':');
                 if (string.IsNullOrWhiteSpace(ipList) || ipAndPort.Length < 2)
                     continue;
                 Console.WriteLine("获取到代理:" + ipList);
                 var model = new ProxyEntity();
                 model.IpAddress = ipAndPort[0];
                 model.Port = Conv.ToInt(ipAndPort[1]);
                 model.Usage = 0;
                 model.CreateTime = DateTime.Now;
                 ProxyDomain proxyDomain = new ProxyDomain();
                 if (isValidate)
                 {
                     if (ThreadValidate.VerificationIp(model.IpAddress, model.Port))
                     {
                         if (!proxyDomain.IsExist(model))
                             proxyDomain.Add(model);
                         countNum--;
                     }
                 }
                 if (!proxyDomain.IsExist(model))
                     proxyDomain.Add(model);
                 countNum--;
             }
         }
         Console.WriteLine("IP采集完毕");
     })
     { IsBackground = false }.Start();
 }
示例#4
0
文件: Proxy.cs 项目: CBDlkl/LiGather
 /// <summary>
 /// 通过在线代理直接获取未验证的IP代理
 /// 主要用于本地IP代理库存不足,临时获取
 /// </summary>
 /// <returns></returns>
 public ProxyEntity GetHttProxyEntity()
 {
     var ipList = GetProxyByHttp();
     var ipAndPort = ipList.Split(':');
     var model = new ProxyEntity();
     model.IpAddress = ipAndPort[0];
     model.Port = Conv.ToInt(ipAndPort[1]);
     model.Usage = 1;
     model.CanUse = true;
     model.CreateTime = DateTime.Now;
     model.LastUseTime = DateTime.Now;
     ProxyDomain proxyDomain = new ProxyDomain();
     if (!proxyDomain.IsExist(model))
         proxyDomain.Add(model);
     return model;
 }