Ejemplo n.º 1
0
        public ActionResult SingelSearch(string guid = null, string searchInfo = null)
        {
            CrawlerEntity crawlerEntity = null;
            if (string.IsNullOrWhiteSpace(guid) && string.IsNullOrWhiteSpace(searchInfo))
            {
                ViewBag.Guid = Guid.NewGuid();
            }
            else
            {
                ViewBag.Guid = guid;
                var count = new TargeCompanyDomain().Get(t => t.CompanyName.Equals(searchInfo))?.Count;
                if (count > 0)
                {
                    //历史记录中已存在
                }
                else
                {
                    //上网检索
                    List<string> companyList = new List<string> { searchInfo };
                    TaskEntity model = new TaskEntity();
                    model.TaskType = EnumTaskType.BjCrawler;
                    model.TaskName = $"单个任务[{DateTime.Now.ToString("G")}]";
                    model.Unique = Conv.ToGuid(guid);
                    model.TaskStateDicId = 1;
                    model.TaskNum = 1;
                    model.CreateTime = DateTime.Now;
                    model.IsSingelSearch = true;
                    new TaskDomain().Add(model);
                    new BaseData(model).InsertMetadata(companyList.ToList(), model.TaskName, model, taskEntity =>
                    {
                        Task[] tasks = new Task[4];
                        for (int i = 0; i < 4; i++)
                        {
                            tasks[i] = new Task(() =>
                            {
                                var bjqyxy = new Crawler.Bjqyxy.BjCrawler(taskEntity, t => t.TaskGuid.Equals(taskEntity.Unique));
                                bjqyxy.SingelSearch(searchInfo);
                            });
                            tasks[i].Start();
                        }
                        Task.WaitAny(tasks);
                    });

                }
                crawlerEntity = new CrawlerDomain().Get(t => t.搜索名称 == searchInfo && t.名称 != null).FirstOrDefault();
            }
            return View(crawlerEntity);
        }
Ejemplo n.º 2
0
 public ActionResult CheckInsertMetadata(TaskEntity model)
 {
     var insertNum = new TargeCompanyDomain().GetInt(t => t.TaskGuid == model.Unique);
     return Json(new { state = "doing", num = insertNum });
 }
Ejemplo n.º 3
0
 public ActionResult CheckGoGather(TaskEntity model)
 {
     var searchNum =
         new TargeCompanyDomain().GetInt(
             t => t.TaskGuid == model.Unique && t.IsSearched);
     return Json(new { state = "doing", num = searchNum });
 }
Ejemplo n.º 4
0
        private void BaseWork()
        {
            bool isReloadCompany = true; //是否重新获取新的企业名称
            string companyOld = ""; //记忆企业名称
            var httpClient = new HttpClient();
            httpClient.Setting.Timeout = 1000 * 5;
            httpClient.Create<string>(HttpMethod.Post, firsturl).Send();
            while (true)
            {
                var targetModel = new CrawlerEntity { 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique };
                try
                {
                    //查询资源预处理
                    if (isReloadCompany)
                    {
                        var companyEntity = new TargeCompanyDomain().GetSingel(QueryCondition);
                        if (companyEntity == null)
                            break;
                        targetModel.搜索名称 = companyEntity.CompanyName; //搜索名称,直接持久化
                        companyOld = targetModel.搜索名称;
                        isReloadCompany = false;
                    }
                    else
                    {
                        targetModel.搜索名称 = companyOld;
                    }

                    //IP处理
                    var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP
                    if (proxyEntity == null)
                    {
                        Console.WriteLine("在线代理临时获取策略启动。");
                        proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity();
                        Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port);
                    }

                    httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port);

                    var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new
                    {
                        queryStr = targetModel.搜索名称,
                        module = "",
                        idFlag = "qyxy"
                    }).Send();
                    if (!resultBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //提取二级连接
                    var parser = new JumonyParser();
                    var urls = parser.Parse(resultBody.Result).Find("li a").ToList();
                    var nextUrl = "";
                    if (urls.Count < 1)
                    {
                        isReloadCompany = true;
                        AddNull(targetModel);
                        continue;
                    }
                    foreach (var htmlElement in urls)
                    {
                        targetModel.名称 = htmlElement.InnerText();
                        nextUrl = url + htmlElement.Attribute("href").AttributeValue;
                    }
                    //提取目标正文
                    var resultsecondBody =
                        httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send();
                    var nameValueCollection =
                        new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query));
                    if (!resultsecondBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultsecondBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //正文处理
                    var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td"));
                    var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList();
                    var listall = new List<string>();
                    foreach (var tableList in tableLists)
                        tableList.Find("tr td")
                            .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim()));
                    var fillModel = FillModel(listall);
                    fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper();
                    new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel));
                    //后续其他处理 包括了IP使用状态,以查询列表状态
                    proxyEntity.Usage = proxyEntity.Usage + 1;
                    new ProxyDomain().Update(proxyEntity);
                    Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称);
                }
                catch (Exception e)
                {
                    isReloadCompany = true;
                    new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now });
                    continue;
                }
                isReloadCompany = true;
            }
        }