Example #1
0
 /// <summary>
 /// 添加请求到
 /// </summary>
 /// <param name="request"></param>
 /// <returns></returns>
 public async Task AddRequest(SpiderRequest request)
 {
     //检查是否是允许的域名
     try
     {
         if (AllowHost != "*" && request.Url.Contains(AllowHost) == false)
         {
             Log.Error($"URL:{request.Url} 不在允许的域名内!");
             return;
         }
         using var scope = serviceProvider.CreateScope();
         var task = scope.ServiceProvider.GetRequiredService <ITask>();
         if (await task.CheckExistAsync(request.Url) == false)
         {
             //验证是否已经采集过了
             Log.Information("添加URL:" + request.Url + $"[{request.Url}]");
             //todo 验证是否已经存在任务库中,如果存在,那么不添加
             if (_requestQueue.Where(a => a.Url == request.Url).Any() == false)
             {
                 //判断队列还有多少数据,如果超过1000,那么把任务存到数据库
                 if (_requestQueue.Count() > 100)
                 {
                     await task.InsertTaskAsync(new Storage.Entity.TaskEntity()
                     {
                         TagName = request.ReqName,
                         Url     = request.Url
                     });
                 }
                 else
                 {
                     _requestQueue.Enqueue(request);
                 }
             }
         }
         else
         {
             //Log.Information($"当前URL:{request.Url}已经采集了,不加入队列");
         }
     }
     catch (Exception ex)
     {
         Log.Error(ex, "添加URL到请求队列失败!");
     }
 }
Example #2
0
 public abstract SpiderRequest Process(SpiderRequest request);
Example #3
0
 public override SpiderRequest Process(SpiderRequest request)
 {
     log("get request:" + request.Url, INFO, "ExampleRequestMiddleware");
     return(request);
 }
Example #4
0
        //ConcurrentDictionary<string, string> testdic = new ConcurrentDictionary<string, string>();
        /// <summary>
        /// 启动任务调度
        /// </summary>
        /// <returns></returns>
        public async Task RunScheduler()
        {
            Log.Information("任务调度器开始运行...");
            //取出任务列表加到下载队列中

            //从下载队列遍历数据,然后开始请求数据
            _ = Task.Factory.StartNew(async() =>
            {
                using var reqscope = serviceProvider.CreateScope();
                while (status == SpiderStatus.Running)
                {
                    try
                    {
                        if (_requestQueueSpeedController.GetFullLoadStatus())
                        {
                            Log.Information($"下载任务已经达到了上限,稍后再试!先暂停:{_taskHz} 队列中一共:{_requestQueue.Count()}条数据");
                            await Task.Delay(_taskHz);
                            continue;
                        }
                        if (_requestQueue.TryDequeue(out var req))
                        {
                            Log.Information($"当前队列还剩下:{_requestQueue.Count()}");
                            SpiderRequest request = req;
                            _ = Task.Factory.StartNew(async() =>
                            {
                                using var scope = serviceProvider.CreateScope();
                                try
                                {
                                    Log.Information($"下载html:{request.Url}");
                                    //检查是否下载任务已经满了

                                    //检查下载任务已经下载过了
                                    var task = scope.ServiceProvider.GetRequiredService <ITask>();
                                    if (await task.CheckExistAsync(req.Url))
                                    {
                                        Log.Information("当前URL已经下载过了,不进行再次下载");
                                        return;
                                    }
                                    if (_requestQueueSpeedController.GetFullLoadStatus())
                                    {
                                        return;
                                    }
                                    _requestQueueSpeedController.Add();

                                    //todo  检查是否已经采集过了
                                    var html = string.Empty;
                                    if (request.IsAjax)
                                    {
                                        var ajax = scope.ServiceProvider.GetRequiredService <IAjaxDownLoader>();
                                        html     = await ajax.DownAsync(request.Url, request.AjaxParamData);
                                    }
                                    else
                                    {
                                        var htmlloader = scope.ServiceProvider.GetRequiredService <IHtmlDownLoader>();
                                        html           = await htmlloader.DownAsync(request.Url);
                                    }

                                    var response = new SpiderResponse
                                    {
                                        Request = request,
                                        Referer = request.Url,
                                        Content = html,
                                        CharSet = "utf-8"
                                    };
                                    _responseQueue.Enqueue(response);

                                    _requestQueueSpeedController.Sub();
                                    //下载完成后,
                                }
                                catch (Exception ex)
                                {
                                    Log.Error(ex, "下载文件队列异常!");
                                    //抛出异常后,重新添加到请求列表
                                    _requestQueueSpeedController.Sub();
                                    if (request.ReTryTimes <= 3)
                                    {
                                        request.ReTryTimes += 1;

                                        _requestQueue.Enqueue(request);
                                    }
                                    else
                                    {
                                        ITask db = scope.ServiceProvider.GetRequiredService <ITask>();
                                        if (request.IfSave)
                                        {
                                            await db.SaveErrorAsync(request.Url, request.ReqName);
                                        }
                                    }
                                }
                            });
                        }
                        else
                        {
                            //如果请求队列中没有数据了,那么从本地数据库抓取数据
                            Log.Information("请求队列中么有数据了,从本地数据库抓取");
                            var task = reqscope.ServiceProvider.GetRequiredService <ITask>();
                            var list = await task.TakeTaskAsync(100);
                            if (list != null && list.Count() > 0)
                            {
                                foreach (var item in list)
                                {
                                    _requestQueue.Enqueue(new SpiderRequest()
                                    {
                                        Url     = item.Url,
                                        ReqName = item.TagName
                                    });
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        Log.Error(ex, "没有拦截到的错误【429】");
                    }
                    await Task.Delay(_taskHz);
                }
            });
            //从响应队列中读数据包,进入解析操作
            _ = Task.Factory.StartNew(async() =>
            {
                while (status == SpiderStatus.Running)
                {
                    try
                    {
                        if (_responseQueueSpeedController.GetFullLoadStatus())
                        {
                            Log.Information($"当前速度已经满载!response队列剩余:{_responseQueue.Count()} 下载队列剩余:{_requestQueue.Count()}");
                            await Task.Delay(_taskHz);
                            continue;
                        }
                        using var scope = serviceProvider.CreateScope();
                        var log         = scope.ServiceProvider.GetRequiredService <ILogger <CrawlerNet> >();
                        if (_responseQueue.TryDequeue(out var response))
                        {
                            _ = Task.Factory.StartNew(async() =>
                            {
                                using var scope = serviceProvider.CreateScope();
                                try
                                {
                                    ITask db = scope.ServiceProvider.GetRequiredService <ITask>();
                                    try
                                    {
                                        _responseQueueSpeedController.Add();
                                        response.serviceProvider = scope.ServiceProvider;
                                        //await ParseItem(response);
                                        if (callback != null)
                                        {
                                            await callback(response);
                                        }
                                        _responseQueueSpeedController.Sub();
                                        //把当前URL存储本地
                                        //保存数据成功后,把当前URL存入本机数据库
                                        if (response.Request.IfSave)
                                        {
                                            await db.SaveUrlToDbAsync(response.Request.Url);
                                        }
                                    }
                                    catch (Exception ex)
                                    {
                                        Log.Error(ex, $"解析或保存数据错误:{response.Request.Url}");
                                        _responseQueueSpeedController.Sub();
                                        if (response.Request.IfSave)
                                        {
                                            await db.SaveErrorAsync(response.Request.Url, response.Request.ReqName);
                                        }
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Log.Error(ex, "写入到ParseItem错误!");
                                }
                            });
                        }
                    }
                    catch (Exception ex)
                    {
                        Log.Error(ex, "没有拦截到的错误【490】");
                    }
                    await Task.Delay(_taskHz);
                }
            });
            await Task.CompletedTask;
        }