/// <summary> /// 添加请求到 /// </summary> /// <param name="request"></param> /// <returns></returns> public async Task AddRequest(SpiderRequest request) { //检查是否是允许的域名 try { if (AllowHost != "*" && request.Url.Contains(AllowHost) == false) { Log.Error($"URL:{request.Url} 不在允许的域名内!"); return; } using var scope = serviceProvider.CreateScope(); var task = scope.ServiceProvider.GetRequiredService <ITask>(); if (await task.CheckExistAsync(request.Url) == false) { //验证是否已经采集过了 Log.Information("添加URL:" + request.Url + $"[{request.Url}]"); //todo 验证是否已经存在任务库中,如果存在,那么不添加 if (_requestQueue.Where(a => a.Url == request.Url).Any() == false) { //判断队列还有多少数据,如果超过1000,那么把任务存到数据库 if (_requestQueue.Count() > 100) { await task.InsertTaskAsync(new Storage.Entity.TaskEntity() { TagName = request.ReqName, Url = request.Url }); } else { _requestQueue.Enqueue(request); } } } else { //Log.Information($"当前URL:{request.Url}已经采集了,不加入队列"); } } catch (Exception ex) { Log.Error(ex, "添加URL到请求队列失败!"); } }
public abstract SpiderRequest Process(SpiderRequest request);
public override SpiderRequest Process(SpiderRequest request) { log("get request:" + request.Url, INFO, "ExampleRequestMiddleware"); return(request); }
//ConcurrentDictionary<string, string> testdic = new ConcurrentDictionary<string, string>(); /// <summary> /// 启动任务调度 /// </summary> /// <returns></returns> public async Task RunScheduler() { Log.Information("任务调度器开始运行..."); //取出任务列表加到下载队列中 //从下载队列遍历数据,然后开始请求数据 _ = Task.Factory.StartNew(async() => { using var reqscope = serviceProvider.CreateScope(); while (status == SpiderStatus.Running) { try { if (_requestQueueSpeedController.GetFullLoadStatus()) { Log.Information($"下载任务已经达到了上限,稍后再试!先暂停:{_taskHz} 队列中一共:{_requestQueue.Count()}条数据"); await Task.Delay(_taskHz); continue; } if (_requestQueue.TryDequeue(out var req)) { Log.Information($"当前队列还剩下:{_requestQueue.Count()}"); SpiderRequest request = req; _ = Task.Factory.StartNew(async() => { using var scope = serviceProvider.CreateScope(); try { Log.Information($"下载html:{request.Url}"); //检查是否下载任务已经满了 //检查下载任务已经下载过了 var task = scope.ServiceProvider.GetRequiredService <ITask>(); if (await task.CheckExistAsync(req.Url)) { Log.Information("当前URL已经下载过了,不进行再次下载"); return; } if (_requestQueueSpeedController.GetFullLoadStatus()) { return; } _requestQueueSpeedController.Add(); //todo 检查是否已经采集过了 var html = string.Empty; if (request.IsAjax) { var ajax = scope.ServiceProvider.GetRequiredService <IAjaxDownLoader>(); html = await ajax.DownAsync(request.Url, request.AjaxParamData); } else { var htmlloader = scope.ServiceProvider.GetRequiredService <IHtmlDownLoader>(); html = await htmlloader.DownAsync(request.Url); } var response = new SpiderResponse { Request = request, Referer = request.Url, Content = html, CharSet = "utf-8" }; _responseQueue.Enqueue(response); _requestQueueSpeedController.Sub(); //下载完成后, } catch (Exception ex) { Log.Error(ex, "下载文件队列异常!"); //抛出异常后,重新添加到请求列表 _requestQueueSpeedController.Sub(); if (request.ReTryTimes <= 3) { request.ReTryTimes += 1; _requestQueue.Enqueue(request); } else { ITask db = scope.ServiceProvider.GetRequiredService <ITask>(); if (request.IfSave) { await db.SaveErrorAsync(request.Url, request.ReqName); } } } }); } else { //如果请求队列中没有数据了,那么从本地数据库抓取数据 Log.Information("请求队列中么有数据了,从本地数据库抓取"); var task = reqscope.ServiceProvider.GetRequiredService <ITask>(); var list = await task.TakeTaskAsync(100); if (list != null && list.Count() > 0) { foreach (var item in list) { _requestQueue.Enqueue(new SpiderRequest() { Url = item.Url, ReqName = item.TagName }); } } } } catch (Exception ex) { Log.Error(ex, "没有拦截到的错误【429】"); } await Task.Delay(_taskHz); } }); //从响应队列中读数据包,进入解析操作 _ = Task.Factory.StartNew(async() => { while (status == SpiderStatus.Running) { try { if (_responseQueueSpeedController.GetFullLoadStatus()) { Log.Information($"当前速度已经满载!response队列剩余:{_responseQueue.Count()} 下载队列剩余:{_requestQueue.Count()}"); await Task.Delay(_taskHz); continue; } using var scope = serviceProvider.CreateScope(); var log = scope.ServiceProvider.GetRequiredService <ILogger <CrawlerNet> >(); if (_responseQueue.TryDequeue(out var response)) { _ = Task.Factory.StartNew(async() => { using var scope = serviceProvider.CreateScope(); try { ITask db = scope.ServiceProvider.GetRequiredService <ITask>(); try { _responseQueueSpeedController.Add(); response.serviceProvider = scope.ServiceProvider; //await ParseItem(response); if (callback != null) { await callback(response); } _responseQueueSpeedController.Sub(); //把当前URL存储本地 //保存数据成功后,把当前URL存入本机数据库 if (response.Request.IfSave) { await db.SaveUrlToDbAsync(response.Request.Url); } } catch (Exception ex) { Log.Error(ex, $"解析或保存数据错误:{response.Request.Url}"); _responseQueueSpeedController.Sub(); if (response.Request.IfSave) { await db.SaveErrorAsync(response.Request.Url, response.Request.ReqName); } } } catch (Exception ex) { Log.Error(ex, "写入到ParseItem错误!"); } }); } } catch (Exception ex) { Log.Error(ex, "没有拦截到的错误【490】"); } await Task.Delay(_taskHz); } }); await Task.CompletedTask; }