public SpiderResponse Extract(Response response) { var item = new Item(); //var reg = new Regex(@"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"); //var mc = reg.Matches(response.Html); var mc = Regex.Matches(response.Html, @"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"); var rs = new List <Request>(); foreach (Match m in mc) { var url = m.Groups["url"].Value; if (string.IsNullOrEmpty(url)) { continue; } var m1 = Regex.Match(url, @"((ht|f)tps?):\/\/[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&:\/~\+#]*[\w\-\@?^=%&\/~\+#])?"); if (string.IsNullOrEmpty(m1.Value) || !m1.Success) { continue; } rs.Add(new Request() { Url = m1.Value, AllowAutoRedirect = true }); } item.Data.Add("url", response.Request.Url); return(SpiderResponse.Create(item, rs.ToArray())); }
public SpiderResponse Extract(Response response) { if (response.Request.Deep == 0) { var h = response.Html; var sellerId = Regex.Match(response.Html, @"sellerId=(\d+?)&").Groups[1].Value; var spuId = Regex.Match(response.Html, @"spuId=(\d+?)&").Groups[1].Value; var itemId = Regex.Match(response.Html, @"itemId=(\d+?)&").Groups[1].Value; var requests = new List <Request>(); for (int i = 1; i < 2; i++) { requests.Add(new Request() { Url = string.Format(@"https://rate.tmall.com/list_detail_rate.htm?itemId={0}&spuId={1}&sellerId={2}¤tPage={3}", itemId, spuId, sellerId, i), AllowAutoRedirect = true }); } return(SpiderResponse.Create(null, requests.ToArray())); } else if (response.Request.Deep == 1) { var data = Newtonsoft.Json.JsonConvert.DeserializeObject <dynamic>(response.Html); Console.WriteLine(response.Html); } return(null); }
private SpiderResponse ExtractEpisode(Response response) { var moive = new Movie(); var doc = new HtmlDocument(); var item = new Item(); var requests = new List <Request>(); doc.LoadHtml(response.Html); moive.Title = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'newscontent')]//div[@class='hd']").InnerText.Replace(" ", "").Trim().Split('-')[0]; var day = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'newscontent')]//div[@class='seedlink']/span[1]").InnerText.Trim(); switch (day) { case "更新日:每周一": moive.UpdateDay = 1; break; case "更新日:每周二": moive.UpdateDay = 2; break; case "更新日:每周三": moive.UpdateDay = 3; break; case "更新日:每周四": moive.UpdateDay = 4; break; case "更新日:每周五": moive.UpdateDay = 5; break; case "更新日:每周六": moive.UpdateDay = 6; break; case "更新日:每周日": moive.UpdateDay = 0; break; default: break; } moive.Summary = doc.DocumentNode.SelectSingleNode("//div[contains(@class,'newscontent')]//div[@class='newstxt']/p").InnerText; //var seasons = doc.DocumentNode.SelectNodes("//div[@class='seasonitem']/h3"); //foreach (var season in seasons) //{ // requests.Add(new Request() // { // Method = "POST", // Url = "http://www.ttmeiju.com/index.php/meiju/get_episodies.html", // AllowAutoRedirect = true // }); //} item.Data["url"] = response.Request.Url; item.Data["moive"] = moive; return(SpiderResponse.Create(item, requests.ToArray())); }
public override SpiderResponse Process(SpiderResponse response) { if (response.httpResponse.status["StatusCode"] == "302") { var request = response.request; request.Url = response.httpResponse.header["Location"]; Console.WriteLine("302 to:" + request.Url); Spider.AddRequest(request); return(null); } return(response); }
public override SpiderRequest Start(SpiderResponse response) { Count++; Console.OutputEncoding = System.Text.Encoding.UTF8; var bw = new BinaryWriter(new FileStream(@"D:\test.png", FileMode.Create)); bw.Write(response.httpResponse.binaryData); //Console.WriteLine(Count.ToString()+":"+response.request.Url+":"+response.httpResponse.body); //if (Count == 2) //{ // Spider.Stop(); // return null; //} //return SpiderRequest.Make("http://www.oschina.net/", Start); return(null); }
private SpiderResponse ExtractMovie(Response response) { var requests = new List <Request>(); var h = response.Html; var doc = new HtmlDocument(); doc.LoadHtml(h); var trs = doc.DocumentNode.SelectNodes("//table[@class='latesttable']//tr[contains(@class,'Scontent1') or contains(@class,'Scontent')]"); foreach (var item in trs) { var href = item.SelectSingleNode("td[2]/a").Attributes["href"].Value; if (string.IsNullOrEmpty(href)) { continue; } requests.Add(new Request() { Url = string.Format("http://www.ttmeiju.com{0}", href), AllowAutoRedirect = true }); } return(SpiderResponse.Create(null, requests.ToArray())); }
public override SpiderResponse Process(SpiderResponse response) { log("get response:" + response.httpResponse.status["StatusCode"], INFO, "ExampleRequestMiddleware"); return(response); }
public abstract SpiderResponse Process(SpiderResponse response);
//ConcurrentDictionary<string, string> testdic = new ConcurrentDictionary<string, string>(); /// <summary> /// 启动任务调度 /// </summary> /// <returns></returns> public async Task RunScheduler() { Log.Information("任务调度器开始运行..."); //取出任务列表加到下载队列中 //从下载队列遍历数据,然后开始请求数据 _ = Task.Factory.StartNew(async() => { using var reqscope = serviceProvider.CreateScope(); while (status == SpiderStatus.Running) { try { if (_requestQueueSpeedController.GetFullLoadStatus()) { Log.Information($"下载任务已经达到了上限,稍后再试!先暂停:{_taskHz} 队列中一共:{_requestQueue.Count()}条数据"); await Task.Delay(_taskHz); continue; } if (_requestQueue.TryDequeue(out var req)) { Log.Information($"当前队列还剩下:{_requestQueue.Count()}"); SpiderRequest request = req; _ = Task.Factory.StartNew(async() => { using var scope = serviceProvider.CreateScope(); try { Log.Information($"下载html:{request.Url}"); //检查是否下载任务已经满了 //检查下载任务已经下载过了 var task = scope.ServiceProvider.GetRequiredService <ITask>(); if (await task.CheckExistAsync(req.Url)) { Log.Information("当前URL已经下载过了,不进行再次下载"); return; } if (_requestQueueSpeedController.GetFullLoadStatus()) { return; } _requestQueueSpeedController.Add(); //todo 检查是否已经采集过了 var html = string.Empty; if (request.IsAjax) { var ajax = scope.ServiceProvider.GetRequiredService <IAjaxDownLoader>(); html = await ajax.DownAsync(request.Url, request.AjaxParamData); } else { var htmlloader = scope.ServiceProvider.GetRequiredService <IHtmlDownLoader>(); html = await htmlloader.DownAsync(request.Url); } var response = new SpiderResponse { Request = request, Referer = request.Url, Content = html, CharSet = "utf-8" }; _responseQueue.Enqueue(response); _requestQueueSpeedController.Sub(); //下载完成后, } catch (Exception ex) { Log.Error(ex, "下载文件队列异常!"); //抛出异常后,重新添加到请求列表 _requestQueueSpeedController.Sub(); if (request.ReTryTimes <= 3) { request.ReTryTimes += 1; _requestQueue.Enqueue(request); } else { ITask db = scope.ServiceProvider.GetRequiredService <ITask>(); if (request.IfSave) { await db.SaveErrorAsync(request.Url, request.ReqName); } } } }); } else { //如果请求队列中没有数据了,那么从本地数据库抓取数据 Log.Information("请求队列中么有数据了,从本地数据库抓取"); var task = reqscope.ServiceProvider.GetRequiredService <ITask>(); var list = await task.TakeTaskAsync(100); if (list != null && list.Count() > 0) { foreach (var item in list) { _requestQueue.Enqueue(new SpiderRequest() { Url = item.Url, ReqName = item.TagName }); } } } } catch (Exception ex) { Log.Error(ex, "没有拦截到的错误【429】"); } await Task.Delay(_taskHz); } }); //从响应队列中读数据包,进入解析操作 _ = Task.Factory.StartNew(async() => { while (status == SpiderStatus.Running) { try { if (_responseQueueSpeedController.GetFullLoadStatus()) { Log.Information($"当前速度已经满载!response队列剩余:{_responseQueue.Count()} 下载队列剩余:{_requestQueue.Count()}"); await Task.Delay(_taskHz); continue; } using var scope = serviceProvider.CreateScope(); var log = scope.ServiceProvider.GetRequiredService <ILogger <CrawlerNet> >(); if (_responseQueue.TryDequeue(out var response)) { _ = Task.Factory.StartNew(async() => { using var scope = serviceProvider.CreateScope(); try { ITask db = scope.ServiceProvider.GetRequiredService <ITask>(); try { _responseQueueSpeedController.Add(); response.serviceProvider = scope.ServiceProvider; //await ParseItem(response); if (callback != null) { await callback(response); } _responseQueueSpeedController.Sub(); //把当前URL存储本地 //保存数据成功后,把当前URL存入本机数据库 if (response.Request.IfSave) { await db.SaveUrlToDbAsync(response.Request.Url); } } catch (Exception ex) { Log.Error(ex, $"解析或保存数据错误:{response.Request.Url}"); _responseQueueSpeedController.Sub(); if (response.Request.IfSave) { await db.SaveErrorAsync(response.Request.Url, response.Request.ReqName); } } } catch (Exception ex) { Log.Error(ex, "写入到ParseItem错误!"); } }); } } catch (Exception ex) { Log.Error(ex, "没有拦截到的错误【490】"); } await Task.Delay(_taskHz); } }); await Task.CompletedTask; }