public void ParseEntity() { var service = SpiderFactory.CreateScope(); var dataContext = new DataFlowContext(service); dataContext.AddResponse(new Response { Request = new Request("https://list.jd.com/list.html?cat=9987,653,655", new Dictionary <string, string> { { "cat", "手机" }, { "cat3", "110" } }), RawText = File.ReadAllText("Jd.html") }); DataParser <Product> extractor = new DataParser <Product>(); extractor.HandleAsync(dataContext).GetAwaiter().GetResult(); var results = ((List <object>)dataContext.GetItem(typeof(Product).FullName)).Select(x => (Product)x) .ToList(); Assert.Equal(60, results.Count); Assert.Equal("手机", results[0].CategoryName); Assert.Equal(110, results[0].CategoryId); Assert.Equal("https://item.jd.com/3031737.html", results[0].Url); Assert.Equal("3031737", results[0].Sku); Assert.Equal("荣耀官方旗舰店", results[0].ShopName); Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name); Assert.Equal("1000000904", results[0].VenderId); Assert.Equal("1000000904", results[0].JdzyShopId); Assert.Equal(DateTime.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd")); }
public void XpathFollow() { var service = SpiderFactory.CreateScope(); var dataContext = new DataFlowContext(service); dataContext.AddResponse(new Response { Request = new Request("http://cnblogs.com"), RawText = File.ReadAllText("cnblogs.html") }); var xpathFollow = DataParser.XpathFollow(".//div[@class='pager']"); var requests = xpathFollow.Invoke(dataContext); Assert.Equal(12, requests.Length); Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2"); }
public void MultiEntitySelector() { var service = SpiderFactory.CreateScope(); var dataContext = new DataFlowContext(service); dataContext.AddResponse(new Response { Request = new Request("http://abcd.com"), RawText = Html }); var parser = new DataParser <E>(); parser.HandleAsync(dataContext).GetAwaiter().GetResult(); var results = ((List <object>)dataContext.GetItem(typeof(E).FullName)).Select(x => (E)x).ToList(); Assert.Equal("a", results[0].title); Assert.Equal("b", results[1].title); }
private async Task HandleMessage(string message) { if (string.IsNullOrWhiteSpace(message)) { _logger.LogWarning($"任务 {Id} 接收到空消息"); return; } _lastRequestedTime = DateTime.Now; var responses = JsonConvert.DeserializeObject <List <Response> >(message); if (responses.Count == 0) { _logger.LogWarning($"任务 {Id} 接收到空回复"); return; } var agentId = responses.First().AgentId; var successResponses = responses.Where(x => x.Success).ToList(); // 统计下载成功 if (successResponses.Count > 0) { var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count, elapsedMilliseconds); } // 处理下载成功的请求 Parallel.ForEach(successResponses, async response => { _logger.LogInformation($"任务 {Id} 下载 {response.Request.Url} 成功"); var context = new DataFlowContext(_services.CreateScope().ServiceProvider); context.AddResponse(response); try { bool success = true; foreach (var dataFlow in _dataFlows) { var dataFlowResult = await dataFlow.HandleAsync(context); switch (dataFlowResult) { case DataFlowResult.Success: { continue; } case DataFlowResult.Failed: { _logger.LogError($"任务 {Id} 数据流处理器 {dataFlow.GetType().Name} 失败"); success = false; break; } case DataFlowResult.Terminated: { break; } } } var resultItems = context.GetItems(); // 如果解析结果为空,重试 if ((resultItems == null || resultItems.Sum(x => x.Value == null ? 0 : x.Value.Count) == 0) && RetryWhenResultIsEmpty) { response.Request.RetriedTimes++; response.Request.ComputeHash(); // 不需要添加总计 _scheduler.Enqueue(new[] { response.Request.Clone() }); } // 解析的目标请求 var followRequests = context.GetTargetRequests(); if (followRequests != null && followRequests.Count > 0) { var requests = new List <Request>(); foreach (var followRequest in followRequests) { followRequest.Depth = response.Request.Depth + 1; if (followRequest.Depth <= Depth) { requests.Add(followRequest); } } var count = _scheduler.Enqueue(requests); if (count > 0) { await _statisticsService.IncrementTotalAsync(Id, count); } } if (success) { await _statisticsService.IncrementSuccessAsync(Id); } else { await _statisticsService.IncrementFailedAsync(Id); } var result = success ? "成功" : $"失败: {context.Result}"; _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} {result}"); } catch (Exception e) { _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {e}"); } }); var retryResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes) .ToList(); retryResponses.ForEach(x => { x.Request.RetriedTimes++; _logger.LogInformation($"任务 {Id} 下载 {x.Request.Url} 失败: {x.Exception}"); }); var failedRequests = responses.Where(x => !x.Success) .ToList(); // 统计下载失败 if (failedRequests.Count > 0) { await _statisticsService.IncrementFailedAsync(Id); await _statisticsService.IncrementDownloadFailedAsync(agentId, failedRequests.Count); } var retryCount = _scheduler.Enqueue(retryResponses.Select(x => x.Request.Clone())); if (retryCount > 0) { await _statisticsService.IncrementTotalAsync(Id, retryCount); } }