Beispiel #1
0
        public void ParseEntity()
        {
            var service     = SpiderFactory.CreateScope();
            var dataContext = new DataFlowContext(service);

            dataContext.AddResponse(new Response
            {
                Request = new Request("https://list.jd.com/list.html?cat=9987,653,655",
                                      new Dictionary <string, string>
                {
                    { "cat", "手机" },
                    { "cat3", "110" }
                }),
                RawText = File.ReadAllText("Jd.html")
            });

            DataParser <Product> extractor = new DataParser <Product>();


            extractor.HandleAsync(dataContext).GetAwaiter().GetResult();

            var results = ((List <object>)dataContext.GetItem(typeof(Product).FullName)).Select(x => (Product)x)
                          .ToList();

            Assert.Equal(60, results.Count);
            Assert.Equal("手机", results[0].CategoryName);
            Assert.Equal(110, results[0].CategoryId);
            Assert.Equal("https://item.jd.com/3031737.html", results[0].Url);
            Assert.Equal("3031737", results[0].Sku);
            Assert.Equal("荣耀官方旗舰店", results[0].ShopName);
            Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name);
            Assert.Equal("1000000904", results[0].VenderId);
            Assert.Equal("1000000904", results[0].JdzyShopId);
            Assert.Equal(DateTime.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd"));
        }
Beispiel #2
0
        public void XpathFollow()
        {
            var service     = SpiderFactory.CreateScope();
            var dataContext = new DataFlowContext(service);

            dataContext.AddResponse(new Response
            {
                Request = new Request("http://cnblogs.com"),
                RawText = File.ReadAllText("cnblogs.html")
            });
            var xpathFollow = DataParser.XpathFollow(".//div[@class='pager']");

            var requests = xpathFollow.Invoke(dataContext);

            Assert.Equal(12, requests.Length);
            Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2");
        }
Beispiel #3
0
        public void MultiEntitySelector()
        {
            var service     = SpiderFactory.CreateScope();
            var dataContext = new DataFlowContext(service);

            dataContext.AddResponse(new Response
            {
                Request = new Request("http://abcd.com"),
                RawText = Html
            });

            var parser = new DataParser <E>();

            parser.HandleAsync(dataContext).GetAwaiter().GetResult();

            var results = ((List <object>)dataContext.GetItem(typeof(E).FullName)).Select(x => (E)x).ToList();

            Assert.Equal("a", results[0].title);
            Assert.Equal("b", results[1].title);
        }
Beispiel #4
0
        private async Task HandleMessage(string message)
        {
            if (string.IsNullOrWhiteSpace(message))
            {
                _logger.LogWarning($"任务 {Id} 接收到空消息");
                return;
            }

            _lastRequestedTime = DateTime.Now;
            var responses = JsonConvert.DeserializeObject <List <Response> >(message);

            if (responses.Count == 0)
            {
                _logger.LogWarning($"任务 {Id} 接收到空回复");
                return;
            }

            var agentId = responses.First().AgentId;

            var successResponses = responses.Where(x => x.Success).ToList();

            // 统计下载成功
            if (successResponses.Count > 0)
            {
                var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds);
                await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count,
                                                                       elapsedMilliseconds);
            }

            // 处理下载成功的请求
            Parallel.ForEach(successResponses, async response =>
            {
                _logger.LogInformation($"任务 {Id} 下载 {response.Request.Url} 成功");

                var context = new DataFlowContext(_services.CreateScope().ServiceProvider);
                context.AddResponse(response);
                try
                {
                    bool success = true;
                    foreach (var dataFlow in _dataFlows)
                    {
                        var dataFlowResult = await dataFlow.HandleAsync(context);
                        switch (dataFlowResult)
                        {
                        case DataFlowResult.Success:
                            {
                                continue;
                            }

                        case DataFlowResult.Failed:
                            {
                                _logger.LogError($"任务 {Id} 数据流处理器 {dataFlow.GetType().Name} 失败");
                                success = false;
                                break;
                            }

                        case DataFlowResult.Terminated:
                            {
                                break;
                            }
                        }
                    }


                    var resultItems = context.GetItems();
                    // 如果解析结果为空,重试
                    if ((resultItems == null || resultItems.Sum(x => x.Value == null ? 0 : x.Value.Count) == 0) &&
                        RetryWhenResultIsEmpty)
                    {
                        response.Request.RetriedTimes++;
                        response.Request.ComputeHash();
                        // 不需要添加总计
                        _scheduler.Enqueue(new[] { response.Request.Clone() });
                    }

                    // 解析的目标请求
                    var followRequests = context.GetTargetRequests();
                    if (followRequests != null && followRequests.Count > 0)
                    {
                        var requests = new List <Request>();
                        foreach (var followRequest in followRequests)
                        {
                            followRequest.Depth = response.Request.Depth + 1;
                            if (followRequest.Depth <= Depth)
                            {
                                requests.Add(followRequest);
                            }
                        }

                        var count = _scheduler.Enqueue(requests);
                        if (count > 0)
                        {
                            await _statisticsService.IncrementTotalAsync(Id, count);
                        }
                    }

                    if (success)
                    {
                        await _statisticsService.IncrementSuccessAsync(Id);
                    }
                    else
                    {
                        await _statisticsService.IncrementFailedAsync(Id);
                    }

                    var result = success ? "成功" : $"失败: {context.Result}";
                    _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} {result}");
                }
                catch (Exception e)
                {
                    _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {e}");
                }
            });

            var retryResponses =
                responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes)
                .ToList();

            retryResponses.ForEach(x =>
            {
                x.Request.RetriedTimes++;
                _logger.LogInformation($"任务 {Id} 下载 {x.Request.Url} 失败: {x.Exception}");
            });

            var failedRequests =
                responses.Where(x => !x.Success)
                .ToList();

            // 统计下载失败
            if (failedRequests.Count > 0)
            {
                await _statisticsService.IncrementFailedAsync(Id);

                await _statisticsService.IncrementDownloadFailedAsync(agentId, failedRequests.Count);
            }

            var retryCount = _scheduler.Enqueue(retryResponses.Select(x => x.Request.Clone()));

            if (retryCount > 0)
            {
                await _statisticsService.IncrementTotalAsync(Id, retryCount);
            }
        }