Пример #1
0
        public async Task IgnoreCase()
        {
            using (var conn = CreateConnection())
            {
                // 如果实体的 Schema 没有配置表名,则使用类名
                await conn.ExecuteAsync("drop table if exists test.dbo.IgnoreCase;");

                using (var builder = GetLocalSpiderHostBuilder())
                {
                    var provider = builder.Build();
                    var services = provider.CreateScopeServiceProvider();
                    var storage  = (RelationalDatabaseEntityStorageBase)CreateStorage(StorageType.Insert);
                    storage.IgnoreCase = false;
                    var dfc      = new DataFlowContext(null, services);
                    var typeName = typeof(CreateTableEntity7).FullName;
                    var entity   = new CreateTableEntity7();
                    dfc.Add(typeName, entity.GetTableMetadata());
                    var items = new ParseResult <CreateTableEntity7> {
                        entity
                    };
                    dfc.AddParseData(typeName, items);
                    await storage.HandleAsync(dfc);

                    var list = (await conn.QueryAsync <CreateTableEntity7>("SELECT * FROM test.dbo.IgnoreCase"))
                               .ToList();
                    Assert.Single(list);
                    entity = list.First();
                    Assert.Equal("xxx", entity.Str1);
                    Assert.Equal("yyy", entity.Str2);
                    Assert.Equal(655, entity.Required);
                    Assert.Equal(0, entity.Decimal);
                    Assert.Equal(600, entity.Long);
                    Assert.Equal(400, entity.Double);
                    Assert.Equal(200.0F, entity.Float);
                    await conn.ExecuteAsync("drop table if exists test.dbo.IgnoreCase;");
                }
            }
        }
Пример #2
0
        public async Task CreateTablePrimary()
        {
            using var conn = CreateConnection();
            // 如果实体的 Schema 没有配置表名,则使用类名
            await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;");

            {
                var storage  = CreateStorage(StorageMode.Insert);
                var dfc      = new DataFlowContext(null, null, null, null);
                var typeName = typeof(CreateTableEntity4);
                var entity   = new CreateTableEntity4();

                var items = new List <CreateTableEntity4> {
                    entity
                };
                dfc.AddData(typeName, items);
                await storage.HandleAsync(dfc);

                var list = (await conn.QueryAsync <CreateTableEntity4>("SELECT * FROM test.dbo.createtableprimay"))
                           .ToList();
                Assert.Single(list);
                entity = list.First();
                Assert.Equal("xxx", entity.Str1);
                Assert.Equal("yyy", entity.Str2);
                Assert.Equal(655, entity.Required);
                Assert.Equal(0, entity.Decimal);
                Assert.Equal(600, entity.Long);
                Assert.Equal(400, entity.Double);
                Assert.Equal(200.0F, entity.Float);

                var primaries = (await conn.QueryAsync <IndexInfo>
                                     (@"USE test; EXEC sp_pkeys @table_name='createtableprimay'")
                                 ).ToList();
                Assert.Single(primaries);
                Assert.Equal("str2", primaries[0].COLUMN_NAME);
                await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;");
            }
        }
Пример #3
0
        public async Task UseTransaction()
        {
            using (var conn = CreateConnection())
            {
                // 如果实体的 Schema 没有配置表名,则使用类名
                await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;");

                var services = SpiderProvider.Value.CreateScopeServiceProvider();
                var storage  = (RelationalDatabaseEntityStorageBase)CreateStorage(StorageType.InsertIgnoreDuplicate);
                storage.UseTransaction = true;
                var dfc      = new DataFlowContext(null, services);
                var typeName = typeof(CreateTableEntity4).FullName;
                var entity   = new CreateTableEntity4();
                dfc.Add(typeName, entity.GetTableMetadata());
                var items = new ParseResult <CreateTableEntity4>
                {
                    entity,
                    entity,
                    entity
                };
                dfc.AddParseData(typeName, items);
                await storage.HandleAsync(dfc);

                var list = (await conn.QueryAsync <CreateTableEntity4>("SELECT * FROM test.dbo.createtableprimay"))
                           .ToList();
                Assert.Single(list);
                entity = list.First();
                Assert.Equal("xxx", entity.Str1);
                Assert.Equal("yyy", entity.Str2);
                Assert.Equal(655, entity.Required);
                Assert.Equal(0, entity.Decimal);
                Assert.Equal(600, entity.Long);
                Assert.Equal(400, entity.Double);
                Assert.Equal(200.0F, entity.Float);

                await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;");
            }
        }
Пример #4
0
        public async Task InsertIgnoreDuplicate()
        {
            using (var conn = CreateConnection())
            {
                // 如果实体的 Schema 没有配置表名,则使用类名
                await conn.ExecuteAsync(
                    $"drop table if exists {Escape}test{Escape}.{Escape}createtableprimay{Escape};");

                {
                    var storage = CreateStorage(StorageMode.InsertIgnoreDuplicate);
                    var context = new DataFlowContext(null, new SpiderOptions(),
                                                      new Request(), new Response());
                    var typeName = typeof(CreateTableEntity4);
                    var entity   = new CreateTableEntity4();
                    var items    = new List <CreateTableEntity4> {
                        entity, entity, entity
                    };
                    context.AddData(typeName, items);
                    await storage.HandleAsync(context);

                    var list = (await conn.QueryAsync <CreateTableEntity4>(
                                    $"SELECT * FROM {Escape}test{Escape}.{Escape}createtableprimay{Escape}"))
                               .ToList();
                    Assert.Single(list);
                    entity = list.First();
                    Assert.Equal("xxx", entity.Str1);
                    Assert.Equal("yyy", entity.Str2);
                    Assert.Equal(655, entity.Required);
                    Assert.Equal(0, entity.Decimal);
                    Assert.Equal(600, entity.Long);
                    Assert.Equal(400, entity.Double);
                    Assert.Equal(200.0F, entity.Float);

                    await conn.ExecuteAsync(
                        $"drop table if exists {Escape}test{Escape}.{Escape}createtableprimay{Escape};");
                }
            }
        }
Пример #5
0
        public async Task RequiredValidator()
        {
            var request     = new Request("http://cnblogs.com");
            var dataContext =
                new DataFlowContext(null, new SpiderOptions(), request,
                                    new Response {
                Content = new ByteArrayContent(File.ReadAllBytes("cnblogs.html"))
            });

            var dataParser = new TestDataParser();

            dataParser.SetLogger(NullLogger.Instance);
            dataParser.AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']"));
            dataParser.AddRequiredValidator(r => Regex.IsMatch(r.RequestUri.ToString(), "xxxcnblogs\\.com"));

            await dataParser.HandleAsync(dataContext);

            var requests = dataContext.FollowRequests;

            Assert.Empty(requests);

            var dataContext2 =
                new DataFlowContext(null, new SpiderOptions(), request,
                                    new Response {
                Content = new ByteArrayContent(File.ReadAllBytes("cnblogs.html"))
            });
            var dataParser2 = new TestDataParser();

            dataParser2.AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']"));
            dataParser.AddRequiredValidator(r => Regex.IsMatch(r.RequestUri.ToString(), "cnblogs\\.com"));

            await dataParser2.HandleAsync(dataContext2);

            requests = dataContext2.FollowRequests;

            Assert.Equal(12, requests.Count);
            Assert.Contains(requests, r => r.RequestUri.ToString() == "http://cnblogs.com/sitehome/p/2");
        }
        protected override Task <DataFlowResult> Store(DataFlowContext context)
        {
            foreach (var item in context.GetParseItems())
            {
                var tableMetadata = (TableMetadata)context[item.Key];
                switch (MySqlFileType)
                {
                case MySqlFileType.LoadFile:
                {
                    WriteLoadFile(context, tableMetadata, item.Value);
                    break;
                }

                case MySqlFileType.InsertSql:
                {
                    WriteInsertFile(context, tableMetadata, item.Value);
                    break;
                }
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
Пример #7
0
        /// <summary>
        /// 获取主题的地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectUrl(DataFlowContext context)
        {
            var pages = context.Selectable
                        .XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                var request = new Request
                {
                    Url     = page,
                    OwnerId = context.Response.Request.OwnerId
                };
                request.AddProperty("tag", context.Response.Request.GetProperty("tag"));
                request.AddProperty("referer", context.Response.Request.Url);
                requestList.Add(request);
            }

            if (requestList.Count > 0)
            {
                context.AddExtraRequests(requestList.ToArray());
            }
        }
Пример #8
0
        /// <summary>
        /// 获取图片浏览页里抽图片地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPictureUrl(DataFlowContext context)
        {
            var response = context.GetResponse();

            context.AddItem("URL", response.Request.Url);
            context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

            var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues();

            foreach (var image in images)
            {
                //处理图片URL下载
                var request = new Request
                {
                    Url     = image,
                    OwnerId = response.Request.OwnerId
                };
                request.AddProperty("tag", response.Request.GetProperty("tag"));
                request.AddProperty("referer", response.Request.GetProperty("referer"));
                request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue());
                ImageDownloader.GetInstance().AddRequest(request);
            }
        }
Пример #9
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var news = context.Selectable.XPath(".//div[@class='news_block']").Nodes();

                foreach (var item in news)
                {
                    var title = item.Select(Selectors.XPath(".//h2[@class='news_entry']"))
                                .GetValue(ValueOption.InnerText);
                    var url     = item.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href")).GetValue();
                    var summary = item.Select(Selectors.XPath(".//div[@class='entry_summary']"))
                                  .GetValue(ValueOption.InnerText);
                    var views = item.Select(Selectors.XPath(".//span[@class='view']")).GetValue(ValueOption.InnerText)
                                .Replace(" 人浏览", "");
                    var request = CreateFromRequest(context.Response.Request, url);
                    request.AddProperty("title", title);
                    request.AddProperty("summary", summary);
                    request.AddProperty("views", views);

                    context.AddExtraRequests(request);
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Пример #10
0
        public async Task InsertAndUpdate()
        {
            using (var conn = CreateConnection())
            {
                // 如果实体的 Schema 没有配置表名,则使用类名
                await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;");

                {
                    var storage  = CreateStorage(StorageMode.InsertAndUpdate);
                    var dfc      = new DataFlowContext(null, null, null, null);
                    var typeName = typeof(CreateTableEntity4);
                    var entity   = new CreateTableEntity4();

                    var items = new List <CreateTableEntity4> {
                        entity, new CreateTableEntity4 {
                            Str1 = "zzz"
                        }
                    };
                    dfc.AddData(typeName, items);
                    await storage.HandleAsync(dfc);

                    var list = (await conn.QueryAsync <CreateTableEntity4>("SELECT * FROM test.dbo.createtableprimay"))
                               .ToList();
                    Assert.Single(list);
                    entity = list.First();
                    Assert.Equal("zzz", entity.Str1);
                    Assert.Equal("yyy", entity.Str2);
                    Assert.Equal(655, entity.Required);
                    Assert.Equal(0, entity.Decimal);
                    Assert.Equal(600, entity.Long);
                    Assert.Equal(400, entity.Double);
                    Assert.Equal(200.0F, entity.Float);

                    await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;");
                }
            }
        }
Пример #11
0
        public async Task ParseEntity()
        {
            var request = new Request("https://list.jd.com/list.html?cat=9987,653,655",
                                      new Dictionary <string, object> {
                { "cat", "手机" }, { "cat3", "110" }
            });
            var dataContext = new DataFlowContext(null, new SpiderOptions(), request,
                                                  new Response {
                Content = new ByteArrayContent(File.ReadAllBytes("Jd.html"))
            });

            var parser = new DataParser <Product>();
            await parser.InitializeAsync();

            parser.UseHtmlSelectableBuilder();
            await parser.HandleAsync(dataContext);

            var results = (List <Product>)dataContext.GetData(typeof(Product));

            Assert.Equal(60, results.Count);
            Assert.Contains("手机商品筛选", results[0].Title);
            Assert.Contains("手机商品筛选", results[1].Title);
            Assert.Contains("手机商品筛选", results[2].Title);
            Assert.Equal("手机", results[0].CategoryName);
            Assert.Equal(110, results[0].CategoryId);
            Assert.Equal("https://item.jd.com/3031737.html", results[0].Url);
            Assert.Equal("3031737", results[0].Sku);
            Assert.Equal("荣耀官方旗舰店", results[0].ShopName);
            Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name);
            Assert.Equal("1000000904", results[0].VenderId);
            Assert.Equal("1000000904", results[0].JdzyShopId);
            Assert.Equal(DateTimeOffset.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd"));

            var requests = dataContext.FollowRequests;

            Assert.Equal(7, requests.Count);
        }
Пример #12
0
        public override async Task <DataFlowResult> HandleAsync(DataFlowContext context)
        {
            try
            {
                if (!context.HasItems)
                {
                    return(DataFlowResult.Success);
                }

                var storeResult = await Store(context);

                if (storeResult == DataFlowResult.Failed || storeResult == DataFlowResult.Terminated)
                {
                    return(storeResult);
                }

                return(DataFlowResult.Success);
            }
            catch (Exception e)
            {
                Logger?.LogError($"数据存储发生异常: {e}");
                return(DataFlowResult.Failed);
            }
        }
Пример #13
0
        protected override async Task ParseAsync(DataFlowContext context)
        {
            var props = context.Request.Properties;

            if (!props.ContainsKey(REQUEST_CHECK_PROPERTY_NAME))
            {
                return;
            }

            var type = props[REQUEST_CHECK_PROPERTY_NAME] as string;

            if (type != _parserName)
            {
                return;
            }

            var jsonStr = context.Response.ReadAsString();

            var obj = JsonConvert.DeserializeObject <T>(jsonStr);

            context.AddData(typeof(T).Name, obj);

            OnHanlder(context, obj);
        }
Пример #14
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                context.AddData("URL", context.Response.Request.Url);
                context.AddData("Title", context.Selectable.XPath(".//title").GetValue());
                var songs    = new Dictionary <string, string>();
                var tagNodes = context.Selectable
                               .XPath("//*[@id=\"song-list-pre-cache\"]/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var urls = url.Split("song");

                    url = urls[0] + "song/media/outer/url" + urls[1] + ".mp3";
                    var name = node.GetValue();
                    songs.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var song in songs)
                {
                    var request = new Request
                    {
                        Url     = song.Key,
                        OwnerId = context.Response.Request.OwnerId
                    };
                    request.AddProperty("tag", song.Value);
                    request.AddProperty("path", GetImagePath(song.Value));

                    Downloader.GetInstance().AddRequest(request);
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Пример #15
0
        public void XpathFollow()
        {
            var services    = LocalSpiderProvider.Value.CreateScopeServiceProvider();
            var dataContext =
                new DataFlowContext(
                    new Response
            {
                Request = new Request("http://cnblogs.com"),
                Content = File.ReadAllBytes("cnblogs.html"),
                CharSet = "UTF-8"
            }, services);

            if (dataContext.Selectable == null)
            {
                dataContext.Selectable = dataContext.Response?.ToSelectable();
            }

            var xpathFollow = DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']");

            var requests = xpathFollow.Invoke(dataContext);

            Assert.Equal(12, requests.Count);
            Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2");
        }
Пример #16
0
        /// <summary>
        /// Data analysis
        /// </summary>
        /// <param name="context">Processing context</param>
        /// <returns></returns>
        public override async Task <DataFlowResult> HandleAsync(DataFlowContext context)
        {
            if (context?.Response == null)
            {
                Logger?.LogError("Data context or response content is empty");
                return(DataFlowResult.Failed);
            }

            try
            {
                // Skip if not matched, does not affect the execution of other data stream processors
                if (CanParse != null && !CanParse(context.Response.Request))
                {
                    return(DataFlowResult.Success);
                }
                // [Doanh]: call this first to initialize _selectable Instance for the context.
                SelectableFactory?.Invoke(context);

                var parserResult = await Parse(context);

                var urls = QueryFollowRequests?.Invoke(context);
                AddTargetRequests(context, urls);

                if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated)
                {
                    return(parserResult);
                }

                return(DataFlowResult.Success);
            }
            catch (Exception e)
            {
                Logger?.LogError($"任务 {context.Response.Request.OwnerId} 数据解析发生异常: {e}");
                return(DataFlowResult.Failed);
            }
        }
Пример #17
0
        /// <summary>
        /// 数据解析
        /// </summary>
        /// <param name="context">处理上下文</param>
        /// <returns></returns>
        public override async Task <DataFlowResult> HandleAsync(DataFlowContext context)
        {
            if (context?.Response == null)
            {
                Logger?.LogError("数据上下文或者响应内容为空");
                return(DataFlowResult.Failed);
            }

            try
            {
                // 如果不匹配则跳过,不影响其它数据流处理器的执行
                if (CanParse != null && !CanParse(context.Response.Request))
                {
                    return(DataFlowResult.Success);
                }

                SelectableFactory?.Invoke(context);

                var parserResult = await Parse(context);

                var urls = QueryFollowRequests?.Invoke(context);
                AddTargetRequests(context, urls);

                if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated)
                {
                    return(parserResult);
                }

                return(DataFlowResult.Success);
            }
            catch (Exception e)
            {
                Logger?.LogError($"任务 {context.Response.Request.OwnerId} 数据解析发生异常: {e}");
                return(DataFlowResult.Failed);
            }
        }
Пример #18
0
        /// <summary>
        /// 解析画册的分页
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectPageUrl(DataFlowContext context)
        {
            var pageSet = new Dictionary <string, string>();
            var pages   = context.Selectable
                          .XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    try
                    {
                        var request = new Request
                        {
                            Url     = page,
                            OwnerId = context.Response.Request.OwnerId
                        };
                        //request.Properties.Add("tag", response.Request.Properties["tag"]);
                        request.AddProperty("tag", "萝莉");
                        requestList.Add(request);

                        pageSet.Add(page, page);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                }
            }

            if (requestList.Count > 0)
            {
                context.AddExtraRequests(requestList.ToArray());
            }
        }
Пример #19
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            if (!context.Contains(_model.TypeName))
            {
                context.Add(_model.TypeName, _tableMetadata);
            }

            var selectable = context.GetSelectable();
            var results    = new ParseResult <T>();

            if (selectable.Properties == null)
            {
                selectable.Properties = new Dictionary <string, object>();
            }

            var environments = new Dictionary <string, string>();

            foreach (var property in context.Response.Request.Properties)
            {
                environments.Add(property.Key, property.Value);
            }

            if (_model.ShareValueSelectors != null)
            {
                foreach (var selector in _model.ShareValueSelectors)
                {
                    string name  = selector.Name;
                    var    value = selectable.Select(selector.ToSelector()).GetValue();
                    if (!environments.ContainsKey(name))
                    {
                        environments.Add(name, value);
                    }
                    else
                    {
                        environments[name] = value;
                    }
                }
            }

            bool singleExtractor = _model.Selector == null;

            if (!singleExtractor)
            {
                var selector = _model.Selector.ToSelector();

                var list = selectable.SelectList(selector).Nodes()?.ToList();
                if (list != null)
                {
                    if (_model.Take > 0 && list.Count > _model.Take)
                    {
                        list = _model.TakeFromHead
                                                        ? list.Take(_model.Take).ToList()
                                                        : list.Skip(list.Count - _model.Take).ToList();
                    }

                    for (var i = 0; i < list.Count; ++i)
                    {
                        var item = list.ElementAt(i);
                        var obj  = ParseObject(environments, item, i);
                        if (obj != null)
                        {
                            results.Add(obj);
                        }
                        else
                        {
                            Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}");
                        }
                    }
                }
            }
            else
            {
                var obj = ParseObject(environments, selectable, 0);
                if (obj != null)
                {
                    results.Add(obj);
                }
                else
                {
                    Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}");
                }
            }

            if (results.Count > 0)
            {
                var items = context.GetParseItem(_model.TypeName);
                if (items == null)
                {
                    context.AddParseItem(_model.TypeName, results);
                }
                else
                {
                    ((ParseResult <T>)items).AddRange(results);
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
Пример #20
0
        private async Task HandleMessage(string message)
        {
            if (string.IsNullOrWhiteSpace(message))
            {
                _logger.LogWarning($"任务 {Id} 接收到空消息");
                return;
            }

            var commandMessage = message.ToCommandMessage();

            if (commandMessage != null)
            {
                switch (commandMessage.Command)
                {
                case Framework.AllocateDownloaderCommand:
                {
                    if (commandMessage.Message == "true")
                    {
                        _allocated.Inc();
                    }
                    else
                    {
                        _logger.LogError($"任务 {Id} 分配下载器代理失败");
                        _allocatedSuccess = false;
                    }

                    break;
                }

                default:
                {
                    _logger.LogError($"任务 {Id} 未能处理命令: {message}");
                    break;
                }
                }

                return;
            }

            _lastRequestedTime = DateTime.Now;

            Response[] responses;

            try
            {
                responses = JsonConvert.DeserializeObject <Response[]>(message);
            }
            catch
            {
                _logger.LogError($"任务 {Id} 接收到异常消息: {message}");
                return;
            }

            try
            {
                if (responses.Length == 0)
                {
                    _logger.LogWarning($"任务 {Id} 接收到空回复");
                    return;
                }

                _responded.Add(responses.Length);

                // 只要有回应就从缓存中删除,即便是异常要重新下载会成 EnqueueRequest 中重新加回缓存
                // 此处只需要保证: 发 -> 收 可以一对一删除就可以保证检测机制的正确性
                foreach (var response in responses)
                {
                    _enqueuedRequestDict.TryRemove(response.Request.Hash, out _);
                }

                var agentId = responses.First().AgentId;

                var successResponses = responses.Where(x => x.Success).ToList();
                // 统计下载成功
                if (successResponses.Count > 0)
                {
                    var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds);
                    await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count,
                                                                           elapsedMilliseconds);
                }

                // 处理下载成功的请求
                Parallel.ForEach(successResponses, async response =>
                {
                    _logger.LogInformation($"任务 {Id} 下载 {response.Request.Url} 成功");

                    try
                    {
                        var context = new DataFlowContext(response, _services.CreateScope().ServiceProvider);

                        foreach (var dataFlow in _dataFlows)
                        {
                            var dataFlowResult = await dataFlow.HandleAsync(context);
                            var @break         = false;
                            switch (dataFlowResult)
                            {
                            case DataFlowResult.Success:
                                {
                                    continue;
                                }

                            case DataFlowResult.Failed:
                                {
                                    // 如果处理失败,则直接返回
                                    _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {context.Result}");
                                    await _statisticsService.IncrementFailedAsync(Id);
                                    return;
                                }

                            case DataFlowResult.Terminated:
                                {
                                    @break = true;
                                    break;
                                }
                            }

                            if (@break)
                            {
                                break;
                            }
                        }

                        var resultIsEmpty = !context.HasItems && !context.HasParseItems;
                        // 如果解析结果为空,重试
                        if (resultIsEmpty && RetryWhenResultIsEmpty)
                        {
                            if (response.Request.RetriedTimes < RetryDownloadTimes)
                            {
                                response.Request.RetriedTimes++;
                                await EnqueueRequests(response.Request);
                                // 即然是重试这个请求,则解析必然还会再执行一遍,所以解析到的目标链接、成功状态都应该到最后来处理。
                                _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 解析结果为空,尝试重试");
                                return;
                            }
                        }

                        // 解析的目标请求
                        if (context.FollowRequests != null && context.FollowRequests.Count > 0)
                        {
                            var requests = new List <Request>();
                            foreach (var followRequest in context.FollowRequests)
                            {
                                followRequest.Depth = response.Request.Depth + 1;
                                if (followRequest.Depth <= Depth)
                                {
                                    requests.Add(followRequest);
                                }
                            }

                            var count = _scheduler.Enqueue(requests);
                            if (count > 0)
                            {
                                await _statisticsService.IncrementTotalAsync(Id, count);
                            }
                        }

                        if (!resultIsEmpty)
                        {
                            await _statisticsService.IncrementSuccessAsync(Id);
                            _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 成功");
                        }
                        else
                        {
                            if (RetryWhenResultIsEmpty)
                            {
                                await _statisticsService.IncrementFailedAsync(Id);
                                _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败,解析结果为空");
                            }
                            else
                            {
                                await _statisticsService.IncrementSuccessAsync(Id);
                                _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 成功,解析结果为空");
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        await _statisticsService.IncrementFailedAsync(Id);
                        _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {e}");
                    }
                });

                // TODO: 此处需要优化
                var retryResponses =
                    responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes)
                    .ToList();
                var downloadFailedResponses =
                    responses.Where(x => !x.Success)
                    .ToList();
                var failedResponses =
                    responses.Where(x => !x.Success && x.Request.RetriedTimes >= RetryDownloadTimes)
                    .ToList();

                if (retryResponses.Count > 0)
                {
                    retryResponses.ForEach(x =>
                    {
                        x.Request.RetriedTimes++;
                        _logger.LogInformation($"任务 {Id} 下载 {x.Request.Url} 失败: {x.Exception}");
                    });
                    await EnqueueRequests(retryResponses.Select(x => x.Request).ToArray());
                }

                // 统计下载失败
                if (downloadFailedResponses.Count > 0)
                {
                    var elapsedMilliseconds = downloadFailedResponses.Sum(x => x.ElapsedMilliseconds);
                    await _statisticsService.IncrementDownloadFailedAsync(agentId,
                                                                          downloadFailedResponses.Count, elapsedMilliseconds);
                }

                // 统计失败
                if (failedResponses.Count > 0)
                {
                    await _statisticsService.IncrementFailedAsync(Id, failedResponses.Count);
                }
            }
            catch (Exception ex)
            {
                _logger.LogError($"任务 {Id} 处理消息 {message} 失败: {ex}");
            }
        }
Пример #21
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                GetDetailPictureUrl(context);

                return(Task.FromResult(DataFlowResult.Success));
            }
Пример #22
0
        private async Task HandleMessage(string message)
        {
            if (string.IsNullOrWhiteSpace(message))
            {
                _logger.LogWarning($"任务 {Id} 接收到空消息");
                return;
            }

            _lastRequestedTime = DateTime.Now;
            var responses = JsonConvert.DeserializeObject <List <Response> >(message);

            if (responses.Count == 0)
            {
                _logger.LogWarning($"任务 {Id} 接收到空回复");
                return;
            }

            var agentId = responses.First().AgentId;

            var successResponses = responses.Where(x => x.Success).ToList();

            // 统计下载成功
            if (successResponses.Count > 0)
            {
                var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds);
                await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count,
                                                                       elapsedMilliseconds);
            }

            // 处理下载成功的请求
            Parallel.ForEach(successResponses, async response =>
            {
                _logger.LogInformation($"任务 {Id} 下载 {response.Request.Url} 成功");

                var context = new DataFlowContext(_services.CreateScope().ServiceProvider);
                context.AddResponse(response);
                try
                {
                    bool success = true;
                    foreach (var dataFlow in _dataFlows)
                    {
                        var dataFlowResult = await dataFlow.HandleAsync(context);
                        switch (dataFlowResult)
                        {
                        case DataFlowResult.Success:
                            {
                                continue;
                            }

                        case DataFlowResult.Failed:
                            {
                                _logger.LogError($"任务 {Id} 数据流处理器 {dataFlow.GetType().Name} 失败");
                                success = false;
                                break;
                            }

                        case DataFlowResult.Terminated:
                            {
                                break;
                            }
                        }
                    }


                    var resultItems = context.GetItems();
                    // 如果解析结果为空,重试
                    if ((resultItems == null || resultItems.Sum(x => x.Value == null ? 0 : x.Value.Count) == 0) &&
                        RetryWhenResultIsEmpty)
                    {
                        response.Request.RetriedTimes++;
                        response.Request.ComputeHash();
                        // 不需要添加总计
                        _scheduler.Enqueue(new[] { response.Request.Clone() });
                    }

                    // 解析的目标请求
                    var followRequests = context.GetTargetRequests();
                    if (followRequests != null && followRequests.Count > 0)
                    {
                        var requests = new List <Request>();
                        foreach (var followRequest in followRequests)
                        {
                            followRequest.Depth = response.Request.Depth + 1;
                            if (followRequest.Depth <= Depth)
                            {
                                requests.Add(followRequest);
                            }
                        }

                        var count = _scheduler.Enqueue(requests);
                        if (count > 0)
                        {
                            await _statisticsService.IncrementTotalAsync(Id, count);
                        }
                    }

                    if (success)
                    {
                        await _statisticsService.IncrementSuccessAsync(Id);
                    }
                    else
                    {
                        await _statisticsService.IncrementFailedAsync(Id);
                    }

                    var result = success ? "成功" : $"失败: {context.Result}";
                    _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} {result}");
                }
                catch (Exception e)
                {
                    _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {e}");
                }
            });

            var retryResponses =
                responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes)
                .ToList();

            retryResponses.ForEach(x =>
            {
                x.Request.RetriedTimes++;
                _logger.LogInformation($"任务 {Id} 下载 {x.Request.Url} 失败: {x.Exception}");
            });

            var failedRequests =
                responses.Where(x => !x.Success)
                .ToList();

            // 统计下载失败
            if (failedRequests.Count > 0)
            {
                await _statisticsService.IncrementFailedAsync(Id);

                await _statisticsService.IncrementDownloadFailedAsync(agentId, failedRequests.Count);
            }

            var retryCount = _scheduler.Enqueue(retryResponses.Select(x => x.Request.Clone()));

            if (retryCount > 0)
            {
                await _statisticsService.IncrementTotalAsync(Id, retryCount);
            }
        }
Пример #23
0
 public override Task HandleAsync(DataFlowContext context)
 {
     throw new NotImplementedException();
 }
Пример #24
0
        private async Task HandleResponseAsync(MessageData <Response[]> message)
        {
            if (message?.Data == null || message.Data.Length == 0)
            {
                Logger.LogWarning($"{Id} receive empty message");
                return;
            }

            _lastRequestedTime = DateTimeOffset.Now;

            var responses = message.Data;

            try
            {
                if (responses.Length == 0)
                {
                    Logger.LogWarning($"{Id} receive empty message");
                    return;
                }

                _responded.Add(responses.Length);

                // 只要有回应就从缓存中删除,即便是异常要重新下载会成 EnqueueRequest 中重新加回缓存
                // 此处只需要保证: 发 -> 收 可以一对一删除就可以保证检测机制的正确性
                foreach (var response in responses)
                {
                    _enqueuedRequestDict.TryRemove(response.Request.Hash, out _);
                }

                var agentId = responses.First().AgentId;

                var successResponses = responses.Where(x => x.Success).ToList();
                // 统计下载成功
                if (successResponses.Count > 0)
                {
                    var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds);
                    await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count,
                                                                           elapsedMilliseconds);
                }

                // 处理下载成功的请求
                Parallel.ForEach(successResponses, async response =>
                {
                    Logger.LogInformation($"{Id} download {response.Request.Url} success");

                    try
                    {
                        using (var scope = Services.CreateScope())
                        {
                            var context = new DataFlowContext(response, scope.ServiceProvider);

                            foreach (var dataFlow in _dataFlows)
                            {
                                var dataFlowResult = await dataFlow.HandleAsync(context);
                                var @break         = false;
                                switch (dataFlowResult)
                                {
                                case DataFlowResult.Success:
                                    {
                                        continue;
                                    }

                                case DataFlowResult.Failed:
                                    {
                                        // 如果处理失败,则直接返回
                                        Logger.LogInformation(
                                            $"{Id} handle {response.Request.Url} failed: {context.Message}");
                                        await _statisticsService.IncrementFailedAsync(Id);
                                        return;
                                    }

                                case DataFlowResult.Terminated:
                                    {
                                        @break = true;
                                        break;
                                    }
                                }

                                if (@break)
                                {
                                    break;
                                }
                            }

                            var resultIsEmpty = !context.HasData && !context.HasParseData;
                            // 如果解析结果为空,重试
                            if (resultIsEmpty && RetryWhenResultIsEmpty)
                            {
                                if (response.Request.RetriedTimes < response.Request.RetryTimes)
                                {
                                    response.Request.RetriedTimes++;
                                    await EnqueueRequests(response.Request);
                                    // 即然是重试这个请求,则解析必然还会再执行一遍,所以解析到的目标链接、成功状态都应该到最后来处理。
                                    Logger.LogInformation($"{Id} retry {response.Request.Url} because empty result");
                                    return;
                                }
                            }

                            // 解析的目标请求
                            if (context.ExtraRequests != null && context.ExtraRequests.Count > 0)
                            {
                                var requests = new List <Request>();
                                foreach (var newRequest in context.ExtraRequests)
                                {
                                    newRequest.Depth = response.Request.Depth + 1;
                                    if (newRequest.Depth <= Depth)
                                    {
                                        // 在此强制设制 OwnerId, 防止用户忘记导致出错
                                        if (string.IsNullOrWhiteSpace(newRequest.OwnerId))
                                        {
                                            newRequest.OwnerId = context.Response.Request.OwnerId;
                                            newRequest.AgentId = context.Response.Request.AgentId;
                                        }

                                        requests.Add(newRequest);
                                    }
                                }

                                var count = _scheduler.Enqueue(requests);
                                if (count > 0)
                                {
                                    await _statisticsService.IncrementTotalAsync(Id, count);
                                }
                            }

                            if (!resultIsEmpty)
                            {
                                await _statisticsService.IncrementSuccessAsync(Id);
                                Logger.LogInformation($"{Id} handle {response.Request.Url} success");
                            }
                            else
                            {
                                if (RetryWhenResultIsEmpty)
                                {
                                    await _statisticsService.IncrementFailedAsync(Id);
                                    Logger.LogInformation(
                                        $"{Id} handle {response.Request.Url} failed,extract result is empty");
                                }
                                else
                                {
                                    await _statisticsService.IncrementSuccessAsync(Id);
                                    Logger.LogInformation(
                                        $"{Id} handle {response.Request.Url} success,extract result is empty");
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        await _statisticsService.IncrementFailedAsync(Id);
                        Logger.LogInformation($"{Id} handle {response.Request.Url} failed: {e}");
                    }
                });

                // TODO: 此处需要优化
                var retryResponses =
                    responses.Where(x => !x.Success && x.Request.RetriedTimes < x.Request.RetryTimes)
                    .ToList();
                var downloadFailedResponses =
                    responses.Where(x => !x.Success)
                    .ToList();
                var failedResponses =
                    responses.Where(x => !x.Success && x.Request.RetriedTimes >= x.Request.RetryTimes)
                    .ToList();

                foreach (var response in downloadFailedResponses)
                {
                    Logger.LogError($"{Id} download failed: {JsonConvert.SerializeObject(response)}");
                }

                foreach (var response in failedResponses)
                {
                    Logger.LogError($"{Id} failed: {JsonConvert.SerializeObject(response)}");
                }

                if (retryResponses.Count > 0)
                {
                    retryResponses.ForEach(x =>
                    {
                        x.Request.RetriedTimes++;
                        Logger.LogInformation($"{Id} download {x.Request.Url} failed: {x.Exception}");
                    });
                    await EnqueueRequests(retryResponses.Select(x => x.Request).ToArray());
                }

                // 统计下载失败
                if (downloadFailedResponses.Count > 0)
                {
                    var elapsedMilliseconds = downloadFailedResponses.Sum(x => x.ElapsedMilliseconds);
                    await _statisticsService.IncrementDownloadFailedAsync(agentId,
                                                                          downloadFailedResponses.Count, elapsedMilliseconds);
                }

                // 统计失败
                if (failedResponses.Count > 0)
                {
                    await _statisticsService.IncrementFailedAsync(Id, failedResponses.Count);
                }
            }
            catch (Exception ex)
            {
                Logger.LogError($"{Id} handle message {message} failed: {ex}");
            }
        }
Пример #25
0
 public DecompressConsumer(DataFlowContext context)
 {
     _context = context;
 }
Пример #26
0
 protected abstract Task <DataFlowResult> Parse(DataFlowContext context);
Пример #27
0
        private async Task HandleDynamicMessage(string cmd, dynamic message)
        {
            _lastRequestedTime = DateTime.Now;

            Response[] responses;

            try
            {
                responses = message as Response[];                // JsonConvert.DeserializeObject<Response[]>(message);
                if (responses == null)
                {
                    var response = message as Response;
                    if (response != null)
                    {
                        responses = new Response[] { response };
                    }
                }
            }
            catch
            {
                _logger.LogError($"Task {Id} received an exception message: {message}");
                return;
            }

            try
            {
                if (responses == null || responses.Length == 0)
                {
                    _logger.LogWarning($"Task {Id} received an empty reply");
                    return;
                }

                _responded.Add(responses.Length);

                // As long as there is a response, it will be deleted from the cache. Even if the exception is to be re-downloaded, it will be added back to the cache in EnqueueRequest.
                // Here only need to ensure: Send -> Receive can be one-to-one delete to ensure the correctness of the detection mechanism
                foreach (var response in responses)
                {
                    _enqueuedRequestDict.TryRemove(response.Request.Hash, out _);
                }

                var agentId = responses.First().AgentId;

                var successResponses = responses.Where(x => x.Success).ToList();
                // Statistical download success
                if (successResponses.Count > 0)
                {
                    var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds);
                    await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count,
                                                                           elapsedMilliseconds);
                }

                // Handling a successful download request
                Parallel.ForEach(successResponses, async response =>
                {
                    _logger.LogInformation($"Task {Id} Download {response.Request.Url} Success");

                    try
                    {
                        var context            = new DataFlowContext(response, _services.CreateScope().ServiceProvider);
                        context["ProjectName"] = "Vnexpress.net";
                        foreach (var dataFlow in _dataFlows)
                        {
                            var dataFlowResult = await dataFlow.HandleAsync(context);
                            var @break         = false;
                            switch (dataFlowResult)
                            {
                            case DataFlowResult.Success:
                                {
                                    continue;
                                }

                            case DataFlowResult.Failed:
                                {
                                    // If the processing fails, return directly
                                    _logger.LogInformation($"Task {Id} failed to process {response.Request.Url}: {context.Result}");
                                    await _statisticsService.IncrementFailedAsync(Id);
                                    return;
                                }

                            case DataFlowResult.Terminated:
                                {
                                    @break = true;
                                    break;
                                }
                            }

                            if (@break)
                            {
                                break;
                            }
                        }

                        var resultIsEmpty = !context.HasItems && !context.HasParseItems;
                        // If the parsing result is empty, try again
                        if (resultIsEmpty && RetryWhenResultIsEmpty)
                        {
                            if (response.Request.RetriedTimes < RetryDownloadTimes)
                            {
                                response.Request.RetriedTimes++;
                                await EnqueueRequests(response.Request);

                                // Now that the request is retried, the parsing will inevitably be executed again, so the resolved target link and success status should be processed at the end.
                                _logger.LogInformation($"Task {Id} processing {response.Request.Url} parsing result is empty, try to try again.");
                                return;
                            }
                        }

                        // Parsed target request
                        if (context.FollowRequests != null && context.FollowRequests.Count > 0)
                        {
                            var requests              = new List <Request>();
                            var currentPageIndex      = 1;
                            var requestPageIndexValue = context.Response.Request.GetProperty("PageIndex");
                            if (!string.IsNullOrWhiteSpace(requestPageIndexValue))
                            {
                                currentPageIndex = int.Parse(requestPageIndexValue);
                            }
                            foreach (var followRequest in context.FollowRequests)
                            {
                                if (followRequest.PageIndex <= PageLimit)
                                {
                                    // only increase Depth in case of page detail not Next Page.
                                    if (followRequest.PageIndex == currentPageIndex)
                                    {
                                        followRequest.Depth = response.Request.Depth + 1;
                                    }
                                    if (followRequest.Depth <= Depth)
                                    {
                                        requests.Add(followRequest);
                                    }
                                }
                            }

                            var count = _scheduler.Enqueue(requests);
                            if (count > 0)
                            {
                                await _statisticsService.IncrementTotalAsync(Id, count);
                            }
                        }

                        if (!resultIsEmpty)
                        {
                            await _statisticsService.IncrementSuccessAsync(Id);
                            _logger.LogInformation($"Task {Id} processed {response.Request.Url} successfully.");
                        }
                        else
                        {
                            if (RetryWhenResultIsEmpty)
                            {
                                await _statisticsService.IncrementFailedAsync(Id);
                                _logger.LogInformation($"Task {Id} failed to process {response.Request.Url}, parsing result is empty.");
                            }
                            else
                            {
                                await _statisticsService.IncrementSuccessAsync(Id);
                                _logger.LogInformation($"Task {Id} processed {response.Request.Url} succeeded, parsing result is empty.");
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        await _statisticsService.IncrementFailedAsync(Id);
                        _logger.LogInformation($"Task {Id} failed to process {response.Request.Url}: {e}");
                    }
                });

                // TODO: 此处需要优化
                // Need to optimize here
                var retryResponses =
                    responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes)
                    .ToList();
                var downloadFailedResponses =
                    responses.Where(x => !x.Success)
                    .ToList();
                var failedResponses =
                    responses.Where(x => !x.Success && x.Request.RetriedTimes >= RetryDownloadTimes)
                    .ToList();

                if (retryResponses.Count > 0)
                {
                    retryResponses.ForEach(x =>
                    {
                        x.Request.RetriedTimes++;
                        _logger.LogInformation($"Task {Id} Download {x.Request.Url} failed: {x.Exception}");
                    });
                    await EnqueueRequests(retryResponses.Select(x => x.Request).ToArray());
                }

                // Statistical download failed
                if (downloadFailedResponses.Count > 0)
                {
                    var elapsedMilliseconds = downloadFailedResponses.Sum(x => x.ElapsedMilliseconds);
                    await _statisticsService.IncrementDownloadFailedAsync(agentId,
                                                                          downloadFailedResponses.Count, elapsedMilliseconds);
                }

                // Statistical failure
                if (failedResponses.Count > 0)
                {
                    await _statisticsService.IncrementFailedAsync(Id, failedResponses.Count);
                }
            }
            catch (Exception ex)
            {
                _logger.LogError($"Task {Id} processing message {message} failed: {ex}");
            }
        }
Пример #28
0
 /// <summary>
 /// 获取数据文件路径
 /// </summary>
 /// <param name="context">数据上下文件</param>
 /// <param name="tableMetadata">表元数据</param>
 /// <param name="extension">文件扩展名</param>
 /// <returns></returns>
 protected virtual string GetDataFile(DataFlowContext context, TableMetadata tableMetadata, string extension)
 {
     return(Path.Combine(GetDataFolder(context.Request.Owner),
                         $"{GenerateFileName(tableMetadata)}.{extension}"));
 }
Пример #29
0
 protected override Task <DataFlowResult> Parse(DataFlowContext context)
 {
     context.AddItem("URL", context.Response.Request.Url);
     context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());
     return(Task.FromResult(DataFlowResult.Success));
 }
Пример #30
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            var selectable = context.Selectable;

            var results = new ParseResult <T>();

            if (selectable.Properties == null)
            {
                selectable.Properties = new Dictionary <string, object>();
            }

            var environments = new Dictionary <string, string>();

            foreach (var property in context.Response.Request.Properties)
            {
                environments.Add(property.Key, property.Value);
            }

            if (Model.GlobalValueSelectors != null)
            {
                foreach (var selector in Model.GlobalValueSelectors)
                {
                    string name = selector.Name;
                    if (string.IsNullOrWhiteSpace(name))
                    {
                        continue;
                    }

                    var value = selectable.Select(selector.ToSelector()).GetValue();
                    if (!environments.ContainsKey(name))
                    {
                        environments.Add(name, value);
                    }
                    else
                    {
                        environments[name] = value;
                    }
                }
            }

            bool singleExtractor = Model.Selector == null;

            if (!singleExtractor)
            {
                var selector = Model.Selector.ToSelector();

                var list = selectable.SelectList(selector).Nodes()?.ToList();
                if (list != null)
                {
                    if (Model.Take > 0 && list.Count > Model.Take)
                    {
                        list = Model.TakeFromHead
                                                        ? list.Take(Model.Take).ToList()
                                                        : list.Skip(list.Count - Model.Take).ToList();
                    }

                    for (var i = 0; i < list.Count; ++i)
                    {
                        var item = list.ElementAt(i);
                        var obj  = ParseObject(environments, item, i);
                        if (obj != null)
                        {
                            results.Add(obj);
                        }
                        else
                        {
                            Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}");
                        }
                    }
                }
            }
            else
            {
                var obj = ParseObject(environments, selectable, 0);
                if (obj != null)
                {
                    results.Add(obj);
                }
                else
                {
                    Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}");
                }
            }

            AddParseResult(context, results);

            return(base.Parse(context));
        }