public async Task IgnoreCase() { using (var conn = CreateConnection()) { // 如果实体的 Schema 没有配置表名,则使用类名 await conn.ExecuteAsync("drop table if exists test.dbo.IgnoreCase;"); using (var builder = GetLocalSpiderHostBuilder()) { var provider = builder.Build(); var services = provider.CreateScopeServiceProvider(); var storage = (RelationalDatabaseEntityStorageBase)CreateStorage(StorageType.Insert); storage.IgnoreCase = false; var dfc = new DataFlowContext(null, services); var typeName = typeof(CreateTableEntity7).FullName; var entity = new CreateTableEntity7(); dfc.Add(typeName, entity.GetTableMetadata()); var items = new ParseResult <CreateTableEntity7> { entity }; dfc.AddParseData(typeName, items); await storage.HandleAsync(dfc); var list = (await conn.QueryAsync <CreateTableEntity7>("SELECT * FROM test.dbo.IgnoreCase")) .ToList(); Assert.Single(list); entity = list.First(); Assert.Equal("xxx", entity.Str1); Assert.Equal("yyy", entity.Str2); Assert.Equal(655, entity.Required); Assert.Equal(0, entity.Decimal); Assert.Equal(600, entity.Long); Assert.Equal(400, entity.Double); Assert.Equal(200.0F, entity.Float); await conn.ExecuteAsync("drop table if exists test.dbo.IgnoreCase;"); } } }
public async Task CreateTablePrimary() { using var conn = CreateConnection(); // 如果实体的 Schema 没有配置表名,则使用类名 await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;"); { var storage = CreateStorage(StorageMode.Insert); var dfc = new DataFlowContext(null, null, null, null); var typeName = typeof(CreateTableEntity4); var entity = new CreateTableEntity4(); var items = new List <CreateTableEntity4> { entity }; dfc.AddData(typeName, items); await storage.HandleAsync(dfc); var list = (await conn.QueryAsync <CreateTableEntity4>("SELECT * FROM test.dbo.createtableprimay")) .ToList(); Assert.Single(list); entity = list.First(); Assert.Equal("xxx", entity.Str1); Assert.Equal("yyy", entity.Str2); Assert.Equal(655, entity.Required); Assert.Equal(0, entity.Decimal); Assert.Equal(600, entity.Long); Assert.Equal(400, entity.Double); Assert.Equal(200.0F, entity.Float); var primaries = (await conn.QueryAsync <IndexInfo> (@"USE test; EXEC sp_pkeys @table_name='createtableprimay'") ).ToList(); Assert.Single(primaries); Assert.Equal("str2", primaries[0].COLUMN_NAME); await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;"); } }
public async Task UseTransaction() { using (var conn = CreateConnection()) { // 如果实体的 Schema 没有配置表名,则使用类名 await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;"); var services = SpiderProvider.Value.CreateScopeServiceProvider(); var storage = (RelationalDatabaseEntityStorageBase)CreateStorage(StorageType.InsertIgnoreDuplicate); storage.UseTransaction = true; var dfc = new DataFlowContext(null, services); var typeName = typeof(CreateTableEntity4).FullName; var entity = new CreateTableEntity4(); dfc.Add(typeName, entity.GetTableMetadata()); var items = new ParseResult <CreateTableEntity4> { entity, entity, entity }; dfc.AddParseData(typeName, items); await storage.HandleAsync(dfc); var list = (await conn.QueryAsync <CreateTableEntity4>("SELECT * FROM test.dbo.createtableprimay")) .ToList(); Assert.Single(list); entity = list.First(); Assert.Equal("xxx", entity.Str1); Assert.Equal("yyy", entity.Str2); Assert.Equal(655, entity.Required); Assert.Equal(0, entity.Decimal); Assert.Equal(600, entity.Long); Assert.Equal(400, entity.Double); Assert.Equal(200.0F, entity.Float); await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;"); } }
public async Task InsertIgnoreDuplicate() { using (var conn = CreateConnection()) { // 如果实体的 Schema 没有配置表名,则使用类名 await conn.ExecuteAsync( $"drop table if exists {Escape}test{Escape}.{Escape}createtableprimay{Escape};"); { var storage = CreateStorage(StorageMode.InsertIgnoreDuplicate); var context = new DataFlowContext(null, new SpiderOptions(), new Request(), new Response()); var typeName = typeof(CreateTableEntity4); var entity = new CreateTableEntity4(); var items = new List <CreateTableEntity4> { entity, entity, entity }; context.AddData(typeName, items); await storage.HandleAsync(context); var list = (await conn.QueryAsync <CreateTableEntity4>( $"SELECT * FROM {Escape}test{Escape}.{Escape}createtableprimay{Escape}")) .ToList(); Assert.Single(list); entity = list.First(); Assert.Equal("xxx", entity.Str1); Assert.Equal("yyy", entity.Str2); Assert.Equal(655, entity.Required); Assert.Equal(0, entity.Decimal); Assert.Equal(600, entity.Long); Assert.Equal(400, entity.Double); Assert.Equal(200.0F, entity.Float); await conn.ExecuteAsync( $"drop table if exists {Escape}test{Escape}.{Escape}createtableprimay{Escape};"); } } }
public async Task RequiredValidator() { var request = new Request("http://cnblogs.com"); var dataContext = new DataFlowContext(null, new SpiderOptions(), request, new Response { Content = new ByteArrayContent(File.ReadAllBytes("cnblogs.html")) }); var dataParser = new TestDataParser(); dataParser.SetLogger(NullLogger.Instance); dataParser.AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']")); dataParser.AddRequiredValidator(r => Regex.IsMatch(r.RequestUri.ToString(), "xxxcnblogs\\.com")); await dataParser.HandleAsync(dataContext); var requests = dataContext.FollowRequests; Assert.Empty(requests); var dataContext2 = new DataFlowContext(null, new SpiderOptions(), request, new Response { Content = new ByteArrayContent(File.ReadAllBytes("cnblogs.html")) }); var dataParser2 = new TestDataParser(); dataParser2.AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']")); dataParser.AddRequiredValidator(r => Regex.IsMatch(r.RequestUri.ToString(), "cnblogs\\.com")); await dataParser2.HandleAsync(dataContext2); requests = dataContext2.FollowRequests; Assert.Equal(12, requests.Count); Assert.Contains(requests, r => r.RequestUri.ToString() == "http://cnblogs.com/sitehome/p/2"); }
protected override Task <DataFlowResult> Store(DataFlowContext context) { foreach (var item in context.GetParseItems()) { var tableMetadata = (TableMetadata)context[item.Key]; switch (MySqlFileType) { case MySqlFileType.LoadFile: { WriteLoadFile(context, tableMetadata, item.Value); break; } case MySqlFileType.InsertSql: { WriteInsertFile(context, tableMetadata, item.Value); break; } } } return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 获取主题的地址 /// </summary> /// <param name="context"></param> public static void GetSubjectUrl(DataFlowContext context) { var pages = context.Selectable .XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { var request = new Request { Url = page, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", context.Response.Request.GetProperty("tag")); request.AddProperty("referer", context.Response.Request.Url); requestList.Add(request); } if (requestList.Count > 0) { context.AddExtraRequests(requestList.ToArray()); } }
/// <summary> /// 获取图片浏览页里抽图片地址 /// </summary> /// <param name="context"></param> public static void GetDetailPictureUrl(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues(); foreach (var image in images) { //处理图片URL下载 var request = new Request { Url = image, OwnerId = response.Request.OwnerId }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.GetProperty("referer")); request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue()); ImageDownloader.GetInstance().AddRequest(request); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var news = context.Selectable.XPath(".//div[@class='news_block']").Nodes(); foreach (var item in news) { var title = item.Select(Selectors.XPath(".//h2[@class='news_entry']")) .GetValue(ValueOption.InnerText); var url = item.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href")).GetValue(); var summary = item.Select(Selectors.XPath(".//div[@class='entry_summary']")) .GetValue(ValueOption.InnerText); var views = item.Select(Selectors.XPath(".//span[@class='view']")).GetValue(ValueOption.InnerText) .Replace(" 人浏览", ""); var request = CreateFromRequest(context.Response.Request, url); request.AddProperty("title", title); request.AddProperty("summary", summary); request.AddProperty("views", views); context.AddExtraRequests(request); } return(Task.FromResult(DataFlowResult.Success)); }
public async Task InsertAndUpdate() { using (var conn = CreateConnection()) { // 如果实体的 Schema 没有配置表名,则使用类名 await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;"); { var storage = CreateStorage(StorageMode.InsertAndUpdate); var dfc = new DataFlowContext(null, null, null, null); var typeName = typeof(CreateTableEntity4); var entity = new CreateTableEntity4(); var items = new List <CreateTableEntity4> { entity, new CreateTableEntity4 { Str1 = "zzz" } }; dfc.AddData(typeName, items); await storage.HandleAsync(dfc); var list = (await conn.QueryAsync <CreateTableEntity4>("SELECT * FROM test.dbo.createtableprimay")) .ToList(); Assert.Single(list); entity = list.First(); Assert.Equal("zzz", entity.Str1); Assert.Equal("yyy", entity.Str2); Assert.Equal(655, entity.Required); Assert.Equal(0, entity.Decimal); Assert.Equal(600, entity.Long); Assert.Equal(400, entity.Double); Assert.Equal(200.0F, entity.Float); await conn.ExecuteAsync("drop table if exists test.dbo.createtableprimay;"); } } }
public async Task ParseEntity() { var request = new Request("https://list.jd.com/list.html?cat=9987,653,655", new Dictionary <string, object> { { "cat", "手机" }, { "cat3", "110" } }); var dataContext = new DataFlowContext(null, new SpiderOptions(), request, new Response { Content = new ByteArrayContent(File.ReadAllBytes("Jd.html")) }); var parser = new DataParser <Product>(); await parser.InitializeAsync(); parser.UseHtmlSelectableBuilder(); await parser.HandleAsync(dataContext); var results = (List <Product>)dataContext.GetData(typeof(Product)); Assert.Equal(60, results.Count); Assert.Contains("手机商品筛选", results[0].Title); Assert.Contains("手机商品筛选", results[1].Title); Assert.Contains("手机商品筛选", results[2].Title); Assert.Equal("手机", results[0].CategoryName); Assert.Equal(110, results[0].CategoryId); Assert.Equal("https://item.jd.com/3031737.html", results[0].Url); Assert.Equal("3031737", results[0].Sku); Assert.Equal("荣耀官方旗舰店", results[0].ShopName); Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name); Assert.Equal("1000000904", results[0].VenderId); Assert.Equal("1000000904", results[0].JdzyShopId); Assert.Equal(DateTimeOffset.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd")); var requests = dataContext.FollowRequests; Assert.Equal(7, requests.Count); }
public override async Task <DataFlowResult> HandleAsync(DataFlowContext context) { try { if (!context.HasItems) { return(DataFlowResult.Success); } var storeResult = await Store(context); if (storeResult == DataFlowResult.Failed || storeResult == DataFlowResult.Terminated) { return(storeResult); } return(DataFlowResult.Success); } catch (Exception e) { Logger?.LogError($"数据存储发生异常: {e}"); return(DataFlowResult.Failed); } }
protected override async Task ParseAsync(DataFlowContext context) { var props = context.Request.Properties; if (!props.ContainsKey(REQUEST_CHECK_PROPERTY_NAME)) { return; } var type = props[REQUEST_CHECK_PROPERTY_NAME] as string; if (type != _parserName) { return; } var jsonStr = context.Response.ReadAsString(); var obj = JsonConvert.DeserializeObject <T>(jsonStr); context.AddData(typeof(T).Name, obj); OnHanlder(context, obj); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddData("URL", context.Response.Request.Url); context.AddData("Title", context.Selectable.XPath(".//title").GetValue()); var songs = new Dictionary <string, string>(); var tagNodes = context.Selectable .XPath("//*[@id=\"song-list-pre-cache\"]/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var urls = url.Split("song"); url = urls[0] + "song/media/outer/url" + urls[1] + ".mp3"; var name = node.GetValue(); songs.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var song in songs) { var request = new Request { Url = song.Key, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", song.Value); request.AddProperty("path", GetImagePath(song.Value)); Downloader.GetInstance().AddRequest(request); } return(Task.FromResult(DataFlowResult.Success)); }
public void XpathFollow() { var services = LocalSpiderProvider.Value.CreateScopeServiceProvider(); var dataContext = new DataFlowContext( new Response { Request = new Request("http://cnblogs.com"), Content = File.ReadAllBytes("cnblogs.html"), CharSet = "UTF-8" }, services); if (dataContext.Selectable == null) { dataContext.Selectable = dataContext.Response?.ToSelectable(); } var xpathFollow = DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']"); var requests = xpathFollow.Invoke(dataContext); Assert.Equal(12, requests.Count); Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2"); }
/// <summary> /// Data analysis /// </summary> /// <param name="context">Processing context</param> /// <returns></returns> public override async Task <DataFlowResult> HandleAsync(DataFlowContext context) { if (context?.Response == null) { Logger?.LogError("Data context or response content is empty"); return(DataFlowResult.Failed); } try { // Skip if not matched, does not affect the execution of other data stream processors if (CanParse != null && !CanParse(context.Response.Request)) { return(DataFlowResult.Success); } // [Doanh]: call this first to initialize _selectable Instance for the context. SelectableFactory?.Invoke(context); var parserResult = await Parse(context); var urls = QueryFollowRequests?.Invoke(context); AddTargetRequests(context, urls); if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated) { return(parserResult); } return(DataFlowResult.Success); } catch (Exception e) { Logger?.LogError($"任务 {context.Response.Request.OwnerId} 数据解析发生异常: {e}"); return(DataFlowResult.Failed); } }
/// <summary> /// 数据解析 /// </summary> /// <param name="context">处理上下文</param> /// <returns></returns> public override async Task <DataFlowResult> HandleAsync(DataFlowContext context) { if (context?.Response == null) { Logger?.LogError("数据上下文或者响应内容为空"); return(DataFlowResult.Failed); } try { // 如果不匹配则跳过,不影响其它数据流处理器的执行 if (CanParse != null && !CanParse(context.Response.Request)) { return(DataFlowResult.Success); } SelectableFactory?.Invoke(context); var parserResult = await Parse(context); var urls = QueryFollowRequests?.Invoke(context); AddTargetRequests(context, urls); if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated) { return(parserResult); } return(DataFlowResult.Success); } catch (Exception e) { Logger?.LogError($"任务 {context.Response.Request.OwnerId} 数据解析发生异常: {e}"); return(DataFlowResult.Failed); } }
/// <summary> /// 解析画册的分页 /// </summary> /// <param name="context"></param> public static void GetSubjectPageUrl(DataFlowContext context) { var pageSet = new Dictionary <string, string>(); var pages = context.Selectable .XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { try { var request = new Request { Url = page, OwnerId = context.Response.Request.OwnerId }; //request.Properties.Add("tag", response.Request.Properties["tag"]); request.AddProperty("tag", "萝莉"); requestList.Add(request); pageSet.Add(page, page); } catch (Exception e) { Console.WriteLine(e); } } } if (requestList.Count > 0) { context.AddExtraRequests(requestList.ToArray()); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (!context.Contains(_model.TypeName)) { context.Add(_model.TypeName, _tableMetadata); } var selectable = context.GetSelectable(); var results = new ParseResult <T>(); if (selectable.Properties == null) { selectable.Properties = new Dictionary <string, object>(); } var environments = new Dictionary <string, string>(); foreach (var property in context.Response.Request.Properties) { environments.Add(property.Key, property.Value); } if (_model.ShareValueSelectors != null) { foreach (var selector in _model.ShareValueSelectors) { string name = selector.Name; var value = selectable.Select(selector.ToSelector()).GetValue(); if (!environments.ContainsKey(name)) { environments.Add(name, value); } else { environments[name] = value; } } } bool singleExtractor = _model.Selector == null; if (!singleExtractor) { var selector = _model.Selector.ToSelector(); var list = selectable.SelectList(selector).Nodes()?.ToList(); if (list != null) { if (_model.Take > 0 && list.Count > _model.Take) { list = _model.TakeFromHead ? list.Take(_model.Take).ToList() : list.Skip(list.Count - _model.Take).ToList(); } for (var i = 0; i < list.Count; ++i) { var item = list.ElementAt(i); var obj = ParseObject(environments, item, i); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}"); } } } } else { var obj = ParseObject(environments, selectable, 0); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}"); } } if (results.Count > 0) { var items = context.GetParseItem(_model.TypeName); if (items == null) { context.AddParseItem(_model.TypeName, results); } else { ((ParseResult <T>)items).AddRange(results); } } return(Task.FromResult(DataFlowResult.Success)); }
private async Task HandleMessage(string message) { if (string.IsNullOrWhiteSpace(message)) { _logger.LogWarning($"任务 {Id} 接收到空消息"); return; } var commandMessage = message.ToCommandMessage(); if (commandMessage != null) { switch (commandMessage.Command) { case Framework.AllocateDownloaderCommand: { if (commandMessage.Message == "true") { _allocated.Inc(); } else { _logger.LogError($"任务 {Id} 分配下载器代理失败"); _allocatedSuccess = false; } break; } default: { _logger.LogError($"任务 {Id} 未能处理命令: {message}"); break; } } return; } _lastRequestedTime = DateTime.Now; Response[] responses; try { responses = JsonConvert.DeserializeObject <Response[]>(message); } catch { _logger.LogError($"任务 {Id} 接收到异常消息: {message}"); return; } try { if (responses.Length == 0) { _logger.LogWarning($"任务 {Id} 接收到空回复"); return; } _responded.Add(responses.Length); // 只要有回应就从缓存中删除,即便是异常要重新下载会成 EnqueueRequest 中重新加回缓存 // 此处只需要保证: 发 -> 收 可以一对一删除就可以保证检测机制的正确性 foreach (var response in responses) { _enqueuedRequestDict.TryRemove(response.Request.Hash, out _); } var agentId = responses.First().AgentId; var successResponses = responses.Where(x => x.Success).ToList(); // 统计下载成功 if (successResponses.Count > 0) { var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count, elapsedMilliseconds); } // 处理下载成功的请求 Parallel.ForEach(successResponses, async response => { _logger.LogInformation($"任务 {Id} 下载 {response.Request.Url} 成功"); try { var context = new DataFlowContext(response, _services.CreateScope().ServiceProvider); foreach (var dataFlow in _dataFlows) { var dataFlowResult = await dataFlow.HandleAsync(context); var @break = false; switch (dataFlowResult) { case DataFlowResult.Success: { continue; } case DataFlowResult.Failed: { // 如果处理失败,则直接返回 _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {context.Result}"); await _statisticsService.IncrementFailedAsync(Id); return; } case DataFlowResult.Terminated: { @break = true; break; } } if (@break) { break; } } var resultIsEmpty = !context.HasItems && !context.HasParseItems; // 如果解析结果为空,重试 if (resultIsEmpty && RetryWhenResultIsEmpty) { if (response.Request.RetriedTimes < RetryDownloadTimes) { response.Request.RetriedTimes++; await EnqueueRequests(response.Request); // 即然是重试这个请求,则解析必然还会再执行一遍,所以解析到的目标链接、成功状态都应该到最后来处理。 _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 解析结果为空,尝试重试"); return; } } // 解析的目标请求 if (context.FollowRequests != null && context.FollowRequests.Count > 0) { var requests = new List <Request>(); foreach (var followRequest in context.FollowRequests) { followRequest.Depth = response.Request.Depth + 1; if (followRequest.Depth <= Depth) { requests.Add(followRequest); } } var count = _scheduler.Enqueue(requests); if (count > 0) { await _statisticsService.IncrementTotalAsync(Id, count); } } if (!resultIsEmpty) { await _statisticsService.IncrementSuccessAsync(Id); _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 成功"); } else { if (RetryWhenResultIsEmpty) { await _statisticsService.IncrementFailedAsync(Id); _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败,解析结果为空"); } else { await _statisticsService.IncrementSuccessAsync(Id); _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 成功,解析结果为空"); } } } catch (Exception e) { await _statisticsService.IncrementFailedAsync(Id); _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {e}"); } }); // TODO: 此处需要优化 var retryResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes) .ToList(); var downloadFailedResponses = responses.Where(x => !x.Success) .ToList(); var failedResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes >= RetryDownloadTimes) .ToList(); if (retryResponses.Count > 0) { retryResponses.ForEach(x => { x.Request.RetriedTimes++; _logger.LogInformation($"任务 {Id} 下载 {x.Request.Url} 失败: {x.Exception}"); }); await EnqueueRequests(retryResponses.Select(x => x.Request).ToArray()); } // 统计下载失败 if (downloadFailedResponses.Count > 0) { var elapsedMilliseconds = downloadFailedResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadFailedAsync(agentId, downloadFailedResponses.Count, elapsedMilliseconds); } // 统计失败 if (failedResponses.Count > 0) { await _statisticsService.IncrementFailedAsync(Id, failedResponses.Count); } } catch (Exception ex) { _logger.LogError($"任务 {Id} 处理消息 {message} 失败: {ex}"); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { GetDetailPictureUrl(context); return(Task.FromResult(DataFlowResult.Success)); }
private async Task HandleMessage(string message) { if (string.IsNullOrWhiteSpace(message)) { _logger.LogWarning($"任务 {Id} 接收到空消息"); return; } _lastRequestedTime = DateTime.Now; var responses = JsonConvert.DeserializeObject <List <Response> >(message); if (responses.Count == 0) { _logger.LogWarning($"任务 {Id} 接收到空回复"); return; } var agentId = responses.First().AgentId; var successResponses = responses.Where(x => x.Success).ToList(); // 统计下载成功 if (successResponses.Count > 0) { var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count, elapsedMilliseconds); } // 处理下载成功的请求 Parallel.ForEach(successResponses, async response => { _logger.LogInformation($"任务 {Id} 下载 {response.Request.Url} 成功"); var context = new DataFlowContext(_services.CreateScope().ServiceProvider); context.AddResponse(response); try { bool success = true; foreach (var dataFlow in _dataFlows) { var dataFlowResult = await dataFlow.HandleAsync(context); switch (dataFlowResult) { case DataFlowResult.Success: { continue; } case DataFlowResult.Failed: { _logger.LogError($"任务 {Id} 数据流处理器 {dataFlow.GetType().Name} 失败"); success = false; break; } case DataFlowResult.Terminated: { break; } } } var resultItems = context.GetItems(); // 如果解析结果为空,重试 if ((resultItems == null || resultItems.Sum(x => x.Value == null ? 0 : x.Value.Count) == 0) && RetryWhenResultIsEmpty) { response.Request.RetriedTimes++; response.Request.ComputeHash(); // 不需要添加总计 _scheduler.Enqueue(new[] { response.Request.Clone() }); } // 解析的目标请求 var followRequests = context.GetTargetRequests(); if (followRequests != null && followRequests.Count > 0) { var requests = new List <Request>(); foreach (var followRequest in followRequests) { followRequest.Depth = response.Request.Depth + 1; if (followRequest.Depth <= Depth) { requests.Add(followRequest); } } var count = _scheduler.Enqueue(requests); if (count > 0) { await _statisticsService.IncrementTotalAsync(Id, count); } } if (success) { await _statisticsService.IncrementSuccessAsync(Id); } else { await _statisticsService.IncrementFailedAsync(Id); } var result = success ? "成功" : $"失败: {context.Result}"; _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} {result}"); } catch (Exception e) { _logger.LogInformation($"任务 {Id} 处理 {response.Request.Url} 失败: {e}"); } }); var retryResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes) .ToList(); retryResponses.ForEach(x => { x.Request.RetriedTimes++; _logger.LogInformation($"任务 {Id} 下载 {x.Request.Url} 失败: {x.Exception}"); }); var failedRequests = responses.Where(x => !x.Success) .ToList(); // 统计下载失败 if (failedRequests.Count > 0) { await _statisticsService.IncrementFailedAsync(Id); await _statisticsService.IncrementDownloadFailedAsync(agentId, failedRequests.Count); } var retryCount = _scheduler.Enqueue(retryResponses.Select(x => x.Request.Clone())); if (retryCount > 0) { await _statisticsService.IncrementTotalAsync(Id, retryCount); } }
public override Task HandleAsync(DataFlowContext context) { throw new NotImplementedException(); }
private async Task HandleResponseAsync(MessageData <Response[]> message) { if (message?.Data == null || message.Data.Length == 0) { Logger.LogWarning($"{Id} receive empty message"); return; } _lastRequestedTime = DateTimeOffset.Now; var responses = message.Data; try { if (responses.Length == 0) { Logger.LogWarning($"{Id} receive empty message"); return; } _responded.Add(responses.Length); // 只要有回应就从缓存中删除,即便是异常要重新下载会成 EnqueueRequest 中重新加回缓存 // 此处只需要保证: 发 -> 收 可以一对一删除就可以保证检测机制的正确性 foreach (var response in responses) { _enqueuedRequestDict.TryRemove(response.Request.Hash, out _); } var agentId = responses.First().AgentId; var successResponses = responses.Where(x => x.Success).ToList(); // 统计下载成功 if (successResponses.Count > 0) { var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count, elapsedMilliseconds); } // 处理下载成功的请求 Parallel.ForEach(successResponses, async response => { Logger.LogInformation($"{Id} download {response.Request.Url} success"); try { using (var scope = Services.CreateScope()) { var context = new DataFlowContext(response, scope.ServiceProvider); foreach (var dataFlow in _dataFlows) { var dataFlowResult = await dataFlow.HandleAsync(context); var @break = false; switch (dataFlowResult) { case DataFlowResult.Success: { continue; } case DataFlowResult.Failed: { // 如果处理失败,则直接返回 Logger.LogInformation( $"{Id} handle {response.Request.Url} failed: {context.Message}"); await _statisticsService.IncrementFailedAsync(Id); return; } case DataFlowResult.Terminated: { @break = true; break; } } if (@break) { break; } } var resultIsEmpty = !context.HasData && !context.HasParseData; // 如果解析结果为空,重试 if (resultIsEmpty && RetryWhenResultIsEmpty) { if (response.Request.RetriedTimes < response.Request.RetryTimes) { response.Request.RetriedTimes++; await EnqueueRequests(response.Request); // 即然是重试这个请求,则解析必然还会再执行一遍,所以解析到的目标链接、成功状态都应该到最后来处理。 Logger.LogInformation($"{Id} retry {response.Request.Url} because empty result"); return; } } // 解析的目标请求 if (context.ExtraRequests != null && context.ExtraRequests.Count > 0) { var requests = new List <Request>(); foreach (var newRequest in context.ExtraRequests) { newRequest.Depth = response.Request.Depth + 1; if (newRequest.Depth <= Depth) { // 在此强制设制 OwnerId, 防止用户忘记导致出错 if (string.IsNullOrWhiteSpace(newRequest.OwnerId)) { newRequest.OwnerId = context.Response.Request.OwnerId; newRequest.AgentId = context.Response.Request.AgentId; } requests.Add(newRequest); } } var count = _scheduler.Enqueue(requests); if (count > 0) { await _statisticsService.IncrementTotalAsync(Id, count); } } if (!resultIsEmpty) { await _statisticsService.IncrementSuccessAsync(Id); Logger.LogInformation($"{Id} handle {response.Request.Url} success"); } else { if (RetryWhenResultIsEmpty) { await _statisticsService.IncrementFailedAsync(Id); Logger.LogInformation( $"{Id} handle {response.Request.Url} failed,extract result is empty"); } else { await _statisticsService.IncrementSuccessAsync(Id); Logger.LogInformation( $"{Id} handle {response.Request.Url} success,extract result is empty"); } } } } catch (Exception e) { await _statisticsService.IncrementFailedAsync(Id); Logger.LogInformation($"{Id} handle {response.Request.Url} failed: {e}"); } }); // TODO: 此处需要优化 var retryResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes < x.Request.RetryTimes) .ToList(); var downloadFailedResponses = responses.Where(x => !x.Success) .ToList(); var failedResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes >= x.Request.RetryTimes) .ToList(); foreach (var response in downloadFailedResponses) { Logger.LogError($"{Id} download failed: {JsonConvert.SerializeObject(response)}"); } foreach (var response in failedResponses) { Logger.LogError($"{Id} failed: {JsonConvert.SerializeObject(response)}"); } if (retryResponses.Count > 0) { retryResponses.ForEach(x => { x.Request.RetriedTimes++; Logger.LogInformation($"{Id} download {x.Request.Url} failed: {x.Exception}"); }); await EnqueueRequests(retryResponses.Select(x => x.Request).ToArray()); } // 统计下载失败 if (downloadFailedResponses.Count > 0) { var elapsedMilliseconds = downloadFailedResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadFailedAsync(agentId, downloadFailedResponses.Count, elapsedMilliseconds); } // 统计失败 if (failedResponses.Count > 0) { await _statisticsService.IncrementFailedAsync(Id, failedResponses.Count); } } catch (Exception ex) { Logger.LogError($"{Id} handle message {message} failed: {ex}"); } }
public DecompressConsumer(DataFlowContext context) { _context = context; }
protected abstract Task <DataFlowResult> Parse(DataFlowContext context);
private async Task HandleDynamicMessage(string cmd, dynamic message) { _lastRequestedTime = DateTime.Now; Response[] responses; try { responses = message as Response[]; // JsonConvert.DeserializeObject<Response[]>(message); if (responses == null) { var response = message as Response; if (response != null) { responses = new Response[] { response }; } } } catch { _logger.LogError($"Task {Id} received an exception message: {message}"); return; } try { if (responses == null || responses.Length == 0) { _logger.LogWarning($"Task {Id} received an empty reply"); return; } _responded.Add(responses.Length); // As long as there is a response, it will be deleted from the cache. Even if the exception is to be re-downloaded, it will be added back to the cache in EnqueueRequest. // Here only need to ensure: Send -> Receive can be one-to-one delete to ensure the correctness of the detection mechanism foreach (var response in responses) { _enqueuedRequestDict.TryRemove(response.Request.Hash, out _); } var agentId = responses.First().AgentId; var successResponses = responses.Where(x => x.Success).ToList(); // Statistical download success if (successResponses.Count > 0) { var elapsedMilliseconds = successResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadSuccessAsync(agentId, successResponses.Count, elapsedMilliseconds); } // Handling a successful download request Parallel.ForEach(successResponses, async response => { _logger.LogInformation($"Task {Id} Download {response.Request.Url} Success"); try { var context = new DataFlowContext(response, _services.CreateScope().ServiceProvider); context["ProjectName"] = "Vnexpress.net"; foreach (var dataFlow in _dataFlows) { var dataFlowResult = await dataFlow.HandleAsync(context); var @break = false; switch (dataFlowResult) { case DataFlowResult.Success: { continue; } case DataFlowResult.Failed: { // If the processing fails, return directly _logger.LogInformation($"Task {Id} failed to process {response.Request.Url}: {context.Result}"); await _statisticsService.IncrementFailedAsync(Id); return; } case DataFlowResult.Terminated: { @break = true; break; } } if (@break) { break; } } var resultIsEmpty = !context.HasItems && !context.HasParseItems; // If the parsing result is empty, try again if (resultIsEmpty && RetryWhenResultIsEmpty) { if (response.Request.RetriedTimes < RetryDownloadTimes) { response.Request.RetriedTimes++; await EnqueueRequests(response.Request); // Now that the request is retried, the parsing will inevitably be executed again, so the resolved target link and success status should be processed at the end. _logger.LogInformation($"Task {Id} processing {response.Request.Url} parsing result is empty, try to try again."); return; } } // Parsed target request if (context.FollowRequests != null && context.FollowRequests.Count > 0) { var requests = new List <Request>(); var currentPageIndex = 1; var requestPageIndexValue = context.Response.Request.GetProperty("PageIndex"); if (!string.IsNullOrWhiteSpace(requestPageIndexValue)) { currentPageIndex = int.Parse(requestPageIndexValue); } foreach (var followRequest in context.FollowRequests) { if (followRequest.PageIndex <= PageLimit) { // only increase Depth in case of page detail not Next Page. if (followRequest.PageIndex == currentPageIndex) { followRequest.Depth = response.Request.Depth + 1; } if (followRequest.Depth <= Depth) { requests.Add(followRequest); } } } var count = _scheduler.Enqueue(requests); if (count > 0) { await _statisticsService.IncrementTotalAsync(Id, count); } } if (!resultIsEmpty) { await _statisticsService.IncrementSuccessAsync(Id); _logger.LogInformation($"Task {Id} processed {response.Request.Url} successfully."); } else { if (RetryWhenResultIsEmpty) { await _statisticsService.IncrementFailedAsync(Id); _logger.LogInformation($"Task {Id} failed to process {response.Request.Url}, parsing result is empty."); } else { await _statisticsService.IncrementSuccessAsync(Id); _logger.LogInformation($"Task {Id} processed {response.Request.Url} succeeded, parsing result is empty."); } } } catch (Exception e) { await _statisticsService.IncrementFailedAsync(Id); _logger.LogInformation($"Task {Id} failed to process {response.Request.Url}: {e}"); } }); // TODO: 此处需要优化 // Need to optimize here var retryResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes < RetryDownloadTimes) .ToList(); var downloadFailedResponses = responses.Where(x => !x.Success) .ToList(); var failedResponses = responses.Where(x => !x.Success && x.Request.RetriedTimes >= RetryDownloadTimes) .ToList(); if (retryResponses.Count > 0) { retryResponses.ForEach(x => { x.Request.RetriedTimes++; _logger.LogInformation($"Task {Id} Download {x.Request.Url} failed: {x.Exception}"); }); await EnqueueRequests(retryResponses.Select(x => x.Request).ToArray()); } // Statistical download failed if (downloadFailedResponses.Count > 0) { var elapsedMilliseconds = downloadFailedResponses.Sum(x => x.ElapsedMilliseconds); await _statisticsService.IncrementDownloadFailedAsync(agentId, downloadFailedResponses.Count, elapsedMilliseconds); } // Statistical failure if (failedResponses.Count > 0) { await _statisticsService.IncrementFailedAsync(Id, failedResponses.Count); } } catch (Exception ex) { _logger.LogError($"Task {Id} processing message {message} failed: {ex}"); } }
/// <summary> /// 获取数据文件路径 /// </summary> /// <param name="context">数据上下文件</param> /// <param name="tableMetadata">表元数据</param> /// <param name="extension">文件扩展名</param> /// <returns></returns> protected virtual string GetDataFile(DataFlowContext context, TableMetadata tableMetadata, string extension) { return(Path.Combine(GetDataFolder(context.Request.Owner), $"{GenerateFileName(tableMetadata)}.{extension}")); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.Selectable; var results = new ParseResult <T>(); if (selectable.Properties == null) { selectable.Properties = new Dictionary <string, object>(); } var environments = new Dictionary <string, string>(); foreach (var property in context.Response.Request.Properties) { environments.Add(property.Key, property.Value); } if (Model.GlobalValueSelectors != null) { foreach (var selector in Model.GlobalValueSelectors) { string name = selector.Name; if (string.IsNullOrWhiteSpace(name)) { continue; } var value = selectable.Select(selector.ToSelector()).GetValue(); if (!environments.ContainsKey(name)) { environments.Add(name, value); } else { environments[name] = value; } } } bool singleExtractor = Model.Selector == null; if (!singleExtractor) { var selector = Model.Selector.ToSelector(); var list = selectable.SelectList(selector).Nodes()?.ToList(); if (list != null) { if (Model.Take > 0 && list.Count > Model.Take) { list = Model.TakeFromHead ? list.Take(Model.Take).ToList() : list.Skip(list.Count - Model.Take).ToList(); } for (var i = 0; i < list.Count; ++i) { var item = list.ElementAt(i); var obj = ParseObject(environments, item, i); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}"); } } } } else { var obj = ParseObject(environments, selectable, 0); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}"); } } AddParseResult(context, results); return(base.Parse(context)); }