public async Task ParseEntity() { using (var builder = GetLocalSpiderHostBuilder()) { var provider = builder.Build(); var services = provider.CreateScopeServiceProvider(); var dataContext = new DataFlowContext(new Response { Request = new Request("https://list.jd.com/list.html?cat=9987,653,655", new Dictionary <string, string> { { "cat", "手机" }, { "cat3", "110" } }), Content = File.ReadAllBytes("Jd.html"), CharSet = "UTF-8" }, services); var extractor = new DataParser <Product>(); await extractor.HandleAsync(dataContext); var results = (ParseResult <Product>)dataContext.GetParseData(typeof(Product).FullName); Assert.Equal(60, results.Count); Assert.Equal("手机", results[0].CategoryName); Assert.Equal(110, results[0].CategoryId); Assert.Equal("https://item.jd.com/3031737.html", results[0].Url); Assert.Equal("3031737", results[0].Sku); Assert.Equal("荣耀官方旗舰店", results[0].ShopName); Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name); Assert.Equal("1000000904", results[0].VenderId); Assert.Equal("1000000904", results[0].JdzyShopId); Assert.Equal(DateTimeOffset.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd")); } }
public void ParseEntity() { var service = LocalSpiderProvider.Value.CreateScopeServiceProvider(); var dataContext = new DataFlowContext(new Response { Request = new Request("https://list.jd.com/list.html?cat=9987,653,655", new Dictionary <string, string> { { "cat", "手机" }, { "cat3", "110" } }), RawText = File.ReadAllText("Jd.html") }, service); DataParser <Product> extractor = new DataParser <Product>(); extractor.HandleAsync(dataContext).GetAwaiter().GetResult(); var results = (ParseResult <Product>)dataContext.GetParseData(typeof(Product).FullName); Assert.Equal(60, results.Count); Assert.Equal("手机", results[0].CategoryName); Assert.Equal(110, results[0].CategoryId); Assert.Equal("https://item.jd.com/3031737.html", results[0].Url); Assert.Equal("3031737", results[0].Sku); Assert.Equal("荣耀官方旗舰店", results[0].ShopName); Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name); Assert.Equal("1000000904", results[0].VenderId); Assert.Equal("1000000904", results[0].JdzyShopId); Assert.Equal(DateTime.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd")); }
public void MultiEntitySelector() { using (var builder = GetLocalSpiderHostBuilder()) { var provider = builder.Build(); var services = provider.CreateScopeServiceProvider(); var dataContext = new DataFlowContext( new Response { Request = new Request("http://abcd.com"), Content = Encoding.UTF8.GetBytes(Html), CharSet = "UTF-8" }, services); var parser = new DataParser <E>(); parser.HandleAsync(dataContext).GetAwaiter().GetResult(); var results = (ParseResult <E>)dataContext.GetParseData(typeof(E).FullName); Assert.Equal("a", results[0].title); Assert.Equal("b", results[1].title); } }
protected override async Task <DataFlowResult> Store(DataFlowContext context) { foreach (var item in context.GetParseData()) { var tableMetadata = (TableMetadata)context[item.Key]; var writer = CreateOrOpen(context, tableMetadata, "json"); await writer.WriteLineAsync(JsonConvert.SerializeObject(item.Value)); } return(DataFlowResult.Success); }
protected override Task <DataFlowResult> Store(DataFlowContext context) { var items = context.GetParseData(); foreach (var item in items) { foreach (var data in item.Value) { Console.WriteLine(JsonConvert.SerializeObject(data)); } } return(Task.FromResult(DataFlowResult.Success)); }
protected virtual void AddParseResult(DataFlowContext context, ParseResult <T> result) { if (result.Count > 0) { var items = context.GetParseData(Model.TypeName); if (items == null) { context.AddParseData(Model.TypeName, result); } else { ((ParseResult <T>)items).AddRange(result); } } }
public void MultiEntitySelector() { var service = LocalSpiderProvider.Value.CreateScopeServiceProvider(); var dataContext = new DataFlowContext(new Response { Request = new Request("http://abcd.com"), RawText = Html }, service); var parser = new DataParser <E>(); parser.HandleAsync(dataContext).GetAwaiter().GetResult(); var results = (ParseResult <E>)dataContext.GetParseData(typeof(E).FullName); Assert.Equal("a", results[0].title); Assert.Equal("b", results[1].title); }
public void SingleEntitySelector() { var service = SpiderProvider.Value.CreateScopeServiceProvider(); var dataContext = new DataFlowContext(new Response { Request = new Request("http://abcd.com"), RawText = Html }, service); var parser = new DataParser <N>(); parser.HandleAsync(dataContext).GetAwaiter().GetResult(); var results = (ParseResult <N>)dataContext.GetParseData(typeof(N).FullName); Assert.Equal("i am title", results[0].title); Assert.Equal("i am dotnetspider", results[0].dotnetspider); }
protected override Task <DataFlowResult> Store(DataFlowContext context) { foreach (var item in context.GetParseData()) { var tableMetadata = (TableMetadata)context[item.Key]; switch (MySqlFileType) { case MySqlFileType.LoadFile: { WriteLoadFile(context, tableMetadata, item.Value); break; } case MySqlFileType.InsertSql: { WriteInsertFile(context, tableMetadata, item.Value); break; } } } return(Task.FromResult(DataFlowResult.Success)); }
protected override async Task <DataFlowResult> Store(DataFlowContext context) { IDbConnection conn = TryCreateDbConnection(context); using (conn) { foreach (var item in context.GetParseData()) { var tableMetadata = (TableMetadata)context[item.Key]; SqlStatements sqlStatements = GetSqlStatements(tableMetadata); if (_executedCache.TryAdd(sqlStatements.CreateTableSql, new object())) { EnsureDatabaseAndTableCreated(conn, sqlStatements); } for (int i = 0; i < RetryTimes; ++i) { IDbTransaction transaction = null; try { if (UseTransaction) { transaction = conn.BeginTransaction(); } var list = item.Value; switch (StorageType) { case StorageType.Insert: { await conn.ExecuteAsync(sqlStatements.InsertSql, list, transaction); break; } case StorageType.InsertIgnoreDuplicate: { await conn.ExecuteAsync(sqlStatements.InsertIgnoreDuplicateSql, list, transaction); break; } case StorageType.Update: { if (string.IsNullOrWhiteSpace(sqlStatements.UpdateSql)) { throw new SpiderException("未能生成更新 SQL"); } await conn.ExecuteAsync(sqlStatements.UpdateSql, list, transaction); break; } case StorageType.InsertAndUpdate: { await conn.ExecuteAsync(sqlStatements.InsertAndUpdateSql, list, transaction); break; } } transaction?.Commit(); break; } catch (Exception ex) { Logger?.LogError($"尝试插入数据失败: {ex}"); // 网络异常需要重试,并且不需要 Rollback var endOfStreamException = ex.InnerException as EndOfStreamException; if (endOfStreamException == null) { try { transaction?.Rollback(); } catch (Exception e) { Logger?.LogError($"数据库回滚失败: {e}"); } break; } } finally { transaction?.Dispose(); } } } } return(DataFlowResult.Success); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (!context.Contains(_model.TypeName)) { context.Add(_model.TypeName, _tableMetadata); } var selectable = context.Selectable; var results = new ParseResult <T>(); if (selectable.Properties == null) { selectable.Properties = new Dictionary <string, object>(); } var environments = new Dictionary <string, string>(); foreach (var property in context.Response.Request.Properties) { environments.Add(property.Key, property.Value); } if (_model.GlobalValueSelectors != null) { foreach (var selector in _model.GlobalValueSelectors) { string name = selector.Name; if (string.IsNullOrWhiteSpace(name)) { continue; } var value = selectable.Select(selector.ToSelector()).GetValue(); if (!environments.ContainsKey(name)) { environments.Add(name, value); } else { environments[name] = value; } } } bool singleExtractor = _model.Selector == null; if (!singleExtractor) { var selector = _model.Selector.ToSelector(); var list = selectable.SelectList(selector).Nodes()?.ToList(); if (list != null) { if (_model.Take > 0 && list.Count > _model.Take) { list = _model.TakeFromHead ? list.Take(_model.Take).ToList() : list.Skip(list.Count - _model.Take).ToList(); } for (var i = 0; i < list.Count; ++i) { var item = list.ElementAt(i); var obj = ParseObject(environments, item, i); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}"); } } } } else { var obj = ParseObject(environments, selectable, 0); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}"); } } if (results.Count > 0) { var items = context.GetParseData(_model.TypeName); if (items == null) { context.AddParseData(_model.TypeName, results); } else { ((ParseResult <T>)items).AddRange(results); } } return(Task.FromResult(DataFlowResult.Success)); }