public async Task ParseEntity()
        {
            using (var builder = GetLocalSpiderHostBuilder())
            {
                var provider    = builder.Build();
                var services    = provider.CreateScopeServiceProvider();
                var dataContext = new DataFlowContext(new Response
                {
                    Request = new Request("https://list.jd.com/list.html?cat=9987,653,655",
                                          new Dictionary <string, string> {
                        { "cat", "手机" }, { "cat3", "110" }
                    }),
                    Content = File.ReadAllBytes("Jd.html"),
                    CharSet = "UTF-8"
                }, services);

                var extractor = new DataParser <Product>();


                await extractor.HandleAsync(dataContext);

                var results = (ParseResult <Product>)dataContext.GetParseData(typeof(Product).FullName);
                Assert.Equal(60, results.Count);
                Assert.Equal("手机", results[0].CategoryName);
                Assert.Equal(110, results[0].CategoryId);
                Assert.Equal("https://item.jd.com/3031737.html", results[0].Url);
                Assert.Equal("3031737", results[0].Sku);
                Assert.Equal("荣耀官方旗舰店", results[0].ShopName);
                Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name);
                Assert.Equal("1000000904", results[0].VenderId);
                Assert.Equal("1000000904", results[0].JdzyShopId);
                Assert.Equal(DateTimeOffset.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd"));
            }
        }
示例#2
0
        public void ParseEntity()
        {
            var service     = LocalSpiderProvider.Value.CreateScopeServiceProvider();
            var dataContext = new DataFlowContext(new Response
            {
                Request = new Request("https://list.jd.com/list.html?cat=9987,653,655",
                                      new Dictionary <string, string>
                {
                    { "cat", "手机" },
                    { "cat3", "110" }
                }),
                RawText = File.ReadAllText("Jd.html")
            }, service);

            DataParser <Product> extractor = new DataParser <Product>();


            extractor.HandleAsync(dataContext).GetAwaiter().GetResult();

            var results = (ParseResult <Product>)dataContext.GetParseData(typeof(Product).FullName);

            Assert.Equal(60, results.Count);
            Assert.Equal("手机", results[0].CategoryName);
            Assert.Equal(110, results[0].CategoryId);
            Assert.Equal("https://item.jd.com/3031737.html", results[0].Url);
            Assert.Equal("3031737", results[0].Sku);
            Assert.Equal("荣耀官方旗舰店", results[0].ShopName);
            Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name);
            Assert.Equal("1000000904", results[0].VenderId);
            Assert.Equal("1000000904", results[0].JdzyShopId);
            Assert.Equal(DateTime.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd"));
        }
        public void MultiEntitySelector()
        {
            using (var builder = GetLocalSpiderHostBuilder())
            {
                var provider    = builder.Build();
                var services    = provider.CreateScopeServiceProvider();
                var dataContext =
                    new DataFlowContext(
                        new Response
                {
                    Request = new Request("http://abcd.com"),
                    Content = Encoding.UTF8.GetBytes(Html),
                    CharSet = "UTF-8"
                }, services);

                var parser = new DataParser <E>();

                parser.HandleAsync(dataContext).GetAwaiter().GetResult();

                var results = (ParseResult <E>)dataContext.GetParseData(typeof(E).FullName);

                Assert.Equal("a", results[0].title);
                Assert.Equal("b", results[1].title);
            }
        }
        protected override async Task <DataFlowResult> Store(DataFlowContext context)
        {
            foreach (var item in context.GetParseData())
            {
                var tableMetadata = (TableMetadata)context[item.Key];
                var writer        = CreateOrOpen(context, tableMetadata, "json");
                await writer.WriteLineAsync(JsonConvert.SerializeObject(item.Value));
            }

            return(DataFlowResult.Success);
        }
        protected override Task <DataFlowResult> Store(DataFlowContext context)
        {
            var items = context.GetParseData();

            foreach (var item in items)
            {
                foreach (var data in item.Value)
                {
                    Console.WriteLine(JsonConvert.SerializeObject(data));
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
示例#6
0
 protected virtual void AddParseResult(DataFlowContext context, ParseResult <T> result)
 {
     if (result.Count > 0)
     {
         var items = context.GetParseData(Model.TypeName);
         if (items == null)
         {
             context.AddParseData(Model.TypeName, result);
         }
         else
         {
             ((ParseResult <T>)items).AddRange(result);
         }
     }
 }
示例#7
0
        public void MultiEntitySelector()
        {
            var service     = LocalSpiderProvider.Value.CreateScopeServiceProvider();
            var dataContext = new DataFlowContext(new Response
            {
                Request = new Request("http://abcd.com"),
                RawText = Html
            }, service);

            var parser = new DataParser <E>();

            parser.HandleAsync(dataContext).GetAwaiter().GetResult();

            var results = (ParseResult <E>)dataContext.GetParseData(typeof(E).FullName);

            Assert.Equal("a", results[0].title);
            Assert.Equal("b", results[1].title);
        }
示例#8
0
        public void SingleEntitySelector()
        {
            var service     = SpiderProvider.Value.CreateScopeServiceProvider();
            var dataContext = new DataFlowContext(new Response
            {
                Request = new Request("http://abcd.com"),
                RawText = Html
            }, service);

            var parser = new DataParser <N>();


            parser.HandleAsync(dataContext).GetAwaiter().GetResult();

            var results = (ParseResult <N>)dataContext.GetParseData(typeof(N).FullName);

            Assert.Equal("i am title", results[0].title);
            Assert.Equal("i am dotnetspider", results[0].dotnetspider);
        }
示例#9
0
        protected override Task <DataFlowResult> Store(DataFlowContext context)
        {
            foreach (var item in context.GetParseData())
            {
                var tableMetadata = (TableMetadata)context[item.Key];
                switch (MySqlFileType)
                {
                case MySqlFileType.LoadFile:
                {
                    WriteLoadFile(context, tableMetadata, item.Value);
                    break;
                }

                case MySqlFileType.InsertSql:
                {
                    WriteInsertFile(context, tableMetadata, item.Value);
                    break;
                }
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
        protected override async Task <DataFlowResult> Store(DataFlowContext context)
        {
            IDbConnection conn = TryCreateDbConnection(context);

            using (conn)
            {
                foreach (var item in context.GetParseData())
                {
                    var tableMetadata = (TableMetadata)context[item.Key];

                    SqlStatements sqlStatements = GetSqlStatements(tableMetadata);

                    if (_executedCache.TryAdd(sqlStatements.CreateTableSql, new object()))
                    {
                        EnsureDatabaseAndTableCreated(conn, sqlStatements);
                    }

                    for (int i = 0; i < RetryTimes; ++i)
                    {
                        IDbTransaction transaction = null;
                        try
                        {
                            if (UseTransaction)
                            {
                                transaction = conn.BeginTransaction();
                            }

                            var list = item.Value;
                            switch (StorageType)
                            {
                            case StorageType.Insert:
                            {
                                await conn.ExecuteAsync(sqlStatements.InsertSql, list, transaction);

                                break;
                            }

                            case StorageType.InsertIgnoreDuplicate:
                            {
                                await conn.ExecuteAsync(sqlStatements.InsertIgnoreDuplicateSql, list, transaction);

                                break;
                            }

                            case StorageType.Update:
                            {
                                if (string.IsNullOrWhiteSpace(sqlStatements.UpdateSql))
                                {
                                    throw new SpiderException("未能生成更新 SQL");
                                }

                                await conn.ExecuteAsync(sqlStatements.UpdateSql, list, transaction);

                                break;
                            }

                            case StorageType.InsertAndUpdate:
                            {
                                await conn.ExecuteAsync(sqlStatements.InsertAndUpdateSql, list, transaction);

                                break;
                            }
                            }

                            transaction?.Commit();
                            break;
                        }
                        catch (Exception ex)
                        {
                            Logger?.LogError($"尝试插入数据失败: {ex}");

                            // 网络异常需要重试,并且不需要 Rollback
                            var endOfStreamException = ex.InnerException as EndOfStreamException;
                            if (endOfStreamException == null)
                            {
                                try
                                {
                                    transaction?.Rollback();
                                }
                                catch (Exception e)
                                {
                                    Logger?.LogError($"数据库回滚失败: {e}");
                                }

                                break;
                            }
                        }
                        finally
                        {
                            transaction?.Dispose();
                        }
                    }
                }
            }

            return(DataFlowResult.Success);
        }
示例#11
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            if (!context.Contains(_model.TypeName))
            {
                context.Add(_model.TypeName, _tableMetadata);
            }

            var selectable = context.Selectable;

            var results = new ParseResult <T>();

            if (selectable.Properties == null)
            {
                selectable.Properties = new Dictionary <string, object>();
            }

            var environments = new Dictionary <string, string>();

            foreach (var property in context.Response.Request.Properties)
            {
                environments.Add(property.Key, property.Value);
            }

            if (_model.GlobalValueSelectors != null)
            {
                foreach (var selector in _model.GlobalValueSelectors)
                {
                    string name = selector.Name;
                    if (string.IsNullOrWhiteSpace(name))
                    {
                        continue;
                    }

                    var value = selectable.Select(selector.ToSelector()).GetValue();
                    if (!environments.ContainsKey(name))
                    {
                        environments.Add(name, value);
                    }
                    else
                    {
                        environments[name] = value;
                    }
                }
            }

            bool singleExtractor = _model.Selector == null;

            if (!singleExtractor)
            {
                var selector = _model.Selector.ToSelector();

                var list = selectable.SelectList(selector).Nodes()?.ToList();
                if (list != null)
                {
                    if (_model.Take > 0 && list.Count > _model.Take)
                    {
                        list = _model.TakeFromHead
                                                        ? list.Take(_model.Take).ToList()
                                                        : list.Skip(list.Count - _model.Take).ToList();
                    }

                    for (var i = 0; i < list.Count; ++i)
                    {
                        var item = list.ElementAt(i);
                        var obj  = ParseObject(environments, item, i);
                        if (obj != null)
                        {
                            results.Add(obj);
                        }
                        else
                        {
                            Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}");
                        }
                    }
                }
            }
            else
            {
                var obj = ParseObject(environments, selectable, 0);
                if (obj != null)
                {
                    results.Add(obj);
                }
                else
                {
                    Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}");
                }
            }

            if (results.Count > 0)
            {
                var items = context.GetParseData(_model.TypeName);
                if (items == null)
                {
                    context.AddParseData(_model.TypeName, results);
                }
                else
                {
                    ((ParseResult <T>)items).AddRange(results);
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }