Beispiel #1
0
        protected override Task <DataFlowResult> Store(DataFlowContext context)
        {
            var items = context.GetData();

            Console.WriteLine(JsonConvert.SerializeObject(items));
            return(Task.FromResult(DataFlowResult.Success));
        }
Beispiel #2
0
        public override async Task HandleAsync(DataFlowContext context)
        {
            if (IsNullOrEmpty(context))
            {
                Logger.LogWarning("数据流上下文不包含实体解析结果");
                return;
            }

            var data   = context.GetData();
            var result = new Dictionary <Type, ICollection <dynamic> >();

            foreach (var kv in data)
            {
                var type = kv.Key as Type;
                if (type == null || !_baseType.IsAssignableFrom(type))
                {
                    continue;
                }

                if (kv.Value is IEnumerable list)
                {
                    foreach (var obj in list)
                    {
                        AddResult(result, type, obj);
                    }
                }
                else
                {
                    AddResult(result, type, kv.Value);
                }
            }

            await HandleAsync(context, result);
        }
        protected override async Task <DataFlowResult> Store(DataFlowContext context)
        {
            var items = context.GetData();

            foreach (var item in items)
            {
                var tableMetadata = (TableMetadata)context[item.Key];

                if (!_cache.ContainsKey(tableMetadata.Schema.Database))
                {
                    _cache.TryAdd(tableMetadata.Schema.Database, _client.GetDatabase(tableMetadata.Schema.Database));
                }

                var db         = _cache[tableMetadata.Schema.Database];
                var collection = db.GetCollection <BsonDocument>(tableMetadata.Schema.Table);

                var bsonDocs = new List <BsonDocument>();
                foreach (var data in item.Value)
                {
                    bsonDocs.Add(BsonDocument.Create(data));
                }

                await collection.InsertManyAsync(bsonDocs);
            }

            return(DataFlowResult.Success);
        }
Beispiel #4
0
        public override async Task HandleAsync(DataFlowContext context)
        {
            if (context.IsEmpty)
            {
                Logger.LogWarning("数据流上下文不包含实体解析结果");
                return;
            }

            var data = context.GetData();
            var dict = new Dictionary <Type, List <dynamic> >();

            foreach (var d in data)
            {
                var type = d.Key as Type;
                if (type == null || !_baseType.IsAssignableFrom(type))
                {
                    continue;
                }

                if (d.Value is IEnumerable list)
                {
                    foreach (var obj in list)
                    {
                        InsertData(dict, type, obj);
                    }
                }
                else
                {
                    InsertData(dict, type, d.Value);
                }
            }

            await StoreAsync(context, dict);
        }
Beispiel #5
0
        protected override Task StoreAsync(DataFlowContext context)
        {
            var items = context.GetData().Where(x => !ReferenceEquals(x.Key, Consts.ResponseBytes));

            Console.ForegroundColor = ConsoleColor.Cyan;
            Console.WriteLine($"{Environment.NewLine}DATA: {JsonConvert.SerializeObject(items)}{Environment.NewLine}");

            return(Task.CompletedTask);
        }
Beispiel #6
0
        protected override async Task StoreAsync(DataFlowContext context)
        {
            var file = Path.Combine(GetDataFolder(context.Request.Owner),
                                    $"{context.Request.Hash}.json");

            using var writer = OpenWrite(file);
            var items = context.GetData();
            await writer.WriteLineAsync(JsonConvert.SerializeObject(items));
        }
Beispiel #7
0
        protected override Task StoreAsync(DataFlowContext context)
        {
            var items = context.GetData();

            Console.ForegroundColor = ConsoleColor.Cyan;
            Console.WriteLine($"{Environment.NewLine}DATA: {JsonConvert.SerializeObject(items)}{Environment.NewLine}");

            return(Task.CompletedTask);
        }
Beispiel #8
0
        protected override async Task StoreAsync(DataFlowContext context)
        {
            var file = Path.Combine(GetDataFolder(context.Request.Owner),
                                    $"{context.Request.Hash}.json");

            using var writer = OpenWrite(file);
            var items = context.GetData().Where(x => !ReferenceEquals(x.Key, Consts.ResponseBytes));
            await writer.WriteLineAsync(JsonConvert.SerializeObject(items));
        }
Beispiel #9
0
        protected override async Task <DataFlowResult> Store(DataFlowContext context)
        {
            var items = context.GetData();
            var file  = Path.Combine(GetDataFolder(context.Response.Request.OwnerId),
                                     $"{context.Response.Request.Hash}.json");

            CreateFile(file);
            await Writer.WriteLineAsync(JsonConvert.SerializeObject(items));

            return(DataFlowResult.Success);
        }
            protected override Task StoreAsync(DataFlowContext context)
            {
                var typeName = typeof(News).FullName;
                var data     = context.GetData(typeName);

                if (data is News news)
                {
                    Console.WriteLine($"URL: {news.Url}, TITLE: {news.Title}, VIEWS: {news.Views}");
                }

                return(Task.CompletedTask);
            }
Beispiel #11
0
        protected override Task <DataFlowResult> Store(DataFlowContext context)
        {
            var items = context.GetData();

            foreach (var item in items)
            {
                foreach (var data in item.Value)
                {
                    Console.WriteLine(JsonConvert.SerializeObject(data));
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
Beispiel #12
0
            protected override async Task StoreAsync(DataFlowContext context)
            {
                var typeName = typeof(News).FullName;
                var data     = (News)context.GetData(typeName);

                if (data != null)
                {
                    await using var conn =
                                    new MySqlConnection(
                                        "Database='mysql';Data Source=localhost;password=1qazZAQ!;User ID=root;Port=3306;");
                    await conn.ExecuteAsync(
                        $"INSERT IGNORE INTO cnblogs2.news2 (title, url, summary, views, content) VALUES (@Title, @Url, @Summary, @Views, @Content);",
                        data);
                }
            }
Beispiel #13
0
        public async Task MultiEntitySelector()
        {
            var request     = new Request("http://abcd.com");
            var dataContext =
                new DataFlowContext(null, new SpiderOptions(), request,
                                    new Response {
                Content = new ByteArrayContent(Encoding.UTF8.GetBytes(Html))
            });

            var parser = new DataParser <E>();

            await parser.HandleAsync(dataContext);

            var results = (List <E>)dataContext.GetData(typeof(E));

            Assert.Equal("a", results[0].title);
            Assert.Equal("b", results[1].title);
        }
Beispiel #14
0
            public override Task HandleAsync(DataFlowContext context)
            {
                if (IsNullOrEmpty(context))
                {
                    Logger.LogWarning("数据流上下文不包含解析结果");
                    return(Task.CompletedTask);
                }

                var typeName = typeof(News).FullName;
                var data     = context.GetData(typeName);

                if (data is News news)
                {
                    Console.WriteLine($"URL: {news.Url}, TITLE: {news.Title}, VIEWS: {news.Views}");
                }

                return(Task.CompletedTask);
            }
Beispiel #15
0
        protected override async Task StoreAsync(DataFlowContext context)
        {
            var id    = context.Request.Owner;
            var table = $"dotnet_spider:response_{id}";

            _tableCreatedDict.GetOrAdd(table, t =>
            {
                EnsureDatabaseAndTableCreated(context, t);
                return(true);
            });

            var hash = context.Request.Hash;

            var bytes = context.GetData(Consts.ResponseBytes);
            var data  = Convert.ToBase64String(bytes);

            var httpClient = context.ServiceProvider.GetRequiredService <IHttpClientFactory>().CreateClient(_rest);

            for (var i = 0; i < 3; ++i)
            {
                try
                {
                    var httpRequestMessage = new HttpRequestMessage(HttpMethod.Put, $"{_rest}{table}/row");
                    httpRequestMessage.Headers.TryAddWithoutValidation("Accept", "application/json");
                    var rowKey = hash.ToBase64String();

                    var body =
                        "{\"Row\":[{\"key\":\"" + rowKey +
                        "\", \"Cell\": [{\"column\":\"" + _columnName + "\", \"$\":\"" + data + "\"}]}]}";
                    var content =
                        new StringContent(body,
                                          Encoding.UTF8, "application/json");
                    httpRequestMessage.Content = content;

                    var res = await httpClient.SendAsync(httpRequestMessage);

                    res.EnsureSuccessStatusCode();
                }
                catch (Exception ex)
                {
                    Logger.LogError($"Store {context.Request.RequestUri} response to HBase failed [{i}]: {ex}");
                }
            }
        }
Beispiel #16
0
 protected virtual void AddParsedResult <T>(DataFlowContext context, IEnumerable <T> results)
     where T : EntityBase <T>, new()
 {
     if (results != null)
     {
         var type  = typeof(T);
         var items = context.GetData(type);
         if (items == null)
         {
             var list = new List <T>();
             list.AddRange(results);
             context.AddData(type, list);
         }
         else
         {
             items.AddRange(results);
         }
     }
 }
Beispiel #17
0
        public async Task SingleEntitySelector()
        {
            var request     = new Request("http://abcd.com");
            var dataContext =
                new DataFlowContext(null, new SpiderOptions(), request,
                                    new Response {
                Content = new ByteArrayContent(Encoding.UTF8.GetBytes(Html))
            });

            var parser = new DataParser <N>();

            parser.UseHtmlSelectableBuilder();

            await parser.HandleAsync(dataContext);

            var results = (List <N>)dataContext.GetData(typeof(N));

            Assert.Equal("i am title", results[0].title);
            Assert.Equal("i am dotnetspider", results[0].dotnetspider);
        }
Beispiel #18
0
            public override async Task HandleAsync(DataFlowContext context)
            {
                if (IsNullOrEmpty(context))
                {
                    Logger.LogWarning("数据流上下文不包含解析结果");
                    return;
                }

                var typeName = typeof(News).FullName;
                var data     = (News)context.GetData(typeName);

                if (data != null)
                {
                    await using var conn =
                                    new MySqlConnection(
                                        "Database='mysql';Data Source=localhost;password=1qazZAQ!;User ID=root;Port=3306;");
                    await conn.ExecuteAsync(
                        $"INSERT IGNORE INTO cnblogs2.news2 (title, url, summary, views, content) VALUES (@Title, @Url, @Summary, @Views, @Content);",
                        data);
                }
            }
Beispiel #19
0
        public async Task ParseEntity()
        {
            var request = new Request("https://list.jd.com/list.html?cat=9987,653,655",
                                      new Dictionary <string, object> {
                { "cat", "手机" }, { "cat3", "110" }
            });
            var dataContext = new DataFlowContext(null, new SpiderOptions(), request,
                                                  new Response {
                Content = new ByteArrayContent(File.ReadAllBytes("Jd.html"))
            });

            var parser = new DataParser <Product>();
            await parser.InitializeAsync();

            parser.UseHtmlSelectableBuilder();
            await parser.HandleAsync(dataContext);

            var results = (List <Product>)dataContext.GetData(typeof(Product));

            Assert.Equal(60, results.Count);
            Assert.Contains("手机商品筛选", results[0].Title);
            Assert.Contains("手机商品筛选", results[1].Title);
            Assert.Contains("手机商品筛选", results[2].Title);
            Assert.Equal("手机", results[0].CategoryName);
            Assert.Equal(110, results[0].CategoryId);
            Assert.Equal("https://item.jd.com/3031737.html", results[0].Url);
            Assert.Equal("3031737", results[0].Sku);
            Assert.Equal("荣耀官方旗舰店", results[0].ShopName);
            Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0].Name);
            Assert.Equal("1000000904", results[0].VenderId);
            Assert.Equal("1000000904", results[0].JdzyShopId);
            Assert.Equal(DateTimeOffset.Now.ToString("yyyy-MM-dd"), results[0].RunId.ToString("yyyy-MM-dd"));

            var requests = dataContext.FollowRequests;

            Assert.Equal(7, requests.Count);
        }