コード例 #1
0
        public void Update()
        {
            ClearDb();

            using (SqlConnection conn = new SqlConnection(ConnectString))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                SqlServerEntityPipeline insertPipeline = new SqlServerEntityPipeline(ConnectString);
                var metadata = EntitySpider.GenerateEntityMetaData(typeof(ProductInsert).GetTypeInfo());
                insertPipeline.AddEntity(metadata);
                insertPipeline.InitPipeline(spider);

                JObject data1 = new JObject {
                    { "Sku", "110" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" }
                };
                JObject data2 = new JObject {
                    { "Sku", "111" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" }
                };
                insertPipeline.Process(metadata.Entity.Name, new List <JObject> {
                    data1, data2
                });

                SqlServerEntityPipeline updatePipeline = new SqlServerEntityPipeline(ConnectString);
                var metadat2 = EntitySpider.GenerateEntityMetaData(typeof(ProductUpdate).GetTypeInfo());
                updatePipeline.AddEntity(metadat2);
                updatePipeline.InitPipeline(spider);

                JObject data3 = new JObject {
                    { "Sku", "110" }, { "Category", "4C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" }
                };
                updatePipeline.Process(metadat2.Entity.Name, new List <JObject> {
                    data3
                });

                var list = conn.Query <ProductInsert>($"use test;select * from sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.AreEqual(2, list.Count);
                Assert.AreEqual("110", list[0].Sku);
                Assert.AreEqual("4C", list[0].Category);
            }

            ClearDb();
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: zdw2018/DotnetSpider
        static async Task Main(string[] args)
        {
            Log.Logger = new LoggerConfiguration()
                         .MinimumLevel.Information()
                         .MinimumLevel.Override("Microsoft.Hosting.Lifetime", LogEventLevel.Warning)
                         .MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
                         .MinimumLevel.Override("System", LogEventLevel.Warning)
                         .MinimumLevel.Override("Microsoft.AspNetCore.Authentication", LogEventLevel.Warning)
                         .Enrich.FromLogContext()
                         .WriteTo.Console().WriteTo.RollingFile("logs/spider.txt")
                         .CreateLogger();

            await EntitySpider.RunAsync();

            // await DistributedSpider.RunAsync();

            Console.WriteLine("Bye!");
            Environment.Exit(0);
        }
コード例 #3
0
        protected override EntitySpider GetEntitySpider()
        {
            Name = "JD Shop details " + DateTimeUtils.RunIdOfMonday;
            var context = new EntitySpider(new Site())
            {
                TaskGroup  = "JD SKU Weekly",
                CachedSize = 1,
                ThreadNum  = 8,
                Scheduler  = new RedisScheduler("127.0.0.1:6379,serviceName=Scheduler.NET,keepAlive=8,allowAdmin=True,connectTimeout=10000,password=6GS9F2QTkP36GggE0c3XwVwI,abortConnect=True,connectRetry=20"),
                Downloader = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new SubContentHandler
                        {
                            Start       = "json(",
                            End         = ");",
                            StartOffset = 5,
                            EndOffset   = 0
                        }
                    }
                },
                PrepareStartUrls = new PrepareStartUrls[]
                {
                    new BaseDbPrepareStartUrls()
                    {
                        Source        = DataSource.MySql,
                        ConnectString = "Database='test';Data Source= localhost;User ID=root;Password=1qazZAQ!;Port=3306",
                        QueryString   = $"SELECT * FROM jd.sku_v2_{DateTimeUtils.RunIdOfMonday} WHERE shopname is null or shopid is null order by sku",
                        Columns       = new [] { new DataColumn {
                                                     Name = "sku"
                                                 } },
                        FormateStrings = new List <string> {
                            "http://chat1.jd.com/api/checkChat?my=list&pidList={0}&callback=json"
                        }
                    }
                }
            };

            context.AddPipeline(new MySqlEntityPipeline("Database='taobao';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=4306"));
            context.AddEntityType(typeof(ProductUpdater));
            return(context);
        }
コード例 #4
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site
            {
                EncodingName = "UTF-8"
            });

            context.AddPipeline(
                new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));

            var word = "淘宝618";

            context.AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary <string, dynamic> {
                { "Keyword", word }
            });
            context.AddEntityType(typeof(BaiduSearchEntry));

            return(context);
        }
コード例 #5
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site
            {
                //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
            });

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            // dowload html by http client
            context.SetDownloader(new HttpClientDownloader());
            // save data to mysql.
            context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product));
            return(context);
        }
コード例 #6
0
        public void UpdateWhenUnionPrimary()
        {
            ClearDb();

            using (MySqlConnection conn = new MySqlConnection(ConnectString))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline(ConnectString);
                var metadata = EntitySpider.GenerateEntityDefine(typeof(Product2Insert).GetTypeInfo());
                insertPipeline.AddEntity(metadata);
                insertPipeline.InitPipeline(spider);

                DataObject data1 = new DataObject {
                    { "Sku", "110" }, { "Category1", "4C" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" }
                };
                DataObject data2 = new DataObject {
                    { "Sku", "111" }, { "Category1", "4C" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" }
                };
                insertPipeline.Process(metadata.Name, new List <DataObject> {
                    data1, data2
                });

                MySqlEntityPipeline updatePipeline = new MySqlEntityPipeline(ConnectString);
                var metadata2 = EntitySpider.GenerateEntityDefine(typeof(Product2Update).GetTypeInfo());
                updatePipeline.AddEntity(metadata2);
                updatePipeline.InitPipeline(spider);

                DataObject data3 = new DataObject {
                    { "Sku", "110" }, { "Category1", "4C" }, { "Category", "AAAA" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" }
                };
                updatePipeline.Process(metadata2.Name, new List <DataObject> {
                    data3
                });

                var list = conn.Query <Product2Insert>($"select * from test.sku2_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.Equal(2, list.Count);
                Assert.Equal("110", list[0].Sku);
                Assert.Equal("AAAA", list[0].Category);
            }

            ClearDb();
        }
コード例 #7
0
ファイル: Program.cs プロジェクト: zdx19981006/C-Spider
        static async Task Main(string[] args)
        {
            var configure = new LoggerConfiguration()
#if DEBUG
                            .MinimumLevel.Verbose()
#else
                            .MinimumLevel.Information()
#endif
                            .MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
                            .Enrich.FromLogContext()
                            .WriteTo.Console().WriteTo
                            .RollingFile("dotnet-spider.log");

            Log.Logger = configure.CreateLogger();

            await EntitySpider.Run();

            // await DistributedSpider.Run();
            Console.Read();
        }
コード例 #8
0
        public void EntitySelector()
        {
            var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity7).GetTypeInfo());

            Assert.Equal("expression", entity1.Entity.Selector.Expression);
            Assert.Equal(SelectorType.XPath, entity1.Entity.Selector.Type);
            Assert.True(entity1.Entity.Multi);

            var entity2 = EntitySpider.ParseEntityMetaData(typeof(Entity8).GetTypeInfo());

            Assert.Equal("expression2", entity2.Entity.Selector.Expression);
            Assert.Equal(SelectorType.Css, entity2.Entity.Selector.Type);
            Assert.True(entity2.Entity.Multi);

            var entity3 = EntitySpider.ParseEntityMetaData(typeof(Entity9).GetTypeInfo());

            Assert.False(entity3.Entity.Multi);
            Assert.Null(entity3.Entity.Selector);
            Assert.Equal("DotnetSpider.Test.SpiderEntityTest+Entity9", entity3.Entity.Name);
        }
コード例 #9
0
        public void EntitySelector()
        {
            var entity1 = EntitySpider.GenerateEntityDefine(typeof(Entity7).GetTypeInfo());

            Assert.Equal("expression", entity1.Selector.Expression);
            Assert.Equal(SelectorType.XPath, entity1.Selector.Type);
            Assert.True(entity1.Multi);

            var entity2 = EntitySpider.GenerateEntityDefine(typeof(Entity8).GetTypeInfo());

            Assert.Equal("expression2", entity2.Selector.Expression);
            Assert.Equal(SelectorType.Css, entity2.Selector.Type);
            Assert.True(entity2.Multi);

            var entity3 = EntitySpider.GenerateEntityDefine(typeof(Entity9).GetTypeInfo());

            Assert.False(entity3.Multi);
            Assert.Null(entity3.Selector);
            Assert.Equal("DotnetSpider.Extension.Test.EntitySpiderTest2+Entity9", entity3.Name);
        }
コード例 #10
0
        public void EntitySelector()
        {
            var entity1 = EntitySpider.GenerateEntityMetaData(typeof(Entity7).GetTypeInfo());

            Assert.AreEqual("expression", entity1.Entity.Selector.Expression);
            Assert.AreEqual(SelectorType.XPath, entity1.Entity.Selector.Type);
            Assert.IsTrue(entity1.Entity.Multi);

            var entity2 = EntitySpider.GenerateEntityMetaData(typeof(Entity8).GetTypeInfo());

            Assert.AreEqual("expression2", entity2.Entity.Selector.Expression);
            Assert.AreEqual(SelectorType.Css, entity2.Entity.Selector.Type);
            Assert.IsTrue(entity2.Entity.Multi);

            var entity3 = EntitySpider.GenerateEntityMetaData(typeof(Entity9).GetTypeInfo());

            Assert.IsFalse(entity3.Entity.Multi);
            Assert.IsNull(entity3.Entity.Selector);
            Assert.AreEqual("DotnetSpider.Extension.Test.EntitySpiderTest2+Entity9", entity3.Entity.Name);
        }
コード例 #11
0
        public void UpdateCheckIfSameBeforeUpdate()
        {
            ClearDb();

            using (MySqlConnection conn = new MySqlConnection(ConnectString))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline(ConnectString);
                insertPipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo()));
                insertPipeline.InitPipeline(spider);

                JObject data1 = new JObject {
                    { "sku", "110" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" }
                };
                JObject data2 = new JObject {
                    { "sku", "111" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" }
                };
                insertPipeline.Process(new List <JObject> {
                    data1, data2
                });

                MySqlEntityPipeline updatePipeline = new MySqlEntityPipeline(ConnectString, PipelineMode.Update, true);
                updatePipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo()));
                updatePipeline.InitPipeline(spider);

                JObject data3 = new JObject {
                    { "sku", "110" }, { "category", "4C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" }
                };
                updatePipeline.Process(new List <JObject> {
                    data3
                });

                var list = conn.Query <Product>($"select * from test.sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.AreEqual(2, list.Count);
                Assert.AreEqual("110", list[0].Sku);
                Assert.AreEqual("4C", list[0].Category);
            }

            ClearDb();
        }
コード例 #12
0
        public void Insert()
        {
            if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
            {
                return;
            }
            ClearDb();

            using (SqlConnection conn = new SqlConnection(ConnectString))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                SqlServerEntityPipeline insertPipeline = new SqlServerEntityPipeline(ConnectString);
                var metadata = EntitySpider.GenerateEntityDefine(typeof(ProductInsert).GetTypeInfo());
                insertPipeline.AddEntity(EntitySpider.GenerateEntityDefine(typeof(ProductInsert).GetTypeInfo()));
                insertPipeline.InitPipeline(spider);

                // Common data
                var data1 = new DataObject {
                    { "Sku", "110" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" }
                };
                var data2 = new DataObject {
                    { "Sku", "111" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" }
                };
                // Value is null
                var data3 = new DataObject {
                    { "Sku", "112" }, { "Category", null }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" }
                };
                insertPipeline.Process(metadata.Name, new List <DataObject> {
                    data1, data2, data3
                });

                var list = conn.Query <ProductInsert>($"use test;select * from sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.Equal(3, list.Count);
                Assert.Equal("110", list[0].Sku);
                Assert.Equal("111", list[1].Sku);
                Assert.Null(list[2].Category);
            }

            ClearDb();
        }
コード例 #13
0
        public void UpdateWhenUnionPrimary()
        {
            ClearDb();

            using (SqlConnection conn = new SqlConnection(ConnectString))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                MsSqlEntityPipeline insertPipeline = new MsSqlEntityPipeline(ConnectString);
                insertPipeline.InitEntity(EntitySpider.GenerateEntityMetaData(typeof(Product2).GetTypeInfo()));
                insertPipeline.InitPipeline(spider);

                JObject data1 = new JObject {
                    { "sku", "110" }, { "category1", "4C" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" }
                };
                JObject data2 = new JObject {
                    { "sku", "111" }, { "category1", "4C" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" }
                };
                insertPipeline.Process(new List <JObject> {
                    data1, data2
                });

                MsSqlEntityPipeline updatePipeline = new MsSqlEntityPipeline(ConnectString, PipelineMode.Update);
                updatePipeline.InitEntity(EntitySpider.GenerateEntityMetaData(typeof(Product2).GetTypeInfo()));
                updatePipeline.InitPipeline(spider);

                JObject data3 = new JObject {
                    { "sku", "110" }, { "category1", "4C" }, { "category", "AAAA" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" }
                };
                updatePipeline.Process(new List <JObject> {
                    data3
                });

                var list = conn.Query <Product2>($"use test;select * from sku2_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.AreEqual(2, list.Count);
                Assert.AreEqual("110", list[0].Sku);
                Assert.AreEqual("AAAA", list[0].Category);
            }

            ClearDb();
        }
コード例 #14
0
        public void MySqlDataTypeTests()
        {
            using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"))
            {
                EntitySpider context = new EntitySpider(new Site());
                context.SetIdentity(Guid.NewGuid().ToString("N"));
                context.SetThreadNum(1);
                context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));

                context.AddStartUrl("http://baidu.com");
                context.AddEntityType(typeof(Entity15));

                context.Run("running-test");


                var columns = conn.Query <ColumnInfo>("SELECT COLUMN_NAME as `Name`, COLUMN_TYPE as `Type` FROM information_schema.columns WHERE table_name='table15' AND table_schema = 'test';").ToList();;
                Assert.AreEqual(9, columns.Count);

                Assert.AreEqual("Int", columns[0].Name);
                Assert.AreEqual("BigInt", columns[1].Name);
                Assert.AreEqual("String", columns[2].Name);
                Assert.AreEqual("Time", columns[3].Name);
                Assert.AreEqual("Float", columns[4].Name);
                Assert.AreEqual("Double", columns[5].Name);
                Assert.AreEqual("String1", columns[6].Name);
                Assert.AreEqual("cdate", columns[7].Name);
                Assert.AreEqual("__id", columns[8].Name);

                Assert.AreEqual("int(11)", columns[0].Type);
                Assert.AreEqual("bigint(20)", columns[1].Type);
                Assert.AreEqual("text", columns[2].Type);
                Assert.AreEqual("timestamp", columns[3].Type);
                Assert.AreEqual("float", columns[4].Type);
                Assert.AreEqual("double", columns[5].Type);
                Assert.AreEqual("varchar(100)", columns[6].Type);
                Assert.AreEqual("timestamp", columns[7].Type);
                Assert.AreEqual("bigint(20)", columns[8].Type);

                conn.Execute("drop table `test`.`table15`");
            }
        }
コード例 #15
0
        public void Update()
        {
            ClearDb();

            using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=127.0.0.1;User ID=root;Password=1qazZAQ!;Port=3306"))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline("Database='mysql';Data Source=127.0.0.1;User ID=root;Password=1qazZAQ!;Port=3306");
                insertPipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo()));
                insertPipeline.InitPipeline(spider);

                JObject data1 = new JObject {
                    { "sku", "110" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" }
                };
                JObject data2 = new JObject {
                    { "sku", "111" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" }
                };
                insertPipeline.Process(new List <JObject> {
                    data1, data2
                });

                MySqlEntityPipeline updatePipeline = new MySqlEntityPipeline("Database='mysql';Data Source=127.0.0.1;User ID=root;Password=1qazZAQ!;Port=3306", PipelineMode.Update);
                updatePipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo()));
                updatePipeline.InitPipeline(spider);

                JObject data3 = new JObject {
                    { "sku", "110" }, { "category", "4C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" }
                };
                updatePipeline.Process(new List <JObject> {
                    data3
                });

                var list = conn.Query <Product>($"select * from test.sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.Equal(2, list.Count);
                Assert.Equal("110", list[0].Sku);
                Assert.Equal("4C", list[0].Category);
            }

            ClearDb();
        }
コード例 #16
0
        public void SqlServerDataTypeTests()
        {
            using (var conn = new SqlConnection("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true"))
            {
                EntitySpider context = new EntitySpider(new Site());
                context.SetIdentity(Guid.NewGuid().ToString("N"));
                context.SetThreadNum(1);
                context.AddPipeline(new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true"));

                context.AddStartUrl("http://baidu.com");
                context.AddEntityType(typeof(Entity15));

                context.Run("running-test");


                var columns = conn.Query <ColumnInfo>("USE [test];select  b.name Name,c.name+'(' + cast(c.length as varchar)+')' [Type] from sysobjects a,syscolumns b,systypes c where a.id=b.id and a.name='table15' and a.xtype='U'and b.xtype=c.xtype").ToList();;
                Assert.AreEqual(11, columns.Count);

                Assert.AreEqual("Int", columns[0].Name);
                Assert.AreEqual("Time", columns[1].Name);
                Assert.AreEqual("CDate", columns[2].Name);
                Assert.AreEqual("Float", columns[3].Name);
                Assert.AreEqual("Double", columns[4].Name);
                Assert.AreEqual("BigInt", columns[5].Name);
                Assert.AreEqual("__Id", columns[6].Name);
                Assert.AreEqual("String", columns[7].Name);
                Assert.AreEqual("String1", columns[8].Name);

                Assert.AreEqual("int(4)", columns[0].Type);
                Assert.AreEqual("datetime(8)", columns[1].Type);
                Assert.AreEqual("datetime(8)", columns[2].Type);
                Assert.AreEqual("float(8)", columns[3].Type);
                Assert.AreEqual("float(8)", columns[4].Type);
                Assert.AreEqual("bigint(8)", columns[5].Type);
                Assert.AreEqual("bigint(8)", columns[6].Type);
                Assert.AreEqual("nvarchar(8000)", columns[7].Type);
                Assert.AreEqual("nvarchar(8000)", columns[8].Type);

                conn.Execute("USE [test]; drop table [test].dbo.[table15]");
            }
        }
コード例 #17
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity("JD sku/store test " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product), new TargetUrlExtractor
            {
                Region = new Selector {
                    Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]"
                },
                Patterns = new List <string> {
                    @"&page=[0-9]+&"
                }
            });
            context.SetDownloader(new WebDriverDownloader(Browser.Chrome));
            return(context);
        }
コード例 #18
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetSite(new Site
            {
                CookiesStringPart = "sid=dea284fc36c24e8cbcd447343d7b8a4e; sn=DD962248; ctid=000000; ctnm=%E5%8F%A4%E9%95%87%E7%81%AF%E9%A5%B0%E6%89%B9%E5%8F%91; ctpv=%E5%B9%BF%E4%B8%9C; JSESSIONID=acbBqFfOD4I63d9PziDvv; DDENG=c4fc08ae2e3ba3efeddbc667c2f45e615a85e80009169501dc244a03e87908aa61146548b97ed9c7dc07af23bfd80bff5008f8c8867a9165d4bd2732aca0db7dedae2e042d3968fcad1150f36be242e8a32a3f59db2a0b39216a59f1628508c5799644532a9d99925f9841b3c13a1f97; userId=10003379; previousUser=%E5%A4%95%E7%8E%89; Hm_lvt_9e33f153f28be198970d205d90a24f28=1466146335; Hm_lpvt_9e33f153f28be198970d205d90a24f28=1466146392; Hm_lvt_54b4cb498afd05463ab4611b38a6f289=1466146335; Hm_lpvt_54b4cb498afd05463ab4611b38a6f289=1466146392; CNZZDATA1256982382=395301521-1466143554-%7C1466143554",
                Headers           = new Dictionary <string, string>
                {
                    { "Cache-Control", "max-age=0" },
                    { "Upgrade-Insecure-Requests", "1" }
                },
                UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
                Accept    = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
            });
            context.AddPipeline(new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true"));
            context.AddStartUrl("http://www.ddeng.com/product/982227");
            context.AddEntityType(typeof(Corp));

            return(context);
        }
コード例 #19
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
            context.AddPipeline(new MySqlEntityPipeline(null)
            {
                UpdateConnectString = new DbUpdateConnectString
                {
                    ConnectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306",
                    QueryString   = "SELECT value from `dotnetspider`.`settings` where `type`='ConnectString' and `key`='MySql01' LIMIT 1"
                }
            });
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main",
                                new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(JdSkuSampleSpider.Product));
            return(context);
        }
コード例 #20
0
        protected override EntitySpider GetEntitySpider()
        {
            //Connecting string
            const string connstr = "Data Source=localhost;Initial Catalog=test;User ID=sa;Password=1234";

            EntitySpider context = new EntitySpider(new Site {
            })
            {
                UserId    = "DotnetSpider",
                TaskGroup = "RuthSpider"
            };

            context.SetThreadNum(1);
            context.SetIdentity("RuthSpider " + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://www.tsia.org.tw/member_list.php?page=1");

            //添加公司列表頁面Entity
            context.AddEntityType(typeof(CompanySummary), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"member_list.php\?page=\d+"
                }
            });
            //添加公司詳情頁面Entity
            context.AddEntityType(typeof(Company), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"member_info.php\?ID=\d+"
                }
            });
            //Config Redis
            context.SetScheduler(new RedisScheduler
            {
                Host     = "localhost",
                Password = "",
                Port     = 6379
            });
            return(context);
        }
コード例 #21
0
        protected override EntitySpider GetEntitySpider()
        {
            Site site = new Site();

            using (var reader = new StreamReader(File.OpenRead("taobaokeyword.txt")))
            {
                string keyword;
                while (!string.IsNullOrEmpty(keyword = reader.ReadLine()))
                {
                    site.AddStartUrl("https://" + $"s.taobao.com/search?q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}&fs=1&filter_tianmao=tmall", new Dictionary <string, object>
                    {
                        { "keyword", keyword }
                    });
                }
            }
            var context = new EntitySpider(site)
            {
                ThreadNum             = 5,
                SkipWhenResultIsEmpty = true,
                Scheduler             = new RedisScheduler("127.0.0.1:6379,serviceName = DotnetSpider,keepAlive = 8,allowAdmin = True,connectTimeout = 10000,password = 6GS9F2QTkP36GggE0c3XwVwI,abortConnect = True,connectRetry = 20"),
                Downloader            = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new SubContentHandler
                        {
                            StartOffset = 16,
                            EndOffset   = 22,
                            Start       = "g_page_config = {",
                            End         = "g_srp_loadCss();"
                        },
                        new IncrementTargetUrlsCreator("&s=0", null, 44)
                    }
                }
            };

            context.AddPipeline(new MySqlEntityPipeline("Database = 'mysql'; Data Source = localhost; User ID = root; Password = 1qazZAQ!; Port = 3306"));
            context.AddEntityType(typeof(Item), new MyDataHanlder());
            return(context);
        }
コード例 #22
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site())
            {
                Downloader = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new IncrementTargetUrlsCreator("index_1.shtml")
                    }
                },
            };

            context.SetThreadNum(10);
            context.SetIdentity("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
            context.AddPipeline(
                new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://oa.jlu.edu.cn/. ");
            context.AddEntityType(typeof(ArticleSummary));
            context.AddEntityType(typeof(Article));
            return(context);
        }
コード例 #23
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site())
            {
                UserId                = "DotnetSpider",
                TaskGroup             = "HaoBrowser",
                Identity              = "HaoBrowser Hao360Spider Buble " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"),
                CachedSize            = 1,
                ThreadNum             = 1,
                SkipWhenResultIsEmpty = true,
                Downloader            = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new SubContentHandler {
                            Start       = "sales[\"hotsite_yixing\"] = [",
                            End         = "}}",
                            StartOffset = 27,
                            EndOffset   = 0
                        },
                        new ReplaceContentHandler {
                            NewValue = "/",
                            OldValue = "\\/",
                        },
                    }
                }
            };

            context.SetScheduler(new Extension.Scheduler.RedisScheduler {
                Host     = "127.0.0.1",
                Port     = 6379,
                Password = "******"
            });
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='testhao';Data Source= 127.0.0.1;User ID=root;Password=root@123456;Port=4306"));
            context.AddStartUrl("https://hao.360.cn/");
            context.AddEntityType(typeof(UpdateHao360Info));
            return(context);
        }
コード例 #24
0
        protected override EntitySpider GetEntitySpider()
        {
            var site = new Site()
            {
                UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                Accept    = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                Headers   = new Dictionary <string, string>
                {
                    { "Accept-Encoding", "gzip, deflate, sdch" },
                    { "Upgrade-Insecure-Requests", "1" },
                    { "Accept-Language", "en,en-US;q=0.8" },
                    { "Cache-Control", "ax-age=0" },
                }
            };

            site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3355984&callback=json");
            site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3682523&callback=json");
            var context = new EntitySpider(site)
            {
                Downloader = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new SubContentHandler
                        {
                            Start       = "json(",
                            End         = ");",
                            StartOffset = 5,
                            EndOffset   = 2
                        }
                    }
                }
            };

            context.AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddEntityType(typeof(ProductUpdater));
            return(context);
        }
コード例 #25
0
ファイル: Program.cs プロジェクト: yisuo2015/DotnetSpider
        static async Task Main(string[] args)
        {
            ThreadPool.SetMaxThreads(255, 255);
            ThreadPool.SetMinThreads(255, 255);

            Log.Logger = new LoggerConfiguration()
                         .MinimumLevel.Information()
                         .MinimumLevel.Override("Microsoft.Hosting.Lifetime", LogEventLevel.Warning)
                         .MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
                         .MinimumLevel.Override("System", LogEventLevel.Warning)
                         .MinimumLevel.Override("Microsoft.AspNetCore.Authentication", LogEventLevel.Warning)
                         .Enrich.FromLogContext()
                         .WriteTo.Console().WriteTo.RollingFile("logs/spider.log")
                         .CreateLogger();


            // // await DistributedSpider.RunAsync();
            // await ProxySpider.RunAsync();
            // await EntitySpider.RunMySqlQueueAsync();
            await EntitySpider.RunAsync();

            Console.WriteLine("Bye!");
        }
コード例 #26
0
            protected override EntitySpider GetEntitySpider()
            {
                EntitySpider context = new EntitySpider(new Site());

                context.SetSite(new Site());
                context.SetThreadNum(2);
                context.ThreadNum = 1;
                context.RetryWhenResultIsEmpty = false;
                context.Deep           = 100;
                context.EmptySleepTime = 5000;
                context.SetEmptySleepTime(5000);
                context.ExitWhenComplete = true;
                context.CachedSize       = 1;
                context.SetDownloader(new HttpClientDownloader());
                context.SetScheduler(new QueueDuplicateRemovedScheduler());

                context.SkipWhenResultIsEmpty = true;
                context.SpawnUrl = true;
                context.AddPipeline(new CollectEntityPipeline());
                context.AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml");
                context.AddEntityType(typeof(ArticleSummary));
                return(context);
            }
コード例 #27
0
        public void Insert()
        {
            ClearDb();

            using (MySqlConnection conn = new MySqlConnection(ConnectString))
            {
                ISpider spider = new DefaultSpider("test", new Site());

                MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline(ConnectString);
                var metadata = EntitySpider.GenerateEntityMetaData(typeof(ProductInsert).GetTypeInfo());
                insertPipeline.AddEntity(metadata);
                insertPipeline.InitPipeline(spider);

                // Common data
                JObject data1 = new JObject {
                    { "Sku", "110" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" }
                };
                JObject data2 = new JObject {
                    { "Sku", "111" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" }
                };
                // Value is null
                JObject data3 = new JObject {
                    { "Sku", "112" }, { "Category", null }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" }
                };
                insertPipeline.Process(metadata.Name, new List <JObject> {
                    data1, data2, data3
                });

                var list = conn.Query <ProductInsert>($"select * from test.sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList();
                Assert.AreEqual(3, list.Count);
                Assert.AreEqual("110", list[0].Sku);
                Assert.AreEqual("111", list[1].Sku);
                Assert.AreEqual(null, list[2].Category);
            }

            ClearDb();
        }
コード例 #28
0
        public void Extract()
        {
            var             entityMetadata = EntitySpider.GenerateEntityDefine(typeof(Product).GetTypeInfo());
            EntityExtractor extractor      = new EntityExtractor("test", null, entityMetadata);
            var             results        = extractor.Extract(new Page(new Request("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, dynamic>
            {
                { "cat", "手机" },
                { "cat3", "110" }
            }), null)
            {
                Content = File.ReadAllText(Path.Combine(Core.Environment.BaseDirectory, "Jd.html"))
            });

            Assert.Equal(60, results.Count);
            Assert.Equal("手机", results[0]["CategoryName"]);
            Assert.Equal("110", results[0]["CategoryId"]);
            Assert.Equal("http://item.jd.com/3031737.html", results[0]["Url"]);
            Assert.Equal("3031737", results[0]["Sku"]);
            Assert.Equal("荣耀官方旗舰店", results[0]["ShopName"]);
            Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0]["Name"]);
            Assert.Equal("1000000904", results[0]["VenderId"]);
            Assert.Equal("1000000904", results[0]["JdzyShopId"]);
            Assert.Equal(DateTime.Now.ToString("yyyy-MM-dd"), results[0]["RunId"]);
        }
コード例 #29
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site
            {
                //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
            });

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product), new TargetUrlExtractor
            {
                Region = new BaseSelector {
                    Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]"
                },
                Patterns = new List <string> {
                    @"&page=[0-9]+&"
                }
            });
            return(context);
        }
コード例 #30
0
        public static void Main(string[] args)
        {
            RegexTestEntitySpider spider2 = new RegexTestEntitySpider();

            spider2.Run();

            EntitySpider spider = new EntitySpider(new Core.Site());

            spider.AddStartUrl("http://www.baidu.com");


            // Custmize processor and pipeline 完全自定义页面解析和数据管道
            BaseUsage.CustmizeProcessorAndPipeline();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            // Crawler pages without traverse 采集指定页面不做遍历
            BaseUsage.CrawlerPagesWithoutTraverse();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            // Crawler pages traversal 遍历整站
            BaseUsage.CrawlerPagesTraversal();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            DDengEntitySpider dDengEntitySpider = new DDengEntitySpider();

            dDengEntitySpider.Run();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            Cnblogs.Run();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            CasSpider casSpider = new CasSpider();

            casSpider.Run();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            BaiduSearchSpider baiduSearchSpider = new BaiduSearchSpider();

            baiduSearchSpider.Run();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            JdShopDetailSpider jdShopDetailSpider = new JdShopDetailSpider();

            jdShopDetailSpider.Run();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            JdSkuSampleSpider jdSkuSampleSpider = new JdSkuSampleSpider();

            jdSkuSampleSpider.Run();
            Console.WriteLine("Press any key to continue...");
            Console.Read();

            Situoli.Run();
        }