Exemple #1
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline
            {
                UpdateConnectString = new DbUpdateConnectString
                {
                    ConnectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306",
                    QueryString   = "SELECT value from `dotnetspider`.`settings` where `type`='ConnectString' and `key`='MySql01' LIMIT 1"
                }
            });
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main",
                                new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(JdSkuSampleSpider.Product), new TargetUrlExtractor
            {
                Region = new BaseSelector {
                    Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]"
                },
                Patterns = new List <string> {
                    @"&page=[0-9]+&"
                }
            });
            return(context);
        }
Exemple #2
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site
            {
                //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
            })
            {
                UserId    = "DotnetSpider",
                TaskGroup = "JdSkuSampleSpider"
            };

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product), new TargetUrlExtractor
            {
                Region = new BaseSelector {
                    Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]"
                },
                Patterns = new List <string> {
                    @"&page=[0-9]+&"
                }
            });
            return(context);
        }
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site())
            {
                Downloader = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new IncrementTargetUrlsCreator("index_1.shtml")
                    }
                },
            };

            context.SetThreadNum(10);
            context.SetIdentity("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
            context.AddEntityPipeline(
                new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml");
            context.AddStartUrl("http://www.cas.cn/kx/kpwz/index_1.shtml");
            context.AddEntityType(typeof(ArticleSummary), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"index_[0-9]+.shtml", "index.shtml"
                }
            });
            context.AddEntityType(typeof(Article), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"t[0-9]+_[0-9]+.shtml"
                }
            });
            return(context);
        }
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site
            {
                //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("http://dev.kuaidaili.com/api/getproxy/?orderid=917184806038194&num=999&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_tr=1&an_an=1&an_ha=1&sep=1"))
            })
            {
                UserId    = "86Research",
                TaskGroup = "JdSkuSampleSpider"
            };

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd"));
            context.AddEntityPipeline(
                new MySqlEntityPipeline("Database='test';Data Source=redis;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main",
                                new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product), new TargetUrlExtractor
            {
                Region = new BaseSelector {
                    Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]"
                },
                Patterns = new List <string> {
                    @"&page=[0-9]+&"
                }
            });
            return(context);
        }
Exemple #5
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity("cnblogs homepage " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
            context.AddStartUrl("http://www.cnblogs.com");
            context.AddPipeline(new ConsoleEntityPipeline());
            context.AddEntityType(typeof(HomePage));
            return(context);
        }
Exemple #6
0
        public void MultiEntitiesInitPipelines()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetThreadNum(1);
            context.SetIdentity("test-MultiEntitiesInitPipelines");
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddEntityPipeline(new MySqlFileEntityPipeline());
            context.AddEntityPipeline(new ConsoleEntityPipeline());
            context.AddEntityPipeline(new JsonFileEntityPipeline());
#if !NET_CORE
            //context.AddEntityPipeline(new MongoDbEntityPipeline("mongo"));
#endif
            context.AddStartUrl("http://a.com");
            context.AddEntityType(typeof(Entity13));
            context.AddEntityType(typeof(Entity12));
            context.Run("running-test");

            var entityPipelines = context.EntityPipelines;
#if NET_CORE
            Assert.Equal(4, entityPipelines.Count);
#else
            Assert.Equal(4, entityPipelines.Count);
            //Assert.Equal(5, entityPipelines.Count);
#endif
            var pipeline1 = (MySqlEntityPipeline)entityPipelines[0];
            Assert.Equal("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", pipeline1.ConnectString);

            Assert.Equal("MySqlFileEntityPipeline", entityPipelines[1].GetType().Name);
            Assert.Equal("ConsoleEntityPipeline", entityPipelines[2].GetType().Name);
            Assert.Equal("JsonFileEntityPipeline", entityPipelines[3].GetType().Name);
#if !NET_CORE
            //Assert.Equal("MongoDbEntityPipeline", entityPipelines[4].GetType().Name);
            //var pipeline2 = (MySqlEntityPipeline)entityPipelines[4];
            //Assert.Equal("mongo", pipeline2.ConnectString);
#endif
            var pipelines = context.GetPipelines();
            Assert.Equal(1, pipelines.Count);
            EntityPipeline pipeline = (EntityPipeline)pipelines[0];
            entityPipelines = pipeline.GetEntityPipelines();
            Assert.Equal(4, entityPipelines.Count);
            pipeline1 = (MySqlEntityPipeline)entityPipelines[0];
            Assert.Equal("db", pipeline1.GetSchema().Database);
            Assert.Equal("table", pipeline1.GetSchema().TableName);
#if !NET_CORE
            //var pipeline2 = (MongoDbEntityPipeline)entityPipelines[4];
            //Assert.Equal("db", pipeline2.GetSchema().Database);
            //Assert.Equal("table", pipeline2.GetSchema().TableName);
#endif

            using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"))
            {
                conn.Execute($"DROP table db.table");
            }
        }
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity("JD sku/store test " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
            context.AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product));
            context.SetDownloader(new WebDriverDownloader(Browser.Chrome));
            return(context);
        }
Exemple #8
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity("ShanxizhaoshengSpider " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
            context.SetSite(new Site
            {
                EncodingName = "GB2312"
            });
            context.AddPipeline(new SqlServerEntityPipeline("Data Source=.\\SQLEXPRESS;Initial Catalog=master;Integrated Security=True"));
            context.AddStartUrl("http://www.sneac.com/pgjhcx/ypbkyxjg.jsp?a11709CountNo=2000");
            context.AddEntityType(typeof(Item));

            return(context);
        }
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site
            {
                //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
            });

            context.SetThreadNum(1);
            context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product));
            return(context);
        }
        public void MultiEntitiesInitPipelines()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity(Guid.NewGuid().ToString("N"));
            context.SetThreadNum(1);
            context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddPipeline(new MySqlFileEntityPipeline());
            context.AddPipeline(new ConsoleEntityPipeline());
            context.AddPipeline(new JsonFileEntityPipeline());

            context.AddStartUrl("http://baidu.com");
            context.AddEntityType(typeof(Entity13));
            context.AddEntityType(typeof(Entity12));
            context.Run("running-test");

            var entityPipelines = context.Pipelines;

            Assert.AreEqual(4, entityPipelines.Count);

            var pipeline1 = (MySqlEntityPipeline)entityPipelines[0];

            Assert.AreEqual("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", pipeline1.ConnectString);

            Assert.AreEqual("MySqlFileEntityPipeline", entityPipelines[1].GetType().Name);
            Assert.AreEqual("ConsoleEntityPipeline", entityPipelines[2].GetType().Name);
            Assert.AreEqual("JsonFileEntityPipeline", entityPipelines[3].GetType().Name);

            var pipelines = context.GetPipelines();

            Assert.AreEqual(4, pipelines.Count);
            IEntityPipeline pipeline = (IEntityPipeline)pipelines[0];

            //entityPipelines = pipeline.GetEntityPipelines();
            //Assert.AreEqual(4, entityPipelines.Count);
            //pipeline1 = (MySqlEntityPipeline)entityPipelines[0];
            //Assert.AreEqual("db", pipeline1.GetSchema().Database);
            //Assert.AreEqual("table13", pipeline1.GetSchema().Name);

            using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"))
            {
                conn.Execute($"DROP table db.table12");
                conn.Execute($"DROP table db.table13");
            }
        }
Exemple #11
0
        public void SqlServerDataTypeTests()
        {
            using (var conn = new SqlConnection("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true"))
            {
                EntitySpider context = new EntitySpider(new Site());
                context.SetIdentity(Guid.NewGuid().ToString("N"));
                context.SetThreadNum(1);
                context.AddPipeline(new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true"));

                context.AddStartUrl("http://baidu.com");
                context.AddEntityType(typeof(Entity15));

                context.Run("running-test");


                var columns = conn.Query <ColumnInfo>("USE [test];select  b.name Name,c.name+'(' + cast(c.length as varchar)+')' [Type] from sysobjects a,syscolumns b,systypes c where a.id=b.id and a.name='table15' and a.xtype='U'and b.xtype=c.xtype").ToList();;
                Assert.AreEqual(11, columns.Count);

                Assert.AreEqual("Int", columns[0].Name);
                Assert.AreEqual("Time", columns[1].Name);
                Assert.AreEqual("CDate", columns[2].Name);
                Assert.AreEqual("Float", columns[3].Name);
                Assert.AreEqual("Double", columns[4].Name);
                Assert.AreEqual("BigInt", columns[5].Name);
                Assert.AreEqual("__Id", columns[6].Name);
                Assert.AreEqual("String", columns[7].Name);
                Assert.AreEqual("String1", columns[8].Name);

                Assert.AreEqual("int(4)", columns[0].Type);
                Assert.AreEqual("datetime(8)", columns[1].Type);
                Assert.AreEqual("datetime(8)", columns[2].Type);
                Assert.AreEqual("float(8)", columns[3].Type);
                Assert.AreEqual("float(8)", columns[4].Type);
                Assert.AreEqual("bigint(8)", columns[5].Type);
                Assert.AreEqual("bigint(8)", columns[6].Type);
                Assert.AreEqual("nvarchar(8000)", columns[7].Type);
                Assert.AreEqual("nvarchar(8000)", columns[8].Type);

                conn.Execute("USE [test]; drop table [test].dbo.[table15]");
            }
        }
Exemple #12
0
        public void MySqlDataTypeTests()
        {
            using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"))
            {
                EntitySpider context = new EntitySpider(new Site());
                context.SetIdentity(Guid.NewGuid().ToString("N"));
                context.SetThreadNum(1);
                context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));

                context.AddStartUrl("http://baidu.com");
                context.AddEntityType(typeof(Entity15));

                context.Run("running-test");


                var columns = conn.Query <ColumnInfo>("SELECT COLUMN_NAME as `Name`, COLUMN_TYPE as `Type` FROM information_schema.columns WHERE table_name='table15' AND table_schema = 'test';").ToList();;
                Assert.AreEqual(9, columns.Count);

                Assert.AreEqual("Int", columns[0].Name);
                Assert.AreEqual("BigInt", columns[1].Name);
                Assert.AreEqual("String", columns[2].Name);
                Assert.AreEqual("Time", columns[3].Name);
                Assert.AreEqual("Float", columns[4].Name);
                Assert.AreEqual("Double", columns[5].Name);
                Assert.AreEqual("String1", columns[6].Name);
                Assert.AreEqual("cdate", columns[7].Name);
                Assert.AreEqual("__id", columns[8].Name);

                Assert.AreEqual("int(11)", columns[0].Type);
                Assert.AreEqual("bigint(20)", columns[1].Type);
                Assert.AreEqual("text", columns[2].Type);
                Assert.AreEqual("timestamp", columns[3].Type);
                Assert.AreEqual("float", columns[4].Type);
                Assert.AreEqual("double", columns[5].Type);
                Assert.AreEqual("varchar(100)", columns[6].Type);
                Assert.AreEqual("timestamp", columns[7].Type);
                Assert.AreEqual("bigint(20)", columns[8].Type);

                conn.Execute("drop table `test`.`table15`");
            }
        }
Exemple #13
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity("JD sku/store test " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> {
                { "name", "手机" }, { "cat3", "655" }
            });
            context.AddEntityType(typeof(Product), new TargetUrlExtractor
            {
                Region = new Selector {
                    Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]"
                },
                Patterns = new List <string> {
                    @"&page=[0-9]+&"
                }
            });
            context.SetDownloader(new WebDriverDownloader(Browser.Chrome));
            return(context);
        }
Exemple #14
0
        protected override EntitySpider GetEntitySpider()
        {
            //Connecting string
            const string connstr = "Data Source=localhost;Initial Catalog=test;User ID=sa;Password=1234";

            EntitySpider context = new EntitySpider(new Site {
            })
            {
                UserId    = "DotnetSpider",
                TaskGroup = "RuthSpider"
            };

            context.SetThreadNum(1);
            context.SetIdentity("RuthSpider " + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://www.tsia.org.tw/member_list.php?page=1");

            //添加公司列表頁面Entity
            context.AddEntityType(typeof(CompanySummary), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"member_list.php\?page=\d+"
                }
            });
            //添加公司詳情頁面Entity
            context.AddEntityType(typeof(Company), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"member_info.php\?ID=\d+"
                }
            });
            //Config Redis
            context.SetScheduler(new RedisScheduler
            {
                Host     = "localhost",
                Password = "",
                Port     = 6379
            });
            return(context);
        }
Exemple #15
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site());

            context.SetIdentity("ddeng.com " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
            context.SetSite(new Site
            {
                CookiesStringPart = "sid=dea284fc36c24e8cbcd447343d7b8a4e; sn=DD962248; ctid=000000; ctnm=%E5%8F%A4%E9%95%87%E7%81%AF%E9%A5%B0%E6%89%B9%E5%8F%91; ctpv=%E5%B9%BF%E4%B8%9C; JSESSIONID=acbBqFfOD4I63d9PziDvv; DDENG=c4fc08ae2e3ba3efeddbc667c2f45e615a85e80009169501dc244a03e87908aa61146548b97ed9c7dc07af23bfd80bff5008f8c8867a9165d4bd2732aca0db7dedae2e042d3968fcad1150f36be242e8a32a3f59db2a0b39216a59f1628508c5799644532a9d99925f9841b3c13a1f97; userId=10003379; previousUser=%E5%A4%95%E7%8E%89; Hm_lvt_9e33f153f28be198970d205d90a24f28=1466146335; Hm_lpvt_9e33f153f28be198970d205d90a24f28=1466146392; Hm_lvt_54b4cb498afd05463ab4611b38a6f289=1466146335; Hm_lpvt_54b4cb498afd05463ab4611b38a6f289=1466146392; CNZZDATA1256982382=395301521-1466143554-%7C1466143554",
                Headers           = new Dictionary <string, string>
                {
                    { "Cache-Control", "max-age=0" },
                    { "Upgrade-Insecure-Requests", "1" }
                },
                UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
                Accept    = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
            });
            context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://www.ddeng.com/product/982227");
            context.AddEntityType(typeof(Corp));

            return(context);
        }
            protected override EntitySpider GetEntitySpider()
            {
                EntitySpider context = new EntitySpider(new Site());

                context.SetSite(new Site());
                context.SetThreadNum(2);
                context.ThreadNum = 1;
                context.RetryWhenResultIsEmpty = false;
                context.Deep           = 100;
                context.EmptySleepTime = 5000;
                context.SetEmptySleepTime(5000);
                context.ExitWhenComplete = true;
                context.CachedSize       = 1;
                context.SetDownloader(new HttpClientDownloader());
                context.SetScheduler(new QueueDuplicateRemovedScheduler());

                context.SkipWhenResultIsEmpty = true;
                context.SpawnUrl = true;
                context.SetIdentity("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss"));
                context.AddPipeline(new CollectEntityPipeline());
                context.AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml");
                context.AddEntityType(typeof(ArticleSummary));
                return(context);
            }