public void Update() { ClearDb(); using (SqlConnection conn = new SqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); SqlServerEntityPipeline insertPipeline = new SqlServerEntityPipeline(ConnectString); var metadata = EntitySpider.GenerateEntityMetaData(typeof(ProductInsert).GetTypeInfo()); insertPipeline.AddEntity(metadata); insertPipeline.InitPipeline(spider); JObject data1 = new JObject { { "Sku", "110" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" } }; JObject data2 = new JObject { { "Sku", "111" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" } }; insertPipeline.Process(metadata.Entity.Name, new List <JObject> { data1, data2 }); SqlServerEntityPipeline updatePipeline = new SqlServerEntityPipeline(ConnectString); var metadat2 = EntitySpider.GenerateEntityMetaData(typeof(ProductUpdate).GetTypeInfo()); updatePipeline.AddEntity(metadat2); updatePipeline.InitPipeline(spider); JObject data3 = new JObject { { "Sku", "110" }, { "Category", "4C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" } }; updatePipeline.Process(metadat2.Entity.Name, new List <JObject> { data3 }); var list = conn.Query <ProductInsert>($"use test;select * from sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.AreEqual(2, list.Count); Assert.AreEqual("110", list[0].Sku); Assert.AreEqual("4C", list[0].Category); } ClearDb(); }
static async Task Main(string[] args) { Log.Logger = new LoggerConfiguration() .MinimumLevel.Information() .MinimumLevel.Override("Microsoft.Hosting.Lifetime", LogEventLevel.Warning) .MinimumLevel.Override("Microsoft", LogEventLevel.Warning) .MinimumLevel.Override("System", LogEventLevel.Warning) .MinimumLevel.Override("Microsoft.AspNetCore.Authentication", LogEventLevel.Warning) .Enrich.FromLogContext() .WriteTo.Console().WriteTo.RollingFile("logs/spider.txt") .CreateLogger(); await EntitySpider.RunAsync(); // await DistributedSpider.RunAsync(); Console.WriteLine("Bye!"); Environment.Exit(0); }
protected override EntitySpider GetEntitySpider() { Name = "JD Shop details " + DateTimeUtils.RunIdOfMonday; var context = new EntitySpider(new Site()) { TaskGroup = "JD SKU Weekly", CachedSize = 1, ThreadNum = 8, Scheduler = new RedisScheduler("127.0.0.1:6379,serviceName=Scheduler.NET,keepAlive=8,allowAdmin=True,connectTimeout=10000,password=6GS9F2QTkP36GggE0c3XwVwI,abortConnect=True,connectRetry=20"), Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "json(", End = ");", StartOffset = 5, EndOffset = 0 } } }, PrepareStartUrls = new PrepareStartUrls[] { new BaseDbPrepareStartUrls() { Source = DataSource.MySql, ConnectString = "Database='test';Data Source= localhost;User ID=root;Password=1qazZAQ!;Port=3306", QueryString = $"SELECT * FROM jd.sku_v2_{DateTimeUtils.RunIdOfMonday} WHERE shopname is null or shopid is null order by sku", Columns = new [] { new DataColumn { Name = "sku" } }, FormateStrings = new List <string> { "http://chat1.jd.com/api/checkChat?my=list&pidList={0}&callback=json" } } } }; context.AddPipeline(new MySqlEntityPipeline("Database='taobao';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=4306")); context.AddEntityType(typeof(ProductUpdater)); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site { EncodingName = "UTF-8" }); context.AddPipeline( new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); var word = "淘宝618"; context.AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary <string, dynamic> { { "Keyword", word } }); context.AddEntityType(typeof(BaiduSearchEntry)); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site { //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API")) }); context.SetThreadNum(1); context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); // dowload html by http client context.SetDownloader(new HttpClientDownloader()); // save data to mysql. context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> { { "name", "手机" }, { "cat3", "655" } }); context.AddEntityType(typeof(Product)); return(context); }
public void UpdateWhenUnionPrimary() { ClearDb(); using (MySqlConnection conn = new MySqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline(ConnectString); var metadata = EntitySpider.GenerateEntityDefine(typeof(Product2Insert).GetTypeInfo()); insertPipeline.AddEntity(metadata); insertPipeline.InitPipeline(spider); DataObject data1 = new DataObject { { "Sku", "110" }, { "Category1", "4C" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" } }; DataObject data2 = new DataObject { { "Sku", "111" }, { "Category1", "4C" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" } }; insertPipeline.Process(metadata.Name, new List <DataObject> { data1, data2 }); MySqlEntityPipeline updatePipeline = new MySqlEntityPipeline(ConnectString); var metadata2 = EntitySpider.GenerateEntityDefine(typeof(Product2Update).GetTypeInfo()); updatePipeline.AddEntity(metadata2); updatePipeline.InitPipeline(spider); DataObject data3 = new DataObject { { "Sku", "110" }, { "Category1", "4C" }, { "Category", "AAAA" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" } }; updatePipeline.Process(metadata2.Name, new List <DataObject> { data3 }); var list = conn.Query <Product2Insert>($"select * from test.sku2_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.Equal(2, list.Count); Assert.Equal("110", list[0].Sku); Assert.Equal("AAAA", list[0].Category); } ClearDb(); }
static async Task Main(string[] args) { var configure = new LoggerConfiguration() #if DEBUG .MinimumLevel.Verbose() #else .MinimumLevel.Information() #endif .MinimumLevel.Override("Microsoft", LogEventLevel.Warning) .Enrich.FromLogContext() .WriteTo.Console().WriteTo .RollingFile("dotnet-spider.log"); Log.Logger = configure.CreateLogger(); await EntitySpider.Run(); // await DistributedSpider.Run(); Console.Read(); }
public void EntitySelector() { var entity1 = EntitySpider.ParseEntityMetaData(typeof(Entity7).GetTypeInfo()); Assert.Equal("expression", entity1.Entity.Selector.Expression); Assert.Equal(SelectorType.XPath, entity1.Entity.Selector.Type); Assert.True(entity1.Entity.Multi); var entity2 = EntitySpider.ParseEntityMetaData(typeof(Entity8).GetTypeInfo()); Assert.Equal("expression2", entity2.Entity.Selector.Expression); Assert.Equal(SelectorType.Css, entity2.Entity.Selector.Type); Assert.True(entity2.Entity.Multi); var entity3 = EntitySpider.ParseEntityMetaData(typeof(Entity9).GetTypeInfo()); Assert.False(entity3.Entity.Multi); Assert.Null(entity3.Entity.Selector); Assert.Equal("DotnetSpider.Test.SpiderEntityTest+Entity9", entity3.Entity.Name); }
public void EntitySelector() { var entity1 = EntitySpider.GenerateEntityDefine(typeof(Entity7).GetTypeInfo()); Assert.Equal("expression", entity1.Selector.Expression); Assert.Equal(SelectorType.XPath, entity1.Selector.Type); Assert.True(entity1.Multi); var entity2 = EntitySpider.GenerateEntityDefine(typeof(Entity8).GetTypeInfo()); Assert.Equal("expression2", entity2.Selector.Expression); Assert.Equal(SelectorType.Css, entity2.Selector.Type); Assert.True(entity2.Multi); var entity3 = EntitySpider.GenerateEntityDefine(typeof(Entity9).GetTypeInfo()); Assert.False(entity3.Multi); Assert.Null(entity3.Selector); Assert.Equal("DotnetSpider.Extension.Test.EntitySpiderTest2+Entity9", entity3.Name); }
public void EntitySelector() { var entity1 = EntitySpider.GenerateEntityMetaData(typeof(Entity7).GetTypeInfo()); Assert.AreEqual("expression", entity1.Entity.Selector.Expression); Assert.AreEqual(SelectorType.XPath, entity1.Entity.Selector.Type); Assert.IsTrue(entity1.Entity.Multi); var entity2 = EntitySpider.GenerateEntityMetaData(typeof(Entity8).GetTypeInfo()); Assert.AreEqual("expression2", entity2.Entity.Selector.Expression); Assert.AreEqual(SelectorType.Css, entity2.Entity.Selector.Type); Assert.IsTrue(entity2.Entity.Multi); var entity3 = EntitySpider.GenerateEntityMetaData(typeof(Entity9).GetTypeInfo()); Assert.IsFalse(entity3.Entity.Multi); Assert.IsNull(entity3.Entity.Selector); Assert.AreEqual("DotnetSpider.Extension.Test.EntitySpiderTest2+Entity9", entity3.Entity.Name); }
public void UpdateCheckIfSameBeforeUpdate() { ClearDb(); using (MySqlConnection conn = new MySqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline(ConnectString); insertPipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo())); insertPipeline.InitPipeline(spider); JObject data1 = new JObject { { "sku", "110" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; JObject data2 = new JObject { { "sku", "111" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" } }; insertPipeline.Process(new List <JObject> { data1, data2 }); MySqlEntityPipeline updatePipeline = new MySqlEntityPipeline(ConnectString, PipelineMode.Update, true); updatePipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo())); updatePipeline.InitPipeline(spider); JObject data3 = new JObject { { "sku", "110" }, { "category", "4C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; updatePipeline.Process(new List <JObject> { data3 }); var list = conn.Query <Product>($"select * from test.sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.AreEqual(2, list.Count); Assert.AreEqual("110", list[0].Sku); Assert.AreEqual("4C", list[0].Category); } ClearDb(); }
public void Insert() { if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return; } ClearDb(); using (SqlConnection conn = new SqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); SqlServerEntityPipeline insertPipeline = new SqlServerEntityPipeline(ConnectString); var metadata = EntitySpider.GenerateEntityDefine(typeof(ProductInsert).GetTypeInfo()); insertPipeline.AddEntity(EntitySpider.GenerateEntityDefine(typeof(ProductInsert).GetTypeInfo())); insertPipeline.InitPipeline(spider); // Common data var data1 = new DataObject { { "Sku", "110" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" } }; var data2 = new DataObject { { "Sku", "111" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" } }; // Value is null var data3 = new DataObject { { "Sku", "112" }, { "Category", null }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" } }; insertPipeline.Process(metadata.Name, new List <DataObject> { data1, data2, data3 }); var list = conn.Query <ProductInsert>($"use test;select * from sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.Equal(3, list.Count); Assert.Equal("110", list[0].Sku); Assert.Equal("111", list[1].Sku); Assert.Null(list[2].Category); } ClearDb(); }
public void UpdateWhenUnionPrimary() { ClearDb(); using (SqlConnection conn = new SqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); MsSqlEntityPipeline insertPipeline = new MsSqlEntityPipeline(ConnectString); insertPipeline.InitEntity(EntitySpider.GenerateEntityMetaData(typeof(Product2).GetTypeInfo())); insertPipeline.InitPipeline(spider); JObject data1 = new JObject { { "sku", "110" }, { "category1", "4C" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; JObject data2 = new JObject { { "sku", "111" }, { "category1", "4C" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" } }; insertPipeline.Process(new List <JObject> { data1, data2 }); MsSqlEntityPipeline updatePipeline = new MsSqlEntityPipeline(ConnectString, PipelineMode.Update); updatePipeline.InitEntity(EntitySpider.GenerateEntityMetaData(typeof(Product2).GetTypeInfo())); updatePipeline.InitPipeline(spider); JObject data3 = new JObject { { "sku", "110" }, { "category1", "4C" }, { "category", "AAAA" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; updatePipeline.Process(new List <JObject> { data3 }); var list = conn.Query <Product2>($"use test;select * from sku2_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.AreEqual(2, list.Count); Assert.AreEqual("110", list[0].Sku); Assert.AreEqual("AAAA", list[0].Category); } ClearDb(); }
public void MySqlDataTypeTests() { using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")) { EntitySpider context = new EntitySpider(new Site()); context.SetIdentity(Guid.NewGuid().ToString("N")); context.SetThreadNum(1); context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://baidu.com"); context.AddEntityType(typeof(Entity15)); context.Run("running-test"); var columns = conn.Query <ColumnInfo>("SELECT COLUMN_NAME as `Name`, COLUMN_TYPE as `Type` FROM information_schema.columns WHERE table_name='table15' AND table_schema = 'test';").ToList();; Assert.AreEqual(9, columns.Count); Assert.AreEqual("Int", columns[0].Name); Assert.AreEqual("BigInt", columns[1].Name); Assert.AreEqual("String", columns[2].Name); Assert.AreEqual("Time", columns[3].Name); Assert.AreEqual("Float", columns[4].Name); Assert.AreEqual("Double", columns[5].Name); Assert.AreEqual("String1", columns[6].Name); Assert.AreEqual("cdate", columns[7].Name); Assert.AreEqual("__id", columns[8].Name); Assert.AreEqual("int(11)", columns[0].Type); Assert.AreEqual("bigint(20)", columns[1].Type); Assert.AreEqual("text", columns[2].Type); Assert.AreEqual("timestamp", columns[3].Type); Assert.AreEqual("float", columns[4].Type); Assert.AreEqual("double", columns[5].Type); Assert.AreEqual("varchar(100)", columns[6].Type); Assert.AreEqual("timestamp", columns[7].Type); Assert.AreEqual("bigint(20)", columns[8].Type); conn.Execute("drop table `test`.`table15`"); } }
public void Update() { ClearDb(); using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=127.0.0.1;User ID=root;Password=1qazZAQ!;Port=3306")) { ISpider spider = new DefaultSpider("test", new Site()); MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline("Database='mysql';Data Source=127.0.0.1;User ID=root;Password=1qazZAQ!;Port=3306"); insertPipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo())); insertPipeline.InitPipeline(spider); JObject data1 = new JObject { { "sku", "110" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; JObject data2 = new JObject { { "sku", "111" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" } }; insertPipeline.Process(new List <JObject> { data1, data2 }); MySqlEntityPipeline updatePipeline = new MySqlEntityPipeline("Database='mysql';Data Source=127.0.0.1;User ID=root;Password=1qazZAQ!;Port=3306", PipelineMode.Update); updatePipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo())); updatePipeline.InitPipeline(spider); JObject data3 = new JObject { { "sku", "110" }, { "category", "4C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; updatePipeline.Process(new List <JObject> { data3 }); var list = conn.Query <Product>($"select * from test.sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.Equal(2, list.Count); Assert.Equal("110", list[0].Sku); Assert.Equal("4C", list[0].Category); } ClearDb(); }
public void SqlServerDataTypeTests() { using (var conn = new SqlConnection("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true")) { EntitySpider context = new EntitySpider(new Site()); context.SetIdentity(Guid.NewGuid().ToString("N")); context.SetThreadNum(1); context.AddPipeline(new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true")); context.AddStartUrl("http://baidu.com"); context.AddEntityType(typeof(Entity15)); context.Run("running-test"); var columns = conn.Query <ColumnInfo>("USE [test];select b.name Name,c.name+'(' + cast(c.length as varchar)+')' [Type] from sysobjects a,syscolumns b,systypes c where a.id=b.id and a.name='table15' and a.xtype='U'and b.xtype=c.xtype").ToList();; Assert.AreEqual(11, columns.Count); Assert.AreEqual("Int", columns[0].Name); Assert.AreEqual("Time", columns[1].Name); Assert.AreEqual("CDate", columns[2].Name); Assert.AreEqual("Float", columns[3].Name); Assert.AreEqual("Double", columns[4].Name); Assert.AreEqual("BigInt", columns[5].Name); Assert.AreEqual("__Id", columns[6].Name); Assert.AreEqual("String", columns[7].Name); Assert.AreEqual("String1", columns[8].Name); Assert.AreEqual("int(4)", columns[0].Type); Assert.AreEqual("datetime(8)", columns[1].Type); Assert.AreEqual("datetime(8)", columns[2].Type); Assert.AreEqual("float(8)", columns[3].Type); Assert.AreEqual("float(8)", columns[4].Type); Assert.AreEqual("bigint(8)", columns[5].Type); Assert.AreEqual("bigint(8)", columns[6].Type); Assert.AreEqual("nvarchar(8000)", columns[7].Type); Assert.AreEqual("nvarchar(8000)", columns[8].Type); conn.Execute("USE [test]; drop table [test].dbo.[table15]"); } }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()); context.SetIdentity("JD sku/store test " + DateTime.Now.ToString("yyyy-MM-dd HHmmss")); context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> { { "name", "手机" }, { "cat3", "655" } }); context.AddEntityType(typeof(Product), new TargetUrlExtractor { Region = new Selector { Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]" }, Patterns = new List <string> { @"&page=[0-9]+&" } }); context.SetDownloader(new WebDriverDownloader(Browser.Chrome)); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()); context.SetSite(new Site { CookiesStringPart = "sid=dea284fc36c24e8cbcd447343d7b8a4e; sn=DD962248; ctid=000000; ctnm=%E5%8F%A4%E9%95%87%E7%81%AF%E9%A5%B0%E6%89%B9%E5%8F%91; ctpv=%E5%B9%BF%E4%B8%9C; JSESSIONID=acbBqFfOD4I63d9PziDvv; DDENG=c4fc08ae2e3ba3efeddbc667c2f45e615a85e80009169501dc244a03e87908aa61146548b97ed9c7dc07af23bfd80bff5008f8c8867a9165d4bd2732aca0db7dedae2e042d3968fcad1150f36be242e8a32a3f59db2a0b39216a59f1628508c5799644532a9d99925f9841b3c13a1f97; userId=10003379; previousUser=%E5%A4%95%E7%8E%89; Hm_lvt_9e33f153f28be198970d205d90a24f28=1466146335; Hm_lpvt_9e33f153f28be198970d205d90a24f28=1466146392; Hm_lvt_54b4cb498afd05463ab4611b38a6f289=1466146335; Hm_lpvt_54b4cb498afd05463ab4611b38a6f289=1466146392; CNZZDATA1256982382=395301521-1466143554-%7C1466143554", Headers = new Dictionary <string, string> { { "Cache-Control", "max-age=0" }, { "Upgrade-Insecure-Requests", "1" } }, UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }); context.AddPipeline(new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=test;Trusted_Connection=True;MultipleActiveResultSets=true")); context.AddStartUrl("http://www.ddeng.com/product/982227"); context.AddEntityType(typeof(Corp)); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()); context.SetThreadNum(1); context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss")); context.AddPipeline(new MySqlEntityPipeline(null) { UpdateConnectString = new DbUpdateConnectString { ConnectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", QueryString = "SELECT value from `dotnetspider`.`settings` where `type`='ConnectString' and `key`='MySql01' LIMIT 1" } }); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> { { "name", "手机" }, { "cat3", "655" } }); context.AddEntityType(typeof(JdSkuSampleSpider.Product)); return(context); }
protected override EntitySpider GetEntitySpider() { //Connecting string const string connstr = "Data Source=localhost;Initial Catalog=test;User ID=sa;Password=1234"; EntitySpider context = new EntitySpider(new Site { }) { UserId = "DotnetSpider", TaskGroup = "RuthSpider" }; context.SetThreadNum(1); context.SetIdentity("RuthSpider " + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://www.tsia.org.tw/member_list.php?page=1"); //添加公司列表頁面Entity context.AddEntityType(typeof(CompanySummary), new TargetUrlExtractor { Patterns = new List <string> { @"member_list.php\?page=\d+" } }); //添加公司詳情頁面Entity context.AddEntityType(typeof(Company), new TargetUrlExtractor { Patterns = new List <string> { @"member_info.php\?ID=\d+" } }); //Config Redis context.SetScheduler(new RedisScheduler { Host = "localhost", Password = "", Port = 6379 }); return(context); }
protected override EntitySpider GetEntitySpider() { Site site = new Site(); using (var reader = new StreamReader(File.OpenRead("taobaokeyword.txt"))) { string keyword; while (!string.IsNullOrEmpty(keyword = reader.ReadLine())) { site.AddStartUrl("https://" + $"s.taobao.com/search?q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&sort=sale-desc&s=0&tab={1}&fs=1&filter_tianmao=tmall", new Dictionary <string, object> { { "keyword", keyword } }); } } var context = new EntitySpider(site) { ThreadNum = 5, SkipWhenResultIsEmpty = true, Scheduler = new RedisScheduler("127.0.0.1:6379,serviceName = DotnetSpider,keepAlive = 8,allowAdmin = True,connectTimeout = 10000,password = 6GS9F2QTkP36GggE0c3XwVwI,abortConnect = True,connectRetry = 20"), Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { StartOffset = 16, EndOffset = 22, Start = "g_page_config = {", End = "g_srp_loadCss();" }, new IncrementTargetUrlsCreator("&s=0", null, 44) } } }; context.AddPipeline(new MySqlEntityPipeline("Database = 'mysql'; Data Source = localhost; User ID = root; Password = 1qazZAQ!; Port = 3306")); context.AddEntityType(typeof(Item), new MyDataHanlder()); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()) { Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new IncrementTargetUrlsCreator("index_1.shtml") } }, }; context.SetThreadNum(10); context.SetIdentity("qidian_" + DateTime.Now.ToString("yyyy_MM_dd_HHmmss")); context.AddPipeline( new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://oa.jlu.edu.cn/. "); context.AddEntityType(typeof(ArticleSummary)); context.AddEntityType(typeof(Article)); return(context); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()) { UserId = "DotnetSpider", TaskGroup = "HaoBrowser", Identity = "HaoBrowser Hao360Spider Buble " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"), CachedSize = 1, ThreadNum = 1, SkipWhenResultIsEmpty = true, Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "sales[\"hotsite_yixing\"] = [", End = "}}", StartOffset = 27, EndOffset = 0 }, new ReplaceContentHandler { NewValue = "/", OldValue = "\\/", }, } } }; context.SetScheduler(new Extension.Scheduler.RedisScheduler { Host = "127.0.0.1", Port = 6379, Password = "******" }); context.AddEntityPipeline(new MySqlEntityPipeline("Database='testhao';Data Source= 127.0.0.1;User ID=root;Password=root@123456;Port=4306")); context.AddStartUrl("https://hao.360.cn/"); context.AddEntityType(typeof(UpdateHao360Info)); return(context); }
protected override EntitySpider GetEntitySpider() { var site = new Site() { UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", Headers = new Dictionary <string, string> { { "Accept-Encoding", "gzip, deflate, sdch" }, { "Upgrade-Insecure-Requests", "1" }, { "Accept-Language", "en,en-US;q=0.8" }, { "Cache-Control", "ax-age=0" }, } }; site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3355984&callback=json"); site.AddStartUrl("http://chat1.jd.com/api/checkChat?my=list&pidList=3682523&callback=json"); var context = new EntitySpider(site) { Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "json(", End = ");", StartOffset = 5, EndOffset = 2 } } } }; context.AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddEntityType(typeof(ProductUpdater)); return(context); }
static async Task Main(string[] args) { ThreadPool.SetMaxThreads(255, 255); ThreadPool.SetMinThreads(255, 255); Log.Logger = new LoggerConfiguration() .MinimumLevel.Information() .MinimumLevel.Override("Microsoft.Hosting.Lifetime", LogEventLevel.Warning) .MinimumLevel.Override("Microsoft", LogEventLevel.Warning) .MinimumLevel.Override("System", LogEventLevel.Warning) .MinimumLevel.Override("Microsoft.AspNetCore.Authentication", LogEventLevel.Warning) .Enrich.FromLogContext() .WriteTo.Console().WriteTo.RollingFile("logs/spider.log") .CreateLogger(); // // await DistributedSpider.RunAsync(); // await ProxySpider.RunAsync(); // await EntitySpider.RunMySqlQueueAsync(); await EntitySpider.RunAsync(); Console.WriteLine("Bye!"); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site()); context.SetSite(new Site()); context.SetThreadNum(2); context.ThreadNum = 1; context.RetryWhenResultIsEmpty = false; context.Deep = 100; context.EmptySleepTime = 5000; context.SetEmptySleepTime(5000); context.ExitWhenComplete = true; context.CachedSize = 1; context.SetDownloader(new HttpClientDownloader()); context.SetScheduler(new QueueDuplicateRemovedScheduler()); context.SkipWhenResultIsEmpty = true; context.SpawnUrl = true; context.AddPipeline(new CollectEntityPipeline()); context.AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml"); context.AddEntityType(typeof(ArticleSummary)); return(context); }
public void Insert() { ClearDb(); using (MySqlConnection conn = new MySqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); MySqlEntityPipeline insertPipeline = new MySqlEntityPipeline(ConnectString); var metadata = EntitySpider.GenerateEntityMetaData(typeof(ProductInsert).GetTypeInfo()); insertPipeline.AddEntity(metadata); insertPipeline.InitPipeline(spider); // Common data JObject data1 = new JObject { { "Sku", "110" }, { "Category", "3C" }, { "Url", "http://jd.com/110" }, { "CDate", "2016-08-13" } }; JObject data2 = new JObject { { "Sku", "111" }, { "Category", "3C" }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" } }; // Value is null JObject data3 = new JObject { { "Sku", "112" }, { "Category", null }, { "Url", "http://jd.com/111" }, { "CDate", "2016-08-13" } }; insertPipeline.Process(metadata.Name, new List <JObject> { data1, data2, data3 }); var list = conn.Query <ProductInsert>($"select * from test.sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.AreEqual(3, list.Count); Assert.AreEqual("110", list[0].Sku); Assert.AreEqual("111", list[1].Sku); Assert.AreEqual(null, list[2].Category); } ClearDb(); }
public void Extract() { var entityMetadata = EntitySpider.GenerateEntityDefine(typeof(Product).GetTypeInfo()); EntityExtractor extractor = new EntityExtractor("test", null, entityMetadata); var results = extractor.Extract(new Page(new Request("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, dynamic> { { "cat", "手机" }, { "cat3", "110" } }), null) { Content = File.ReadAllText(Path.Combine(Core.Environment.BaseDirectory, "Jd.html")) }); Assert.Equal(60, results.Count); Assert.Equal("手机", results[0]["CategoryName"]); Assert.Equal("110", results[0]["CategoryId"]); Assert.Equal("http://item.jd.com/3031737.html", results[0]["Url"]); Assert.Equal("3031737", results[0]["Sku"]); Assert.Equal("荣耀官方旗舰店", results[0]["ShopName"]); Assert.Equal("荣耀 NOTE 8 4GB+32GB 全网通版 冰河银", results[0]["Name"]); Assert.Equal("1000000904", results[0]["VenderId"]); Assert.Equal("1000000904", results[0]["JdzyShopId"]); Assert.Equal(DateTime.Now.ToString("yyyy-MM-dd"), results[0]["RunId"]); }
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site { //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API")) }); context.SetThreadNum(1); context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary <string, object> { { "name", "手机" }, { "cat3", "655" } }); context.AddEntityType(typeof(Product), new TargetUrlExtractor { Region = new BaseSelector { Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]" }, Patterns = new List <string> { @"&page=[0-9]+&" } }); return(context); }
public static void Main(string[] args) { RegexTestEntitySpider spider2 = new RegexTestEntitySpider(); spider2.Run(); EntitySpider spider = new EntitySpider(new Core.Site()); spider.AddStartUrl("http://www.baidu.com"); // Custmize processor and pipeline 完全自定义页面解析和数据管道 BaseUsage.CustmizeProcessorAndPipeline(); Console.WriteLine("Press any key to continue..."); Console.Read(); // Crawler pages without traverse 采集指定页面不做遍历 BaseUsage.CrawlerPagesWithoutTraverse(); Console.WriteLine("Press any key to continue..."); Console.Read(); // Crawler pages traversal 遍历整站 BaseUsage.CrawlerPagesTraversal(); Console.WriteLine("Press any key to continue..."); Console.Read(); DDengEntitySpider dDengEntitySpider = new DDengEntitySpider(); dDengEntitySpider.Run(); Console.WriteLine("Press any key to continue..."); Console.Read(); Cnblogs.Run(); Console.WriteLine("Press any key to continue..."); Console.Read(); CasSpider casSpider = new CasSpider(); casSpider.Run(); Console.WriteLine("Press any key to continue..."); Console.Read(); BaiduSearchSpider baiduSearchSpider = new BaiduSearchSpider(); baiduSearchSpider.Run(); Console.WriteLine("Press any key to continue..."); Console.Read(); JdShopDetailSpider jdShopDetailSpider = new JdShopDetailSpider(); jdShopDetailSpider.Run(); Console.WriteLine("Press any key to continue..."); Console.Read(); JdSkuSampleSpider jdSkuSampleSpider = new JdSkuSampleSpider(); jdSkuSampleSpider.Run(); Console.WriteLine("Press any key to continue..."); Console.Read(); Situoli.Run(); }