public void Insert() { ClearDb(); using (SqlConnection conn = new SqlConnection(ConnectString)) { ISpider spider = new DefaultSpider("test", new Site()); MsSqlEntityPipeline insertPipeline = new MsSqlEntityPipeline(ConnectString); insertPipeline.InitiEntity(EntitySpider.ParseEntityMetaData(typeof(Product).GetTypeInfo())); insertPipeline.InitPipeline(spider); // Common data JObject data1 = new JObject { { "sku", "110" }, { "category", "3C" }, { "url", "http://jd.com/110" }, { "cdate", "2016-08-13" } }; JObject data2 = new JObject { { "sku", "111" }, { "category", "3C" }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" } }; // Value is null JObject data3 = new JObject { { "sku", "112" }, { "category", null }, { "url", "http://jd.com/111" }, { "cdate", "2016-08-13" } }; insertPipeline.Process(new List <JObject> { data1, data2, data3 }); var list = conn.Query <Product>($"use test;select * from sku_{DateTime.Now.ToString("yyyy_MM_dd")}").ToList(); Assert.AreEqual(3, list.Count); Assert.AreEqual("110", list[0].Sku); Assert.AreEqual("111", list[1].Sku); Assert.AreEqual(null, list[2].Category); } ClearDb(); }
public void PushAndPollBreadthFirst() { QueueDuplicateRemovedScheduler scheduler = new QueueDuplicateRemovedScheduler(); scheduler.DepthFirst = false; ISpider spider = new DefaultSpider("test", new Site()); scheduler.Init(spider); scheduler.Push(new Request("http://www.a.com", 1, null)); scheduler.Push(new Request("http://www.a.com", 1, null)); scheduler.Push(new Request("http://www.a.com", 1, null)); scheduler.Push(new Request("http://www.b.com", 1, null)); var request = scheduler.Poll(); Assert.Equal(request.Url.ToString(), "http://www.a.com/"); long left = scheduler.GetLeftRequestsCount(); long total = scheduler.GetTotalRequestsCount(); Assert.Equal(left, 1); Assert.Equal(total, 2); }
public void Load() { QueueDuplicateRemovedScheduler scheduler = new QueueDuplicateRemovedScheduler(); ISpider spider = new DefaultSpider("test"); scheduler.Push(new Request("http://www.a.com/", null)); scheduler.Push(new Request("http://www.b.com/", null)); scheduler.Push(new Request("http://www.c.com/", null)); scheduler.Push(new Request("http://www.d.com/", null)); Extension.Scheduler.RedisScheduler redisScheduler = GetRedisScheduler(spider.Identity); redisScheduler.Dispose(); redisScheduler.Reload(scheduler.All); Assert.Equal("http://www.d.com/", redisScheduler.Poll().Url.ToString()); Assert.Equal("http://www.c.com/", redisScheduler.Poll().Url.ToString()); Assert.Equal("http://www.b.com/", redisScheduler.Poll().Url.ToString()); Assert.Equal("http://www.a.com/", redisScheduler.Poll().Url.ToString()); redisScheduler.Dispose(); }
public void Clear() { Extension.Scheduler.RedisScheduler scheduler = GetRedisScheduler(); ISpider spider = new DefaultSpider(); scheduler.Init(spider); scheduler.Dispose(); Request request1 = new Request("http://www.ibm.com/1", null); Request request2 = new Request("http://www.ibm.com/2", null); Request request3 = new Request("http://www.ibm.com/3", null); Request request4 = new Request("http://www.ibm.com/4", null); scheduler.Push(request1); scheduler.Push(request2); scheduler.Push(request3); scheduler.Push(request4); Request result = scheduler.Poll(); Assert.AreEqual("http://www.ibm.com/4", result.Url.ToString()); scheduler.Dispose(); }
public void PushAndPollBreadthFirst() { QueueDuplicateRemovedScheduler scheduler = new QueueDuplicateRemovedScheduler(); scheduler.DepthFirst = false; ISpider spider = new DefaultSpider("test", new Site()); scheduler.Init(spider); scheduler.Push(new Request("http://www.a.com", null) { Site = spider.Site }); scheduler.Push(new Request("http://www.a.com", null) { Site = spider.Site }); scheduler.Push(new Request("http://www.a.com", null) { Site = spider.Site }); scheduler.Push(new Request("http://www.b.com", null) { Site = spider.Site }); var request = scheduler.Poll(); Assert.Equal("http://www.a.com/", request.Url.ToString()); long left = scheduler.LeftRequestsCount; long total = scheduler.TotalRequestsCount; Assert.Equal(1, left); Assert.Equal(2, total); }
public void InsertAndIgnoreDuplicate() { #if NETSTANDARD if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return; } #endif ClearDb(); ISpider spider = new DefaultSpider("test", new Site()); CassandraEntityPipeline insertPipeline = new CassandraEntityPipeline(connectString); var metadata = new EntityDefine <ProductInsert>(); insertPipeline.AddEntity(metadata); insertPipeline.Init(); // Common data var data1 = new ProductInsert { Sku = "110", Category = "3C", Url = "http://jd.com/110", CDate = new DateTime(2016, 8, 13) }; var data2 = new ProductInsert { Sku = "111", Category = "3C", Url = "http://jd.com/111", CDate = new DateTime(2016, 8, 13) }; var data3 = new ProductInsert { Sku = "112", Category = null, Url = "http://jd.com/111", CDate = new DateTime(2016, 8, 13) }; // Value is null insertPipeline.Process(metadata.Name, new List <dynamic> { data1, data2, data3 }, spider); var cluster = CassandraUtil.CreateCluster(connectString); var session = cluster.Connect(); session.ChangeKeyspace("test"); var rows = session.Execute($"SELECT * FROM test.sku_cassandra_{DateTime.Now.ToString("yyyy_MM_dd")}").GetRows().ToList(); var results = new List <ProductInsert>(); foreach (var row in rows) { results.Add(new ProductInsert { Sku = row.GetValue <string>("sku"), Category = row.GetValue <string>("category"), Id = row.GetValue <Guid>("id") }); } insertPipeline.DefaultPipelineModel = PipelineMode.InsertAndIgnoreDuplicate; var sku = results.First().Sku; var data4 = new ProductInsert { Id = results.First().Id, Sku = "113", Category = "asdfasf", Url = "http://jd.com/111", CDate = new DateTime(2016, 8, 13) }; insertPipeline.Process(metadata.Name, new List <dynamic> { data4 }, spider); rows = session.Execute($"SELECT * FROM test.sku_cassandra_{DateTime.Now.ToString("yyyy_MM_dd")}").GetRows().ToList(); results = new List <ProductInsert>(); foreach (var row in rows) { results.Add(new ProductInsert { Sku = row.GetValue <string>("sku"), Category = row.GetValue <string>("category") }); } Assert.Equal(3, results.Count); Assert.DoesNotContain(results, r => r.Sku == sku); Assert.Contains(results, r => r.Sku == "113"); Assert.Contains(results, r => r.Category == "asdfasf"); ClearDb(); }
public virtual void DataTypes() { using (MySqlConnection conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;")) { try { conn.Execute("use test; drop table table15;"); } catch { } var spider = new DefaultSpider(); EntityProcessor <Entity15> processor = new EntityProcessor <Entity15>(); var pipeline = new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"); var resultItems = new ResultItems(); resultItems.Request = new Request(); resultItems.AddOrUpdateResultItem(processor.Model.Identity, new Tuple <IModel, IList <dynamic> >(processor.Model, new[] { new Dictionary <string, dynamic> { { "int", "1" }, { "bool", "1" }, { "bigint", "11" }, { "string", "aaa" }, { "time", "2018-06-12" }, { "float", "1" }, { "double", "1" }, { "string1", "abc" }, { "string2", "abcdd" }, { "decimal", "1" } } })); pipeline.Process(new ResultItems[] { resultItems }, spider.Logger, spider); var columns = conn.Query <ColumnInfo>("SELECT COLUMN_NAME as `Name`, COLUMN_TYPE as `Type` FROM information_schema.columns WHERE table_name='table15' AND table_schema = 'test';").ToList();; Assert.Equal(12, columns.Count); Assert.Equal("int".ToLower(), columns[0].Name); Assert.Equal("bool".ToLower(), columns[1].Name); Assert.Equal("bigint".ToLower(), columns[2].Name); Assert.Equal("string".ToLower(), columns[3].Name); Assert.Equal("time".ToLower(), columns[4].Name); Assert.Equal("float".ToLower(), columns[5].Name); Assert.Equal("double".ToLower(), columns[6].Name); Assert.Equal("string1".ToLower(), columns[7].Name); Assert.Equal("string2".ToLower(), columns[8].Name); Assert.Equal("decimal".ToLower(), columns[9].Name); Assert.Equal("creation_time".ToLower(), columns[10].Name); Assert.Equal("creation_date".ToLower(), columns[11].Name); Assert.Equal("int(11)", columns[0].Type); Assert.Equal("tinyint(1)", columns[1].Type); Assert.Equal("bigint(20)", columns[2].Type); Assert.Equal("varchar(255)", columns[3].Type); Assert.Equal("timestamp", columns[4].Type); Assert.Equal("float", columns[5].Type); Assert.Equal("double", columns[6].Type); Assert.Equal("varchar(100)", columns[7].Type); Assert.Equal("longtext", columns[8].Type); Assert.Equal("decimal(18,2)", columns[9].Type); Assert.Equal("timestamp", columns[10].Type); Assert.Equal("date", columns[11].Type); try { conn.Execute("use test; drop table table15;"); } catch { } } }
public void Status() { Extension.Scheduler.RedisScheduler scheduler = GetRedisScheduler(); ISpider spider = new DefaultSpider("test", new Site()); scheduler.Init(spider); scheduler.Dispose(); scheduler.Push(new Request("http://www.a.com/", null) { Site = spider.Site }); scheduler.Push(new Request("http://www.b.com/", null) { Site = spider.Site }); scheduler.Push(new Request("http://www.c.com/", null) { Site = spider.Site }); scheduler.Push(new Request("http://www.d.com/", null) { Site = spider.Site }); Assert.Equal(0, scheduler.ErrorRequestsCount); Assert.Equal(4, scheduler.LeftRequestsCount); Assert.Equal(4, scheduler.TotalRequestsCount); scheduler.IncreaseErrorCount(); Assert.Equal(1, scheduler.ErrorRequestsCount); Assert.Equal(0, scheduler.SuccessRequestsCount); scheduler.IncreaseSuccessCount(); Assert.Equal(1, scheduler.SuccessRequestsCount); scheduler.Poll(); Assert.Equal(3, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); Assert.Equal(1, scheduler.ErrorRequestsCount); Assert.Equal(4, scheduler.TotalRequestsCount); scheduler.Poll(); Assert.Equal(2, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); Assert.Equal(1, scheduler.ErrorRequestsCount); Assert.Equal(4, scheduler.TotalRequestsCount); scheduler.Poll(); Assert.Equal(1, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); Assert.Equal(1, scheduler.ErrorRequestsCount); Assert.Equal(4, scheduler.TotalRequestsCount); scheduler.Poll(); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); Assert.Equal(1, scheduler.ErrorRequestsCount); Assert.Equal(4, scheduler.TotalRequestsCount); scheduler.Poll(); scheduler.Poll(); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); Assert.Equal(1, scheduler.ErrorRequestsCount); Assert.Equal(4, scheduler.TotalRequestsCount); scheduler.Dispose(); }
public void SubContentHandler() { var spider = new DefaultSpider("test", new Site()); TestDownloader downloader1 = new TestDownloader(); downloader1.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "a", End = "c" }); var request1 = new Request("http://a.com/", null); Page page = downloader1.Download(request1, spider); Assert.AreEqual("aabbc", page.Content); downloader1 = new TestDownloader(); downloader1.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "a", End = "c", EndOffset = 1 }); page = downloader1.Download(request1, spider); Assert.AreEqual("aabb", page.Content); downloader1 = new TestDownloader(); downloader1.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "a", End = "c", StartOffset = 1 }); page = downloader1.Download(request1, spider); Assert.AreEqual("abbc", page.Content); downloader1 = new TestDownloader(); downloader1.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "a", End = "c", StartOffset = 1, EndOffset = 1 }); page = downloader1.Download(request1, spider); Assert.AreEqual("abb", page.Content); downloader1 = new TestDownloader(); downloader1.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "a", End = "c", StartOffset = 10 }); var downloader2 = downloader1; try { page = downloader2.Download(request1, spider); throw new System.Exception("test failed."); } catch (SpiderException exception) { Assert.AreEqual("Sub content failed. Please check your settings.", exception.Message); } downloader1 = new TestDownloader(); downloader1.AddAfterDownloadCompleteHandler(new SubContentHandler { Start = "a", End = "c", EndOffset = 20 }); try { page = downloader1.Download(request1, spider); throw new System.Exception("test failed."); } catch (SpiderException exception) { Assert.AreEqual("Sub content failed. Please check your settings.", exception.Message); } }
public void SubContentHandler() { var spider = new DefaultSpider("test", new Site()); TestDownloader downloader1 = new TestDownloader() { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "a", End = "c" } } }; var request1 = new Request("http://a.com/", 0, null); Page page = downloader1.Download(request1, spider); Assert.Equal("aabbc", page.Content); downloader1 = new TestDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "a", End = "c", EndOffset = 1 } } }; page = downloader1.Download(request1, spider); Assert.Equal("aabb", page.Content); downloader1 = new TestDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "a", End = "c", StartOffset = 1 } } }; page = downloader1.Download(request1, spider); Assert.Equal("abbc", page.Content); downloader1 = new TestDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "a", End = "c", StartOffset = 1, EndOffset = 1 } } }; page = downloader1.Download(request1, spider); Assert.Equal("abb", page.Content); downloader1 = new TestDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "a", End = "c", StartOffset = 10 } } }; var downloader2 = downloader1; var exception = Assert.Throws <SpiderException>(() => { page = downloader2.Download(request1, spider); }); Assert.Equal("Sub content failed. Please check your settings.", exception.Message); downloader1 = new TestDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "a", End = "c", EndOffset = 20 } } }; exception = Assert.Throws <SpiderException>(() => { page = downloader1.Download(request1, spider); }); Assert.Equal("Sub content failed. Please check your settings.", exception.Message); }
public override void DataTypes() { if (!Env.IsWindows) { return; } using (var conn = new SqlConnection("Server=.\\SQLEXPRESS;Database=master;Trusted_Connection=True;MultipleActiveResultSets=true")) { try { conn.Execute("create database test;"); } catch { } try { conn.Execute("USE [test]; drop table [test].dbo.[table15]"); } catch { } var spider = new DefaultSpider(); EntityProcessor <Entity15> processor = new EntityProcessor <Entity15>(); var pipeline = new SqlServerEntityPipeline("Server=.\\SQLEXPRESS;Database=master;Trusted_Connection=True;MultipleActiveResultSets=true"); var resultItems = new ResultItems(); resultItems.Request = new Request(); resultItems.AddOrUpdateResultItem(processor.Model.Identity, new Tuple <IModel, IEnumerable <dynamic> >(processor.Model, new dynamic[] { new Dictionary <string, dynamic> { { "int", "1" }, { "bool", "1" }, { "bigint", "11" }, { "string", "aaa" }, { "time", "2018-06-12" }, { "float", "1" }, { "double", "1" }, { "string1", "abc" }, { "string2", "abcdd" }, { "decimal", "1" } } })); pipeline.Process(new ResultItems[] { resultItems }, spider); var columns = conn.Query <ColumnInfo>("USE [test];select b.name Name,c.name+'(' + cast(c.length as varchar)+')' [Type] from sysobjects a,syscolumns b,systypes c where a.id=b.id and a.name='table15' and a.xtype='U'and b.xtype=c.xtype").ToList(); Assert.Equal(15, columns.Count); Assert.Equal("creation_date".ToLower(), columns[0].Name); Assert.Equal("int".ToLower(), columns[1].Name); Assert.Equal("time".ToLower(), columns[2].Name); Assert.Equal("creation_time".ToLower(), columns[3].Name); Assert.Equal("float".ToLower(), columns[4].Name); Assert.Equal("double".ToLower(), columns[5].Name); Assert.Equal("bool".ToLower(), columns[6].Name); Assert.Equal("decimal".ToLower(), columns[7].Name); Assert.Equal("bigint".ToLower(), columns[8].Name); Assert.Equal("string".ToLower(), columns[9].Name); Assert.Equal("string1".ToLower(), columns[10].Name); Assert.Equal("string2".ToLower(), columns[11].Name); Assert.Equal("date(3)", columns[0].Type); Assert.Equal("int(4)", columns[1].Type); Assert.Equal("datetime(8)", columns[2].Type); Assert.Equal("datetime(8)", columns[3].Type); Assert.Equal("float(8)", columns[4].Type); Assert.Equal("float(8)", columns[5].Type); Assert.Equal("bit(1)", columns[6].Type); Assert.Equal("decimal(17)", columns[7].Type); Assert.Equal("bigint(8)", columns[8].Type); Assert.Equal("nvarchar(8000)", columns[9].Type); Assert.Equal("nvarchar(8000)", columns[10].Type); Assert.Equal("nvarchar(8000)", columns[11].Type); conn.Execute("USE [test]; drop table [test].dbo.[table15]"); } }