Ejemplo n.º 1
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site())
            {
                UserId                = "DotnetSpider",
                TaskGroup             = "HaoBrowser",
                Identity              = "HaoBrowser Hao360Spider Buble " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"),
                CachedSize            = 1,
                ThreadNum             = 1,
                SkipWhenResultIsEmpty = true,
                Downloader            = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new SubContentHandler {
                            Start       = "sales[\"hotsite_yixing\"] = [",
                            End         = "}}",
                            StartOffset = 27,
                            EndOffset   = 0
                        },
                        new ReplaceContentHandler {
                            NewValue = "/",
                            OldValue = "\\/",
                        },
                    }
                }
            };

            context.SetScheduler(new Extension.Scheduler.RedisScheduler("127.0.0.1:6379,serviceName=Scheduler.NET,keepAlive=8,allowAdmin=True,connectTimeout=10000,password=6GS9F2QTkP36GggE0c3XwVwI,abortConnect=True,connectRetry=20"));
            context.AddPipeline(new MySqlEntityPipeline("Database='testhao';Data Source= localhost;User ID=root;Password=root@123456;Port=3306"));
            context.AddStartUrl("https://hao.360.cn/");
            context.AddEntityType(typeof(UpdateHao360Info));
            return(context);
        }
Ejemplo n.º 2
0
            protected override EntitySpider GetEntitySpider()
            {
                EntitySpider context = new EntitySpider(new Site());

                context.SetSite(new Site());
                context.SetThreadNum(2);
                context.ThreadNum = 1;
                context.RetryWhenResultIsEmpty = false;
                context.Deep           = 100;
                context.EmptySleepTime = 5000;
                context.SetEmptySleepTime(5000);
                context.ExitWhenComplete = true;
                context.CachedSize       = 1;
                context.SetDownloader(new HttpClientDownloader());
                context.SetScheduler(new QueueDuplicateRemovedScheduler());

                context.SkipWhenResultIsEmpty = true;
                context.SpawnUrl = true;
                context.AddPipeline(new CollectEntityPipeline());
                context.AddStartUrl("http://www.cas.cn/kx/kpwz/index.shtml");
                context.AddEntityType(typeof(ArticleSummary));

                Name  = "qidian";
                Batch = DateTime.Now.ToString("yyyy_MM_dd_HHmmss");
                return(context);
            }
Ejemplo n.º 3
0
        public void ClearScheduler()
        {
            EntitySpider spider = new EntitySpider(new Site());

            spider.Identity = Guid.NewGuid().ToString("N");
            spider.SetScheduler(new RedisScheduler
            {
                Host     = "localhost",
                Password = "******"
            });
            spider.AddStartUrl("https://baidu.com");
            spider.AddEntityPipeline(new ConsoleEntityPipeline());
            spider.AddEntityType(typeof(TestEntity));
            spider.Run();

            var confiruation = new ConfigurationOptions()
            {
                ServiceName     = "DotnetSpider",
                ConnectTimeout  = 65530,
                KeepAlive       = 8,
                ConnectRetry    = 3,
                ResponseTimeout = 3000,
                Password        = "******",
                AllowAdmin      = true
            };

            confiruation.EndPoints.Add(new DnsEndPoint("127.0.0.1", 6379));

            var redis = ConnectionMultiplexer.Connect(confiruation);
            var db    = redis.GetDatabase(0);

            var md5             = Encrypt.Md5Encrypt(spider.Identity);
            var itemKey         = "item-" + md5;
            var setKey          = "set-" + md5;
            var queueKey        = "queue-" + md5;
            var errorCountKey   = "error-record" + md5;
            var successCountKey = "success-record" + md5;

            //queue
            Assert.Equal(0, db.ListLength(queueKey));
            //set
            Assert.Equal(0, db.SetLength(setKey));
            //item
            Assert.Equal(0, db.HashLength(itemKey));
            //error-count
            Assert.Equal(false, db.StringGet(errorCountKey).HasValue);
            //success-count
            Assert.Equal(false, db.StringGet(successCountKey).HasValue);
        }
Ejemplo n.º 4
0
        protected override EntitySpider GetEntitySpider()
        {
            //Connecting string
            const string connstr = "Data Source=localhost;Initial Catalog=test;User ID=sa;Password=1234";

            EntitySpider context = new EntitySpider(new Site {
            })
            {
                UserId    = "DotnetSpider",
                TaskGroup = "RuthSpider"
            };

            context.SetThreadNum(1);
            context.SetIdentity("RuthSpider " + DateTime.Now.ToString("yyyy_MM_dd_hhmmss"));
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
            context.AddStartUrl("http://www.tsia.org.tw/member_list.php?page=1");

            //添加公司列表頁面Entity
            context.AddEntityType(typeof(CompanySummary), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"member_list.php\?page=\d+"
                }
            });
            //添加公司詳情頁面Entity
            context.AddEntityType(typeof(Company), new TargetUrlExtractor
            {
                Patterns = new List <string> {
                    @"member_info.php\?ID=\d+"
                }
            });
            //Config Redis
            context.SetScheduler(new RedisScheduler
            {
                Host     = "localhost",
                Password = "",
                Port     = 6379
            });
            return(context);
        }
Ejemplo n.º 5
0
        protected override EntitySpider GetEntitySpider()
        {
            EntitySpider context = new EntitySpider(new Site())
            {
                UserId                = "DotnetSpider",
                TaskGroup             = "HaoBrowser",
                Identity              = "HaoBrowser Hao360Spider Buble " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"),
                CachedSize            = 1,
                ThreadNum             = 1,
                SkipWhenResultIsEmpty = true,
                Downloader            = new HttpClientDownloader
                {
                    DownloadCompleteHandlers = new IDownloadCompleteHandler[]
                    {
                        new SubContentHandler {
                            Start       = "sales[\"hotsite_yixing\"] = [",
                            End         = "}}",
                            StartOffset = 27,
                            EndOffset   = 0
                        },
                        new ReplaceContentHandler {
                            NewValue = "/",
                            OldValue = "\\/",
                        },
                    }
                }
            };

            context.SetScheduler(new Extension.Scheduler.RedisScheduler {
                Host     = "127.0.0.1",
                Port     = 6379,
                Password = "******"
            });
            context.AddEntityPipeline(new MySqlEntityPipeline("Database='testhao';Data Source= 127.0.0.1;User ID=root;Password=root@123456;Port=4306"));
            context.AddStartUrl("https://hao.360.cn/");
            context.AddEntityType(typeof(UpdateHao360Info));
            return(context);
        }