示例#1
0
        public static Task Run()
        {
            //var spider = Create<VnexpressSpider>();
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            var settings = new ProjectDefinition()
            {
                ProjectName      = "Vnexpress Spider",
                Site             = "Vnexpress/Kinh Doanh",
                ItemUrlsSelector = "",
                Urls             = "https://vnexpress.net/kinh-doanh",
                FileFormat       = "*.html",
                FileStorage      = @"P:\Neil.Test\Spider Storage\Vnexpress",
                PageLimit        = 4,
            };

            builder.Services.AddSingleton <ProjectDefinition>(settings);
            builder.AddSpider <VnexpressSpider>();
            //	builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null);
            //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>();
            builder.UseDynamicMessageQueue();
            var factory = builder.Build();
            var spider  = factory.Create <VnexpressSpider>();

            return(spider.RunAsync());
        }
示例#2
0
        public void DoNotRetryWhenResultIsEmpty()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(null, null, false);
            builder.UseStandalone();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.NewGuidId();
            spider.Name                    = "RetryWhenResultIsEmpty";
            spider.EmptySleepTime          = 15;
            spider.RetryDownloadTimes      = 5;
            spider.RetryWhenResultIsEmpty  = false;
            spider.DownloaderSettings.Type = DownloaderType.Empty;
            spider.Scheduler               = new QueueDistinctBfsScheduler();
            spider.AddRequests("http://www.DoNotRetryWhenResultIsEmpty.com");
            spider.RunAsync().Wait();

            var statisticsStore = provider.GetRequiredService <IStatisticsStore>();
            var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result;

            var ds = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result[0];

            Assert.Equal(1, s.Total);
            Assert.Equal(0, s.Failed);
            Assert.Equal(1, s.Success);

            Assert.Equal(0, ds.Failed);
            Assert.Equal(1, ds.Success);
        }
        public static void Run()
        {
            ImageDownloader.GetInstance().Start();

            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "宅男女神图片采集";                                  // 设置任务名称
            spider.Speed = 2;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 5;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            //spider.AddDataFlow(new NvshensTagIndexDataParser());
            spider.AddDataFlow(new NvshensFirstPageTagDataParser());
            spider.AddDataFlow(new NvshensPageTagDataParser());
            spider.AddDataFlow(new NvshensFirstPageDetailDataParser());
            spider.AddDataFlow(new NvshensPageDetailDataParser());
            //spider.AddRequests("https://www.nvshens.com/gallery/"); // 设置起始链接
            spider.AddRequests("https://www.nvshens.com/gallery/luoli/"); // 设置起始链接
            spider.RunAsync();                                            // 启动
        }
        public static void Run1()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory   = context => context.GetSelectable(ContentType.Html),
                CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com"),
                QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
        /// <summary>
        /// 运行
        /// </summary>
        /// <param name="args">运行参数</param>
        public static void Run(params string[] args)
        {
            Framework.SetEncoding();

            var    configurationBuilder = Framework.CreateConfigurationBuilder(null, args);
            var    configuration        = configurationBuilder.Build();
            string spider = configuration["spider"];

            if (string.IsNullOrWhiteSpace(spider))
            {
                throw new SpiderException("未指定需要执行的爬虫");
            }

            var name       = configuration["name"];
            var id         = configuration["id"] ?? Guid.NewGuid().ToString("N");
            var config     = configuration["config"];
            var arguments  = configuration["args"]?.Split(' ');
            var distribute = configuration["distribute"] == "true";

            PrintEnvironment(args);

            var spiderTypes = DetectSpiders();

            if (spiderTypes == null || spiderTypes.Count == 0)
            {
                return;
            }

            var spiderType = spiderTypes.FirstOrDefault(x => x.Name.ToLower() == spider.ToLower());

            if (spiderType == null)
            {
                ConsoleHelper.WriteLine($"未找到爬虫: {spider}", 0, ConsoleColor.DarkYellow);
                return;
            }

            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(config);
            if (!distribute)
            {
                builder.UseStandalone();
            }

            builder.AddSpider(spiderType);
            var provider = builder.Build();
            var instance = provider.Create(spiderType);

            if (instance != null)
            {
                instance.Name = name;
                instance.Id   = id;
                instance.RunAsync(arguments);
            }
            else
            {
                ConsoleHelper.WriteLine("创建爬虫对象失败", 0, ConsoleColor.DarkYellow);
            }
        }
示例#6
0
        protected TestBase()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(null, null, false);
            builder.UseStandalone();
            SpiderFactory = builder.Build();

            SpiderFactory.GetRequiredService <ILogger <TestBase> >()
            .LogInformation($"Development {SpiderFactory.GetRequiredService<IConfiguration>()["Development"]}");
        }
示例#7
0
        public static Task Run()
        {
            //var spider = Create<VnexpressSpider>();
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <VnexpressSpider>();
            builder.Services.AddSingleton <IDynamicMessageQueue, InMemoryMessageQueue>((s) => null);
            //builder.Services.AddSingleton<IDynamicMessageQueue>(null as IDynamicMessageQueue);
            var factory = builder.Build();
            var spider  = factory.Create <VnexpressSpider>();

            return(spider.RunAsync());
        }
示例#8
0
        public static Task Run()
        {
            //var spider = Create<VnexpressSpider>();
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.Services.AddSingleton <IProxyValidator, FakeProxyValidator>();
            //builder.Services.AddSingleton<IProxyValidator, DefaultProxyValidator>();
            builder.ConfigureAppConfiguration(null, args: new string[] { "/ProxySupplyUrl=http://localhost:52445/api/proxies" }, true);
            builder.UseStandalone();
            var settings = new ProjectDefinition()
            {
                ProjectName                = "Vnexpress Spider",
                Site                       = "Vnexpress/Kinh Doanh",
                ItemUrlsSelector           = "//article/h1[@class='title_news']/a[1];//article[@class='list_news']/h4[@class='title_news']/a[1]",
                Urls                       = "https://vnexpress.net/kinh-doanh",
                FileStorage                = @"P:\Neil.Test\Spider Storage\Vnexpress",
                FileFormat                 = "*.json",
                PageLimit                  = 4,
                Deepth                     = 2,
                NextPageSelector           = "//p[@id='pagination']/a[@class='next']",
                NumberOfConcurrentRequests = 5,
                Mapping                    = new ItemMapping
                {
                    ItemCssSelector = "//section[@id='left_calculator']",
                    Mapping         = new FieldMapping[]
                    {
                        new FieldMapping {
                            Field = "Title", CssSelector = "//h1[@class='title_news_detail mb10']"
                        },
                        new FieldMapping {
                            Field = "Description", CssSelector = "//p[@class='description']"
                        },
                    }
                }
            };

            builder.Services.AddSingleton <ProjectDefinition>(settings);
            builder.AddSpider <HttpClientSpider>();
            //	builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null);
            //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>();
            builder.UseDynamicMessageQueue();
            var factory = builder.Build();
            var spider  = factory.Create <HttpClientSpider>();

            return(spider.RunAsync());
        }
示例#9
0
        public static Task Run()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();

            var spider = provider.Create <Spider>();

            spider.NewGuidId();                                         // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage());
            spider.AddRequests("http://www.cnblogs.com/");              // 设置起始链接
            return(spider.RunAsync());                                  // 启动
        }
示例#10
0
        public static async void Run()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(loadCommandLine: false);
            builder.UseStandalone();

            builder.AddSpider <CnblogsSpider>();
            var provider = builder.Build();

            var spider = provider.Create <CnblogsSpider>();
            //spider.Scheduler = new QueueBfsScheduler();
            //spider.NewGuidId(); // 设置任务标识
            //spider.Name = "博客园全站采集"; // 设置任务名称
            //spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            //				  //	spider.Depth = 3; // 设置采集深度
            //spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            //spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage());
            //spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            await spider.RunAsync();             // 启动
        }