コード例 #1
0
        public static Task Run()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UserKafka();
            var provider = builder.Build();

            var spider = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 2;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser <EntitySpider.CnblogsEntry>())
            .AddDataFlow(spider.GetDefaultStorage());
            spider.AddRequests(
                new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> {
                { "网站", "博客园" }
            }),
                new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> {
                { "网站", "博客园" }
            }));
            return(spider.RunAsync());            // 启动
        }
コード例 #2
0
        static async Task Main(string[] args)
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UserKafka();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory   = context => context.GetSelectable(ContentType.Html),
                CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com"),
                QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            await spider.RunAsync();                       // 启动

            Console.Read();
        }
コード例 #3
0
        static void Main(string[] args)
        {
            try
            {
                var builder = new SpiderBuilder();

                var configurationBuilder = Framework.CreateConfigurationBuilder(null, args);
                var configuration        = configurationBuilder.Build();
                var @class   = configuration["dotnetspider.spider.class"];
                var spiderId = configuration["dotnetspider.spider.id"];

                @class   = "DotnetSpider.Spiders.CnblogsSpider";
                spiderId = "xxxxxxxx";


                var folder = Directory.Exists("/logs/") ? "/logs/" : "";

                var logPath = string.IsNullOrWhiteSpace(spiderId)
                                        ? $"{folder}{DateTime.Now:yyyy-MM-dd HH:mm:ss}.log"
                                        : $"{folder}{spiderId}.log";

                var loggerConfiguration = new LoggerConfiguration()
                                          .MinimumLevel.Information()
                                          .MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
                                          .Enrich.FromLogContext()
                                          .WriteTo.Console().WriteTo
                                          .RollingFile(logPath);
                builder.AddSerilog(loggerConfiguration);

                var spiderName = configuration["dotnetspider.spider.name"];
                spiderName = "博客园";
                if (string.IsNullOrWhiteSpace(@class) ||
                    string.IsNullOrWhiteSpace(spiderId) ||
                    string.IsNullOrWhiteSpace(spiderName)
                    )
                {
                    Log.Logger.Error($"执行爬虫的参数不正确: class {@class}, id {spiderId}, name {spiderName}");
                    return;
                }

                var type = Type.GetType(@class);
                if (type == null)
                {
                    Log.Logger.Error($"未找到爬虫类型: {@class}");
                    return;
                }

                Log.Logger.Information($"获取爬虫类型 {type.FullName} 成功");
                builder.ConfigureAppConfiguration(configuration);
                builder.UserKafka();
                builder.AddSpider(type);
                var provider = builder.Build();

                var spider = provider.Create(type);
                Log.Logger.Information($"创建爬虫实例成功");
                spider.Id   = spiderId;
                spider.Name = spiderName;

                Log.Logger.Information($"尝试启动爬虫实例");
                spider.Run();

                Log.Logger.Information($"爬虫实例退出");
            }
            catch (Exception e)
            {
                Log.Logger.Error($"执行失败: {e}");
            }
        }