public static Task Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UserKafka(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 2; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser <EntitySpider.CnblogsEntry>()) .AddDataFlow(spider.GetDefaultStorage()); spider.AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> { { "网站", "博客园" } }), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> { { "网站", "博客园" } })); return(spider.RunAsync()); // 启动 }
static async Task Main(string[] args) { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UserKafka(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 await spider.RunAsync(); // 启动 Console.Read(); }
static void Main(string[] args) { try { var builder = new SpiderBuilder(); var configurationBuilder = Framework.CreateConfigurationBuilder(null, args); var configuration = configurationBuilder.Build(); var @class = configuration["dotnetspider.spider.class"]; var spiderId = configuration["dotnetspider.spider.id"]; @class = "DotnetSpider.Spiders.CnblogsSpider"; spiderId = "xxxxxxxx"; var folder = Directory.Exists("/logs/") ? "/logs/" : ""; var logPath = string.IsNullOrWhiteSpace(spiderId) ? $"{folder}{DateTime.Now:yyyy-MM-dd HH:mm:ss}.log" : $"{folder}{spiderId}.log"; var loggerConfiguration = new LoggerConfiguration() .MinimumLevel.Information() .MinimumLevel.Override("Microsoft", LogEventLevel.Warning) .Enrich.FromLogContext() .WriteTo.Console().WriteTo .RollingFile(logPath); builder.AddSerilog(loggerConfiguration); var spiderName = configuration["dotnetspider.spider.name"]; spiderName = "博客园"; if (string.IsNullOrWhiteSpace(@class) || string.IsNullOrWhiteSpace(spiderId) || string.IsNullOrWhiteSpace(spiderName) ) { Log.Logger.Error($"执行爬虫的参数不正确: class {@class}, id {spiderId}, name {spiderName}"); return; } var type = Type.GetType(@class); if (type == null) { Log.Logger.Error($"未找到爬虫类型: {@class}"); return; } Log.Logger.Information($"获取爬虫类型 {type.FullName} 成功"); builder.ConfigureAppConfiguration(configuration); builder.UserKafka(); builder.AddSpider(type); var provider = builder.Build(); var spider = provider.Create(type); Log.Logger.Information($"创建爬虫实例成功"); spider.Id = spiderId; spider.Name = spiderName; Log.Logger.Information($"尝试启动爬虫实例"); spider.Run(); Log.Logger.Information($"爬虫实例退出"); } catch (Exception e) { Log.Logger.Error($"执行失败: {e}"); } }