public void Run() { // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 //var site = new Site { EncodingName = "UTF-8" }; var site = new Site(); // Add start/feed urls. 添加初始采集链接 var context = new StockLearningEntities(); var stocks = context.Stocks.ToList(); foreach (var stock in stocks) { string range = "sh"; if (stock.StockId.StartsWith("0") || stock.StockId.StartsWith("3")) { range = "sz"; } site.AddStartUrl($"http://f9.eastmoney.com/{range}{stock.StockId}.html"); } DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(site, // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for Processor new StockJJRPageProcessor()) // use custmize pipeline for Pipeline .AddPipeline(new StockJJRPipeline()); spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; // Start crawler 启动爬虫 spider.Run(); }
public static void Run() { DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create( // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 为优酷自定义的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 为优酷自定义的 Pipeline .AddPipeline(new YoukuPipeline()); // Start crawler 启动爬虫 spider.EncodingName = "UTF-8"; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } spider.Run(); }
private void CustmizeProcessorAndPipeline() { var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; string url = this.tb_url.Text; site.AddStartUrl(url); DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(site, new QueueDuplicateRemovedScheduler(), new TouTiaoPageProcessor()); spider.AddPipeline(new TotiaoPipeline()); spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; // Start crawler 启动爬虫 spider.Run(); }
public void Run() { // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 //var site = new Site { EncodingName = "UTF-8" }; var site = new Site(); // Add start/feed urls. 添加初始采集链接 site.AddStartUrl("http://quote.eastmoney.com/stocklist.html"); DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(site, // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for Processor new StockListPageProcessor()) // use custmize pipeline for Pipeline .AddPipeline(new StockListPipeline()); spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; // Start crawler 启动爬虫 spider.Run(); }
public static void Run(params string[] args) { Console.WriteLine(""); Spider.PrintInfo(); Console.WriteLine(""); Console.ForegroundColor = ConsoleColor.Cyan; var commands = string.Join(" ", args); Console.WriteLine("Args: " + commands); Console.WriteLine(""); Console.ForegroundColor = ConsoleColor.White; #if NET_CORE Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif Dictionary <string, string> arguments = new Dictionary <string, string>(); foreach (var arg in args) { var results = arg.Split(':'); if (results.Length == 2) { var key = results[0].Trim(); if (arguments.ContainsKey(key)) { arguments[key] = results[1].Trim(); } else { arguments.Add(key, results[1].Trim()); } } else if (results.Length == 1) { var key = results[0].Trim(); if (!arguments.ContainsKey(key)) { arguments.Add(key, string.Empty); } } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Please use command like: -s:[spider type name] -i:[identity] -a:[arg1,arg2...] -tid:[taskId]"); Console.ForegroundColor = ConsoleColor.White; return; } } string spiderName; if (arguments.Count == 0 || !arguments.ContainsKey("-s") || !arguments.ContainsKey("-tid")) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("-s or -tid are necessary."); Console.ForegroundColor = ConsoleColor.White; return; } else { spiderName = arguments["-s"]; } #if NET_CORE var deps = DependencyContext.Default; #endif var spiders = new Dictionary <string, object>(); #if NET_CORE foreach (var library in deps.CompileLibraries.Where(l => l.Name.ToLower().EndsWith("dotnetspider.sample") || l.Name.ToLower().EndsWith("spiders") || l.Name.ToLower().EndsWith("crawlers"))) { var asm = Assembly.Load(new AssemblyName(library.Name)); var types = asm.GetTypes(); #else foreach (var file in DetectDlls()) { var asm = Assembly.LoadFrom(file); var types = asm.GetTypes(); #endif Console.WriteLine($"Fetch assembly: {asm.FullName}."); foreach (var type in types) { bool hasNonParametersConstructor = type.GetConstructors().Any(c => c.IsPublic && c.GetParameters().Length == 0); if (hasNonParametersConstructor) { var interfaces = type.GetInterfaces(); var isNamed = interfaces.Any(t => t.FullName == "DotnetSpider.Core.INamed"); var isIdentity = interfaces.Any(t => t.FullName == "DotnetSpider.Core.IIdentity"); var isRunnable = interfaces.Any(t => t.FullName == "DotnetSpider.Core.IRunable"); if (isNamed && isRunnable && isIdentity) { var property = type.GetProperties().First(p => p.Name == "Name"); object runner = Activator.CreateInstance(type); var name = (string)property.GetValue(runner); if (!spiders.ContainsKey(name)) { spiders.Add(name, runner); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine($"Spider {name} are duplicate."); Console.WriteLine(); Console.ForegroundColor = ConsoleColor.White; return; } } } } } if (spiders.Count == 0) { Console.ForegroundColor = ConsoleColor.DarkYellow; Console.WriteLine(); Console.WriteLine("Did not detect any spider."); Console.WriteLine(); Console.ForegroundColor = ConsoleColor.White; return; } Console.WriteLine(); Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine($"Detected {spiders.Keys.Count} crawlers."); Console.ForegroundColor = ConsoleColor.White; Console.WriteLine(); Console.WriteLine("================================================================="); Console.WriteLine(); if (!spiders.ContainsKey(spiderName)) { Console.WriteLine($"There is no spider named: {spiderName}."); return; } var spider = spiders[spiderName]; string identity = ""; if (arguments.ContainsKey("-i")) { var property = spider.GetType().GetProperties().First(p => p.Name == "Identity"); identity = arguments["-i"].ToLower(); if (arguments["-i"].ToLower() == "guid") { property.SetValue(spider, Guid.NewGuid().ToString("N")); } else { if (!string.IsNullOrEmpty(identity)) { property.SetValue(spider, arguments["-i"]); } } } if (arguments.ContainsKey("-tid")) { var property = spider.GetType().GetProperties().First(p => p.Name == "TaskId"); if (arguments["-tid"].ToLower() == "guid") { property.SetValue(spider, Guid.NewGuid().ToString("N")); } else { if (!string.IsNullOrEmpty(identity)) { property.SetValue(spider, arguments["-tid"]); } } } var spiderType = spider.GetType(); var method = spiderType.GetMethod("Run"); if (!arguments.ContainsKey("-a")) { method.Invoke(spider, new object[] { new string[] { } }); } else { var parameters = arguments["-a"].Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries); if (parameters.Contains("report")) { var emptySleepTime = spiderType.GetProperty("EmptySleepTime"); if (emptySleepTime != null) { emptySleepTime.SetValue(spider, 1000); } } method.Invoke(spider, new object[] { parameters }); } }