Example #1
0
        public void Run()
        {
            // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
            //var site = new Site { EncodingName = "UTF-8" };
            var site = new Site();

            // Add start/feed urls. 添加初始采集链接
            var context = new StockLearningEntities();
            var stocks  = context.Stocks.ToList();


            foreach (var stock in stocks)
            {
                string range = "sh";
                if (stock.StockId.StartsWith("0") || stock.StockId.StartsWith("3"))
                {
                    range = "sz";
                }
                site.AddStartUrl($"http://f9.eastmoney.com/{range}{stock.StockId}.html");
            }

            DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(site,
                                              // use memoery queue scheduler. 使用内存调度
                                                                              new QueueDuplicateRemovedScheduler(),
                                              // use custmize processor for  Processor
                                                                              new StockJJRPageProcessor())
                                              // use custmize pipeline for  Pipeline
                                              .AddPipeline(new StockJJRPipeline());
            spider.Downloader     = new HttpClientDownloader();
            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 3000;

            // Start crawler 启动爬虫
            spider.Run();
        }
Example #2
0
 public static void Run()
 {
     DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(
         // use memoery queue scheduler. 使用内存调度
         new QueueDuplicateRemovedScheduler(),
         // use custmize processor for youku 为优酷自定义的 Processor
         new YoukuPageProcessor())
                                       // use custmize pipeline for youku 为优酷自定义的 Pipeline
                                       .AddPipeline(new YoukuPipeline());
     // Start crawler 启动爬虫
     spider.EncodingName = "UTF-8";
     for (int i = 1; i < 5; ++i)
     {
         // Add start/feed urls. 添加初始采集链接
         spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
     }
     spider.Run();
 }
Example #3
0
        private void CustmizeProcessorAndPipeline()
        {
            var site = new Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };
            string url = this.tb_url.Text;

            site.AddStartUrl(url);

            DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(site, new QueueDuplicateRemovedScheduler(), new TouTiaoPageProcessor());
            spider.AddPipeline(new TotiaoPipeline());
            spider.Downloader     = new HttpClientDownloader();
            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 3000;

            // Start crawler 启动爬虫
            spider.Run();
        }
        public void Run()
        {
            // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
            //var site = new Site { EncodingName = "UTF-8" };
            var site = new Site();

            // Add start/feed urls. 添加初始采集链接
            site.AddStartUrl("http://quote.eastmoney.com/stocklist.html");

            DotnetSpider.Core.Spider spider = DotnetSpider.Core.Spider.Create(site,
                                              // use memoery queue scheduler. 使用内存调度
                                                                              new QueueDuplicateRemovedScheduler(),
                                              // use custmize processor for  Processor
                                                                              new StockListPageProcessor())
                                              // use custmize pipeline for  Pipeline
                                              .AddPipeline(new StockListPipeline());
            spider.Downloader     = new HttpClientDownloader();
            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 3000;

            // Start crawler 启动爬虫
            spider.Run();
        }
Example #5
0
        public static void Run(params string[] args)
        {
            Console.WriteLine("");
            Spider.PrintInfo();
            Console.WriteLine("");
            Console.ForegroundColor = ConsoleColor.Cyan;
            var commands = string.Join(" ", args);

            Console.WriteLine("Args: " + commands);
            Console.WriteLine("");
            Console.ForegroundColor = ConsoleColor.White;

#if NET_CORE
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Dictionary <string, string> arguments = new Dictionary <string, string>();
            foreach (var arg in args)
            {
                var results = arg.Split(':');
                if (results.Length == 2)
                {
                    var key = results[0].Trim();
                    if (arguments.ContainsKey(key))
                    {
                        arguments[key] = results[1].Trim();
                    }
                    else
                    {
                        arguments.Add(key, results[1].Trim());
                    }
                }
                else if (results.Length == 1)
                {
                    var key = results[0].Trim();
                    if (!arguments.ContainsKey(key))
                    {
                        arguments.Add(key, string.Empty);
                    }
                }
                else
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("Please use command like: -s:[spider type name] -i:[identity] -a:[arg1,arg2...] -tid:[taskId]");
                    Console.ForegroundColor = ConsoleColor.White;
                    return;
                }
            }
            string spiderName;
            if (arguments.Count == 0 || !arguments.ContainsKey("-s") || !arguments.ContainsKey("-tid"))
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("-s or -tid are necessary.");
                Console.ForegroundColor = ConsoleColor.White;
                return;
            }
            else
            {
                spiderName = arguments["-s"];
            }

#if NET_CORE
            var deps = DependencyContext.Default;
#endif

            var spiders = new Dictionary <string, object>();
#if NET_CORE
            foreach (var library in deps.CompileLibraries.Where(l => l.Name.ToLower().EndsWith("dotnetspider.sample") || l.Name.ToLower().EndsWith("spiders") || l.Name.ToLower().EndsWith("crawlers")))
            {
                var asm   = Assembly.Load(new AssemblyName(library.Name));
                var types = asm.GetTypes();
#else
            foreach (var file in DetectDlls())
            {
                var asm   = Assembly.LoadFrom(file);
                var types = asm.GetTypes();
#endif
                Console.WriteLine($"Fetch assembly: {asm.FullName}.");
                foreach (var type in types)
                {
                    bool hasNonParametersConstructor = type.GetConstructors().Any(c => c.IsPublic && c.GetParameters().Length == 0);

                    if (hasNonParametersConstructor)
                    {
                        var interfaces = type.GetInterfaces();

                        var isNamed    = interfaces.Any(t => t.FullName == "DotnetSpider.Core.INamed");
                        var isIdentity = interfaces.Any(t => t.FullName == "DotnetSpider.Core.IIdentity");
                        var isRunnable = interfaces.Any(t => t.FullName == "DotnetSpider.Core.IRunable");

                        if (isNamed && isRunnable && isIdentity)
                        {
                            var    property = type.GetProperties().First(p => p.Name == "Name");
                            object runner   = Activator.CreateInstance(type);
                            var    name     = (string)property.GetValue(runner);
                            if (!spiders.ContainsKey(name))
                            {
                                spiders.Add(name, runner);
                            }
                            else
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine();
                                Console.WriteLine($"Spider {name} are duplicate.");
                                Console.WriteLine();
                                Console.ForegroundColor = ConsoleColor.White;
                                return;
                            }
                        }
                    }
                }
            }

            if (spiders.Count == 0)
            {
                Console.ForegroundColor = ConsoleColor.DarkYellow;
                Console.WriteLine();
                Console.WriteLine("Did not detect any spider.");
                Console.WriteLine();
                Console.ForegroundColor = ConsoleColor.White;
                return;
            }

            Console.WriteLine();
            Console.ForegroundColor = ConsoleColor.Cyan;
            Console.WriteLine($"Detected {spiders.Keys.Count} crawlers.");
            Console.ForegroundColor = ConsoleColor.White;
            Console.WriteLine();
            Console.WriteLine("=================================================================");
            Console.WriteLine();

            if (!spiders.ContainsKey(spiderName))
            {
                Console.WriteLine($"There is no spider named: {spiderName}.");
                return;
            }
            var spider      = spiders[spiderName];
            string identity = "";
            if (arguments.ContainsKey("-i"))
            {
                var property = spider.GetType().GetProperties().First(p => p.Name == "Identity");
                identity = arguments["-i"].ToLower();
                if (arguments["-i"].ToLower() == "guid")
                {
                    property.SetValue(spider, Guid.NewGuid().ToString("N"));
                }
                else
                {
                    if (!string.IsNullOrEmpty(identity))
                    {
                        property.SetValue(spider, arguments["-i"]);
                    }
                }
            }

            if (arguments.ContainsKey("-tid"))
            {
                var property = spider.GetType().GetProperties().First(p => p.Name == "TaskId");
                if (arguments["-tid"].ToLower() == "guid")
                {
                    property.SetValue(spider, Guid.NewGuid().ToString("N"));
                }
                else
                {
                    if (!string.IsNullOrEmpty(identity))
                    {
                        property.SetValue(spider, arguments["-tid"]);
                    }
                }
            }

            var spiderType = spider.GetType();
            var method     = spiderType.GetMethod("Run");

            if (!arguments.ContainsKey("-a"))
            {
                method.Invoke(spider, new object[] { new string[] { } });
            }
            else
            {
                var parameters = arguments["-a"].Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                if (parameters.Contains("report"))
                {
                    var emptySleepTime = spiderType.GetProperty("EmptySleepTime");
                    if (emptySleepTime != null)
                    {
                        emptySleepTime.SetValue(spider, 1000);
                    }
                }

                method.Invoke(spider, new object[] { parameters });
            }
        }