Пример #1
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            Scheduler.Init(this);
#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"[步骤 1] 添加链接到调度中心, 数量: {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Load(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("[步骤 1] 添加链接到调度中心, 数量: 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            if (Site.MinSleepTime > Site.MaxSleepTime)
            {
                Site.MaxSleepTime = Site.MinSleepTime;
            }

            _init = true;
        }
Пример #2
0
 public override string Formate(string value)
 {
     try
     {
         var name = Path.GetFileName(value);
         if (name != null)
         {
             var    fileData = Client.GetByteArrayAsync(value).Result;
             string file     = Path.Combine(SpiderEnviroment.GlobalDirectory, "images", name);
             if (File.Exists(file))
             {
                 return(value);
             }
             var stream = BasePipeline.PrepareFile(file).OpenWrite();
             foreach (var b in fileData)
             {
                 stream.WriteByte(b);
             }
             stream.Flush();
             stream.Dispose();
         }
         return(value);
     }
     catch (Exception e)
     {
         Logger.SaveLog(LogInfo.Create($"Download file: {value} failed.", Logger.Name, null, LogLevel.Error, e));
         throw;
     }
 }
Пример #3
0
        protected override dynamic FormateValue(dynamic value)
        {
            var name = Path.GetFileName(value);

            if (name != null)
            {
                Task <byte[]> task = Client.GetByteArrayAsync(value);
                task.ContinueWith(t =>
                {
                    if (t.Exception != null)
                    {
                        throw t.Exception;
                    }
                    var fileData = t.Result;
                    string file  = Path.Combine(SpiderConsts.GlobalDirectory, "images", name);
                    if (!File.Exists(file))
                    {
                        var stream = BasePipeline.PrepareFile(file).OpenWrite();
                        foreach (var b in fileData)
                        {
                            stream.WriteByte(b);
                        }
                        stream.Flush();
                        stream.Dispose();
                    }
                });
            }
            return(name);
        }
Пример #4
0
        static void Main(string[] args)
        {
            //调用开发者批量注册的管道
            PipelineConfig.Register();
            //注册最后一个WebPipe管道
            PipelineBuilder.Bind <WebPipeline>();
            //创建管道
            _entrancePipeline = PipelineBuilder.Build();

            //基于HttpListener的web容器,实际也是依赖于http.sys来监听当前服务器的所有http请求
            HttpListener  httpListener = new HttpListener();
            List <string> prefixList   = new List <string>()
            {
                "http://127.0.0.1:20001/",
                "http://127.0.0.1:20002/",
                "http://127.0.0.1:20003/"
            };

            prefixList.ForEach(prefix =>
            {
                httpListener.Prefixes.Add(prefix);
            });

            httpListener.Start();

            Console.WriteLine("HttpListener is started...");

            while (true)
            {
                //HttpListenerContext context =  httpListener.GetContext();
                IAsyncResult        asyncResult = httpListener.BeginGetContext(null, null);
                HttpListenerContext context     = httpListener.EndGetContext(asyncResult);
                ThreadPool.QueueUserWorkItem(ProcessRequest, context);
            }
        }
Пример #5
0
        protected override dynamic FormateValue(dynamic value)
        {
            var name = Path.GetFileName(value);

            if (name != null)
            {
                Task <byte[]> task = Client.GetByteArrayAsync(value);
                task.ContinueWith(t =>
                {
                    if (t.Exception != null)
                    {
                        Logger.SaveLog(LogInfo.Create($"下载文件: {value} 失败.", Logger.Name, null, LogLevel.Warn, t.Exception));
                        return;
                    }
                    var fileData = t.Result;
                    string file  = Path.Combine(SpiderEnviroment.GlobalDirectory, "images", name);
                    if (!File.Exists(file))
                    {
                        var stream = BasePipeline.PrepareFile(file).OpenWrite();
                        foreach (var b in fileData)
                        {
                            stream.WriteByte(b);
                        }
                        stream.Flush();
                        stream.Dispose();
                    }
                });
            }
            return(name);
        }
Пример #6
0
        public override void InitPipeline(ISpider spider)
        {
            base.InitPipeline(spider);
#if !NET_CORE
            DataFolder = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, spider.Identity, "mysql");
#else
            DataFolder = Path.Combine(AppContext.BaseDirectory, spider.Identity, "mysql");
#endif
            Writer           = BasePipeline.PrepareFile(Path.Combine(DataFolder, $"{Schema.Database}.{Schema.TableName}.data")).AppendText();
            Writer.AutoFlush = true;
        }
Пример #7
0
        /// <summary>
        /// 创建管道实例(这里用了参考了webapi中的DelegatingHandler和owin中的OwinMiddleware的连接概念,可以理解成一个类似装饰者模式的运用)
        /// </summary>
        /// <returns>返回第一个管道实例</returns>
        public static BasePipeline Build()
        {
            BasePipeline nextPipe = null;

            _pipeTypes.Reverse();
            foreach (Type pipeType in _pipeTypes)
            {
                BasePipeline basePipe = (BasePipeline)Activator.CreateInstance(pipeType, nextPipe);
                nextPipe = basePipe;
            }
            return(nextPipe);
        }
Пример #8
0
        /// <summary>
        /// Create a spider with pageProcessor.
        /// </summary>
        /// <param name="identity"></param>
        /// <param name="pageProcessor"></param>
        /// <param name="scheduler"></param>
        protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler)
        {
            if (string.IsNullOrWhiteSpace(identity))
            {
                Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain;
            }
            else
            {
                //if (!IdentifyRegex.IsMatch(identity))
                //{
                //	throw new SpiderExceptoin("任务ID不能有空字符.");
                //}
                Identity = identity;
            }

            UserId    = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid;
            TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup;
            Logger    = new Logger(Identity, UserId, TaskGroup);

            _waitCount = 0;
            if (pageProcessor == null)
            {
                throw new SpiderException("PageProcessor should not be null.");
            }
            PageProcessor = pageProcessor;
            Site          = site;
            if (Site == null)
            {
                throw new SpiderException("Site should not be null.");
            }
            PageProcessor.Site = site;
            StartRequests      = Site.StartRequests;
            Scheduler          = scheduler ?? new QueueDuplicateRemovedScheduler();
            Scheduler.Init(this);
#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
        }
Пример #9
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            _monitors = IocManager.GetServices <IMonitor>().ToList();
            if (_monitors.Count == 0)
            {
                _monitors = new List <IMonitor> {
                    new NLogMonitor()
                };
            }

            Scheduler.Init(this);

            _monitorTask = Task.Factory.StartNew(() =>
            {
                var monitor = GetMonitor();
                while (!_scheduler.IsExited)
                {
                    ReportStatus();

                    Thread.Sleep(2000);
                }
                monitor.IsExited = true;
                ReportStatus();
            });

#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Import(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            if (Site.MinSleepTime > Site.MaxSleepTime)
            {
                Site.MaxSleepTime = Site.MinSleepTime;
            }

            _init = true;
        }
 public AuthenticationPipeline(BasePipeline basePipeline) : base(basePipeline)
 {
 }
Пример #11
0
        protected virtual void InitComponent(params string[] arguments)
        {
            if (_init)
            {
                return;
            }

            this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info);

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            PreInitComponent(arguments);

            _monitor = IocManager.Resolve <IMonitor>() ?? new NLogMonitor();

            if (CookieInjector != null)
            {
                CookieInjector.Inject(this, false);
            }

            Scheduler.Init(this);

            _monitorTask = Task.Factory.StartNew(() =>
            {
                while (!Monitorable.IsExited)
                {
                    ReportStatus();
                    Thread.Sleep(2000);
                }
                ReportStatus();
            });

#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Import(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            AfterInitComponent(arguments);

            _init = true;
        }
 public ModelValidPipeline(BasePipeline basePipeline) : base(basePipeline)
 {
 }