Esempio n. 1
0
        protected override dynamic FormateValue(dynamic value)
        {
            var name = Path.GetFileName(value);

            if (name != null)
            {
                Task <byte[]> task = Client.GetByteArrayAsync(value);
                task.ContinueWith(t =>
                {
                    if (t.Exception != null)
                    {
                        throw t.Exception;
                    }
                    var fileData = t.Result;
                    string file  = Path.Combine(SpiderConsts.GlobalDirectory, "images", name);
                    if (!File.Exists(file))
                    {
                        var stream = BasePipeline.PrepareFile(file).OpenWrite();
                        foreach (var b in fileData)
                        {
                            stream.WriteByte(b);
                        }
                        stream.Flush();
                        stream.Dispose();
                    }
                });
            }
            return(name);
        }
Esempio n. 2
0
 public override string Formate(string value)
 {
     try
     {
         var name = Path.GetFileName(value);
         if (name != null)
         {
             var    fileData = Client.GetByteArrayAsync(value).Result;
             string file     = Path.Combine(SpiderEnviroment.GlobalDirectory, "images", name);
             if (File.Exists(file))
             {
                 return(value);
             }
             var stream = BasePipeline.PrepareFile(file).OpenWrite();
             foreach (var b in fileData)
             {
                 stream.WriteByte(b);
             }
             stream.Flush();
             stream.Dispose();
         }
         return(value);
     }
     catch (Exception e)
     {
         Logger.SaveLog(LogInfo.Create($"Download file: {value} failed.", Logger.Name, null, LogLevel.Error, e));
         throw;
     }
 }
Esempio n. 3
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            Scheduler.Init(this);
#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"[步骤 1] 添加链接到调度中心, 数量: {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Load(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("[步骤 1] 添加链接到调度中心, 数量: 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            if (Site.MinSleepTime > Site.MaxSleepTime)
            {
                Site.MaxSleepTime = Site.MinSleepTime;
            }

            _init = true;
        }
Esempio n. 4
0
        protected override dynamic FormateValue(dynamic value)
        {
            var name = Path.GetFileName(value);

            if (name != null)
            {
                Task <byte[]> task = Client.GetByteArrayAsync(value);
                task.ContinueWith(t =>
                {
                    if (t.Exception != null)
                    {
                        Logger.SaveLog(LogInfo.Create($"下载文件: {value} 失败.", Logger.Name, null, LogLevel.Warn, t.Exception));
                        return;
                    }
                    var fileData = t.Result;
                    string file  = Path.Combine(SpiderEnviroment.GlobalDirectory, "images", name);
                    if (!File.Exists(file))
                    {
                        var stream = BasePipeline.PrepareFile(file).OpenWrite();
                        foreach (var b in fileData)
                        {
                            stream.WriteByte(b);
                        }
                        stream.Flush();
                        stream.Dispose();
                    }
                });
            }
            return(name);
        }
Esempio n. 5
0
        public override void InitPipeline(ISpider spider)
        {
            base.InitPipeline(spider);
#if !NET_CORE
            DataFolder = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, spider.Identity, "mysql");
#else
            DataFolder = Path.Combine(AppContext.BaseDirectory, spider.Identity, "mysql");
#endif
            Writer           = BasePipeline.PrepareFile(Path.Combine(DataFolder, $"{Schema.Database}.{Schema.TableName}.data")).AppendText();
            Writer.AutoFlush = true;
        }
Esempio n. 6
0
        /// <summary>
        /// Create a spider with pageProcessor.
        /// </summary>
        /// <param name="identity"></param>
        /// <param name="pageProcessor"></param>
        /// <param name="scheduler"></param>
        protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler)
        {
            if (string.IsNullOrWhiteSpace(identity))
            {
                Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain;
            }
            else
            {
                //if (!IdentifyRegex.IsMatch(identity))
                //{
                //	throw new SpiderExceptoin("任务ID不能有空字符.");
                //}
                Identity = identity;
            }

            UserId    = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid;
            TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup;
            Logger    = new Logger(Identity, UserId, TaskGroup);

            _waitCount = 0;
            if (pageProcessor == null)
            {
                throw new SpiderException("PageProcessor should not be null.");
            }
            PageProcessor = pageProcessor;
            Site          = site;
            if (Site == null)
            {
                throw new SpiderException("Site should not be null.");
            }
            PageProcessor.Site = site;
            StartRequests      = Site.StartRequests;
            Scheduler          = scheduler ?? new QueueDuplicateRemovedScheduler();
            Scheduler.Init(this);
#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
        }
Esempio n. 7
0
        public void InitComponent()
        {
            if (_init)
            {
                return;
            }

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            _monitors = IocManager.GetServices <IMonitor>().ToList();
            if (_monitors.Count == 0)
            {
                _monitors = new List <IMonitor> {
                    new NLogMonitor()
                };
            }

            Scheduler.Init(this);

            _monitorTask = Task.Factory.StartNew(() =>
            {
                var monitor = GetMonitor();
                while (!_scheduler.IsExited)
                {
                    ReportStatus();

                    Thread.Sleep(2000);
                }
                monitor.IsExited = true;
                ReportStatus();
            });

#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Import(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            if (Site.MinSleepTime > Site.MaxSleepTime)
            {
                Site.MaxSleepTime = Site.MinSleepTime;
            }

            _init = true;
        }
Esempio n. 8
0
        protected virtual void InitComponent(params string[] arguments)
        {
            if (_init)
            {
                return;
            }

            this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info);

            if (Pipelines == null || Pipelines.Count == 0)
            {
                throw new SpiderException("Pipelines should not be null.");
            }

            PreInitComponent(arguments);

            _monitor = IocManager.Resolve <IMonitor>() ?? new NLogMonitor();

            if (CookieInjector != null)
            {
                CookieInjector.Inject(this, false);
            }

            Scheduler.Init(this);

            _monitorTask = Task.Factory.StartNew(() =>
            {
                while (!Monitorable.IsExited)
                {
                    ReportStatus();
                    Thread.Sleep(2000);
                }
                ReportStatus();
            });

#if !NET_CORE
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
#else
            _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt"));
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            Console.CancelKeyPress += ConsoleCancelKeyPress;

            foreach (var pipeline in Pipelines)
            {
                pipeline.InitPipeline(this);
            }

            if (Site.StartRequests != null && Site.StartRequests.Count > 0)
            {
                this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info);
                //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info));
                if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler))
                {
                    Parallel.ForEach(Site.StartRequests, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 4
                    }, request =>
                    {
                        Scheduler.Push(request);
                    });
                }
                else
                {
                    Scheduler.Import(new HashSet <Request>(Site.StartRequests));
                    ClearStartRequests();
                }
            }
            else
            {
                this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info);
            }

            _waitCountLimit = EmptySleepTime / WaitInterval;

            AfterInitComponent(arguments);

            _init = true;
        }