protected override dynamic FormateValue(dynamic value) { var name = Path.GetFileName(value); if (name != null) { Task <byte[]> task = Client.GetByteArrayAsync(value); task.ContinueWith(t => { if (t.Exception != null) { throw t.Exception; } var fileData = t.Result; string file = Path.Combine(SpiderConsts.GlobalDirectory, "images", name); if (!File.Exists(file)) { var stream = BasePipeline.PrepareFile(file).OpenWrite(); foreach (var b in fileData) { stream.WriteByte(b); } stream.Flush(); stream.Dispose(); } }); } return(name); }
public override string Formate(string value) { try { var name = Path.GetFileName(value); if (name != null) { var fileData = Client.GetByteArrayAsync(value).Result; string file = Path.Combine(SpiderEnviroment.GlobalDirectory, "images", name); if (File.Exists(file)) { return(value); } var stream = BasePipeline.PrepareFile(file).OpenWrite(); foreach (var b in fileData) { stream.WriteByte(b); } stream.Flush(); stream.Dispose(); } return(value); } catch (Exception e) { Logger.SaveLog(LogInfo.Create($"Download file: {value} failed.", Logger.Name, null, LogLevel.Error, e)); throw; } }
public void InitComponent() { if (_init) { return; } if (Pipelines == null || Pipelines.Count == 0) { throw new SpiderException("Pipelines should not be null."); } Scheduler.Init(this); #if !NET_CORE _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); #else _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif Console.CancelKeyPress += ConsoleCancelKeyPress; foreach (var pipeline in Pipelines) { pipeline.InitPipeline(this); } if (Site.StartRequests != null && Site.StartRequests.Count > 0) { this.Log($"[步骤 1] 添加链接到调度中心, 数量: {Site.StartRequests.Count}.", LogLevel.Info); //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info)); if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler)) { Parallel.ForEach(Site.StartRequests, new ParallelOptions() { MaxDegreeOfParallelism = 4 }, request => { Scheduler.Push(request); }); } else { Scheduler.Load(new HashSet <Request>(Site.StartRequests)); ClearStartRequests(); } } else { this.Log("[步骤 1] 添加链接到调度中心, 数量: 0.", LogLevel.Info); } _waitCountLimit = EmptySleepTime / WaitInterval; if (Site.MinSleepTime > Site.MaxSleepTime) { Site.MaxSleepTime = Site.MinSleepTime; } _init = true; }
protected override dynamic FormateValue(dynamic value) { var name = Path.GetFileName(value); if (name != null) { Task <byte[]> task = Client.GetByteArrayAsync(value); task.ContinueWith(t => { if (t.Exception != null) { Logger.SaveLog(LogInfo.Create($"下载文件: {value} 失败.", Logger.Name, null, LogLevel.Warn, t.Exception)); return; } var fileData = t.Result; string file = Path.Combine(SpiderEnviroment.GlobalDirectory, "images", name); if (!File.Exists(file)) { var stream = BasePipeline.PrepareFile(file).OpenWrite(); foreach (var b in fileData) { stream.WriteByte(b); } stream.Flush(); stream.Dispose(); } }); } return(name); }
public override void InitPipeline(ISpider spider) { base.InitPipeline(spider); #if !NET_CORE DataFolder = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, spider.Identity, "mysql"); #else DataFolder = Path.Combine(AppContext.BaseDirectory, spider.Identity, "mysql"); #endif Writer = BasePipeline.PrepareFile(Path.Combine(DataFolder, $"{Schema.Database}.{Schema.TableName}.data")).AppendText(); Writer.AutoFlush = true; }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identity"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) { if (string.IsNullOrWhiteSpace(identity)) { Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain; } else { //if (!IdentifyRegex.IsMatch(identity)) //{ // throw new SpiderExceptoin("任务ID不能有空字符."); //} Identity = identity; } UserId = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid; TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup; Logger = new Logger(Identity, UserId, TaskGroup); _waitCount = 0; if (pageProcessor == null) { throw new SpiderException("PageProcessor should not be null."); } PageProcessor = pageProcessor; Site = site; if (Site == null) { throw new SpiderException("Site should not be null."); } PageProcessor.Site = site; StartRequests = Site.StartRequests; Scheduler = scheduler ?? new QueueDuplicateRemovedScheduler(); Scheduler.Init(this); #if !NET_CORE _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); #else _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif }
public void InitComponent() { if (_init) { return; } if (Pipelines == null || Pipelines.Count == 0) { throw new SpiderException("Pipelines should not be null."); } _monitors = IocManager.GetServices <IMonitor>().ToList(); if (_monitors.Count == 0) { _monitors = new List <IMonitor> { new NLogMonitor() }; } Scheduler.Init(this); _monitorTask = Task.Factory.StartNew(() => { var monitor = GetMonitor(); while (!_scheduler.IsExited) { ReportStatus(); Thread.Sleep(2000); } monitor.IsExited = true; ReportStatus(); }); #if !NET_CORE _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); #else _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif Console.CancelKeyPress += ConsoleCancelKeyPress; foreach (var pipeline in Pipelines) { pipeline.InitPipeline(this); } if (Site.StartRequests != null && Site.StartRequests.Count > 0) { this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info); //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info)); if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler)) { Parallel.ForEach(Site.StartRequests, new ParallelOptions() { MaxDegreeOfParallelism = 4 }, request => { Scheduler.Push(request); }); } else { Scheduler.Import(new HashSet <Request>(Site.StartRequests)); ClearStartRequests(); } } else { this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info); } _waitCountLimit = EmptySleepTime / WaitInterval; if (Site.MinSleepTime > Site.MaxSleepTime) { Site.MaxSleepTime = Site.MinSleepTime; } _init = true; }
protected virtual void InitComponent(params string[] arguments) { if (_init) { return; } this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info); if (Pipelines == null || Pipelines.Count == 0) { throw new SpiderException("Pipelines should not be null."); } PreInitComponent(arguments); _monitor = IocManager.Resolve <IMonitor>() ?? new NLogMonitor(); if (CookieInjector != null) { CookieInjector.Inject(this, false); } Scheduler.Init(this); _monitorTask = Task.Factory.StartNew(() => { while (!Monitorable.IsExited) { ReportStatus(); Thread.Sleep(2000); } ReportStatus(); }); #if !NET_CORE _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); #else _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif Console.CancelKeyPress += ConsoleCancelKeyPress; foreach (var pipeline in Pipelines) { pipeline.InitPipeline(this); } if (Site.StartRequests != null && Site.StartRequests.Count > 0) { this.Log($"准备步骤: 添加链接到调度中心, 数量 {Site.StartRequests.Count}.", LogLevel.Info); //Logger.SaveLog(LogInfo.Create(, Logger.Name, this, LogLevel.Info)); if ((Scheduler is QueueDuplicateRemovedScheduler) || (Scheduler is PriorityScheduler)) { Parallel.ForEach(Site.StartRequests, new ParallelOptions() { MaxDegreeOfParallelism = 4 }, request => { Scheduler.Push(request); }); } else { Scheduler.Import(new HashSet <Request>(Site.StartRequests)); ClearStartRequests(); } } else { this.Log("准备步骤: 添加链接到调度中心, 数量 0.", LogLevel.Info); } _waitCountLimit = EmptySleepTime / WaitInterval; AfterInitComponent(arguments); _init = true; }