protected void OnError(Request request) { lock (this) { //写入文件中, 用户从最终的结果可以知道有多少个Request没有跑. 提供ReRun, Spider可以重新载入错误的Request重新跑过 FileInfo file = FilePersistentBase.PrepareFile(Path.Combine(DataRootDirectory, "ErrorRequests.txt")); File.AppendAllText(file.FullName, JsonConvert.SerializeObject(request) + Environment.NewLine, Encoding.UTF8); } RequestFailedEvent?.Invoke(request); }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identity"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) { if (string.IsNullOrWhiteSpace(identity)) { Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain; } else { //if (!IdentifyRegex.IsMatch(identity)) //{ // throw new SpiderExceptoin("任务ID不能有空字符."); //} Identity = identity; } UserId = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid; TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup; Logger = LogManager.GetLogger($"{Identity}&{UserId}&{TaskGroup}"); _waitCount = 0; if (pageProcessor == null) { throw new SpiderExceptoin("PageProcessor should not be null."); } PageProcessor = pageProcessor; Site = site; if (Site == null) { throw new SpiderExceptoin("Site should not be null."); } PageProcessor.Site = site; StartRequests = Site.StartRequests; Scheduler = scheduler ?? new QueueDuplicateRemovedScheduler(); #if !NET_CORE DataRootDirectory = AppDomain.CurrentDomain.BaseDirectory + "\\data\\" + Identity; #else DataRootDirectory = Path.Combine(AppContext.BaseDirectory, "data", Identity); try { Console.OutputEncoding = System.Text.Encoding.UTF8; } catch { } #endif _errorRequestFile = FilePersistentBase.PrepareFile(Path.Combine(DataRootDirectory, "errorRequests.txt")); }