Beispiel #1
0
        protected void OnError(Request request)
        {
            lock (this)
            {
                //写入文件中, 用户从最终的结果可以知道有多少个Request没有跑. 提供ReRun, Spider可以重新载入错误的Request重新跑过
                FileInfo file = FilePersistentBase.PrepareFile(Path.Combine(DataRootDirectory, "ErrorRequests.txt"));
                File.AppendAllText(file.FullName, JsonConvert.SerializeObject(request) + Environment.NewLine, Encoding.UTF8);
            }

            RequestFailedEvent?.Invoke(request);
        }
Beispiel #2
0
        /// <summary>
        /// Create a spider with pageProcessor.
        /// </summary>
        /// <param name="identity"></param>
        /// <param name="pageProcessor"></param>
        /// <param name="scheduler"></param>
        protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler)
        {
            if (string.IsNullOrWhiteSpace(identity))
            {
                Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain;
            }
            else
            {
                //if (!IdentifyRegex.IsMatch(identity))
                //{
                //	throw new SpiderExceptoin("任务ID不能有空字符.");
                //}
                Identity = identity;
            }

            UserId    = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid;
            TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup;
            Logger    = LogManager.GetLogger($"{Identity}&{UserId}&{TaskGroup}");

            _waitCount = 0;
            if (pageProcessor == null)
            {
                throw new SpiderExceptoin("PageProcessor should not be null.");
            }
            PageProcessor = pageProcessor;
            Site          = site;
            if (Site == null)
            {
                throw new SpiderExceptoin("Site should not be null.");
            }
            PageProcessor.Site = site;
            StartRequests      = Site.StartRequests;
            Scheduler          = scheduler ?? new QueueDuplicateRemovedScheduler();

#if !NET_CORE
            DataRootDirectory = AppDomain.CurrentDomain.BaseDirectory + "\\data\\" + Identity;
#else
            DataRootDirectory = Path.Combine(AppContext.BaseDirectory, "data", Identity);
            try
            {
                Console.OutputEncoding = System.Text.Encoding.UTF8;
            }
            catch
            {
            }
#endif
            _errorRequestFile = FilePersistentBase.PrepareFile(Path.Combine(DataRootDirectory, "errorRequests.txt"));
        }