/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identify"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> protected Spider(string identify, IPageProcessor pageProcessor, IScheduler scheduler) { _waitCount = 0; PageProcessor = pageProcessor; Site = pageProcessor.Site; StartRequests = Site.StartRequests; Scheduler = scheduler ?? new QueueDuplicateRemovedScheduler(); if (string.IsNullOrWhiteSpace(identify)) { Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain; } else { if (!IdentifyRegex.IsMatch(identify)) { throw new SpiderExceptoin("Task Identify only can contains A-Z a-z 0-9 _ -"); } Identity = identify; } #if !NET_CORE DataRootDirectory = AppDomain.CurrentDomain.BaseDirectory + "\\data\\" + Identity; #else DataRootDirectory = Path.Combine(Directory.GetCurrentDirectory(), Path.Combine("data", Identity)); #endif }
public WikipediaXmlProcessor(Stream inputStream, IPageProcessor pageProcessor) { if (inputStream == null) throw new ArgumentNullException("inputStream"); if (pageProcessor == null) throw new ArgumentNullException("pageProcessor"); this.InputStream = inputStream; this.PageProcessor = pageProcessor; }
public void OpenPage(string url, IPageProcessor pageProcessor) { HttpRequestClient netCenter = new HttpRequestClient() { AllowAutoRedirect = false }; var response = netCenter.GetResponse(); }
public Spider(Site site, IPageProcessor pageProcessor) { Status = SpiderStatusEnum.Init; PageProcessor = pageProcessor; PageProcessor.Site = Site = site; Scheduler = new MemoryScheduler(); DownLoader = new HttpDownLoader(); }
public TaskQueueManager(IPageFetcher pageFetcher, IPageProcessor pageProcessor, Action <string> outputAction) { _pageFetcher = pageFetcher; _pageProcessor = pageProcessor; _outputAction = outputAction; _pageSaveJobRunner = new PostSaveJobRunner(ComponentFactory.GetPostRepository()); _runningInfoRepository = ComponentFactory.GetRunningInfoRepository(); }
void filterStream_Responsing(object sender, EventArgs e) { IPageProcessor processor = sender as IPageProcessor; if (processor != null) { processor.Context.CurrentContent = processor.Context.CurrentContent.Replace("Label", "asdf"); } }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identify"></param> /// <param name="pageProcessor"></param> protected Spider(string identify, IPageProcessor pageProcessor) { Logger = LogManager.GetLogger(typeof(Spider)); _waitCount = 0; PageProcessor = pageProcessor; _site = pageProcessor.Site; StartRequests = pageProcessor.Site.GetStartRequests(); _identify = identify; }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="site"></param> /// <param name="identity"></param> /// <param name="taskGroup"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> /// <param name="userid"></param> protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) : this() { Identity = identity; UserId = userid; PageProcessor = pageProcessor; Site = site; TaskGroup = taskGroup; Scheduler = scheduler; CheckIfSettingsCorrect(); }
public WikipediaXmlProcessor(Stream inputStream, IPageProcessor pageProcessor) { if (inputStream == null) { throw new ArgumentNullException("inputStream"); } if (pageProcessor == null) { throw new ArgumentNullException("pageProcessor"); } this.InputStream = inputStream; this.PageProcessor = pageProcessor; }
public WebCrawler(string url, string outputFolder, bool debugMessages, IPreCrawlProcessor[] preCrawlProcessors, IPageProcessor[] pageProcessors, IPostCrawlProcessor[] postCrawlProcessors) { this.Url = new Uri(url, UriKind.Absolute); this.OutputFolder = new Uri(new Uri(Environment.CurrentDirectory + Path.DirectorySeparatorChar), outputFolder + Path.DirectorySeparatorChar); this.logger = new Logger(debugMessages); this.preCrawlProcessors = preCrawlProcessors; this.pageProcessors = pageProcessors; this.postCrawlProcessors = postCrawlProcessors; this.logger.LogDebugMessage("Output Folder is {0}", this.OutputFolder.LocalPath); this.siteRoot = new Uri(this.Url.GetLeftPart(UriPartial.Authority)); this.pages = new Dictionary<Uri, string> { { this.Url, null } }; }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identity"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) { if (string.IsNullOrWhiteSpace(identity)) { Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain; } else { //if (!IdentifyRegex.IsMatch(identity)) //{ // throw new SpiderExceptoin("任务ID不能有空字符."); //} Identity = identity; } UserId = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid; TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup; Logger = LogManager.GetLogger($"{Identity}&{UserId}&{TaskGroup}"); _waitCount = 0; if (pageProcessor == null) { throw new SpiderExceptoin("PageProcessor should not be null."); } PageProcessor = pageProcessor; Site = site; if (Site == null) { throw new SpiderExceptoin("Site should not be null."); } PageProcessor.Site = site; StartRequests = Site.StartRequests; Scheduler = scheduler ?? new QueueDuplicateRemovedScheduler(); #if !NET_CORE DataRootDirectory = AppDomain.CurrentDomain.BaseDirectory + "\\data\\" + Identity; #else DataRootDirectory = Path.Combine(AppContext.BaseDirectory, "data", Identity); try { Console.OutputEncoding = System.Text.Encoding.UTF8; } catch { } #endif _errorRequestFile = FilePersistentBase.PrepareFile(Path.Combine(DataRootDirectory, "errorRequests.txt")); }
public static MillStatus TryProcessPage <T>(this IPageProcessor pageProcessor, MillRequest request, out MillResult <T> result) where T : class { result = new MillResult <T> { Url = request.Url, HumanReadableDescription = request.HumanReadableDescription }; try { if (typeof(T) == typeof(List <ThreadHeader>)) { result = pageProcessor.ProcessForumPage(request) as MillResult <T>; } else if (typeof(T) == typeof(ForumThread)) { result = pageProcessor.ProcessThreadPage(request) as MillResult <T>; } else { throw new NotSupportedException(); } return(MillStatus.Success); } catch (NotSignedInException ne) { Logger.Info("解析网页时发现未登录,URL: {0}\r\n 内容:{1}", ne.Request.Url, ne.Request.HtmlContent); return(MillStatus.NotSignedIn); } catch (PermissionDeniedException pde) { Logger.Info("解析网页时发现没有权限,URL: {0}\r\n 内容:{1}", pde.Request.Url, pde.Request.HtmlContent); return(MillStatus.PermissionDenied); } catch (ProcessFaultException pfe) { Logger.Info("解析网页时发生错误,错误信息:{0}\r\n URL: {1}\r\n 内容:{2}\r\n 内部异常:{3}\r\n", pfe.Message, pfe.Request.Url, pfe.Request.HtmlContent, pfe.InnerException); return(MillStatus.FormatError); } catch (Exception e) { Logger.Info("Error URL:{0}\r\n{1}\r\n{2}", request.Url, e, request.HtmlContent); return(MillStatus.FormatError); } }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identity"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> protected Spider(Site site, string identity, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) { if (string.IsNullOrWhiteSpace(identity)) { Identity = string.IsNullOrEmpty(Site.Domain) ? Guid.NewGuid().ToString() : Site.Domain; } else { //if (!IdentifyRegex.IsMatch(identity)) //{ // throw new SpiderExceptoin("任务ID不能有空字符."); //} Identity = identity; } UserId = string.IsNullOrEmpty(userid) ? "DotnetSpider" : userid; TaskGroup = string.IsNullOrEmpty(taskGroup) ? "DotnetSpider" : taskGroup; Logger = new Logger(Identity, UserId, TaskGroup); _waitCount = 0; if (pageProcessor == null) { throw new SpiderException("PageProcessor should not be null."); } PageProcessor = pageProcessor; Site = site; if (Site == null) { throw new SpiderException("Site should not be null."); } PageProcessor.Site = site; StartRequests = Site.StartRequests; Scheduler = scheduler ?? new QueueDuplicateRemovedScheduler(); Scheduler.Init(this); #if !NET_CORE _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); #else _errorRequestFile = BasePipeline.PrepareFile(Path.Combine(AppContext.BaseDirectory, "ErrorRequests", Identity, "errors.txt")); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif }
void ProcessFile(string src, string dst, string file, bool write, StartTemplate<StartModel> startTemplate, IPageProcessor processor) { #if DEBUG // Console.Write("."); #endif string srcfile = Path.Combine(src, file); string dstfile = null; Action<string, string> writer = null; if (write) { if (!_context.PageMap.ContainsKey(srcfile)) { // This file is unpublished. return; } PageInfo pageInfo = _context.PageMap[srcfile]; dstfile = pageInfo.GetDestinationPath(_context, src, dst, file); string dstdir = Path.GetDirectoryName(dstfile); Directory.CreateDirectory(dstdir); if (_context.Options.Verbose) Console.WriteLine(dstfile); writer = (d, r) => { // Write the output file if a destination is specified. if (!String.IsNullOrWhiteSpace(d)) { File.WriteAllText(d, r); } }; } PageTemplate<PageModel> pageTemplate = processor.ProcessFile(srcfile, dstfile, srcfile, _pageModel, startTemplate, writer); if (!write && pageTemplate.Published) { // Force the use of an empty layout to get the just the content. var contentStart = new StartTemplate<StartModel>() { ForceLayout = true }; string content = null; PageTemplate<PageModel> excerptTemplate = processor.ProcessFile(srcfile, null, srcfile + "*", _pageModel, contentStart, (d, r) => { content = r; }); if (pageTemplate.Excerpt == null) { pageTemplate.Excerpt = ExtractExcerpt(content); } // If this file is named like a post, get the info. PathInfo pathInfo = PathInfo.GetpathInfo(src, file); DateTime date = pageTemplate.Date.HasValue ? pageTemplate.Date.Value : (pathInfo == null ? DateTime.MinValue : pathInfo.Date); if (date == DateTime.MinValue) { // Note: It's probably OK for pages not to have dates, // since they won't often be listed by date. // Console.WriteLine("Warning: No date specified for {0}.", srcfile); } PageInfo pageInfo; if (pathInfo != null) { pageInfo = new PostInfo(pathInfo); } else { pageInfo = new PageInfo(); } pageInfo.Permalink = pageTemplate.Permalink; pageInfo.Rebase = pageTemplate.Rebase; pageInfo.Title = pageTemplate.Title; pageInfo.Content = content; pageInfo.Excerpt = pageTemplate.Excerpt; pageInfo.Categories = pageTemplate.Categories; // TODO: Copy pageInfo.Tags = pageTemplate.Tags; // TODO: Copy pageInfo.Date = date; dstfile = pageInfo.GetDestinationPath(_context, src, dst, file); // Build a URL fragment for internal linking. pageInfo.Url = FileUtility.GetInternalUrl(_context, dstfile); AddCategories(pageInfo, pageTemplate.Categories); AddTags(pageInfo, pageTemplate.Tags); _context.PageMap.Add(srcfile, pageInfo); } }
public FileCache(string startUrl, string urlPattern, string path = "/data/dotnetspider/temp/") { _pageProcessor = new SimplePageProcessor(startUrl, urlPattern); SetPath(path); _downloaderWhenFileMiss = new HttpClientDownloader(); }
void ProcessFile(string src, string dst, string file, bool write, StartTemplate <StartModel> startTemplate, IPageProcessor processor) { #if DEBUG // Console.Write("."); #endif string srcfile = Path.Combine(src, file); string dstfile = null; Action <string, string> writer = null; if (write) { if (!_context.PageMap.ContainsKey(srcfile)) { // This file is unpublished. return; } PageInfo pageInfo = _context.PageMap[srcfile]; dstfile = pageInfo.GetDestinationPath(_context, src, dst, file); string dstdir = Path.GetDirectoryName(dstfile); Directory.CreateDirectory(dstdir); if (_context.Options.Verbose) { Console.WriteLine(dstfile); } writer = (d, r) => { // Write the output file if a destination is specified. if (!String.IsNullOrWhiteSpace(d)) { File.WriteAllText(d, r); } }; } PageTemplate <PageModel> pageTemplate = processor.ProcessFile(srcfile, dstfile, srcfile, _pageModel, startTemplate, writer); if (!write && pageTemplate.Published) { // Force the use of an empty layout to get the just the content. var contentStart = new StartTemplate <StartModel>() { ForceLayout = true }; string content = null; PageTemplate <PageModel> excerptTemplate = processor.ProcessFile(srcfile, null, srcfile + "*", _pageModel, contentStart, (d, r) => { content = r; }); if (pageTemplate.Excerpt == null) { pageTemplate.Excerpt = ExtractExcerpt(content); } // If this file is named like a post, get the info. PathInfo pathInfo = PathInfo.GetpathInfo(src, file); DateTime date = pageTemplate.Date.HasValue ? pageTemplate.Date.Value : (pathInfo == null ? DateTime.MinValue : pathInfo.Date); if (date == DateTime.MinValue) { // Note: It's probably OK for pages not to have dates, // since they won't often be listed by date. // Console.WriteLine("Warning: No date specified for {0}.", srcfile); } PageInfo pageInfo; if (pathInfo != null) { pageInfo = new PostInfo(pathInfo); } else { pageInfo = new PageInfo(); } pageInfo.Permalink = pageTemplate.Permalink; pageInfo.Rebase = pageTemplate.Rebase; pageInfo.Title = pageTemplate.Title; pageInfo.Content = content; pageInfo.Excerpt = pageTemplate.Excerpt; pageInfo.Categories = pageTemplate.Categories; // TODO: Copy pageInfo.Tags = pageTemplate.Tags; // TODO: Copy pageInfo.Date = date; dstfile = pageInfo.GetDestinationPath(_context, src, dst, file); // Build a URL fragment for internal linking. pageInfo.Url = FileUtility.GetInternalUrl(_context, dstfile); AddCategories(pageInfo, pageTemplate.Categories); AddTags(pageInfo, pageTemplate.Tags); _context.PageMap.Add(srcfile, pageInfo); } }
/// <summary> /// Create a spider with pageProcessor and scheduler /// </summary> /// <param name="site"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> /// <returns></returns> public static Spider Create(Site site, IPageProcessor pageProcessor, IScheduler scheduler) { return(new Spider(site, Guid.NewGuid().ToString(), null, null, pageProcessor, scheduler)); }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="identify"></param> /// <param name="pageProcessor"></param> /// <returns></returns> public static Spider Create(string identify, IPageProcessor pageProcessor) { return(new Spider(identify, pageProcessor)); }
protected BaseModelSpider(string identify, IPageProcessor pageProcessor, IScheduler scheduler) : base(identify, pageProcessor, scheduler) { }
public OOSpider(IPageProcessor pageProcessor) : base(pageProcessor) { this.modelPipeline = new ModelPipeline <T>(); }
/// <summary> /// Create a spider with pageProcessor and scheduler /// </summary> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> /// <returns></returns> public static Spider Create(IPageProcessor pageProcessor, IScheduler scheduler) { return(new Spider(Guid.NewGuid().ToString(), pageProcessor, scheduler)); }
/// <summary> /// Initializes a new instance of the <see cref="ControllersRequestHandler" /> class. /// </summary> /// <param name="controllersProcessor">The controllers request handler.</param> /// <param name="pageProcessor">The page processor.</param> /// <param name="redirector">The redirector.</param> public ControllersRequestHandler(IControllersProcessor controllersProcessor, IPageProcessor pageProcessor, IRedirector redirector) { _controllersProcessor = controllersProcessor; _pageProcessor = pageProcessor; _redirector = redirector; }
public EntityGeneralSpider(Site site, string identify, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) : base(site, identify, userid, taskGroup, pageProcessor, scheduler) { }
/// <summary> /// /// </summary> /// <param name="pageProcessor"></param> public Spider(IPageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.GetSite(); }
public EntityGeneralSpider(string identify, IPageProcessor pageProcessor, IScheduler scheduler) : base(identify, pageProcessor, scheduler) { }
public Spider(Site site, IPageProcessor pageProcessor, IScheduler scheduler, IDownLoader downLoader) : this(site, pageProcessor, scheduler) { DownLoader = downLoader; }
/// <summary> /// 添加一个页面处理器 /// </summary> public SpiderBuilder AddPageProcessor(IPageProcessor processor) { _spider.AddPageProcessor(processor); return(this); }
protected BaseModelSpider(Site site, string identify, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) : base(site, identify, userid, taskGroup, pageProcessor, scheduler) { }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="pageProcessor"></param> /// <returns></returns> public static Spider Create(IPageProcessor pageProcessor) { return(new Spider(null, pageProcessor)); }
public OoSpider(string identify, IPageProcessor pageProcessor) : base(identify, pageProcessor) { }
/// <summary> /// Add a page processor to spider. /// </summary> /// <param name="processors">页面解析器</param> /// <returns>爬虫</returns> public virtual Spider AddPageProcessor(IPageProcessor processor) { return(AddPageProcessors(processor)); }
/// <summary> /// Create a spider with indentify, pageProcessor, scheduler. /// </summary> /// <param name="site"></param> /// <param name="identify"></param> /// <param name="taskGroup"></param> /// <param name="pageProcessor"></param> /// <param name="scheduler"></param> /// <param name="userid"></param> /// <returns></returns> public static Spider Create(Site site, string identify, string userid, string taskGroup, IPageProcessor pageProcessor, IScheduler scheduler) { return(new Spider(site, identify, userid, taskGroup, pageProcessor, scheduler)); }
/// <summary> /// Create a spider with pageProcessor. /// </summary> /// <param name="site"></param> /// <param name="pageProcessor"></param> /// <returns></returns> public static Spider Create(Site site, IPageProcessor pageProcessor) { return(new Spider(site, Guid.NewGuid().ToString(), null, null, pageProcessor, new QueueDuplicateRemovedScheduler())); }