/// <summary> /// Initializes a new instance of the <see cref="Crawler"/> class. /// </summary> /// <param name="observer">The Observer of the instance to create.</param> /// <param name="source">The source to observe.</param> public WebsiteCrawler([NotNull] ICrawlerObserver observer, [NotNull] Source source) : base(observer, source) { if (!string.IsNullOrEmpty(source.Regex)) { this.regex = new Regex(source.Regex, RegexOptions.Compiled); } }
/// <summary> /// Initializes a new instance of the <see cref="RssCrawler"/> class. /// </summary> /// <param name="observer">The Observer of the instance to create.</param> /// <param name="source">The source to observe.</param> public RssCrawler([NotNull] ICrawlerObserver observer, [NotNull] Source source) : base(observer, source) { if (!string.IsNullOrEmpty(source.Regex) || !string.IsNullOrEmpty(source.XPath)) { this.websiteCrawler = new WebsiteCrawler(observer, source); } else { this.lastIndexed = new HashSet <RssKey>(5000); } }
public Crawler(WorkerPool workerPool, ICrawlRequestFilter crawlRequestFilter, ILinkExtractor linkExtractor, ICrawlerObserver observer, IReadOnlyDictionary <string, string> customHttpHeaders = null, TimeSpan?requestTimeout = null, int maxRetries = 0, string userAgent = null) { if (this.maxRetries < 0) { throw new ArgumentOutOfRangeException(nameof(maxRetries), "Max retries must be non-negative"); } this.crawlRequestFilter = crawlRequestFilter; this.linkExtractor = linkExtractor; this.observer = observer; this.maxRetries = maxRetries; this.userAgent = userAgent; this.customHttpHeaders = customHttpHeaders ?? new Dictionary <string, string>(); this.workerPool = workerPool; if (requestTimeout != null) { this.httpClient.Timeout = requestTimeout.Value; } }
private static Crawler CreateCrawler(Options opts, ICrawlerObserver observer) { var startUrlAuthorities = opts.StartUrls.Select(x => new Uri(x).Authority); var linkExtractor = new CompositeExtractor(new HtmlLinkExtractor(), new SiteMapLinkExtractor(), new RobotsTxtSitemapExtractor()); var crawlRequestFilter = new CompositeFilter( new AuthorityFilter(startUrlAuthorities, opts.TestExternalUrls), new MaxDepthFilter(opts.MaxDepth)); var requestTimeout = opts.RequestTimeout != null ? (TimeSpan?)TimeSpan.FromSeconds(opts.RequestTimeout.Value) : null; return(new Crawler( new WorkerPool(opts.NumberOfWorkers), crawlRequestFilter, linkExtractor, observer, opts.RequestHeaders, requestTimeout, opts.MaxRetries, opts.UserAgent)); }
/// <summary> /// Initializes a new instance of the <see cref="Crawler"/> class. /// </summary> /// <param name="observer">The Observer of the instance to create.</param> /// <param name="source">The source to observe.</param> protected Crawler([NotNull] ICrawlerObserver observer, [NotNull] Source source) { Observer = observer; Source = source; }