/// <summary>
 /// Initializes a new instance of the <see cref="Crawler"/> class.
 /// </summary>
 /// <param name="observer">The Observer of the instance to create.</param>
 /// <param name="source">The source to observe.</param>
 public WebsiteCrawler([NotNull] ICrawlerObserver observer, [NotNull] Source source)
     : base(observer, source)
 {
     if (!string.IsNullOrEmpty(source.Regex))
     {
         this.regex = new Regex(source.Regex, RegexOptions.Compiled);
     }
 }
 /// <summary>
 /// Initializes a new instance of the <see cref="RssCrawler"/> class.
 /// </summary>
 /// <param name="observer">The Observer of the instance to create.</param>
 /// <param name="source">The source to observe.</param>
 public RssCrawler([NotNull] ICrawlerObserver observer, [NotNull] Source source) : base(observer, source)
 {
     if (!string.IsNullOrEmpty(source.Regex) || !string.IsNullOrEmpty(source.XPath))
     {
         this.websiteCrawler = new WebsiteCrawler(observer, source);
     }
     else
     {
         this.lastIndexed = new HashSet <RssKey>(5000);
     }
 }
Exemple #3
0
        public Crawler(WorkerPool workerPool, ICrawlRequestFilter crawlRequestFilter, ILinkExtractor linkExtractor, ICrawlerObserver observer,
                       IReadOnlyDictionary <string, string> customHttpHeaders = null, TimeSpan?requestTimeout = null, int maxRetries = 0, string userAgent = null)
        {
            if (this.maxRetries < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(maxRetries), "Max retries must be non-negative");
            }

            this.crawlRequestFilter = crawlRequestFilter;
            this.linkExtractor      = linkExtractor;
            this.observer           = observer;
            this.maxRetries         = maxRetries;
            this.userAgent          = userAgent;
            this.customHttpHeaders  = customHttpHeaders ?? new Dictionary <string, string>();
            this.workerPool         = workerPool;

            if (requestTimeout != null)
            {
                this.httpClient.Timeout = requestTimeout.Value;
            }
        }
Exemple #4
0
        private static Crawler CreateCrawler(Options opts, ICrawlerObserver observer)
        {
            var startUrlAuthorities = opts.StartUrls.Select(x => new Uri(x).Authority);

            var linkExtractor      = new CompositeExtractor(new HtmlLinkExtractor(), new SiteMapLinkExtractor(), new RobotsTxtSitemapExtractor());
            var crawlRequestFilter = new CompositeFilter(
                new AuthorityFilter(startUrlAuthorities, opts.TestExternalUrls),
                new MaxDepthFilter(opts.MaxDepth));

            var requestTimeout = opts.RequestTimeout != null
                ? (TimeSpan?)TimeSpan.FromSeconds(opts.RequestTimeout.Value)
                : null;

            return(new Crawler(
                       new WorkerPool(opts.NumberOfWorkers),
                       crawlRequestFilter,
                       linkExtractor,
                       observer,
                       opts.RequestHeaders,
                       requestTimeout,
                       opts.MaxRetries,
                       opts.UserAgent));
        }
Exemple #5
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Crawler"/> class.
 /// </summary>
 /// <param name="observer">The Observer of the instance to create.</param>
 /// <param name="source">The source to observe.</param>
 protected Crawler([NotNull] ICrawlerObserver observer, [NotNull] Source source)
 {
     Observer = observer;
     Source   = source;
 }