Exemple #1
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="htmlParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHtmlParser htmlParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext
            {
                CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration()
            };
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler          = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester      = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor());
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _htmlParser = htmlParser ?? new AngleSharpHyperlinkParser(_crawlContext.CrawlConfiguration, null);

            _crawlContext.Scheduler = _scheduler;
        }
Exemple #2
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler          = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester      = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled);

            _crawlContext.Scheduler = _scheduler;
        }
Exemple #3
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="httpRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester httpRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads);
            _scheduler          = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled);
            _httpRequester      = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser();

            _crawlContext.Scheduler = _scheduler;
        }
Exemple #4
0
        public void ConfigureServices(IServiceCollection services)
        {
            services.AddSingleton(Configuration);
            // services.AddTransient<ContentContext>();
            //Custom code
            services.AddDbContextPool <ContentContext>(options =>
            {
                options.UseSqlite(@"Data Source=D:/repos/WebCrawlerPrj/Crawler/DB/content.db;");
            }, poolSize: 200);
            services
            .AddTransient <LaunchOptions>(provider => new LaunchOptions
            {
                Headless = false
            })
            .AddSingleton <IBrowserController, BrowserController>();

            services.AddTransient <CrawlConfiguration>(provider => new CrawlConfiguration
            {
                // HttpRequestTimeoutInSeconds = 30,
                MaxPagesToCrawl      = 0,                    //Max Crawl,
                MaxConcurrentThreads = 20,
                MinCrawlDelayPerDomainMilliSeconds = 300     //Wait this many millisecs between requests
            })
            .AddTransient <IWebContentExtractor, WebContentExtractor>()
            .AddTransient <IPageRequester, PageRequester>()
            //.AddTransient<IPageRequester, ChromiumPageRequester>()
            .AddTransient <IPoliteWebCrawler, PoliteWebCrawler>(provider =>
            {
                var crawlConfiguration = provider.GetRequiredService <CrawlConfiguration>();
                ICrawlDecisionMaker crawlDecisionMaker = null;
                // var crawlDecisionMaker = provider.GetRequiredService<ICrawlDecisionMaker>();
                IThreadManager threadManager = null;
                // var threadManager = provider.GetRequiredService<IThreadManager>();
                IScheduler scheduler = null;
                // var scheduler = provider.GetRequiredService<IScheduler>();
                // IPageRequester pageRequester = null;
                var pageRequester      = provider.GetRequiredService <IPageRequester>();
                IHtmlParser htmlParser = null;
                // var htmlParser = provider.GetRequiredService<IHtmlParser>();
                IMemoryManager memoryManager = null;
                // var memoryManager = provider.GetRequiredService<IMemoryManager>();
                IDomainRateLimiter domainRateLimiter = null;
                // var domainRateLimiter = provider.GetRequiredService<IDomainRateLimiter>();
                IRobotsDotTextFinder robotsDotTextFinder = null;
                // var robotsDotTextFinder = provider.GetRequiredService<IRobotsDotTextFinder>();

                return(new PoliteWebCrawler(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler,
                                            pageRequester, htmlParser, memoryManager, domainRateLimiter, robotsDotTextFinder));
            });
        }
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IScheduler scheduler,
     IPageRequester pageRequester,
     IHtmlParser htmlParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, scheduler, pageRequester, htmlParser, memoryManager)
 {
     _domainRateLimiter   = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor()));
 }
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IThreadManager threadManager,
     IScheduler scheduler,
     IPageRequester pageRequester,
     IHyperLinkParser hyperLinkParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager)
 {
     _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration));
 }
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IThreadManager threadManager,
     IScheduler scheduler,
     IPageRequester httpRequester,
     IHyperLinkParser hyperLinkParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, httpRequester, hyperLinkParser, memoryManager)
 {
     _domainRateLimiter   = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration));
 }
Exemple #8
0
 /// <summary>
 /// Creates a crawler instance with custom settings or implementation
 /// </summary>
 /// <param name="threadManager">Distributes http requests over multiple threads</param>
 /// <param name="scheduler">Decides what link should be crawled next</param>
 /// <param name="pageRequester">Makes the raw http requests</param>
 /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
 /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
 /// <param name="crawlConfiguration">Configurable crawl values</param>
 /// <param name="memoryManager">Checks the memory usage of the host process</param>
 /// <param name="domainRateLimiter"></param>
 /// <param name="robotsDotTextFinder"></param>
 /// <param name="sitemapLoader"></param>
 public GoogleWebCrawler(
     CrawlConfiguration crawlConfiguration  = null,
     ICrawlDecisionMaker crawlDecisionMaker = null,
     IThreadManager threadManager           = null,
     IScheduler scheduler                     = null,
     IPageRequester pageRequester             = null,
     IHyperLinkParser hyperLinkParser         = null,
     IMemoryManager memoryManager             = null,
     IDomainRateLimiter domainRateLimiter     = null,
     IRobotsDotTextFinder robotsDotTextFinder = null,
     IRobotsSitemapLoader sitemapLoader       = null)
     : base(SetConfig(crawlConfiguration), crawlDecisionMaker, threadManager, scheduler, pageRequester ?? new BrowserPageRequester(crawlConfiguration), hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder)
 {
     SitemapLoader = sitemapLoader ?? new RobotsSitemapLoader();
 }
		public MultiProxyPoliteWebCrawler(
			MultiProxyCrawlConfiguration crawlConfiguration,
			ICrawlDecisionMaker crawlDecisionMaker,
			IThreadManager threadManager,
			IScheduler scheduler,
			IPageRequester pageRequester,
			IHyperLinkParser hyperLinkParser,
			IMemoryManager memoryManager,
			IMultiProxyDomainRateLimiter domainRateLimiter,
			IRobotsDotTextFinder robotsDotTextFinder)
            : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager)
        {
			if ((pageRequester as MultiProxyPageRequester) == null)
				_pageRequester = new MultiProxyPageRequester(crawlConfiguration);
			_domainRateLimiter = domainRateLimiter ?? new MultiProxyDomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
			_robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(_pageRequester);
		}
Exemple #10
0
        public WebCrawler(ILogger <WebCrawler> logger, CrawlConfiguration crawlConfiguration, IThreadManager threadManager, ICrawlDecisionMaker crawlDecisionMaker, ICrawlScheduler crawlScheduler, IPageRequester pageRequester, IDocumentParser hyperLinkParser, IRateLimiter rateLimiter)
        {
            _logger             = logger;
            _crawlConfiguration = crawlConfiguration;
            _crawlContext       = new CrawlContext(logger)
            {
                CrawlConfiguration = crawlConfiguration
            };

            _threadManager      = threadManager;
            _crawlDecisionMaker = crawlDecisionMaker;
            _scheduler          = crawlScheduler;
            _pageRequester      = pageRequester;
            _hyperLinkParser    = hyperLinkParser;
            _rateLimiter        = rateLimiter;

            PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync;
        }
        public CrawlerLight(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker,
                            IThreadManager threadManager, IScheduler scheduler)
        {
            _crawlContext = new CrawlContext
            {
                CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration()
            };

            _scheduler = scheduler ?? new Scheduler();
            int cpuBound = _crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0
                ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads
                : Environment.ProcessorCount;

            _threadManager          = threadManager ?? new TaskThreadManager(cpuBound, _crawlContext.CancellationTokenSource);
            _linkProvider           = new AngleSharpLinkParser();
            _crawlDecisionMaker     = crawlDecisionMaker ?? new CrawlDirector();
            _crawlContext.Scheduler = _scheduler;
        }
 public ImageAltTextChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder)
 {
 }
Exemple #13
0
 public WebChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder)
 {
 }
Exemple #14
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="httpRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration, 
            ICrawlDecisionMaker crawlDecisionMaker, 
            IThreadManager threadManager, 
            IScheduler scheduler, 
            IPageRequester httpRequester, 
            IHyperLinkParser hyperLinkParser, 
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads);
            _scheduler = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled);
            _httpRequester = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if(_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0
                || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser();

            _crawlContext.Scheduler = _scheduler;
        }
Exemple #15
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0
                || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled, null, _crawlContext.CrawlConfiguration.IsRespectUrlNamedAnchorOrHashbangEnabled);

            _crawlContext.Scheduler = _scheduler;
        }
Exemple #16
0
        public ThinkWebCrawler(ILogger <WebCrawler> logger, CrawlConfiguration crawlConfiguration, IThreadManager threadManager, ICrawlDecisionMaker crawlDecisionMaker, ICrawlScheduler crawlScheduler, IPageRequester pageRequester, IDocumentParser hyperLinkParser, IRateLimiter rateLimiter, IOptions <ThinkCrawlConfiguration> thinkCrawlOpt) : base(logger, crawlConfiguration, threadManager, crawlDecisionMaker, crawlScheduler, pageRequester, hyperLinkParser, rateLimiter)
        {
            _thinkCrawlConfiguration = thinkCrawlOpt.Value;

            Check();
        }