Example #1
0
        public void ConfigureServices(IServiceCollection services)
        {
            services.AddSingleton(Configuration);
            // services.AddTransient<ContentContext>();
            //Custom code
            services.AddDbContextPool <ContentContext>(options =>
            {
                options.UseSqlite(@"Data Source=D:/repos/WebCrawlerPrj/Crawler/DB/content.db;");
            }, poolSize: 200);
            services
            .AddTransient <LaunchOptions>(provider => new LaunchOptions
            {
                Headless = false
            })
            .AddSingleton <IBrowserController, BrowserController>();

            services.AddTransient <CrawlConfiguration>(provider => new CrawlConfiguration
            {
                // HttpRequestTimeoutInSeconds = 30,
                MaxPagesToCrawl      = 0,                    //Max Crawl,
                MaxConcurrentThreads = 20,
                MinCrawlDelayPerDomainMilliSeconds = 300     //Wait this many millisecs between requests
            })
            .AddTransient <IWebContentExtractor, WebContentExtractor>()
            .AddTransient <IPageRequester, PageRequester>()
            //.AddTransient<IPageRequester, ChromiumPageRequester>()
            .AddTransient <IPoliteWebCrawler, PoliteWebCrawler>(provider =>
            {
                var crawlConfiguration = provider.GetRequiredService <CrawlConfiguration>();
                ICrawlDecisionMaker crawlDecisionMaker = null;
                // var crawlDecisionMaker = provider.GetRequiredService<ICrawlDecisionMaker>();
                IThreadManager threadManager = null;
                // var threadManager = provider.GetRequiredService<IThreadManager>();
                IScheduler scheduler = null;
                // var scheduler = provider.GetRequiredService<IScheduler>();
                // IPageRequester pageRequester = null;
                var pageRequester      = provider.GetRequiredService <IPageRequester>();
                IHtmlParser htmlParser = null;
                // var htmlParser = provider.GetRequiredService<IHtmlParser>();
                IMemoryManager memoryManager = null;
                // var memoryManager = provider.GetRequiredService<IMemoryManager>();
                IDomainRateLimiter domainRateLimiter = null;
                // var domainRateLimiter = provider.GetRequiredService<IDomainRateLimiter>();
                IRobotsDotTextFinder robotsDotTextFinder = null;
                // var robotsDotTextFinder = provider.GetRequiredService<IRobotsDotTextFinder>();

                return(new PoliteWebCrawler(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler,
                                            pageRequester, htmlParser, memoryManager, domainRateLimiter, robotsDotTextFinder));
            });
        }
Example #2
0
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IScheduler scheduler,
     IPageRequester pageRequester,
     IHtmlParser htmlParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, scheduler, pageRequester, htmlParser, memoryManager)
 {
     _domainRateLimiter   = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor()));
 }
Example #3
0
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IThreadManager threadManager,
     IScheduler scheduler,
     IPageRequester pageRequester,
     IHyperLinkParser hyperLinkParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager)
 {
     _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration));
 }
 public PoliteWebCrawler(
     CrawlConfiguration crawlConfiguration,
     ICrawlDecisionMaker crawlDecisionMaker,
     IThreadManager threadManager,
     IScheduler scheduler,
     IPageRequester httpRequester,
     IHyperLinkParser hyperLinkParser,
     IMemoryManager memoryManager,
     IDomainRateLimiter domainRateLimiter,
     IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, httpRequester, hyperLinkParser, memoryManager)
 {
     _domainRateLimiter   = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
     _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration));
 }
Example #5
0
 /// <summary>
 /// Creates a crawler instance with custom settings or implementation
 /// </summary>
 /// <param name="threadManager">Distributes http requests over multiple threads</param>
 /// <param name="scheduler">Decides what link should be crawled next</param>
 /// <param name="pageRequester">Makes the raw http requests</param>
 /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
 /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
 /// <param name="crawlConfiguration">Configurable crawl values</param>
 /// <param name="memoryManager">Checks the memory usage of the host process</param>
 /// <param name="domainRateLimiter"></param>
 /// <param name="robotsDotTextFinder"></param>
 /// <param name="sitemapLoader"></param>
 public GoogleWebCrawler(
     CrawlConfiguration crawlConfiguration  = null,
     ICrawlDecisionMaker crawlDecisionMaker = null,
     IThreadManager threadManager           = null,
     IScheduler scheduler                     = null,
     IPageRequester pageRequester             = null,
     IHyperLinkParser hyperLinkParser         = null,
     IMemoryManager memoryManager             = null,
     IDomainRateLimiter domainRateLimiter     = null,
     IRobotsDotTextFinder robotsDotTextFinder = null,
     IRobotsSitemapLoader sitemapLoader       = null)
     : base(SetConfig(crawlConfiguration), crawlDecisionMaker, threadManager, scheduler, pageRequester ?? new BrowserPageRequester(crawlConfiguration), hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder)
 {
     SitemapLoader = sitemapLoader ?? new RobotsSitemapLoader();
 }
		public MultiProxyPoliteWebCrawler(
			MultiProxyCrawlConfiguration crawlConfiguration,
			ICrawlDecisionMaker crawlDecisionMaker,
			IThreadManager threadManager,
			IScheduler scheduler,
			IPageRequester pageRequester,
			IHyperLinkParser hyperLinkParser,
			IMemoryManager memoryManager,
			IMultiProxyDomainRateLimiter domainRateLimiter,
			IRobotsDotTextFinder robotsDotTextFinder)
            : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager)
        {
			if ((pageRequester as MultiProxyPageRequester) == null)
				_pageRequester = new MultiProxyPageRequester(crawlConfiguration);
			_domainRateLimiter = domainRateLimiter ?? new MultiProxyDomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds);
			_robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(_pageRequester);
		}
 public ImageAltTextChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder)
 {
 }
Example #8
0
 public WebChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder)
     : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder)
 {
 }