public void ConfigureServices(IServiceCollection services) { services.AddSingleton(Configuration); // services.AddTransient<ContentContext>(); //Custom code services.AddDbContextPool <ContentContext>(options => { options.UseSqlite(@"Data Source=D:/repos/WebCrawlerPrj/Crawler/DB/content.db;"); }, poolSize: 200); services .AddTransient <LaunchOptions>(provider => new LaunchOptions { Headless = false }) .AddSingleton <IBrowserController, BrowserController>(); services.AddTransient <CrawlConfiguration>(provider => new CrawlConfiguration { // HttpRequestTimeoutInSeconds = 30, MaxPagesToCrawl = 0, //Max Crawl, MaxConcurrentThreads = 20, MinCrawlDelayPerDomainMilliSeconds = 300 //Wait this many millisecs between requests }) .AddTransient <IWebContentExtractor, WebContentExtractor>() .AddTransient <IPageRequester, PageRequester>() //.AddTransient<IPageRequester, ChromiumPageRequester>() .AddTransient <IPoliteWebCrawler, PoliteWebCrawler>(provider => { var crawlConfiguration = provider.GetRequiredService <CrawlConfiguration>(); ICrawlDecisionMaker crawlDecisionMaker = null; // var crawlDecisionMaker = provider.GetRequiredService<ICrawlDecisionMaker>(); IThreadManager threadManager = null; // var threadManager = provider.GetRequiredService<IThreadManager>(); IScheduler scheduler = null; // var scheduler = provider.GetRequiredService<IScheduler>(); // IPageRequester pageRequester = null; var pageRequester = provider.GetRequiredService <IPageRequester>(); IHtmlParser htmlParser = null; // var htmlParser = provider.GetRequiredService<IHtmlParser>(); IMemoryManager memoryManager = null; // var memoryManager = provider.GetRequiredService<IMemoryManager>(); IDomainRateLimiter domainRateLimiter = null; // var domainRateLimiter = provider.GetRequiredService<IDomainRateLimiter>(); IRobotsDotTextFinder robotsDotTextFinder = null; // var robotsDotTextFinder = provider.GetRequiredService<IRobotsDotTextFinder>(); return(new PoliteWebCrawler(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, htmlParser, memoryManager, domainRateLimiter, robotsDotTextFinder)); }); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IScheduler scheduler, IPageRequester pageRequester, IHtmlParser htmlParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, scheduler, pageRequester, htmlParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor())); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
public PoliteWebCrawler( CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester httpRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, httpRequester, hyperLinkParser, memoryManager) { _domainRateLimiter = domainRateLimiter ?? new DomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(new PageRequester(_crawlContext.CrawlConfiguration)); }
/// <summary> /// Creates a crawler instance with custom settings or implementation /// </summary> /// <param name="threadManager">Distributes http requests over multiple threads</param> /// <param name="scheduler">Decides what link should be crawled next</param> /// <param name="pageRequester">Makes the raw http requests</param> /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param> /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param> /// <param name="crawlConfiguration">Configurable crawl values</param> /// <param name="memoryManager">Checks the memory usage of the host process</param> /// <param name="domainRateLimiter"></param> /// <param name="robotsDotTextFinder"></param> /// <param name="sitemapLoader"></param> public GoogleWebCrawler( CrawlConfiguration crawlConfiguration = null, ICrawlDecisionMaker crawlDecisionMaker = null, IThreadManager threadManager = null, IScheduler scheduler = null, IPageRequester pageRequester = null, IHyperLinkParser hyperLinkParser = null, IMemoryManager memoryManager = null, IDomainRateLimiter domainRateLimiter = null, IRobotsDotTextFinder robotsDotTextFinder = null, IRobotsSitemapLoader sitemapLoader = null) : base(SetConfig(crawlConfiguration), crawlDecisionMaker, threadManager, scheduler, pageRequester ?? new BrowserPageRequester(crawlConfiguration), hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { SitemapLoader = sitemapLoader ?? new RobotsSitemapLoader(); }
public MultiProxyPoliteWebCrawler( MultiProxyCrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IMultiProxyDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager) { if ((pageRequester as MultiProxyPageRequester) == null) _pageRequester = new MultiProxyPageRequester(crawlConfiguration); _domainRateLimiter = domainRateLimiter ?? new MultiProxyDomainRateLimiter(_crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds); _robotsDotTextFinder = robotsDotTextFinder ?? new RobotsDotTextFinder(_pageRequester); }
public ImageAltTextChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }
public WebChecker(CrawlConfiguration crawlConfiguration, ICrawlDecisionMaker crawlDecisionMaker, IThreadManager threadManager, IScheduler scheduler, IPageRequester pageRequester, IHyperLinkParser hyperLinkParser, IMemoryManager memoryManager, IDomainRateLimiter domainRateLimiter, IRobotsDotTextFinder robotsDotTextFinder) : base(crawlConfiguration, crawlDecisionMaker, threadManager, scheduler, pageRequester, hyperLinkParser, memoryManager, domainRateLimiter, robotsDotTextFinder) { }