public MangaTests() { var memoryCache = new MemoryCache(new MemoryCacheOptions()); _source = new UnionMangasSource(); _crawler = new UnionMangasCrawler(_source, memoryCache); }
public Brawa(IClock clock, IWebCrawler webCrawler) : base( clock, webCrawler, new Uri("https://www.brawa.de")) { }
public Hornby(IClock clock, IWebCrawler webCrawler) : base( clock, webCrawler, new Uri("https://www.hornby.com")) { }
public void Dispose() { var disposable = _provider as IDisposable; if (disposable != null) disposable.Dispose(); _provider = null; disposable = _repo as IDisposable; if (disposable != null) disposable.Dispose(); _repo = null; if (_crawler != null) { if (IsAsync) { _crawler.PageCrawlStartingAsync -= crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync -= crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync -= crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync -= crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting -= crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted -= crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed -= crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed -= crawler_PageLinksCrawlDisallowed; } _crawler = null; } }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetDefaultWebCrawler(); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost")) { return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" } } ; return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { if (crawlContext.CrawledCount >= 5) { return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" } } ; return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { if (!crawledPage.IsInternal) { return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" } } ; return(new CrawlDecision { Allow = true }); }); return(crawler); }
public CategoriesParser(ILog logger, IWebCrawler crawler, ICategoryRepository categories, IUnitOfWork uow) { this.logger = logger; this.crawler = crawler; this.categories = categories; this.uow = uow; }
public async Task GetResult(IWebCrawler crawler, IList <string> results) { var result = await crawler.Get(fixture.Create <string>()); output.WriteLine(result.Body); results.Add(result.Body); }
public ExtractorService(IServiceProvider serviceProvider, IHttpContextAccessor contextAccessor) : base(contextAccessor) { _webCrawler = serviceProvider.GetRequiredService <IWebCrawler>(); _imageService = serviceProvider.GetRequiredService <IImageService>(); Configure(); }
public CrawlResult Crawl() { IWebCrawler crawler = InitCrawler(); Uri uriToCrawl = new Uri("http://rabota.ua/jobsearch/vacancy_list"); //http://rabota.ua/jobsearch/vacancy_list?pg=1000 crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains(@"rabota.ua/jobsearch/vacancy_list") && !pageToCrawl.Uri.AbsoluteUri.Contains(@"period")) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false, Reason = "Parse only job pages" }); }); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(uriToCrawl); return(result); }
public ModellbahnshopLippe(IWebCrawler webCrawler, Uri startPage) : base(SystemClock.Instance, webCrawler, new Uri(@"https://www.modellbahnshop-lippe.com"), startPage) { }
public DataProcessor(IConsoleManager consoleManager, IStringHelper stringHelper, IWebCrawler webCrawler, IExcelLogger excelLogger, ILogger logger) { _consoleManager = consoleManager; _stringHelper = stringHelper; _webCrawler = webCrawler; _excelLogger = excelLogger; _logger = logger; }
private IWebCrawler CreateCrawlerInstance() { IWebCrawler crawler = _crawlerFactory.CreateInstance(); crawler.PageCrawlCompleted += (s, e) => ProcessCrawledPage(e.CrawlContext, e.CrawledPage); return(crawler); }
public MfcCrawler(IocContainer ioc, IUserRepository userRepository, IPrinter printer, ICommunityDecider communityDecider, int waitTime = 200) { _userRepository = userRepository; _printer = printer; _communityDecider = communityDecider; crawler = ioc.SteamCrawler(1000, waitTime); }
protected BaseWrapper(IClock clock, IWebCrawler webCrawler, Uri baseUri) { Parser = new HtmlParser(); BaseUri = baseUri; _clock = clock; _webCrawler = webCrawler ?? throw new ArgumentNullException(nameof(webCrawler)); }
public void SetUp() { var resourceLoader = new ResourceLoader(); _rootFolder = ConfigurationManager.AppSettings["PathToTestFolder"]; var fileSaver = new FileSaver(); var pageLocator = new ResourceLocationManager(_rootFolder); _webCrawler = new WebCrawlerService.WebCrawler(); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetManuallyConfiguredWebCrawler(siteToCrawl); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { //if (!pageToCrawl.Uri.AbsoluteUri.Contains("chicken") && !pageToCrawl.Uri.AbsoluteUri.Contains("Chicken")) if (!pageToCrawl.Uri.AbsoluteUri.Contains(category.Replace(" ", "+")) || /*pageToCrawl.Uri.AbsoluteUri.Contains("navid")||*/ pageToCrawl.Uri.AbsoluteUri.Contains("_KG") || pageToCrawl.Uri.AbsoluteUri.Contains("_EA")) { return new CrawlDecision { Allow = false, Reason = "I only crawl the right pages" } } ; return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run /*crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => * { * if (crawlContext.CrawledCount >= 5) * return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" }; * * return new CrawlDecision { Allow = true }; * });*/ //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; if (crawledPage.Content.Bytes.Length < 100) { return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" } } ; return(decision); }); return(crawler); }
public WebTester(CancellationTokenSource tokenSource, TestOptions options) { Context = new WebTestContext { TestOptions = options ?? new TestOptions(), CancellationTokenSource = tokenSource ?? new CancellationTokenSource() }; BlockInit(); _crawler = new CrawlerLight(); _crawler.PageCrawlCompletedAsync += _crawler_PageCrawlCompleted; }
public Crawler(string argUrl) { TheWebCrawler = GetManuallyConfiguredWebCrawler(); TheWebCrawler.PageCrawlCompleted += PageCrawlCompletedEvent; TheWebCrawler.PageCrawlDisallowed += PageCrawlDisallowedEvent; TheWebCrawler.PageCrawlStarting += PageCrawlStartingEvent; TheWebCrawler.PageLinksCrawlDisallowed += PageLinksCrawlDisallowedEvent; var crawlResult = TheWebCrawler.Crawl(new Uri(argUrl)); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetDefaultWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains("p.html") || pageToCrawl.Uri.AbsoluteUri.Contains("p.html")) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false, Reason = "Incorrect subdomain" }); }); crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { if (crawlContext.CrawledCount >= 5) { return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" } } ; return(new CrawlDecision { Allow = true }); }); crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { if (!crawledPage.IsInternal) { return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" } } ; return(new CrawlDecision { Allow = true }); }); return(crawler); }
/// <summary> /// Pobieranie kontentów strony komentarzy dla danego produkru z serwisów ceneo.pl i skapice.pl /// </summary> public HttpCommentGeter(string productId, IStatisctics statistic) { product = new Product(); string pageName = "http://www.ceneo.pl/" + productId + "#tab=reviews"; fillProductPropertis(product, pageName); m_webCrawlerCeneo = new CeneoWebCrawler(pageName); m_webCrawlerCeneo.getPagesContent( statistic, product); ILinkToProductFinder productFinder = new SkapiecLinkToProductFinder(); string foundProduct = productFinder.getLinkToProduct(product); if (foundProduct != null) { m_webCrawlerSkapiec = new SkapiecWebCrawler("http://www.skapiec.pl" + productFinder.getLinkToProduct(product) + "#opinie"); m_webCrawlerSkapiec.getPagesContent(statistic, product); } }
public void CrawlAndAssert(IWebCrawler crawler) { crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompleted; CrawlResult result = crawler.Crawl(_rootUri); Assert.IsNull(result.ErrorException); Assert.IsFalse(result.ErrorOccurred); Assert.AreSame(_rootUri, result.RootUri); List<Discrepancy> descrepancies = GetDescrepancies(); PrintDescrepancies(descrepancies); Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details."); Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl)); }
public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken) { if (domain == null) { throw new ArgumentNullException("domain"); } if (cancellationToken == null) { throw new ArgumentNullException("cancellationToken"); } IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page IWebCrawler crawler = CreateCrawlerInstance(); DomainCrawlResult domainCrawlResult = new DomainCrawlResult(); domainCrawlResult.Domain = domain; try { crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { Domain = domain, PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider, BackupPersistenceProvider = _processorContext.BackupPersistenceProvider, CrawlProcessors = processors }; domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken); ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext); } catch (Exception ex) { string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message); domainCrawlResult.CrawlResult = new CrawlResult { ErrorException = ex }; _logger.ErrorFormat(errorMessage, ex); //TODO Statsg fatal error occurred during crawl StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config); } LogCrawlResult(domainCrawlResult.CrawlResult); return(domainCrawlResult); }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return(false); } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return(true); }
public async Task CrawlAndAssert(IWebCrawler crawler) { crawler.PageCrawlCompleted += crawler_PageCrawlCompleted; CrawlResult result = await crawler.CrawlAsync(_rootUri); Assert.IsNull(result.ErrorException); Assert.IsFalse(result.ErrorOccurred); Assert.AreSame(_rootUri, result.RootUri); List <Discrepancy> descrepancies = GetDescrepancies(); PrintDescrepancies(descrepancies); Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details."); Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl)); }
public CrawlerViewModel Crawl(CrawlerViewModel viewModel) { if (!Helper.IsValidUrl(viewModel.UrlToCrawl)) { viewModel.ErrorMsg = String.Format(" Please enter mail adress"); return(viewModel); } allLinksOnPage = new List <Uri>(); CrawlConfiguration config = new CrawlerNetConfig().Initalize(); this.crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += crawler_PageCrawlCompleted; // CrawlResult result = crawler.Crawl(new Uri(viewModel.UrlToCrawl)); if (result.ErrorOccurred) { viewModel.ErrorMsg = String.Format("Crawler completed with error: {0}", result.ErrorException.Message); } var isProd = Convert.ToBoolean(ConfigurationManager.AppSettings["IsProd"].ToString()); if (isProd) { viewModel.CrawledLinks.AddRange(allLinksOnPage); } else { viewModel.CrawledLinks.AddRange(allLinksOnPage.Take(10)); } viewModel.SuccessMsg = " Successfully Listed !"; return(viewModel); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetDefaultWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains("ru/news/")) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false }); }); crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { if (!crawledPage.IsInternal) { return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" } } ; return(new CrawlDecision { Allow = true }); }); return(crawler); }
public CrawlResult Crawl() { IWebCrawler crawler = InitCrawler(); //http://hh.ua/search/vacancy?no_magic=true&items_on_page=100¤cy_code=UAH&clusters=true&page=0 //-- ALL // --- UA Uri uriToCrawl = new Uri("http://hh.ua/search/vacancy?no_magic=true&items_on_page=100&clusters=true¤cy_code=UAH&area=5&page=0"); //var urlPattern=@"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100¤cy_code=UAH&clusters=true&page=[0-9]+$"; // -- ALL var urlPattern = @"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100&clusters=true¤cy_code=UAH&area=5&page=[0-9]+$"; // -- UA crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (Regex.IsMatch(pageToCrawl.Uri.ToString(), urlPattern, RegexOptions.IgnoreCase)) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false, Reason = "Parse only job pages" }); }); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(uriToCrawl); return(result); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetManuallyConfiguredWebCrawler(); crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => new CrawlDecision { Allow = true }); crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { if (!crawledPage.IsInternal) { return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" } } ; return(new CrawlDecision { Allow = true }); }); return(crawler); }
public void Dispose() { var disposable = _provider as IDisposable; if (disposable != null) { disposable.Dispose(); } _provider = null; disposable = _repo as IDisposable; if (disposable != null) { disposable.Dispose(); } _repo = null; if (_crawler != null) { if (IsAsync) { _crawler.PageCrawlStartingAsync -= crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync -= crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync -= crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync -= crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting -= crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted -= crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed -= crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed -= crawler_PageLinksCrawlDisallowed; } _crawler = null; } }
public GroundForcesScraper(IWebCrawler webCrawler, IConsoleManager consoleManager) { _webCrawler = webCrawler; _consoleManager = consoleManager; }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetDefaultWebCrawler(); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision; //if (pageToCrawl.Uri == crawlContext.RootUri // || // (pageToCrawl.Uri == new Uri((string)crawlContext.CrawlBag.PriceUri)) // ) // //pageToCrawl.Uri.PathAndQuery.Contains("tid=10301")) //{ // decision = new CrawlDecision { Allow = true }; //} //else //{ // decision = new CrawlDecision { Allow = false, Reason = "不是主页或价格信息" }; //} decision = new CrawlDecision { Allow = true }; Log($"ShouldCrawlPage {decision.Allow} ", pageToCrawl.Uri.AbsoluteUri); return(decision); //if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost")) // return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" }; //return new CrawlDecision { Allow = true }; }); //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { CrawlDecision decision; if (crawledPage.Uri == crawlContext.RootUri || crawledPage.Uri.PathAndQuery.Contains("tid=10301")) { decision = new CrawlDecision { Allow = true }; } else { decision = new CrawlDecision { Allow = false, ShouldStopCrawl = true, Reason = "不是主页或价格信息" }; } Log($"ShouldDownloadPageContent {decision.Allow} ", crawledPage.Uri.AbsoluteUri); return(decision); //if (crawlContext.CrawledCount >= 5) // return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" }; //return new CrawlDecision { Allow = true }; }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { Log("ShouldCrawlPageLinks", crawledPage.Uri.AbsoluteUri); //if (crawledPage.Uri == crawlContext.RootUri) //{ // return new CrawlDecision { Allow = true }; //} return(new CrawlDecision { Allow = false, Reason = "需要手工爬链接" }); }); return(crawler); }
public RbotProvider(IWebCrawler Crawler) { this.Crawler = Crawler; }
public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config) { _config = config; //check if a crawl is already defined var existingRun = _repo.GetCrawl(sessionId, crawlerId); if (existingRun != null) { var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId); _logger.Error(mssg); return false; } Seed = new Uri(seedUrl); CrawlerDefinition = new CrawlerRun() { SessionId = sessionId, SeedUrl = Seed.AbsoluteUri, CrawlerId = crawlerId, BaseDomain = Seed.GetBaseDomain() }; _repo.AddCrawl(CrawlerDefinition); _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo); _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null); _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId; _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId; _crawler.ShouldScheduleLink(ShouldScheduleLink); _crawler.ShouldCrawlPage(ShouldCrawlPage); if (IsAsync) { _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; } else { _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; } return true; }
protected WrapperWithStartPage(IClock clock, IWebCrawler webCrawler, Uri baseUri, Uri startPage) : base(clock, webCrawler, baseUri) { StartPage = startPage; }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = GetDefaultWebCrawler(); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPageDecisionMaker = (pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost")) { return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" } } ; return(new CrawlDecision { Allow = true }); }; //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run crawler.ShouldDownloadPageContentDecisionMaker = (crawledPage, crawlContext) => { if (crawlContext.CrawledCount >= 5) { return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" } } ; return(new CrawlDecision { Allow = true }); }; //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinksDecisionMaker = (crawledPage, crawlContext) => { if (!crawledPage.IsInternal) { return new CrawlDecision { Allow = false, Reason = "We don't crawl links of external pages" } } ; return(new CrawlDecision { Allow = true }); }; return(crawler); } private static Uri GetSiteToCrawl(string[] args) { string userInputUrl = string.Empty; if (args.Length < 1) { System.Console.WriteLine("Please, enter ABSOLUTE url to crawl (for ex.: https://github.com ):"); userInputUrl = System.Console.ReadLine(); } else { userInputUrl = args[0]; } var isAbsoluteUri = Uri.TryCreate(userInputUrl, UriKind.Absolute, out Uri result); if (string.IsNullOrWhiteSpace(userInputUrl) || !isAbsoluteUri) { throw new ApplicationException("Requare absolute url, without white spaces and not empty"); } return(result); }
public AdminController(IWebCrawler webCrawler, ApplicationDbContext db, ITime time) { _webCrawler = webCrawler; _db = db; _time = time; }
public RepositoryUnitTest(RawDataFixture rawDataFixture) { _rawDataFixture = rawDataFixture; _globalWebCrawler = rawDataFixture.CreateGlobalWebCrawler(10); _detailWebCrawler = rawDataFixture.CreateDetailWebCrawler(10); }