public CrawlResult Crawl() { IWebCrawler crawler = InitCrawler(); Uri uriToCrawl = new Uri("http://rabota.ua/jobsearch/vacancy_list"); //http://rabota.ua/jobsearch/vacancy_list?pg=1000 crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains(@"rabota.ua/jobsearch/vacancy_list") && !pageToCrawl.Uri.AbsoluteUri.Contains(@"period")) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false, Reason = "Parse only job pages" }); }); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(uriToCrawl); return(result); }
public void Crawl(int maxDepth, Uri link) { var depth = 0; var lastWeight = 0.5; foreach (var vertex in crawler.Crawl(link)) { var community = false; _printer.Print(vertex); var shouldCreate = _communityDecider.ShouldCreateCommunity(lastWeight, vertex.Weight); lastWeight = vertex.Weight; if (shouldCreate) { community = true; } var newUser = new User { UserId = vertex.Id, Weight = vertex.Weight, Community = community }; _userRepository.Add(newUser); depth++; if (depth >= maxDepth) { break; } AddChildren(newUser, vertex.Degrees); } }
async Task <CrawlResult> CrawlAndTest(Uri uri) { var res = await _crawler.Crawl(uri, Context.CancellationTokenSource).ConfigureAwait(false); _executionQueueBlock.Complete(); await _notificationBlock.Completion.ConfigureAwait(false); return(res); }
public Crawler(string argUrl) { TheWebCrawler = GetManuallyConfiguredWebCrawler(); TheWebCrawler.PageCrawlCompleted += PageCrawlCompletedEvent; TheWebCrawler.PageCrawlDisallowed += PageCrawlDisallowedEvent; TheWebCrawler.PageCrawlStarting += PageCrawlStartingEvent; TheWebCrawler.PageLinksCrawlDisallowed += PageLinksCrawlDisallowedEvent; var crawlResult = TheWebCrawler.Crawl(new Uri(argUrl)); }
public void StartCrawl() { DateTime timeStamp = DateTime.Now; CrawlerDefinition.StartTime = timeStamp; CrawlerDefinition.InProgress = true; _repo.UpdateCrawl(CrawlerDefinition); OnDomainCrawlStarted(CrawlerDefinition); #region log start if (_logger.IsDebugEnabled) { _logger.DebugFormat("Starting crawl sessionId: {0} seed: {1}", CrawlerDefinition.SessionId, CrawlerDefinition.SeedUrl); } #endregion CrawlResult result = _crawler.Crawl(Seed, _cancelToken); #region log start if (_logger.IsDebugEnabled) { _logger.DebugFormat("Ended crawl elapsed: {0}", result.Elapsed); } #endregion CrawlerDefinition.InProgress = false; CrawlerDefinition.EndTime = CrawlerDefinition.StartTime.Add(result.Elapsed); CrawlerDefinition.ErrorOccurred = result.ErrorOccurred; _repo.UpdateCrawl(CrawlerDefinition); OnDomainCrawlEnded(CrawlerDefinition); if (result.ErrorOccurred) { var mssg = string.Format("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); _logger.Error(mssg); } else { var mssg = string.Format("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); _logger.Debug(mssg); } }
public void Run(string[] args) { // Reads from the command line arguments. _input.ReadArguments(args); var domain = _input.DomainUri; var filePath = _input.OutputFilePath; var wait = _input.WaitBeforeEnd; // Crawls the domain and prints the results. var node = _crawler.Crawl(domain); var text = _output.Generate(node); _output.Save(filePath, text); _output.Write(text, wait); }
public void CrawlAndAssert(IWebCrawler crawler) { crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompleted; CrawlResult result = crawler.Crawl(_rootUri); Assert.IsNull(result.ErrorException); Assert.IsFalse(result.ErrorOccurred); Assert.AreSame(_rootUri, result.RootUri); List<Discrepancy> descrepancies = GetDescrepancies(); PrintDescrepancies(descrepancies); Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details."); Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl)); }
public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken) { if (domain == null) { throw new ArgumentNullException("domain"); } if (cancellationToken == null) { throw new ArgumentNullException("cancellationToken"); } IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page IWebCrawler crawler = CreateCrawlerInstance(); DomainCrawlResult domainCrawlResult = new DomainCrawlResult(); domainCrawlResult.Domain = domain; try { crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { Domain = domain, PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider, BackupPersistenceProvider = _processorContext.BackupPersistenceProvider, CrawlProcessors = processors }; domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken); ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext); } catch (Exception ex) { string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message); domainCrawlResult.CrawlResult = new CrawlResult { ErrorException = ex }; _logger.ErrorFormat(errorMessage, ex); //TODO Statsg fatal error occurred during crawl StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config); } LogCrawlResult(domainCrawlResult.CrawlResult); return(domainCrawlResult); }
public void CrawlAndAssert(IWebCrawler crawler) { crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompleted; CrawlResult result = crawler.Crawl(_rootUri); Assert.IsNull(result.ErrorException); Assert.IsFalse(result.ErrorOccurred); Assert.AreSame(_rootUri, result.RootUri); List <Discrepancy> descrepancies = GetDescrepancies(); PrintDescrepancies(descrepancies); Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details."); Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl)); }
public CrawlerViewModel Crawl(CrawlerViewModel viewModel) { if (!Helper.IsValidUrl(viewModel.UrlToCrawl)) { viewModel.ErrorMsg = String.Format(" Please enter mail adress"); return(viewModel); } allLinksOnPage = new List <Uri>(); CrawlConfiguration config = new CrawlerNetConfig().Initalize(); this.crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += crawler_PageCrawlCompleted; // CrawlResult result = crawler.Crawl(new Uri(viewModel.UrlToCrawl)); if (result.ErrorOccurred) { viewModel.ErrorMsg = String.Format("Crawler completed with error: {0}", result.ErrorException.Message); } var isProd = Convert.ToBoolean(ConfigurationManager.AppSettings["IsProd"].ToString()); if (isProd) { viewModel.CrawledLinks.AddRange(allLinksOnPage); } else { viewModel.CrawledLinks.AddRange(allLinksOnPage.Take(10)); } viewModel.SuccessMsg = " Successfully Listed !"; return(viewModel); }
public CrawlResult Crawl() { IWebCrawler crawler = InitCrawler(); //http://hh.ua/search/vacancy?no_magic=true&items_on_page=100¤cy_code=UAH&clusters=true&page=0 //-- ALL // --- UA Uri uriToCrawl = new Uri("http://hh.ua/search/vacancy?no_magic=true&items_on_page=100&clusters=true¤cy_code=UAH&area=5&page=0"); //var urlPattern=@"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100¤cy_code=UAH&clusters=true&page=[0-9]+$"; // -- ALL var urlPattern = @"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100&clusters=true¤cy_code=UAH&area=5&page=[0-9]+$"; // -- UA crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (Regex.IsMatch(pageToCrawl.Uri.ToString(), urlPattern, RegexOptions.IgnoreCase)) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false, Reason = "Parse only job pages" }); }); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(uriToCrawl); return(result); }
public Task StartAsync(CancellationToken cancellationToken) { _logger.LogInformation("服务开始"); _crawler.Crawl(); return(Task.FromResult(0)); }
public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken) { if (domain == null) { throw new ArgumentNullException("domain"); } if (cancellationToken == null) { throw new ArgumentNullException("cancellationToken"); } LogCrawlBegin(domain); IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page IWebCrawler crawler = CreateCrawlerInstance(); DomainCrawlResult domainCrawlResult = new DomainCrawlResult(); domainCrawlResult.Domain = domain; try { crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { Domain = domain, PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider, BackupPersistenceProvider = _processorContext.BackupPersistenceProvider, CrawlProcessors = processors }; //call parkedpage processor. if parked, no need to crawl anything ICrawlProcessor parkedProc = processors.FirstOrDefault(p => p.GetType().Name == "ParkedCrawlProcessor"); CrawlContext cc = new CrawlContext { RootUri = domain.Uri, CrawlBag = crawler.CrawlBag }; if (!Object.Equals(null, parkedProc)) { parkedProc.ProcessCrawledDomain(cc); } //if not parked or theres no parked processor, continue crawling the site if (Object.Equals(null, parkedProc) || !cc.CrawlBag.NoCrawl) { domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken); ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext); } } catch (Exception ex) { string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message); domainCrawlResult.CrawlResult = new CrawlResult { ErrorException = ex }; _logger.ErrorFormat(errorMessage, ex); } if (!Object.Equals(null, domainCrawlResult.CrawlResult)) //could be null if we don't crawl it due to being a parked page or no A record { LogCrawlResult(domainCrawlResult.CrawlResult); } return(domainCrawlResult); }