Ejemplo n.º 1
0
        public CrawlResult Crawl()
        {
            IWebCrawler crawler = InitCrawler();

            Uri uriToCrawl = new Uri("http://rabota.ua/jobsearch/vacancy_list"); //http://rabota.ua/jobsearch/vacancy_list?pg=1000

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains(@"rabota.ua/jobsearch/vacancy_list") &&
                    !pageToCrawl.Uri.AbsoluteUri.Contains(@"period"))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false, Reason = "Parse only job pages"
                });
            });

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(uriToCrawl);

            return(result);
        }
Ejemplo n.º 2
0
        public void Crawl(int maxDepth, Uri link)
        {
            var depth      = 0;
            var lastWeight = 0.5;

            foreach (var vertex in crawler.Crawl(link))
            {
                var community = false;
                _printer.Print(vertex);

                var shouldCreate = _communityDecider.ShouldCreateCommunity(lastWeight, vertex.Weight);
                lastWeight = vertex.Weight;

                if (shouldCreate)
                {
                    community = true;
                }

                var newUser = new User {
                    UserId = vertex.Id, Weight = vertex.Weight, Community = community
                };

                _userRepository.Add(newUser);

                depth++;
                if (depth >= maxDepth)
                {
                    break;
                }

                AddChildren(newUser, vertex.Degrees);
            }
        }
Ejemplo n.º 3
0
        async Task <CrawlResult> CrawlAndTest(Uri uri)
        {
            var res = await _crawler.Crawl(uri, Context.CancellationTokenSource).ConfigureAwait(false);

            _executionQueueBlock.Complete();
            await _notificationBlock.Completion.ConfigureAwait(false);

            return(res);
        }
        public Crawler(string argUrl)
        {
            TheWebCrawler = GetManuallyConfiguredWebCrawler();

            TheWebCrawler.PageCrawlCompleted       += PageCrawlCompletedEvent;
            TheWebCrawler.PageCrawlDisallowed      += PageCrawlDisallowedEvent;
            TheWebCrawler.PageCrawlStarting        += PageCrawlStartingEvent;
            TheWebCrawler.PageLinksCrawlDisallowed += PageLinksCrawlDisallowedEvent;

            var crawlResult = TheWebCrawler.Crawl(new Uri(argUrl));
        }
        public Crawler(string argUrl)
        {
            TheWebCrawler = GetManuallyConfiguredWebCrawler();

            TheWebCrawler.PageCrawlCompleted += PageCrawlCompletedEvent;
            TheWebCrawler.PageCrawlDisallowed += PageCrawlDisallowedEvent;
            TheWebCrawler.PageCrawlStarting += PageCrawlStartingEvent;
            TheWebCrawler.PageLinksCrawlDisallowed += PageLinksCrawlDisallowedEvent;

            var crawlResult = TheWebCrawler.Crawl(new Uri(argUrl));
        }
Ejemplo n.º 6
0
        public void StartCrawl()
        {
            DateTime timeStamp = DateTime.Now;

            CrawlerDefinition.StartTime  = timeStamp;
            CrawlerDefinition.InProgress = true;
            _repo.UpdateCrawl(CrawlerDefinition);


            OnDomainCrawlStarted(CrawlerDefinition);

            #region log start

            if (_logger.IsDebugEnabled)
            {
                _logger.DebugFormat("Starting crawl sessionId: {0} seed: {1}", CrawlerDefinition.SessionId, CrawlerDefinition.SeedUrl);
            }

            #endregion

            CrawlResult result = _crawler.Crawl(Seed, _cancelToken);

            #region log start

            if (_logger.IsDebugEnabled)
            {
                _logger.DebugFormat("Ended crawl elapsed: {0}", result.Elapsed);
            }

            #endregion

            CrawlerDefinition.InProgress    = false;
            CrawlerDefinition.EndTime       = CrawlerDefinition.StartTime.Add(result.Elapsed);
            CrawlerDefinition.ErrorOccurred = result.ErrorOccurred;

            _repo.UpdateCrawl(CrawlerDefinition);
            OnDomainCrawlEnded(CrawlerDefinition);

            if (result.ErrorOccurred)
            {
                var mssg = string.Format("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                _logger.Error(mssg);
            }
            else
            {
                var mssg = string.Format("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                _logger.Debug(mssg);
            }
        }
Ejemplo n.º 7
0
        public void Run(string[] args)
        {
            // Reads from the command line arguments.
            _input.ReadArguments(args);
            var domain   = _input.DomainUri;
            var filePath = _input.OutputFilePath;
            var wait     = _input.WaitBeforeEnd;

            // Crawls the domain and prints the results.
            var node = _crawler.Crawl(domain);
            var text = _output.Generate(node);

            _output.Save(filePath, text);
            _output.Write(text, wait);
        }
Ejemplo n.º 8
0
        public void CrawlAndAssert(IWebCrawler crawler)
        {
            crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompleted;

            CrawlResult result = crawler.Crawl(_rootUri);

            Assert.IsNull(result.ErrorException);
            Assert.IsFalse(result.ErrorOccurred);
            Assert.AreSame(_rootUri, result.RootUri);

            List<Discrepancy> descrepancies = GetDescrepancies();
            PrintDescrepancies(descrepancies);

            Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details.");
            Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl));
        }
Ejemplo n.º 9
0
        public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            if (cancellationToken == null)
            {
                throw new ArgumentNullException("cancellationToken");
            }

            IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page
            IWebCrawler crawler = CreateCrawlerInstance();

            DomainCrawlResult domainCrawlResult = new DomainCrawlResult();

            domainCrawlResult.Domain = domain;
            try
            {
                crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
                {
                    Domain = domain,
                    PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider,
                    BackupPersistenceProvider  = _processorContext.BackupPersistenceProvider,
                    CrawlProcessors            = processors
                };

                domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken);

                ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext);
            }
            catch (Exception ex)
            {
                string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message);
                domainCrawlResult.CrawlResult = new CrawlResult {
                    ErrorException = ex
                };

                _logger.ErrorFormat(errorMessage, ex);
                //TODO Statsg fatal error occurred during crawl
                StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config);
            }

            LogCrawlResult(domainCrawlResult.CrawlResult);
            return(domainCrawlResult);
        }
Ejemplo n.º 10
0
        public void CrawlAndAssert(IWebCrawler crawler)
        {
            crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompleted;

            CrawlResult result = crawler.Crawl(_rootUri);

            Assert.IsNull(result.ErrorException);
            Assert.IsFalse(result.ErrorOccurred);
            Assert.AreSame(_rootUri, result.RootUri);

            List <Discrepancy> descrepancies = GetDescrepancies();

            PrintDescrepancies(descrepancies);

            Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details.");
            Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl));
        }
Ejemplo n.º 11
0
        public CrawlerViewModel Crawl(CrawlerViewModel viewModel)
        {
            if (!Helper.IsValidUrl(viewModel.UrlToCrawl))
            {
                viewModel.ErrorMsg = String.Format(" Please enter mail adress");
                return(viewModel);
            }

            allLinksOnPage = new List <Uri>();
            CrawlConfiguration config = new CrawlerNetConfig().Initalize();

            this.crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += crawler_PageCrawlCompleted;

            //

            CrawlResult result = crawler.Crawl(new Uri(viewModel.UrlToCrawl));

            if (result.ErrorOccurred)
            {
                viewModel.ErrorMsg = String.Format("Crawler completed with error: {0}", result.ErrorException.Message);
            }

            var isProd = Convert.ToBoolean(ConfigurationManager.AppSettings["IsProd"].ToString());

            if (isProd)
            {
                viewModel.CrawledLinks.AddRange(allLinksOnPage);
            }
            else
            {
                viewModel.CrawledLinks.AddRange(allLinksOnPage.Take(10));
            }

            viewModel.SuccessMsg = " Successfully Listed !";

            return(viewModel);
        }
Ejemplo n.º 12
0
        public CrawlResult Crawl()
        {
            IWebCrawler crawler = InitCrawler();

            //http://hh.ua/search/vacancy?no_magic=true&items_on_page=100&currency_code=UAH&clusters=true&page=0    //-- ALL
            // --- UA
            Uri uriToCrawl = new Uri("http://hh.ua/search/vacancy?no_magic=true&items_on_page=100&clusters=true&currency_code=UAH&area=5&page=0");


            //var urlPattern=@"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100&currency_code=UAH&clusters=true&page=[0-9]+$"; // -- ALL
            var urlPattern = @"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100&clusters=true&currency_code=UAH&area=5&page=[0-9]+$"; // -- UA

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (Regex.IsMatch(pageToCrawl.Uri.ToString(), urlPattern, RegexOptions.IgnoreCase))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false, Reason = "Parse only job pages"
                });
            });

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(uriToCrawl);

            return(result);
        }
Ejemplo n.º 13
0
 public Task StartAsync(CancellationToken cancellationToken)
 {
     _logger.LogInformation("服务开始");
     _crawler.Crawl();
     return(Task.FromResult(0));
 }
Ejemplo n.º 14
0
        public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            if (cancellationToken == null)
            {
                throw new ArgumentNullException("cancellationToken");
            }

            LogCrawlBegin(domain);

            IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page
            IWebCrawler crawler = CreateCrawlerInstance();

            DomainCrawlResult domainCrawlResult = new DomainCrawlResult();

            domainCrawlResult.Domain = domain;
            try
            {
                crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
                {
                    Domain = domain,
                    PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider,
                    BackupPersistenceProvider  = _processorContext.BackupPersistenceProvider,
                    CrawlProcessors            = processors
                };

                //call parkedpage processor.  if parked, no need to crawl anything
                ICrawlProcessor parkedProc = processors.FirstOrDefault(p => p.GetType().Name == "ParkedCrawlProcessor");
                CrawlContext    cc         = new CrawlContext {
                    RootUri = domain.Uri, CrawlBag = crawler.CrawlBag
                };
                if (!Object.Equals(null, parkedProc))
                {
                    parkedProc.ProcessCrawledDomain(cc);
                }

                //if not parked or theres no parked processor, continue crawling the site
                if (Object.Equals(null, parkedProc) || !cc.CrawlBag.NoCrawl)
                {
                    domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken);
                    ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext);
                }
            }
            catch (Exception ex)
            {
                string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message);
                domainCrawlResult.CrawlResult = new CrawlResult {
                    ErrorException = ex
                };

                _logger.ErrorFormat(errorMessage, ex);
            }

            if (!Object.Equals(null, domainCrawlResult.CrawlResult)) //could be null if we don't crawl it due to being a parked page or no A record
            {
                LogCrawlResult(domainCrawlResult.CrawlResult);
            }

            return(domainCrawlResult);
        }