Example #1
0
        //protected virtual async Task<CrawledPage> CrawlThePage(PageToCrawl pageToCrawl)
        protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl)
        {
            _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri);
            FirePageCrawlStartingEventAsync(pageToCrawl);
            FirePageCrawlStartingEvent(pageToCrawl);

            if (pageToCrawl.IsRetry)
            {
                WaitMinimumRetryDelay(pageToCrawl);
            }

            pageToCrawl.LastRequest = DateTime.Now;

            CrawledPage crawledPage = _pageRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x));
            //CrawledPage crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x));

            dynamic combinedPageBag = this.CombinePageBags(pageToCrawl.PageBag, crawledPage.PageBag);

            Mapper.CreateMap <PageToCrawl, CrawledPage>();
            Mapper.Map(pageToCrawl, crawledPage);
            crawledPage.PageBag = combinedPageBag;

            if (crawledPage.HttpWebResponse == null)
            {
                _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}] Retry:[{2}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri, crawledPage.RetryCount);
            }
            else
            {
                _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}] Retry:[{3}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri, crawledPage.RetryCount);
            }

            return(crawledPage);
        }
Example #2
0
        protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl)
        {
            FirePageCrawlStartingEventAsync(pageToCrawl);

            pageToCrawl.LastRequest = DateTime.Now;

            CrawledPage crawledPage = _pageRequester.MakeRequest(pageToCrawl.Uri, ShouldDownloadPageContent);

            Map(pageToCrawl, crawledPage);

            _logger.LogInformation("抓取完毕, 状态:[{0}] 地址:[{1}] 花费:[{2}] 重试:[{3}]", crawledPage.StatusCode, crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.RetryCount);

            return(crawledPage);
        }
        public IRobotsDotText Find(Uri rootUri)
        {
            if (rootUri == null)
            {
                throw new ArgumentNullException("rootUri");
            }

            Uri         robotsUri = new Uri(rootUri, "/robots.txt");
            CrawledPage page      = _pageRequester.MakeRequest(robotsUri);

            if (page == null || page.WebException != null || page.HttpWebResponse == null || page.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                _logger.DebugFormat("Did not find robots.txt file at [{0}]", robotsUri);
                return(null);
            }

            _logger.DebugFormat("Found robots.txt file at [{0}]", robotsUri);
            return(new RobotsDotText(rootUri, page.Content.Text));
        }
Example #4
0
        protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl)
        {
            _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri);
            FirePageCrawlStartingEventAsync(pageToCrawl);
            FirePageCrawlStartingEvent(pageToCrawl);

            CrawledPage crawledPage = _httpRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x));

            AutoMapper.Mapper.CreateMap <PageToCrawl, CrawledPage>();
            AutoMapper.Mapper.Map(pageToCrawl, crawledPage);

            if (crawledPage.HttpWebResponse == null)
            {
                _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri);
            }
            else
            {
                _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri);
            }

            return(crawledPage);
        }
        public async Task Crawl(Uri startingUrl, int maxUrlsToCrawl)
        {
            var maxCrawls     = maxUrlsToCrawl;
            var currentCrawls = 0;
            var crawlComplete = false;

            _scheduler.Add(startingUrl);

            while (!crawlComplete && currentCrawls < maxCrawls)
            {
                var linksLeftToSchedule = _scheduler.Count;
                if (linksLeftToSchedule > 0)
                {
                    var linkToCrawl = _scheduler.GetNext();
                    ProcessPage(await _pageRequester.MakeRequest(linkToCrawl));
                    currentCrawls++;
                }
                else
                {
                    //Finished the crawl
                    crawlComplete = true;
                }
            }
        }
Example #6
0
        public void DownloadPage_Google_Downloads()
        {
            CrawledPage page = _requester.MakeRequest(new Uri("http://google.co.uk"));

            Assert.IsNotNull(page);
        }