Esempio n. 1
0
        //protected virtual async Task<CrawledPage> CrawlThePage(PageToCrawl pageToCrawl)
        protected virtual async Task <CrawledPage> CrawlThePageAsync(PageToCrawl pageToCrawl)
        {
            _logger.LogDebug($"About to crawl page [{pageToCrawl.Uri.AbsoluteUri}]");
            FirePageCrawlStartingEventAsync(pageToCrawl);
            FirePageCrawlStartingEvent(pageToCrawl);

            if (pageToCrawl.IsRetry)
            {
                WaitMinimumRetryDelay(pageToCrawl);
            }

            pageToCrawl.LastRequest = DateTime.Now;

            var crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, ShouldDownloadPageContent);

            //CrawledPage crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, ShouldDownloadPageContent);

            dynamic combinedPageBag = this.CombinePageBags(pageToCrawl.PageBag, crawledPage.PageBag);

            Mapper.CreateMap <PageToCrawl, CrawledPage>();
            Mapper.Map(pageToCrawl, crawledPage);
            crawledPage.PageBag = combinedPageBag;

            if (crawledPage.HttpWebResponse == null)
            {
                _logger.LogInformation($"Page crawl complete, Status:[NA] Url:[{crawledPage.Uri.AbsoluteUri}] Elapsed:[{crawledPage.Elapsed}] Parent:[{crawledPage.ParentUri}] Retry:[{crawledPage.RetryCount}]");
            }
            else
            {
                _logger.LogInformation($"Page crawl complete, Status:[{Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode)}] Url:[{crawledPage.Uri.AbsoluteUri}] Elapsed:[{crawledPage.Elapsed}] Parent:[{crawledPage.ParentUri}] Retry:[{crawledPage.RetryCount}]");
            }

            return(crawledPage);
        }
Esempio n. 2
0
        protected virtual async Task <CrawledPage> CrawlThePage(PageToCrawl pageToCrawl)
        {
            Log.Debug("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri);
            FirePageCrawlStartingEvent(pageToCrawl);

            if (pageToCrawl.IsRetry)
            {
                WaitMinimumRetryDelay(pageToCrawl);
            }

            pageToCrawl.LastRequest = DateTime.Now;

            var crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, ShouldDownloadPageContent).ConfigureAwait(false);

            Map(pageToCrawl, crawledPage);

            if (crawledPage.HttpResponseMessage == null)
            {
                Log.Information("Page crawl complete, Status:[NA] Url:[{0}] Elapsed:[{1}] Parent:[{2}] Retry:[{3}]", crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.ParentUri, crawledPage.RetryCount);
            }
            else
            {
                Log.Information("Page crawl complete, Status:[{0}] Url:[{1}] Elapsed:[{2}] Parent:[{3}] Retry:[{4}]", Convert.ToInt32(crawledPage.HttpResponseMessage.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.ParentUri, crawledPage.RetryCount);
            }

            return(crawledPage);
        }
Esempio n. 3
0
        /// <summary>
        /// Finds the robots.txt file using the rootUri.
        /// If rootUri is http://yahoo.com, it will look for robots at http://yahoo.com/robots.txt.
        /// If rootUri is http://music.yahoo.com, it will look for robots at http://music.yahoo.com/robots.txt
        /// </summary>
        /// <param name="rootUri">The root domain</param>
        /// <returns>Object representing the robots.txt file or returns null</returns>
        public IRobotsDotText Find(Uri rootUri)
        {
            if (rootUri == null)
            {
                throw new ArgumentNullException(nameof(rootUri));
            }

            Uri robotsUri = new Uri(rootUri, RobotsTxt);

            // If should crawl site not from start page
            if (!robotsUri.ToString().Contains(rootUri.ToString()))
            {
                Logger.DebugFormat("Your url couldn't have robots.txt");
                return(null);
            }

            CrawledPage page = PageRequester.MakeRequestAsync(robotsUri).Result;

            if (page == null ||
                page.WebException != null ||
                page.HttpWebResponse == null ||
                page.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Logger.DebugFormat("Did not find robots.txt file at [{0}]", robotsUri);
                return(null);
            }

            Logger.DebugFormat("Found robots.txt file at [{0}]", robotsUri);
            return(new RobotsDotText(rootUri, page.Content.Text));
        }
Esempio n. 4
0
        public async Task <IRobotsDotText> FindAsync(Uri rootUri)
        {
            if (rootUri == null)
            {
                throw new ArgumentNullException(nameof(rootUri));
            }

            var robotsUri = new Uri(rootUri, "/robots.txt");
            var page      = await _pageRequester.MakeRequestAsync(robotsUri);

            if (page == null || page.WebException != null || page.HttpWebResponse == null || page.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                _logger.LogDebug($"Did not find robots.txt file at [{robotsUri}]");
                return(null);
            }

            _logger.LogDebug($"Found robots.txt file at [{robotsUri}]");
            return(new RobotsDotText(rootUri, page.Content.Text));
        }
        public async Task <IRobotsDotText> FindAsync(Uri rootUri)
        {
            if (rootUri == null)
            {
                throw new ArgumentNullException(nameof(rootUri));
            }

            var robotsUri = new Uri(rootUri, "/robots.txt");
            var page      = await _pageRequester.MakeRequestAsync(robotsUri).ConfigureAwait(false);

            if (page == null ||
                page.HttpRequestException != null ||
                page.HttpResponseMessage == null ||
                page.HttpResponseMessage.StatusCode != HttpStatusCode.OK)
            {
                Log.Debug("Did not find robots.txt file at [{0}]", robotsUri);
                return(null);
            }

            Log.Debug("Found robots.txt file at [{0}]", robotsUri);

            return(new RobotsDotText(rootUri, page.Content.Text));
        }