コード例 #1
0
        public void OnError(CrawlError error)
        {
            if (error.Exception != null)
            {
                Console.WriteLine($"ERROR: {error.Exception.FlattenInnerMessages()}: '{error.Url}'");

                this.errors.Add(error);
            }
            else
            {
                Console.WriteLine($"{error.Status}: '{error.Url}' (Referrer: '{error.Referrer}')");

                if (error.Status == HttpStatusCode.NotFound)
                {
                    this.warnings.Add(error);
                }
                else
                {
                    this.errors.Add(error);
                }
            }

            if (this.maxErrors.HasValue && this.errors.Count >= this.maxErrors)
            {
                this.cancellationTokenSource.Cancel();
            }
        }
コード例 #2
0
ファイル: Crawler.cs プロジェクト: fortedigital/SmokeTester
 private void ReportError(bool isLastTry, CrawlError crawlError)
 {
     if (isLastTry)
     {
         this.observer.OnError(crawlError);
     }
     else
     {
         this.observer.OnRetrying(crawlError);
     }
 }
コード例 #3
0
        public Task CollectError(CrawlError error)
        {
            if (error.Status == HttpStatusCode.NotFound)
            {
                this.Warnings.Add(error);
            }
            else
            {
                this.Errors.Add(error);
            }

            return(Task.CompletedTask);
        }
コード例 #4
0
ファイル: Crawler.cs プロジェクト: fortedigital/SmokeTester
        private async Task <bool> Crawl(CrawlRequest request, CancellationToken cancellationToken, bool isLastTry)
        {
            this.observer.OnCrawling(request);

            var requestStopWatch = Stopwatch.StartNew();

            try
            {
                var httpRequestMessage = new HttpRequestMessage
                {
                    RequestUri = request.Url,
                    Method     = HttpMethod.Get,
                };

                if (string.IsNullOrEmpty(this.userAgent) == false)
                {
                    httpRequestMessage.Headers.TryAddWithoutValidation("User-Agent", this.userAgent);
                }

                foreach (var customHttpHeader in this.customHttpHeaders)
                {
                    httpRequestMessage.Headers.Add(customHttpHeader.Key, customHttpHeader.Value);
                }

                using (var response = await this.httpClient.SendAsync(httpRequestMessage, cancellationToken))
                {
                    this.discoveredUrls[request.Url].status = response.StatusCode;

                    if (response.IsSuccessStatusCode)
                    {
                        this.observer.OnCrawled(new CrawlResult(request.Url, response.StatusCode, request.Referrer, requestStopWatch.Elapsed));

                        var links = await this.linkExtractor.ExtractLinks(request, response.Content);

                        foreach (var url in links)
                        {
                            if (cancellationToken.IsCancellationRequested)
                            {
                                break;
                            }

                            if (this.ProcessExtractedUrl(request, url))
                            {
                                var crawlRequest = new CrawlRequest(url, request.Url, request.Depth + 1);

                                if (this.crawlRequestFilter.ShouldCrawl(crawlRequest) == false)
                                {
                                    continue;
                                }

                                this.workQueue.Add(crawlRequest, cancellationToken);
                            }
                        }

                        return(true);
                    }
                    else
                    {
                        var crawlError = new CrawlError(request.Url, response.StatusCode, request.Referrer);
                        this.ReportError(isLastTry, crawlError);
                        return(false);
                    }
                }
            }
            catch (OperationCanceledException ex)
            {
                if (cancellationToken.IsCancellationRequested == false)
                {
                    // it means timeout but there is no easy way to find it out
                    // https://github.com/dotnet/corefx/issues/20296

                    var exception  = new OperationCanceledException($"Task canceled for {request.Url} after {requestStopWatch.Elapsed} (timeout?).", ex);
                    var crawlError = new CrawlError(request.Url, exception, request.Referrer);
                    this.ReportError(isLastTry, crawlError);
                }

                // otherwise it means request to stop processing new urls
                return(false);
            }
            catch (Exception e)
            {
                var crawlError = new CrawlError(request.Url, e, request.Referrer);
                this.ReportError(isLastTry, crawlError);
                return(false);
            }
        }
コード例 #5
0
 public void OnRetrying(CrawlError error)
 {
     Console.WriteLine(error.Exception != null
         ? $"RETRYING: '{error.Url}' : {error.Exception.FlattenInnerMessages()}: "
         : $"RETRYING: '{error.Url}' : {error.Status}:  (Referrer: '{error.Referrer}')");
 }
コード例 #6
0
ファイル: Program.cs プロジェクト: fortedigital/SmokeTester
        private static void WriteReferrers(IReadOnlyDictionary <Uri, CrawledUrlProperties> result, CrawlError error)
        {
            var referrers      = result[error.Url].Referrers;
            var referrersCount = referrers.Count();

            if (referrersCount > 10)
            {
                Console.WriteLine($"Referrers (showing 10 of {referrersCount}):");
            }
            else
            {
                Console.WriteLine($"Referrers ({referrersCount}):");
            }

            Console.WriteLine($"  {string.Join("\n  ", referrers.Take(10))}\n");
        }