public void OnError(CrawlError error) { if (error.Exception != null) { Console.WriteLine($"ERROR: {error.Exception.FlattenInnerMessages()}: '{error.Url}'"); this.errors.Add(error); } else { Console.WriteLine($"{error.Status}: '{error.Url}' (Referrer: '{error.Referrer}')"); if (error.Status == HttpStatusCode.NotFound) { this.warnings.Add(error); } else { this.errors.Add(error); } } if (this.maxErrors.HasValue && this.errors.Count >= this.maxErrors) { this.cancellationTokenSource.Cancel(); } }
private void ReportError(bool isLastTry, CrawlError crawlError) { if (isLastTry) { this.observer.OnError(crawlError); } else { this.observer.OnRetrying(crawlError); } }
public Task CollectError(CrawlError error) { if (error.Status == HttpStatusCode.NotFound) { this.Warnings.Add(error); } else { this.Errors.Add(error); } return(Task.CompletedTask); }
private async Task <bool> Crawl(CrawlRequest request, CancellationToken cancellationToken, bool isLastTry) { this.observer.OnCrawling(request); var requestStopWatch = Stopwatch.StartNew(); try { var httpRequestMessage = new HttpRequestMessage { RequestUri = request.Url, Method = HttpMethod.Get, }; if (string.IsNullOrEmpty(this.userAgent) == false) { httpRequestMessage.Headers.TryAddWithoutValidation("User-Agent", this.userAgent); } foreach (var customHttpHeader in this.customHttpHeaders) { httpRequestMessage.Headers.Add(customHttpHeader.Key, customHttpHeader.Value); } using (var response = await this.httpClient.SendAsync(httpRequestMessage, cancellationToken)) { this.discoveredUrls[request.Url].status = response.StatusCode; if (response.IsSuccessStatusCode) { this.observer.OnCrawled(new CrawlResult(request.Url, response.StatusCode, request.Referrer, requestStopWatch.Elapsed)); var links = await this.linkExtractor.ExtractLinks(request, response.Content); foreach (var url in links) { if (cancellationToken.IsCancellationRequested) { break; } if (this.ProcessExtractedUrl(request, url)) { var crawlRequest = new CrawlRequest(url, request.Url, request.Depth + 1); if (this.crawlRequestFilter.ShouldCrawl(crawlRequest) == false) { continue; } this.workQueue.Add(crawlRequest, cancellationToken); } } return(true); } else { var crawlError = new CrawlError(request.Url, response.StatusCode, request.Referrer); this.ReportError(isLastTry, crawlError); return(false); } } } catch (OperationCanceledException ex) { if (cancellationToken.IsCancellationRequested == false) { // it means timeout but there is no easy way to find it out // https://github.com/dotnet/corefx/issues/20296 var exception = new OperationCanceledException($"Task canceled for {request.Url} after {requestStopWatch.Elapsed} (timeout?).", ex); var crawlError = new CrawlError(request.Url, exception, request.Referrer); this.ReportError(isLastTry, crawlError); } // otherwise it means request to stop processing new urls return(false); } catch (Exception e) { var crawlError = new CrawlError(request.Url, e, request.Referrer); this.ReportError(isLastTry, crawlError); return(false); } }
public void OnRetrying(CrawlError error) { Console.WriteLine(error.Exception != null ? $"RETRYING: '{error.Url}' : {error.Exception.FlattenInnerMessages()}: " : $"RETRYING: '{error.Url}' : {error.Status}: (Referrer: '{error.Referrer}')"); }
private static void WriteReferrers(IReadOnlyDictionary <Uri, CrawledUrlProperties> result, CrawlError error) { var referrers = result[error.Url].Referrers; var referrersCount = referrers.Count(); if (referrersCount > 10) { Console.WriteLine($"Referrers (showing 10 of {referrersCount}):"); } else { Console.WriteLine($"Referrers ({referrersCount}):"); } Console.WriteLine($" {string.Join("\n ", referrers.Take(10))}\n"); }