Пример #1
0
        private bool ProcessExtractedUrl(CrawlRequest request, Uri url)
        {
            var newUrl = false;

            if (this.discoveredUrls.TryGetValue(url, out var urlProperties) == false)
            {
                urlProperties = new CrawledUrlPropertiesImpl(url);
                if (this.discoveredUrls.TryAdd(url, urlProperties))
                {
                    newUrl = true;
                    this.observer.OnNewUrl(url);
                }
                else
                {
                    urlProperties = this.discoveredUrls[url];
                }
            }

            urlProperties.referrers.TryAdd(request.Url, 0);

            return(newUrl);
        }
Пример #2
0
 public bool ShouldCrawl(CrawlRequest crawlRequest)
 {
     return(this.filters.All(f => f.ShouldCrawl(crawlRequest)));
 }
Пример #3
0
        private async Task <bool> Crawl(CrawlRequest request, CancellationToken cancellationToken, bool isLastTry)
        {
            this.observer.OnCrawling(request);

            var requestStopWatch = Stopwatch.StartNew();

            try
            {
                var httpRequestMessage = new HttpRequestMessage
                {
                    RequestUri = request.Url,
                    Method     = HttpMethod.Get,
                };

                if (string.IsNullOrEmpty(this.userAgent) == false)
                {
                    httpRequestMessage.Headers.TryAddWithoutValidation("User-Agent", this.userAgent);
                }

                foreach (var customHttpHeader in this.customHttpHeaders)
                {
                    httpRequestMessage.Headers.Add(customHttpHeader.Key, customHttpHeader.Value);
                }

                using (var response = await this.httpClient.SendAsync(httpRequestMessage, cancellationToken))
                {
                    this.discoveredUrls[request.Url].status = response.StatusCode;

                    if (response.IsSuccessStatusCode)
                    {
                        this.observer.OnCrawled(new CrawlResult(request.Url, response.StatusCode, request.Referrer, requestStopWatch.Elapsed));

                        var links = await this.linkExtractor.ExtractLinks(request, response.Content);

                        foreach (var url in links)
                        {
                            if (cancellationToken.IsCancellationRequested)
                            {
                                break;
                            }

                            if (this.ProcessExtractedUrl(request, url))
                            {
                                var crawlRequest = new CrawlRequest(url, request.Url, request.Depth + 1);

                                if (this.crawlRequestFilter.ShouldCrawl(crawlRequest) == false)
                                {
                                    continue;
                                }

                                this.workQueue.Add(crawlRequest, cancellationToken);
                            }
                        }

                        return(true);
                    }
                    else
                    {
                        var crawlError = new CrawlError(request.Url, response.StatusCode, request.Referrer);
                        this.ReportError(isLastTry, crawlError);
                        return(false);
                    }
                }
            }
            catch (OperationCanceledException ex)
            {
                if (cancellationToken.IsCancellationRequested == false)
                {
                    // it means timeout but there is no easy way to find it out
                    // https://github.com/dotnet/corefx/issues/20296

                    var exception  = new OperationCanceledException($"Task canceled for {request.Url} after {requestStopWatch.Elapsed} (timeout?).", ex);
                    var crawlError = new CrawlError(request.Url, exception, request.Referrer);
                    this.ReportError(isLastTry, crawlError);
                }

                // otherwise it means request to stop processing new urls
                return(false);
            }
            catch (Exception e)
            {
                var crawlError = new CrawlError(request.Url, e, request.Referrer);
                this.ReportError(isLastTry, crawlError);
                return(false);
            }
        }
Пример #4
0
 public void OnCrawling(CrawlRequest request)
 {
     Console.WriteLine($"CRAWLING: {request.Url}");
 }