Пример #1
0
 public void Delete(string uri)
 {
     try
     {
         using (var client = new HttpClient())
         {
             var requestUri = new Uri("http://athena/pages/crawlable");
             var model      = new CrawledPageModel()
             {
                 Uri = uri
             };
             var serializedModel = JsonConvert.SerializeObject(model);
             var content         = new StringContent(serializedModel, Encoding.UTF8, "application/json");
             var request         = new HttpRequestMessage()
             {
                 Content    = content,
                 Method     = HttpMethod.Delete,
                 RequestUri = requestUri
             };
             var res = client.SendAsync(request).Result;
         }
     }
     catch (Exception ex)
     {
         _logger.LogError(0, ex, $"Failed ot delete crawled page [{uri}]");
     }
 }
Пример #2
0
        public IEnumerable <PageCrawlResult> RunCheck(CrawledPageModel page)
        {
            if (page.FoundUrls?.Any() != true)
            {
                yield break;
            }

            foreach (var url in page.FoundUrls)
            {
                using (var message = new HttpRequestMessage(HttpMethod.Head, url))
                {
                    var response = _httpClient.SendAsync(message, CancellationToken.None).Result;
                    if (!response.IsSuccessStatusCode)
                    {
                        yield return new PageCrawlResult
                               {
                                   Check       = this,
                                   Result      = SiteCrawlResultType.Error,
                                   ExtraValues = new Dictionary <string, string>
                                   {
                                       { BrokenLinkUrl, url.ToString() },
                                       { BrokenLinkStatusCode, response.StatusCode.ToString() }
                                   }
                               }
                    }
                    ;
                }
            }
        }
        public async Task <CrawledPageModel> MakeRequest(Uri uri)
        {
            if (uri is null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            var crawledPage = new CrawledPageModel(uri);
            HttpResponseMessage response = null;

            try
            {
                crawledPage.RequestStarted = DateTime.Now;
                using (var requestMessage = new HttpRequestMessage(HttpMethod.Get, uri))
                {
                    response = await _httpClient.SendAsync(requestMessage, CancellationToken.None).ConfigureAwait(false);
                }
                var statusCode = Convert.ToInt32(response.StatusCode);
                crawledPage.StatusCode = statusCode;
            }
            catch (HttpRequestException ex)
            {
                //TODO: Log error?
            }
            catch (TaskCanceledException ex)
            {
                //TODO: Log error?
            }
            catch (Exception ex)
            {
                //TODO: Log error?
            }
            finally
            {
                crawledPage.RequestCompleted = DateTime.Now;
                try
                {
                    if (response != null)
                    {
                        var contentStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false);

                        var htmlDocument = new HtmlDocument();
                        htmlDocument.Load(contentStream, Encoding.UTF8);
                        crawledPage.Content = htmlDocument;
                    }
                }
                catch (Exception ex)
                {
                    //TODO: Log error?
                }
            }
            return(crawledPage);
        }
        private void ProcessPage(CrawledPageModel page)
        {
            var links = _linkParser.GetLinks(page).ToArray();

            page.FoundUrls = links;
            foreach (var link in links)
            {
                if (!_scheduler.IsUriKnown(link) && page.Url.Authority == link.Authority)
                {
                    _scheduler.Add(link);
                }
            }
            _scheduler.AddKnownUri(page.Url);
            OnPageCrawlCompleted?.Invoke(this, new PageCrawlCompleteArgs()
            {
                Page = page
            });
        }
        public IEnumerable <Uri> GetLinks(CrawledPageModel page)
        {
            if (page is null)
            {
                throw new ArgumentNullException(nameof(page));
            }

            var links = page.Content?.DocumentNode.SelectNodes("//a[@href]");

            if (links is null)
            {
                yield break;
            }

            var baseUri = page.Url;

            foreach (var link in links)
            {
                var hrefValue = link.Attributes["href"].Value;
                yield return(new Uri(baseUri, hrefValue));
            }
        }