public void Delete(string uri) { try { using (var client = new HttpClient()) { var requestUri = new Uri("http://athena/pages/crawlable"); var model = new CrawledPageModel() { Uri = uri }; var serializedModel = JsonConvert.SerializeObject(model); var content = new StringContent(serializedModel, Encoding.UTF8, "application/json"); var request = new HttpRequestMessage() { Content = content, Method = HttpMethod.Delete, RequestUri = requestUri }; var res = client.SendAsync(request).Result; } } catch (Exception ex) { _logger.LogError(0, ex, $"Failed ot delete crawled page [{uri}]"); } }
public IEnumerable <PageCrawlResult> RunCheck(CrawledPageModel page) { if (page.FoundUrls?.Any() != true) { yield break; } foreach (var url in page.FoundUrls) { using (var message = new HttpRequestMessage(HttpMethod.Head, url)) { var response = _httpClient.SendAsync(message, CancellationToken.None).Result; if (!response.IsSuccessStatusCode) { yield return new PageCrawlResult { Check = this, Result = SiteCrawlResultType.Error, ExtraValues = new Dictionary <string, string> { { BrokenLinkUrl, url.ToString() }, { BrokenLinkStatusCode, response.StatusCode.ToString() } } } } ; } } }
public async Task <CrawledPageModel> MakeRequest(Uri uri) { if (uri is null) { throw new ArgumentNullException(nameof(uri)); } var crawledPage = new CrawledPageModel(uri); HttpResponseMessage response = null; try { crawledPage.RequestStarted = DateTime.Now; using (var requestMessage = new HttpRequestMessage(HttpMethod.Get, uri)) { response = await _httpClient.SendAsync(requestMessage, CancellationToken.None).ConfigureAwait(false); } var statusCode = Convert.ToInt32(response.StatusCode); crawledPage.StatusCode = statusCode; } catch (HttpRequestException ex) { //TODO: Log error? } catch (TaskCanceledException ex) { //TODO: Log error? } catch (Exception ex) { //TODO: Log error? } finally { crawledPage.RequestCompleted = DateTime.Now; try { if (response != null) { var contentStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false); var htmlDocument = new HtmlDocument(); htmlDocument.Load(contentStream, Encoding.UTF8); crawledPage.Content = htmlDocument; } } catch (Exception ex) { //TODO: Log error? } } return(crawledPage); }
private void ProcessPage(CrawledPageModel page) { var links = _linkParser.GetLinks(page).ToArray(); page.FoundUrls = links; foreach (var link in links) { if (!_scheduler.IsUriKnown(link) && page.Url.Authority == link.Authority) { _scheduler.Add(link); } } _scheduler.AddKnownUri(page.Url); OnPageCrawlCompleted?.Invoke(this, new PageCrawlCompleteArgs() { Page = page }); }
public IEnumerable <Uri> GetLinks(CrawledPageModel page) { if (page is null) { throw new ArgumentNullException(nameof(page)); } var links = page.Content?.DocumentNode.SelectNodes("//a[@href]"); if (links is null) { yield break; } var baseUri = page.Url; foreach (var link in links) { var hrefValue = link.Attributes["href"].Value; yield return(new Uri(baseUri, hrefValue)); } }