static void AddNewPage(CrawledPage crawledPage) { if (crawledPage.Uri.AbsoluteUri.StartsWith(_uriToCrawl.AbsoluteUri) == false) return; lock (_sqlRepository) { UrlDto urlDto; if (_sqlRepository.FindUrl(crawledPage.Uri.AbsoluteUri, out urlDto)) { // For pages that do not repond is empty if (crawledPage.HttpWebResponse != null) { urlDto.Status = crawledPage.HttpWebResponse.StatusCode; // This can be text/html or text/html;charset=utf-8 if (crawledPage.HttpWebResponse.ContentType.Contains("text/html")) { urlDto.IsWebPage = true; } } } else { urlDto = new UrlDto { Url = crawledPage.Uri.AbsoluteUri, Status = crawledPage.HttpWebResponse.StatusCode, IsWebPage = true }; } var webPageParsing = new WebPageParsing(crawledPage); var list = GetLinks(webPageParsing, urlDto); urlDto.Links = list.ToArray(); urlDto.Contents = GetContent(webPageParsing); _sqlRepository.InsertOrUpdateUrl(urlDto); } }
private static List<LinkDto> GetLinks(WebPageParsing webPageParsing, UrlDto urlDto) { var links = webPageParsing.GetLinks(); var list = new List<LinkDto>(); foreach (Uri link in links) { UrlDto linkUrlDto; if (_sqlRepository.FindUrl(link.AbsoluteUri, out linkUrlDto) == false) { linkUrlDto = new UrlDto {Url = link.AbsoluteUri}; linkUrlDto = _sqlRepository.InsertOrUpdateUrl(linkUrlDto); } list.Add(new LinkDto() { TargetUrlId = linkUrlDto.UrlId.HasValue ? linkUrlDto.UrlId.Value : 0 }); } return list; }
private static ContentDto[] GetContent(WebPageParsing webPageParsing) { var content = new List <ContentDto>(); var webPageContent = webPageParsing.GetTitle(); var contentDto = new ContentDto { Element = webPageContent.Element, Line = webPageContent.Line, LinePosition = webPageContent.LinePosition }; content.Add(contentDto); return content.ToArray(); }