Exemple #1
0
        static void AddNewPage(CrawledPage crawledPage)
        {
            if (crawledPage.Uri.AbsoluteUri.StartsWith(_uriToCrawl.AbsoluteUri) == false)
                return;

            lock (_sqlRepository)
            {
                UrlDto urlDto;

                if (_sqlRepository.FindUrl(crawledPage.Uri.AbsoluteUri, out urlDto))
                {
                    // For pages that do not repond is empty
                    if (crawledPage.HttpWebResponse != null)
                    {
                        urlDto.Status = crawledPage.HttpWebResponse.StatusCode;

                        // This can be text/html or text/html;charset=utf-8
                        if (crawledPage.HttpWebResponse.ContentType.Contains("text/html"))
                        {
                            urlDto.IsWebPage = true;
                        }
                    }
                }
                else
                {
                    urlDto = new UrlDto
                        {
                            Url = crawledPage.Uri.AbsoluteUri,
                            Status = crawledPage.HttpWebResponse.StatusCode,
                            IsWebPage = true
                        };
                }

                var webPageParsing = new WebPageParsing(crawledPage);
                var list = GetLinks(webPageParsing, urlDto);
                urlDto.Links = list.ToArray();
                urlDto.Contents = GetContent(webPageParsing);
                _sqlRepository.InsertOrUpdateUrl(urlDto);
            }
        }
Exemple #2
0
        private static List<LinkDto> GetLinks(WebPageParsing webPageParsing, UrlDto urlDto)
        {
            var links = webPageParsing.GetLinks();
            var list = new List<LinkDto>();

            foreach (Uri link in links)
            {
                UrlDto linkUrlDto;

                if (_sqlRepository.FindUrl(link.AbsoluteUri, out linkUrlDto) == false)
                {
                    linkUrlDto = new UrlDto {Url = link.AbsoluteUri};
                    linkUrlDto = _sqlRepository.InsertOrUpdateUrl(linkUrlDto);
                }

                list.Add(new LinkDto()
                    {
                        TargetUrlId = linkUrlDto.UrlId.HasValue ? linkUrlDto.UrlId.Value : 0
                    });
            }
            return list;
        }
Exemple #3
0
        private static ContentDto[] GetContent(WebPageParsing webPageParsing)
        {
            var content = new List <ContentDto>();
            var webPageContent = webPageParsing.GetTitle();

            var contentDto = new ContentDto
                {
                    Element = webPageContent.Element,
                    Line = webPageContent.Line,
                    LinePosition = webPageContent.LinePosition
                };
            content.Add(contentDto);
            return content.ToArray();
        }