示例#1
0
 public String GetResults(String url)
 {
     return(webHelper.GetContent(url));
 }
示例#2
0
        public async Task <Dictionary <ResultItem, string> > PageParse(string domain)
        {
            if (!domain.StartsWith("http"))
            {
                domain = Uri.UriSchemeHttp + Uri.SchemeDelimiter + domain;
            }

            Uri    uri = new Uri(domain);
            string contentStartPage = await _restHelper.GetContent(domain);

            string        uriHost         = uri.Host;
            HtmlParser    xDoc            = new HtmlParser();
            IHtmlDocument parsedStartPage = xDoc.ParseDocument(contentStartPage);

            List <string>    listLinks                   = _htmlDocumentParser.GetListSelectorWithAttribute("a", "href", parsedStartPage);
            List <string>    listInternalLinks           = listLinks.Where(e => !string.IsNullOrWhiteSpace(e) && (e.Contains(uriHost) || e.StartsWith("/"))).ToList();
            HashSet <string> uniqueInternalLinks         = listInternalLinks.ToHashSet();
            int           nonUniqueInternalLinks         = listInternalLinks.Count - uniqueInternalLinks.Count;
            List <string> listInternalLinksWithImgAnchor = listInternalLinks.Where(e => !string.IsNullOrWhiteSpace(e) && e.Contains("img")).ToList();
            string        htmlLangValue                  = _htmlDocumentParser.GetListSelectorWithAttribute("html", "xml:lang", parsedStartPage).ToList().FirstOrDefault() ?? "does not exist";
            List <string> imgSrcTagList                  = _htmlDocumentParser.GetListSelectorWithAttribute("img", "src", parsedStartPage);
            List <string> listTitles                   = _htmlDocumentParser.GetListSelectorWithAttribute("[title]", "title", parsedStartPage);
            int           titleAverageLength           = (int)Math.Round(listTitles.Select(e => e.Length).Average());
            int           numberBackgroundImageWithUrl = _regexHelper.RegexList("\"background-image:(\\s*)url", contentStartPage).Count;
            List <string> listH2Header                 = _htmlDocumentParser.GetListSelectorWithAttribute("h2", "h2", parsedStartPage);
            List <string> listH3Header                 = _htmlDocumentParser.GetListSelectorWithAttribute("h3", "h3", parsedStartPage);
            List <string> listH4Header                 = _htmlDocumentParser.GetListSelectorWithAttribute("h4", "h4", parsedStartPage);
            List <string> listH5Header                 = _htmlDocumentParser.GetListSelectorWithAttribute("h5", "h5", parsedStartPage);
            List <string> listH6Header                 = _htmlDocumentParser.GetListSelectorWithAttribute("h6", "h6", parsedStartPage);
            bool          instagramLink                = _htmlDocumentParser.PresenceSocialNetworkLink(listLinks, "instagram");
            bool          twitterLink                  = _htmlDocumentParser.PresenceSocialNetworkLink(listLinks, "twitter");
            bool          facebookLink                 = _htmlDocumentParser.PresenceSocialNetworkLink(listLinks, "facebook");
            bool          youtubeLink                  = _htmlDocumentParser.PresenceSocialNetworkLink(listLinks, "youtube");
            bool          vkLink     = _htmlDocumentParser.PresenceSocialNetworkLink(listLinks, "vk");
            bool          googleLink = _htmlDocumentParser.PresenceSocialNetworkLink(listLinks, "google");

            bool isMainPage = _htmlDocumentParser.IsMainPage(domain);
            var  dictionary = new Dictionary <ResultItem, string>();

            dictionary.Add(ResultItem.ThisPageIsMain, isMainPage.ToString());
            dictionary.Add(ResultItem.LinksCount, listLinks.Count.ToString());
            dictionary.Add(ResultItem.InternalLinksCount, listInternalLinks.Count.ToString());
            dictionary.Add(ResultItem.UniqueInternalLinksCount, uniqueInternalLinks.Count.ToString());
            dictionary.Add(ResultItem.NorUniqueInternalLinksCount, nonUniqueInternalLinks.ToString());
            dictionary.Add(ResultItem.InternalLinksHasImgAnchor, listInternalLinksWithImgAnchor.Count.ToString());
            dictionary.Add(ResultItem.HtmlLangValue, htmlLangValue);
            dictionary.Add(ResultItem.ImgSrcTagCount, imgSrcTagList.Count.ToString());
            dictionary.Add(ResultItem.TitleTagCount, listTitles.Count.ToString());
            dictionary.Add(ResultItem.AverageTitleTag, titleAverageLength.ToString());
            dictionary.Add(ResultItem.StyleBackgroundImageInUrlCount, numberBackgroundImageWithUrl.ToString());
            dictionary.Add(ResultItem.InstagramLinkExist, instagramLink.ToString());
            dictionary.Add(ResultItem.TwitterLinkExist, twitterLink.ToString());
            dictionary.Add(ResultItem.FacebookLinkExist, facebookLink.ToString());
            dictionary.Add(ResultItem.YoutubeLinkExist, youtubeLink.ToString());
            dictionary.Add(ResultItem.VkLinkExist, vkLink.ToString());
            dictionary.Add(ResultItem.GoogleLinkExist, googleLink.ToString());
            dictionary.Add(ResultItem.H2HeaderCount, listH2Header.Count.ToString());
            dictionary.Add(ResultItem.H3HeaderCount, listH3Header.Count.ToString());
            dictionary.Add(ResultItem.H4HeaderCount, listH4Header.Count.ToString());
            dictionary.Add(ResultItem.H5HeaderCount, listH5Header.Count.ToString());
            dictionary.Add(ResultItem.H6HeaderCount, listH6Header.Count.ToString());

            if (!isMainPage)
            {
                string        contentMainPage = _restHelper.GetContent((uri.Scheme + "://" + uri.Host)).ToString();
                HtmlParser    xDocMain        = new HtmlParser();
                IHtmlDocument parsedMainPage  = xDocMain.ParseDocument(contentMainPage);

                int linksWithoutHomePage =
                    listLinks.Count - listLinks.Intersect(_htmlDocumentParser.GetListSelectorWithAttribute("a", "href", parsedMainPage).ToList()).ToList().Count;
                int imgSrcTagWithoutHomePage = imgSrcTagList.Count - _htmlDocumentParser.GetListSelectorWithAttribute("img", "src", parsedMainPage)
                                               .Where(e => listLinks.Contains(e))
                                               .ToList()
                                               .Count;
                int h2HeaderWithoutHomePage = listH2Header.Count - _htmlDocumentParser.GetListSelectorWithAttribute("h2", "h2", parsedMainPage)
                                              .Where(e => listLinks.Contains(e))
                                              .ToList()
                                              .Count;
                int h3HeaderWithoutHomePage = listH3Header.Count - _htmlDocumentParser.GetListSelectorWithAttribute("h3", "h3", parsedMainPage)
                                              .Where(e => listLinks.Contains(e))
                                              .ToList()
                                              .Count;
                int h4HeaderWithoutHomePage = listH4Header.Count - _htmlDocumentParser.GetListSelectorWithAttribute("h4", "h4", parsedMainPage)
                                              .Where(e => listLinks.Contains(e))
                                              .ToList()
                                              .Count;
                int h5HeaderWithoutHomePage = listH5Header.Count - _htmlDocumentParser.GetListSelectorWithAttribute("h5", "h5", parsedMainPage)
                                              .Where(e => listLinks.Contains(e))
                                              .ToList()
                                              .Count;
                int h6HeaderWithoutHomePage = listH6Header.Count - _htmlDocumentParser.GetListSelectorWithAttribute("h6", "h6", parsedMainPage)
                                              .Where(e => listLinks.Contains(e))
                                              .ToList()
                                              .Count;
                int tableWithoutHomePage = _htmlDocumentParser.GetListSelectorWithAttribute("[table]", "table", parsedStartPage).Count - _htmlDocumentParser
                                           .GetListSelectorWithAttribute("[table]", "table", parsedMainPage)
                                           .Where(e => listLinks.Contains(e))
                                           .ToList()
                                           .Count;
                int internalLinksWithImgAnchorWithoutHomePage = listInternalLinksWithImgAnchor.Count - listInternalLinksWithImgAnchor
                                                                .Intersect(_htmlDocumentParser.GetListSelectorWithAttribute("a", "href", parsedMainPage).ToList())
                                                                .ToList()
                                                                .Count;

                dictionary.Add(ResultItem.LinksCountWithoutMainPageMatches, linksWithoutHomePage.ToString());
                dictionary.Add(ResultItem.InternalLinksHasImgAnchorWithoutMainPageMatches, internalLinksWithImgAnchorWithoutHomePage.ToString());
                dictionary.Add(ResultItem.ImgSrcTagCountWithoutMainPageMatches, imgSrcTagWithoutHomePage.ToString());
                dictionary.Add(ResultItem.TableTagCountWithoutMainPageMatches, tableWithoutHomePage.ToString());
                dictionary.Add(ResultItem.H2HeaderCountWithoutMainPageMatches, h2HeaderWithoutHomePage.ToString());
                dictionary.Add(ResultItem.H3HeaderCountWithoutMainPageMatches, h3HeaderWithoutHomePage.ToString());
                dictionary.Add(ResultItem.H4HeaderCountWithoutMainPageMatches, h4HeaderWithoutHomePage.ToString());
                dictionary.Add(ResultItem.H5HeaderCountWithoutMainPageMatches, h5HeaderWithoutHomePage.ToString());
                dictionary.Add(ResultItem.H6HeaderCountWithoutMainPageMatches, h6HeaderWithoutHomePage.ToString());
            }

            return(dictionary);
        }