Beispiel #1
0
        public List <Link> GetDigestLinks(Digest digest)
        {
            List <Link> links         = new();
            var         digestContent = htlmContentGetter.GetContent(digest.DigestURL);
            var         linksHtml     = new HtmlDocument();

            linksHtml.LoadHtml(digestContent);
            var linksInDigest = linksHtml.DocumentNode.SelectNodes("//*[@id='templateBody']//table//table//table//td[contains(@class,'mcnTextContent')][not(div)]");

            for (int i = 0; i < linksInDigest.Count; i++)
            {
                HtmlNode link      = linksInDigest[i];
                var      titleNode = link.SelectSingleNode("./ancestor::table[contains(@class,'mcnTextBlock')]/preceding-sibling::table[1]//div");
                string   title;
                if (titleNode != null)
                {
                    title = titleNode.InnerText; //no titles
                }
                else
                {
                    title = "";
                }
                var href = digest.DigestURL + "#section-" + i;

                string descriptionText;
                var    descriptionNode = HtmlNode.CreateNode("<div></div>");
                descriptionNode.AppendChild(link);
                descriptionNode = contentNormalizer.NormalizeDom(descriptionNode);
                descriptionText = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim());

                links.Add(new Link
                {
                    URL         = href,
                    Title       = title,
                    Description = descriptionText,
                    LinkOrder   = i,
                    Digest      = digest
                });
            }
            return(links);
        }
        public List <Link> GetDigestLinks(Digest digest)
        {
            List <Link> links         = new();
            var         digestContent = htmlContentGetter.GetContent(digest.DigestURL);
            var         linkMatches   = Regex.Matches(digestContent, @"([^\<\>\r\n\=]+?)\<br\>\r\n\<br\>\r\n(.+?)\<br\>\r\n\<br\>\r\n\<br\>", RegexOptions.Singleline);

            for (int i = 1; i < linkMatches.Count; i++)
            {
                string   title           = "";
                string   description     = linkMatches[i].Groups[1].Value;
                string   descriptionText = textSanitizer.Sanitize(description.Trim());
                string   hrefText        = linkMatches[i].Groups[2].Value;
                string[] hrefSplitArray  = hrefText.Split(new string[] { "<br>" }, StringSplitOptions.None);
                string   href;
                if (hrefSplitArray.Length == 1)
                {
                    href = Utils.UnshortenLink(hrefSplitArray[0].Trim());
                }
                else
                {
                    href = hrefSplitArray[^ 1];
Beispiel #3
0
        public List <Link> GetDigestLinks(Digest digest)
        {
            List <Link> links         = new();
            var         digestContent = htmlContentGetter.GetContent(digest.DigestURL);

            var linksHtml = new HtmlDocument();

            linksHtml.LoadHtml(digestContent);
            var linksInDigest = linksHtml.DocumentNode.SelectNodes("//div[contains(@class,'entry')]//li");

            for (int i = 0; i < linksInDigest.Count; i++)
            {
                HtmlNode link     = linksInDigest[i];
                string   title    = ""; //no titles in this digest
                var      hrefNode = link.SelectSingleNode(".//a");
                if (hrefNode == null)
                {
                    continue;
                }
                var href = hrefNode.GetAttributeValue("href", "Not found");

                string descriptionText;
                var    descriptionNode = HtmlNode.CreateNode("<div></div>");
                descriptionNode.AppendChild(link.Clone());
                descriptionNode = contentNormalizer.NormalizeDom(descriptionNode);
                descriptionText = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim());

                links.Add(new Link
                {
                    URL         = href,
                    Title       = title,
                    Description = descriptionText,
                    LinkOrder   = i,
                    Digest      = digest
                });
            }
            return(links);
        }
        public Digest GetDigestDetails(Digest digest)
        {
            //Initial link leading to a decorated page with iframe, let's get actual link
            string       stubContent  = htmlContentGetter.GetContent(digest.DigestURL);
            HtmlDocument stubDocument = new();

            stubDocument.LoadHtml(stubContent);
            HtmlNode iframeNode = stubDocument.DocumentNode.SelectSingleNode("//iframe[@id='iframe']");
            string   realLink   = iframeNode.GetAttributeValue("src", "not found");

            //getting real content
            string       digestContent  = htmlContentGetter.GetContent(realLink);
            HtmlDocument digestDocument = new();

            digestDocument.LoadHtml(digestContent);

            //getting description of the digest
            HtmlNodeCollection descriptionNodes = digestDocument.DocumentNode.SelectNodes("//tr[preceding-sibling::comment()[contains(.,' INTRO Start ')]][following-sibling::comment()[contains(.,' INTRO End ')]]");
            var descriptionNode = HtmlNode.CreateNode("<div></div>");

            descriptionNode.AppendChildren(descriptionNodes);
            descriptionNode = contentNormalizer.NormalizeDom(descriptionNode);
            string descriptionText = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim());

            //very dirty hack to get date, may be broken any time, no way to get something more suitable so far
            HttpClient imgClient = new();

            imgClient.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75");
            var baseUri          = new Uri(realLink);
            var headImageLink    = new Uri(baseUri, "head.jpg").AbsoluteUri;
            var imgContent       = imgClient.GetAsync(headImageLink).Result;
            var fileModifiedDate = imgContent.Content.Headers.LastModified.Value.DateTime;

            var currentDigest = new Digest
            {
                DigestDay         = fileModifiedDate,
                DigestName        = digest.DigestName,
                DigestDescription = descriptionText,
                DigestURL         = digest.DigestURL,
                Provider          = digest.Provider
            };

            return(currentDigest);
        }
        public List <Link> GetDigestLinks(Digest digest)
        {
            List <Link> links         = new();
            var         digestContent = htmlContentGetter.GetContent(digest.DigestURL);
            var         linksHtml     = new HtmlDocument();

            linksHtml.LoadHtml(digestContent);
            var linksInDigest = linksHtml.DocumentNode.SelectNodes("//div[@class='news_item']");

            for (int i = 0; i < linksInDigest.Count; i++)
            {
                HtmlNode link      = linksInDigest[i];
                var      titleNode = link.SelectSingleNode(".//h2[@class='news_item-title']");
                var      title     = titleNode.InnerText;
                var      href      = titleNode.Descendants("a").FirstOrDefault().GetAttributeValue("href", "Not found");
                if (!href.Contains("://") && href.Contains('/'))
                {
                    var digestUrl  = new Uri(digest.DigestURL);
                    var digestBase = new Uri(digestUrl.Scheme + "://" + digestUrl.Authority);
                    href = (new Uri(digestBase, href)).AbsoluteUri;
                }
                href = Utils.UnshortenLink(href);
                var originalDescriptionNode = link.SelectSingleNode(".//div[@class='news_item-content']");
                var descriptionNode         = contentNormalizer.NormalizeDom(originalDescriptionNode);
                var description             = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim());
                links.Add(new Link
                {
                    URL         = href,
                    Title       = title,
                    Description = description,
                    LinkOrder   = i,
                    Digest      = digest
                });
            }
            return(links);
        }