public List <Link> GetDigestLinks(Digest digest) { List <Link> links = new(); var digestContent = htlmContentGetter.GetContent(digest.DigestURL); var linksHtml = new HtmlDocument(); linksHtml.LoadHtml(digestContent); var linksInDigest = linksHtml.DocumentNode.SelectNodes("//*[@id='templateBody']//table//table//table//td[contains(@class,'mcnTextContent')][not(div)]"); for (int i = 0; i < linksInDigest.Count; i++) { HtmlNode link = linksInDigest[i]; var titleNode = link.SelectSingleNode("./ancestor::table[contains(@class,'mcnTextBlock')]/preceding-sibling::table[1]//div"); string title; if (titleNode != null) { title = titleNode.InnerText; //no titles } else { title = ""; } var href = digest.DigestURL + "#section-" + i; string descriptionText; var descriptionNode = HtmlNode.CreateNode("<div></div>"); descriptionNode.AppendChild(link); descriptionNode = contentNormalizer.NormalizeDom(descriptionNode); descriptionText = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim()); links.Add(new Link { URL = href, Title = title, Description = descriptionText, LinkOrder = i, Digest = digest }); } return(links); }
public List <Link> GetDigestLinks(Digest digest) { List <Link> links = new(); var digestContent = htmlContentGetter.GetContent(digest.DigestURL); var linkMatches = Regex.Matches(digestContent, @"([^\<\>\r\n\=]+?)\<br\>\r\n\<br\>\r\n(.+?)\<br\>\r\n\<br\>\r\n\<br\>", RegexOptions.Singleline); for (int i = 1; i < linkMatches.Count; i++) { string title = ""; string description = linkMatches[i].Groups[1].Value; string descriptionText = textSanitizer.Sanitize(description.Trim()); string hrefText = linkMatches[i].Groups[2].Value; string[] hrefSplitArray = hrefText.Split(new string[] { "<br>" }, StringSplitOptions.None); string href; if (hrefSplitArray.Length == 1) { href = Utils.UnshortenLink(hrefSplitArray[0].Trim()); } else { href = hrefSplitArray[^ 1];
public List <Link> GetDigestLinks(Digest digest) { List <Link> links = new(); var digestContent = htmlContentGetter.GetContent(digest.DigestURL); var linksHtml = new HtmlDocument(); linksHtml.LoadHtml(digestContent); var linksInDigest = linksHtml.DocumentNode.SelectNodes("//div[contains(@class,'entry')]//li"); for (int i = 0; i < linksInDigest.Count; i++) { HtmlNode link = linksInDigest[i]; string title = ""; //no titles in this digest var hrefNode = link.SelectSingleNode(".//a"); if (hrefNode == null) { continue; } var href = hrefNode.GetAttributeValue("href", "Not found"); string descriptionText; var descriptionNode = HtmlNode.CreateNode("<div></div>"); descriptionNode.AppendChild(link.Clone()); descriptionNode = contentNormalizer.NormalizeDom(descriptionNode); descriptionText = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim()); links.Add(new Link { URL = href, Title = title, Description = descriptionText, LinkOrder = i, Digest = digest }); } return(links); }
public Digest GetDigestDetails(Digest digest) { //Initial link leading to a decorated page with iframe, let's get actual link string stubContent = htmlContentGetter.GetContent(digest.DigestURL); HtmlDocument stubDocument = new(); stubDocument.LoadHtml(stubContent); HtmlNode iframeNode = stubDocument.DocumentNode.SelectSingleNode("//iframe[@id='iframe']"); string realLink = iframeNode.GetAttributeValue("src", "not found"); //getting real content string digestContent = htmlContentGetter.GetContent(realLink); HtmlDocument digestDocument = new(); digestDocument.LoadHtml(digestContent); //getting description of the digest HtmlNodeCollection descriptionNodes = digestDocument.DocumentNode.SelectNodes("//tr[preceding-sibling::comment()[contains(.,' INTRO Start ')]][following-sibling::comment()[contains(.,' INTRO End ')]]"); var descriptionNode = HtmlNode.CreateNode("<div></div>"); descriptionNode.AppendChildren(descriptionNodes); descriptionNode = contentNormalizer.NormalizeDom(descriptionNode); string descriptionText = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim()); //very dirty hack to get date, may be broken any time, no way to get something more suitable so far HttpClient imgClient = new(); imgClient.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75"); var baseUri = new Uri(realLink); var headImageLink = new Uri(baseUri, "head.jpg").AbsoluteUri; var imgContent = imgClient.GetAsync(headImageLink).Result; var fileModifiedDate = imgContent.Content.Headers.LastModified.Value.DateTime; var currentDigest = new Digest { DigestDay = fileModifiedDate, DigestName = digest.DigestName, DigestDescription = descriptionText, DigestURL = digest.DigestURL, Provider = digest.Provider }; return(currentDigest); }
public List <Link> GetDigestLinks(Digest digest) { List <Link> links = new(); var digestContent = htmlContentGetter.GetContent(digest.DigestURL); var linksHtml = new HtmlDocument(); linksHtml.LoadHtml(digestContent); var linksInDigest = linksHtml.DocumentNode.SelectNodes("//div[@class='news_item']"); for (int i = 0; i < linksInDigest.Count; i++) { HtmlNode link = linksInDigest[i]; var titleNode = link.SelectSingleNode(".//h2[@class='news_item-title']"); var title = titleNode.InnerText; var href = titleNode.Descendants("a").FirstOrDefault().GetAttributeValue("href", "Not found"); if (!href.Contains("://") && href.Contains('/')) { var digestUrl = new Uri(digest.DigestURL); var digestBase = new Uri(digestUrl.Scheme + "://" + digestUrl.Authority); href = (new Uri(digestBase, href)).AbsoluteUri; } href = Utils.UnshortenLink(href); var originalDescriptionNode = link.SelectSingleNode(".//div[@class='news_item-content']"); var descriptionNode = contentNormalizer.NormalizeDom(originalDescriptionNode); var description = textSanitizer.Sanitize(descriptionNode.InnerHtml.Trim()); links.Add(new Link { URL = href, Title = title, Description = description, LinkOrder = i, Digest = digest }); } return(links); }