public WebpageScraper_Stocklink(string url) : base(url, PageProcessingOptions.SimplifyPage) { var titleNode = base._headNode.Descendants() .Where(n => n.Name == "title").SingleOrDefault(); var title = titleNode.InnerText.Replace("Stocklink.no", string.Empty).Replace("-", string.Empty).Trim(); var descriptionNode = base._headNode.Descendants() .Where(n => n.Name == "meta" && n.Attributes.Contains("name") && n.Attributes["name"].Value.Split().Contains("description")).SingleOrDefault(); var description = descriptionNode.Attributes["content"].Value; var keywordsNode = base._headNode.Descendants() .Where(n => n.Name == "meta" && n.Attributes.Contains("name") && n.Attributes["name"].Value.Split().Contains("keywords")).SingleOrDefault(); var keyWords = keywordsNode.Attributes["content"].Value.Split(',').ToList(); //<span id="ctl00_MainArea_m_ArticleContentLbl"> var contentNode = base._bodyNode.Descendants() .Where(x => x.Name == "span" && x.Attributes.Contains("id") && x.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticleContentLbl") // TODO: redefine this weak check ).SingleOrDefault(); // Get all paragraphs within node var paragraphs = contentNode.Descendants().Where(n => n.Name == "p").ToList(); string content = string.Empty; foreach (var item in paragraphs) content += item.InnerText; //<td class="articleByline"> //<div id="ctl00_MainArea_m_BylineDiv"> // Av: // <a id="ctl00_MainArea_m_ArticleAuthorEmail" href="mailto:[email protected]">Asgeir Nilsen</a> // - StockLink.no</div> //<span id="ctl00_MainArea_m_ArticlePublishedDateLbl">Publisert: 07.08.2015 09:55:46</span><span id="ctl00_MainArea_m_ArticleUpdatedDateLbl"> - Oppdatert: 07.08.2015 09:58: var bylineNode = _bodyNode.Descendants().Where(n => n.Name == "a" && n.Attributes.Contains("id") && n.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticleAuthorEmail")).SingleOrDefault(); var author = new Author() { Email = bylineNode.Attributes["href"].Value, Name = bylineNode.InnerText }; var publishedNode = _bodyNode.Descendants().Where(n => n.Name == "span" && n.Attributes.Contains("id") && n.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticlePublishedDateLbl")).SingleOrDefault(); var publishedDate = Convert.ToDateTime(publishedNode.InnerText.Replace("Publisert: ", string.Empty)); var updatedNode = _bodyNode.Descendants().Where(n => n.Name == "span" && n.Attributes.Contains("id") && n.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticleUpdatedDateLbl")).SingleOrDefault(); var updatedDate= Convert.ToDateTime(updatedNode.InnerText.Replace(" - Oppdatert: ", string.Empty)); base.SetArticleData(title, description, null, content, new List<Author>() {author}, publishedDate, updatedDate); }
public WebpageScraper_Hegnar(string url) : base(url, PageProcessingOptions.SimplifyPage) { var titleNode = base._headNode.Descendants() .Where(n => n.Name == "meta" && n.Attributes.Contains("property") && n.Attributes["property"].Value.Split().Contains("og:title")).SingleOrDefault(); var title = titleNode.Attributes["content"].Value; var descriptionNode = base._headNode.Descendants() .Where(n => n.Name == "meta" && n.Attributes.Contains("property") && n.Attributes["property"].Value.Split().Contains("og:description")).SingleOrDefault(); var description = descriptionNode.Attributes["content"].Value; var contentNode = base._bodyNode.Descendants() .Where(x => x.Name == "div" && x.Attributes.Contains("class") && x.Attributes["class"].Value.Split().Contains("body") // TODO: redefine this weak check ).SingleOrDefault(); // Get all paragraphs within node var paragraphs = contentNode.Descendants().Where(n => n.Name == "p").ToList(); string content = string.Empty; foreach (var item in paragraphs) { content += item.InnerText; } //<p class="byline"> //Artikkel av: //Stein Ove Haugen //(Hegnar.no - 26.4.15 08:01) //</p> var bylineNode = _bodyNode.Descendants().Where(n => n.Name == "p" && n.Attributes.Contains("class") && n.Attributes["class"].Value.Split().Contains("byline")).SingleOrDefault(); var refined = bylineNode.InnerText.Replace("Artikkel av:", string.Empty).Replace("Hegnar.no - ", string.Empty); var authorString = refined.Split('(')[0].Replace(Environment.NewLine, string.Empty); var author = new Author() { Email = "NA", Name = authorString }; var publishedString = refined.Split('(')[1].Replace(Environment.NewLine, string.Empty).Replace("(", string.Empty); var publishedDate = Convert.ToDateTime(publishedString); base.SetArticleData(title, description, null, content, new List<Author>() { author}, publishedDate, null); }