コード例 #1
0
        public WebpageScraper_Stocklink(string url)
            : base(url, PageProcessingOptions.SimplifyPage)
        {
            var titleNode = base._headNode.Descendants()
                 .Where(n => n.Name == "title").SingleOrDefault();

               var title = titleNode.InnerText.Replace("Stocklink.no", string.Empty).Replace("-", string.Empty).Trim();

            var descriptionNode = base._headNode.Descendants()
                             .Where(n => n.Name == "meta" && n.Attributes.Contains("name") && n.Attributes["name"].Value.Split().Contains("description")).SingleOrDefault();

            var description = descriptionNode.Attributes["content"].Value;

            var keywordsNode = base._headNode.Descendants()
                             .Where(n => n.Name == "meta" && n.Attributes.Contains("name") && n.Attributes["name"].Value.Split().Contains("keywords")).SingleOrDefault();

            var keyWords = keywordsNode.Attributes["content"].Value.Split(',').ToList();

            //<span id="ctl00_MainArea_m_ArticleContentLbl">
            var contentNode = base._bodyNode.Descendants()
                            .Where(x => x.Name == "span" && x.Attributes.Contains("id")
                            && x.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticleContentLbl") // TODO: redefine this weak check
                            ).SingleOrDefault();

            // Get all paragraphs within node
            var paragraphs = contentNode.Descendants().Where(n => n.Name == "p").ToList();

            string content = string.Empty;
            foreach (var item in paragraphs)
                content += item.InnerText;

            //<td class="articleByline">
            //<div id="ctl00_MainArea_m_BylineDiv">
            //    Av:
            //    <a id="ctl00_MainArea_m_ArticleAuthorEmail" href="mailto:[email protected]">Asgeir Nilsen</a>
            //    - StockLink.no</div>
            //<span id="ctl00_MainArea_m_ArticlePublishedDateLbl">Publisert:&nbsp;07.08.2015 09:55:46</span><span id="ctl00_MainArea_m_ArticleUpdatedDateLbl">&nbsp;-&nbsp;Oppdatert:&nbsp;07.08.2015 09:58:
            var bylineNode = _bodyNode.Descendants().Where(n => n.Name == "a" && n.Attributes.Contains("id") && n.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticleAuthorEmail")).SingleOrDefault();

            var author = new Author()
            {
                Email = bylineNode.Attributes["href"].Value,
                Name = bylineNode.InnerText
            };

            var publishedNode = _bodyNode.Descendants().Where(n => n.Name == "span" && n.Attributes.Contains("id") && n.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticlePublishedDateLbl")).SingleOrDefault();
            var publishedDate = Convert.ToDateTime(publishedNode.InnerText.Replace("Publisert:&nbsp;", string.Empty));

            var updatedNode = _bodyNode.Descendants().Where(n => n.Name == "span" && n.Attributes.Contains("id") && n.Attributes["id"].Value.Split().Contains("ctl00_MainArea_m_ArticleUpdatedDateLbl")).SingleOrDefault();
            var updatedDate= Convert.ToDateTime(updatedNode.InnerText.Replace("&nbsp;-&nbsp;Oppdatert:&nbsp;", string.Empty));

            base.SetArticleData(title, description, null, content, new List<Author>() {author}, publishedDate, updatedDate);
        }
コード例 #2
0
        public WebpageScraper_Hegnar(string url)
            : base(url, PageProcessingOptions.SimplifyPage)
        {
            var titleNode = base._headNode.Descendants()
                 .Where(n => n.Name == "meta" && n.Attributes.Contains("property") && n.Attributes["property"].Value.Split().Contains("og:title")).SingleOrDefault();

            var title = titleNode.Attributes["content"].Value;

            var descriptionNode = base._headNode.Descendants()
                             .Where(n => n.Name == "meta" && n.Attributes.Contains("property") && n.Attributes["property"].Value.Split().Contains("og:description")).SingleOrDefault();

            var description = descriptionNode.Attributes["content"].Value;

            var contentNode = base._bodyNode.Descendants()
                            .Where(x => x.Name == "div" && x.Attributes.Contains("class")
                            && x.Attributes["class"].Value.Split().Contains("body") // TODO: redefine this weak check
                            ).SingleOrDefault();

            // Get all paragraphs within node
            var paragraphs = contentNode.Descendants().Where(n => n.Name == "p").ToList();

            string content = string.Empty;
            foreach (var item in paragraphs)
            {
                content += item.InnerText;
            }

            //<p class="byline">
            //Artikkel av:
            //Stein Ove Haugen
            //(Hegnar.no - 26.4.15 08:01)
            //</p>
            var bylineNode = _bodyNode.Descendants().Where(n => n.Name == "p" && n.Attributes.Contains("class") && n.Attributes["class"].Value.Split().Contains("byline")).SingleOrDefault();

            var refined = bylineNode.InnerText.Replace("Artikkel av:", string.Empty).Replace("Hegnar.no - ", string.Empty);

            var authorString = refined.Split('(')[0].Replace(Environment.NewLine, string.Empty);

            var author = new Author() {  Email = "NA", Name = authorString };

            var publishedString = refined.Split('(')[1].Replace(Environment.NewLine, string.Empty).Replace("(", string.Empty);
            var publishedDate = Convert.ToDateTime(publishedString);

            base.SetArticleData(title, description, null, content, new List<Author>() { author}, publishedDate, null);
        }