public void Work()
        {
            try
            {
                var summaryNodes = _web.Load(_pageURL).DocumentNode
                                   .SelectSingleNode("//tbody")
                                   .SelectNodes(".//tr").ToList();

                foreach (var summary in summaryNodes)
                {
                    ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs()
                    {
                        Publisher = EnumPublisher.ESMA, Body = "n/a", DownloadFiles = "n/a"
                    };
                    var s = summary.SelectNodes(".//td");
                    parsedArticle.PublishDate      = s[Index_Date].InnerText;
                    parsedArticle.LatestUpdateDate = parsedArticle.PublishDate;
                    parsedArticle.Summary          = "Article Reference: " + s[Index_Summary].InnerText;
                    parsedArticle.Header           = s[Index_Title].SelectSingleNode(".//a").InnerText;
                    parsedArticle.ArticleUrl       = s[Index_Title].SelectSingleNode(".//a").Attributes["href"].Value;
                    parsedArticle.ArticleFormat    = "pdf";
                    parsedArticle.ArticleTags      = s[Index_Tag].InnerText;
                    parsedArticle.ArticleType      = s[Index_Type].InnerText;
                    _engine.OnNewsArticleParsedDelegate(parsedArticle);
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }
        }
        public void Work()
        {
            try
            {
                //Go through each news summary item and process
                var summaryListNode = _web.Load(_pageURL).DocumentNode.SelectSingleNode("//ol[@class='search-list']");
                foreach (var summaryItemNode in summaryListNode.SelectNodes(".//li"))
                {
                    ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs()
                    {
                        Publisher = EnumPublisher.FCA, Body = "n/a", DownloadFiles = "n/a"
                    };
                    //Header
                    var linkNode = summaryItemNode.SelectSingleNode(".//a");
                    parsedArticle.Header = linkNode.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim();
                    //ArticleTags
                    parsedArticle.ArticleTags = summaryItemNode.SelectSingleNode(".//span[@class='meta-item type']")?.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim();
                    //Publish
                    parsedArticle.PublishDate = summaryItemNode.SelectSingleNode(".//span[@class='meta-item published-date']")?.InnerText.Split(':')[1].Replace("\t", String.Empty).Replace("\n", String.Empty).Trim();
                    //LatestUpdateDate
                    parsedArticle.LatestUpdateDate = summaryItemNode.SelectSingleNode(".//span[@class='meta-item modified-date']")?.InnerText.Split(':')[1].Replace("\t", String.Empty).Replace("\n", String.Empty).Trim();
                    if (parsedArticle.LatestUpdateDate == null)
                    {
                        parsedArticle.LatestUpdateDate = parsedArticle.PublishDate;
                    }

                    //Summary
                    parsedArticle.Summary = summaryItemNode.SelectSingleNode(".//div[@class='search-item__body']")?.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim();
                    //URL
                    parsedArticle.ArticleUrl = linkNode.Attributes["href"].Value;

                    List <string> parts = parsedArticle.Header.Split(' ').ToList();

                    if ((parts[parts.Count - 1].Contains("[")))
                    {
                        parsedArticle.ArticleFormat = parts[parts.Count - 1];
                        _engine.OnNewsArticleParsedDelegate(parsedArticle);
                    }
                    else
                    {
                        parsedArticle.ArticleFormat = "Web Page";
                        _engine.AddToQueue(new FCAScrapeArticle(parsedArticle, _engine));
                    }
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }
        }
        public void Work()
        {
            try
            {
                var summaryNodes = _web.Load(_pageURL).DocumentNode
                                   .SelectSingleNode("//tbody")
                                   .SelectNodes(".//td").ToList();

                foreach (var summary in summaryNodes)
                {
                    ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs()
                    {
                        Publisher = EnumPublisher.ESMA, ArticleFormat = "Web Page", Body = "n/a", DownloadFiles = "n/a"
                    };

                    var articleTagsNode = summary.SelectNodes(".//div[@class='section_link']")?.Select(item => item.InnerText).ToList();
                    if (articleTagsNode != null)
                    {
                        parsedArticle.ArticleTags = string.Join(":", articleTagsNode);
                    }
                    parsedArticle.Header           = summary.SelectSingleNode("..//a").InnerText;
                    parsedArticle.ArticleUrl       = $"https://www.esma.europa.eu{summary.SelectSingleNode(".//a").Attributes["href"].Value}";
                    parsedArticle.Summary          = summary.SelectSingleNode(".//p").InnerText;
                    parsedArticle.PublishDate      = summary.SelectSingleNode(".//div[@class='field field-type-ds']").InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim();
                    parsedArticle.LatestUpdateDate = parsedArticle.PublishDate;
                    parsedArticle.DownloadFiles    = summary.SelectSingleNode(".//div[@class='file']")?.SelectSingleNode(".//a").Attributes["href"].Value;

                    if (parsedArticle.DownloadFiles != null)
                    {
                        parsedArticle.ArticleFormat += "|Download";
                    }
                    else
                    {
                        parsedArticle.DownloadFiles = "n/a";
                    }

                    _engine.AddToQueue(new ESMANewsScrapeArticle(parsedArticle, _engine));
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }
        }
        public void Work()
        {
            try
            {
                var node         = _web.Load(_pageURL).DocumentNode;
                var summaryNodes = node.SelectNodes("//div[@class='row no-gutters']//div[@class='col-12 px-3']")
                                   .Select(item => item.SelectSingleNode(".//a[@class='afme-article__link']")).ToList();

                foreach (var summaryNode in summaryNodes)
                {
                    ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs()
                    {
                        ArticleType = "News", ArticleTags = "None", Publisher = EnumPublisher.AFME, Body = "n/a", DownloadFiles = "n/a"
                    };
                    parsedArticle.ArticleUrl = "https://www.afme.eu" + summaryNode.Attributes["href"].Value;
                    _engine.AddToQueue(new AFMENewsScrapeArticle(parsedArticle, _engine));
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }
        }
Exemple #5
0
        public void Work()
        {
            try
            {
                var node         = _web.Load(_pageURL).DocumentNode;
                var summaryNodes = node.SelectNodes("//div[@class='template-main-container']/div[@class='row no-gutters']//div[@class='col-12 col-md-6 px-3']")
                                   .Select(item => item.SelectSingleNode("./div/a")).ToList();

                foreach (var summaryNode in summaryNodes)
                {
                    ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs()
                    {
                        ArticleTags = "None", Publisher = EnumPublisher.AFME, Body = "n/a", DownloadFiles = "n/a"
                    };

                    //URL
                    parsedArticle.ArticleUrl = "https://www.afme.eu" + summaryNode.Attributes["href"].Value;
                    //Article Format
                    parsedArticle.ArticleFormat = "pdf";

                    var nodeDetails = summaryNode.SelectNodes("./div//div");
                    //Header
                    parsedArticle.Header = nodeDetails[1].InnerText.Trim();
                    //Public and Modified Date
                    parsedArticle.LatestUpdateDate = parsedArticle.PublishDate = nodeDetails[2].InnerText.Trim();
                    //Summary
                    parsedArticle.Summary = nodeDetails[3].InnerText.Replace("\n", "").Trim();

                    _engine.AddToQueue(new AFMEDataScrapeArticle(parsedArticle, _engine));
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }
        }
 public ESMANewsScrapeArticle(ScrapedArticleArgs article, ScrapeEngine engine)
 {
     _web     = new HtmlWeb();
     _article = article;
     _engine  = engine;
 }
Exemple #7
0
 public FCAScrapeArticle(ScrapedArticleArgs newsArticle, ScrapeEngine engine)
 {
     _web         = new HtmlWeb();
     _newsArticle = newsArticle;
     _engine      = engine;
 }