public void Work() { try { var summaryNodes = _web.Load(_pageURL).DocumentNode .SelectSingleNode("//tbody") .SelectNodes(".//tr").ToList(); foreach (var summary in summaryNodes) { ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs() { Publisher = EnumPublisher.ESMA, Body = "n/a", DownloadFiles = "n/a" }; var s = summary.SelectNodes(".//td"); parsedArticle.PublishDate = s[Index_Date].InnerText; parsedArticle.LatestUpdateDate = parsedArticle.PublishDate; parsedArticle.Summary = "Article Reference: " + s[Index_Summary].InnerText; parsedArticle.Header = s[Index_Title].SelectSingleNode(".//a").InnerText; parsedArticle.ArticleUrl = s[Index_Title].SelectSingleNode(".//a").Attributes["href"].Value; parsedArticle.ArticleFormat = "pdf"; parsedArticle.ArticleTags = s[Index_Tag].InnerText; parsedArticle.ArticleType = s[Index_Type].InnerText; _engine.OnNewsArticleParsedDelegate(parsedArticle); } } catch (Exception ex) { Debug.WriteLine(ex); } }
public void Work() { try { //Go through each news summary item and process var summaryListNode = _web.Load(_pageURL).DocumentNode.SelectSingleNode("//ol[@class='search-list']"); foreach (var summaryItemNode in summaryListNode.SelectNodes(".//li")) { ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs() { Publisher = EnumPublisher.FCA, Body = "n/a", DownloadFiles = "n/a" }; //Header var linkNode = summaryItemNode.SelectSingleNode(".//a"); parsedArticle.Header = linkNode.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim(); //ArticleTags parsedArticle.ArticleTags = summaryItemNode.SelectSingleNode(".//span[@class='meta-item type']")?.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim(); //Publish parsedArticle.PublishDate = summaryItemNode.SelectSingleNode(".//span[@class='meta-item published-date']")?.InnerText.Split(':')[1].Replace("\t", String.Empty).Replace("\n", String.Empty).Trim(); //LatestUpdateDate parsedArticle.LatestUpdateDate = summaryItemNode.SelectSingleNode(".//span[@class='meta-item modified-date']")?.InnerText.Split(':')[1].Replace("\t", String.Empty).Replace("\n", String.Empty).Trim(); if (parsedArticle.LatestUpdateDate == null) { parsedArticle.LatestUpdateDate = parsedArticle.PublishDate; } //Summary parsedArticle.Summary = summaryItemNode.SelectSingleNode(".//div[@class='search-item__body']")?.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim(); //URL parsedArticle.ArticleUrl = linkNode.Attributes["href"].Value; List <string> parts = parsedArticle.Header.Split(' ').ToList(); if ((parts[parts.Count - 1].Contains("["))) { parsedArticle.ArticleFormat = parts[parts.Count - 1]; _engine.OnNewsArticleParsedDelegate(parsedArticle); } else { parsedArticle.ArticleFormat = "Web Page"; _engine.AddToQueue(new FCAScrapeArticle(parsedArticle, _engine)); } } } catch (Exception ex) { Debug.WriteLine(ex); } }
public void Work() { try { var summaryNodes = _web.Load(_pageURL).DocumentNode .SelectSingleNode("//tbody") .SelectNodes(".//td").ToList(); foreach (var summary in summaryNodes) { ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs() { Publisher = EnumPublisher.ESMA, ArticleFormat = "Web Page", Body = "n/a", DownloadFiles = "n/a" }; var articleTagsNode = summary.SelectNodes(".//div[@class='section_link']")?.Select(item => item.InnerText).ToList(); if (articleTagsNode != null) { parsedArticle.ArticleTags = string.Join(":", articleTagsNode); } parsedArticle.Header = summary.SelectSingleNode("..//a").InnerText; parsedArticle.ArticleUrl = $"https://www.esma.europa.eu{summary.SelectSingleNode(".//a").Attributes["href"].Value}"; parsedArticle.Summary = summary.SelectSingleNode(".//p").InnerText; parsedArticle.PublishDate = summary.SelectSingleNode(".//div[@class='field field-type-ds']").InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty).Trim(); parsedArticle.LatestUpdateDate = parsedArticle.PublishDate; parsedArticle.DownloadFiles = summary.SelectSingleNode(".//div[@class='file']")?.SelectSingleNode(".//a").Attributes["href"].Value; if (parsedArticle.DownloadFiles != null) { parsedArticle.ArticleFormat += "|Download"; } else { parsedArticle.DownloadFiles = "n/a"; } _engine.AddToQueue(new ESMANewsScrapeArticle(parsedArticle, _engine)); } } catch (Exception ex) { Debug.WriteLine(ex); } }
public void Work() { try { var node = _web.Load(_pageURL).DocumentNode; var summaryNodes = node.SelectNodes("//div[@class='row no-gutters']//div[@class='col-12 px-3']") .Select(item => item.SelectSingleNode(".//a[@class='afme-article__link']")).ToList(); foreach (var summaryNode in summaryNodes) { ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs() { ArticleType = "News", ArticleTags = "None", Publisher = EnumPublisher.AFME, Body = "n/a", DownloadFiles = "n/a" }; parsedArticle.ArticleUrl = "https://www.afme.eu" + summaryNode.Attributes["href"].Value; _engine.AddToQueue(new AFMENewsScrapeArticle(parsedArticle, _engine)); } } catch (Exception ex) { Debug.WriteLine(ex); } }
public void Work() { try { var node = _web.Load(_pageURL).DocumentNode; var summaryNodes = node.SelectNodes("//div[@class='template-main-container']/div[@class='row no-gutters']//div[@class='col-12 col-md-6 px-3']") .Select(item => item.SelectSingleNode("./div/a")).ToList(); foreach (var summaryNode in summaryNodes) { ScrapedArticleArgs parsedArticle = new ScrapedArticleArgs() { ArticleTags = "None", Publisher = EnumPublisher.AFME, Body = "n/a", DownloadFiles = "n/a" }; //URL parsedArticle.ArticleUrl = "https://www.afme.eu" + summaryNode.Attributes["href"].Value; //Article Format parsedArticle.ArticleFormat = "pdf"; var nodeDetails = summaryNode.SelectNodes("./div//div"); //Header parsedArticle.Header = nodeDetails[1].InnerText.Trim(); //Public and Modified Date parsedArticle.LatestUpdateDate = parsedArticle.PublishDate = nodeDetails[2].InnerText.Trim(); //Summary parsedArticle.Summary = nodeDetails[3].InnerText.Replace("\n", "").Trim(); _engine.AddToQueue(new AFMEDataScrapeArticle(parsedArticle, _engine)); } } catch (Exception ex) { Debug.WriteLine(ex); } }
public ESMANewsScrapeArticle(ScrapedArticleArgs article, ScrapeEngine engine) { _web = new HtmlWeb(); _article = article; _engine = engine; }
public FCAScrapeArticle(ScrapedArticleArgs newsArticle, ScrapeEngine engine) { _web = new HtmlWeb(); _newsArticle = newsArticle; _engine = engine; }