public static async Task<string> getArticlePages(IDocument document, PageModel pageModel) { var content = ""; var jqNextPage = document.QuerySelectorAll(pageModel.PagerNextSelector); if (jqNextPage.Any() && !string.IsNullOrEmpty(jqNextPage[0].GetAttribute("href"))) { var url = pageModel.PagerNextUrlPrefix + jqNextPage[0].GetAttribute("href"); //var parser = new HtmlParser(); //var pageDocument = parser.Parse(getHtml(url, pageModel)); var config = Configuration.Default.WithDefaultLoader(); var pageDocument = await BrowsingContext.New(config).OpenAsync(url); content = getContent(document, pageModel); content += await getArticlePages(pageDocument, pageModel); } else { content = getContent(document, pageModel); } return content; }
public static async void getArticle(RssModel rssModel, PageModel pageModel) { try { var config = Configuration.Default.WithDefaultLoader(); var document = await BrowsingContext.New(config).OpenAsync(rssModel.Link); //var parser = new HtmlParser(); //var document = parser.Parse(getHtml(rssModel.Link, pageModel)); if (document.QuerySelectorAll(pageModel.SkipSelector).Any()) { Console.WriteLine("SKIP - " + rssModel.Link); return; } // find nodes for text var jqTitle = document.QuerySelectorAll(pageModel.TitleSelector); var jqSummary = document.QuerySelectorAll(pageModel.SummarySelector); var jqCategory = document.QuerySelectorAll(pageModel.CategorySelector); var jqImageOriginalUrl = document.QuerySelectorAll(pageModel.ImageOriginalUrlSelector); // populate text vars var publisher = pageModel.Publisher; var category = jqCategory.Any() ? jqCategory[0].TextContent.Safe().ToLower() : ""; if (string.IsNullOrEmpty(category)) { category = "-"; } var title = jqTitle[0].TextContent.Safe(); var summary = jqSummary.Any() ? jqSummary[0].TextContent.Safe() : ""; var imageOriginalUrl = jqImageOriginalUrl.Any() ? pageModel.ImagePrefix + jqImageOriginalUrl[0].GetAttribute("src").Safe() : ""; //var content = getContent(document, pageModel); var content = await getArticlePages(document, pageModel); var ts = Helpers.Timestamp(); var sql = "insert into articles " + "(original_url,publisher,category,title,summary,content,image_original_url,image_rss_original_url,timestamp) " + "values " + "('" + rssModel.Link + "'," + "'" + publisher + "'," + "'" + category + "'," + "'" + title + "'," + "'" + summary + "'," + "'" + content + "'," + "'" + imageOriginalUrl + "'," + "'" + rssModel.RssImage + "'," + ts + ")"; using (var conn = Db.getConnection()) { conn.Open(); using (var cmd = new SQLiteCommand(sql, conn)) { cmd.ExecuteNonQuery(); } } Console.WriteLine("OK - " + rssModel.Link); } catch (Exception e) { Console.WriteLine("FAIL - " + rssModel.Link + " - " + e.Message); } }
public static void checkUrls(string rssUrl, PageModel pageModel) { var publisher = pageModel.Publisher; var urlsInDB = getUrlsForPublisher(publisher); var rssModels = getRss(rssUrl); var urlsFromRSS = rssModels.Select(x => x.Link).ToList(); var newRssModels = new List<RssModel>(); for (int i = 0; i < urlsFromRSS.Count; i++) { var urlRss = urlsFromRSS[i]; if(pageModel.SkipUrlRegex != null) { var match = pageModel.GetSkipUrlRegex().IsMatch(urlRss); if(match) { Console.WriteLine("REGX - " + urlRss); continue; } } if (!urlsInDB.Contains(urlRss)) { newRssModels.Add(rssModels[i]); } } if (newRssModels.Count > 0) { Console.WriteLine(newRssModels.Count + " new urls for " + publisher); foreach (var rssModel in newRssModels) { getArticle(rssModel, pageModel); } } else { Console.WriteLine("No new urls for " + publisher); } }
public static string getContent(IDocument document, PageModel pageModel) { // find nodes for html, remove junk var jqContent = document.QuerySelector(pageModel.ArticleNodeSelector); jqContent.QuerySelectorAll(pageModel.ArticleRemoveSelector).ToList().ForEach(x => x.Remove()); // remove scripts var allowedScripts = new[] { "platform.instagram.com", "platform.twitter.com" }; foreach (var script in jqContent.QuerySelectorAll("script").ToList()) { var src = script.GetAttribute("src").Safe(); if (!src.Any() || (src.Any() && !allowedScripts.Any(x => src.Contains(x)))) { Console.WriteLine(" - Script blocked: " + src); script.Remove(); } } // mod images foreach (var element in jqContent.QuerySelectorAll("img").ToList()) { // remove dimensions element.RemoveAttribute("width"); element.RemoveAttribute("height"); // prefix sources var src = element.GetAttribute("src").Safe(); if (src.Any() && !src.StartsWith("http")) { src = pageModel.ImagePrefix + src; element.SetAttribute("src", src); } } return jqContent.InnerHtml.Safe(); }