コード例 #1
0
ファイル: Program.cs プロジェクト: dedabyte/ScrapNews
        public static async Task<string> getArticlePages(IDocument document, PageModel pageModel)
        {
            var content = "";

            var jqNextPage = document.QuerySelectorAll(pageModel.PagerNextSelector);
            if (jqNextPage.Any() && !string.IsNullOrEmpty(jqNextPage[0].GetAttribute("href")))
            {
                var url = pageModel.PagerNextUrlPrefix + jqNextPage[0].GetAttribute("href");
                //var parser = new HtmlParser();
                //var pageDocument = parser.Parse(getHtml(url, pageModel));
                var config = Configuration.Default.WithDefaultLoader();
                var pageDocument = await BrowsingContext.New(config).OpenAsync(url);

                content = getContent(document, pageModel);

                content += await getArticlePages(pageDocument, pageModel);
            }
            else
            {
                content = getContent(document, pageModel);
            }

            return content;
        }
コード例 #2
0
ファイル: Program.cs プロジェクト: dedabyte/ScrapNews
        public static async void getArticle(RssModel rssModel, PageModel pageModel)
        {
            try
            {
                var config = Configuration.Default.WithDefaultLoader();
                var document = await BrowsingContext.New(config).OpenAsync(rssModel.Link);

                //var parser = new HtmlParser();
                //var document = parser.Parse(getHtml(rssModel.Link, pageModel));

                if (document.QuerySelectorAll(pageModel.SkipSelector).Any())
                {
                    Console.WriteLine("SKIP - " + rssModel.Link);
                    return;
                }

                // find nodes for text
                var jqTitle = document.QuerySelectorAll(pageModel.TitleSelector);
                var jqSummary = document.QuerySelectorAll(pageModel.SummarySelector);
                var jqCategory = document.QuerySelectorAll(pageModel.CategorySelector);
                var jqImageOriginalUrl = document.QuerySelectorAll(pageModel.ImageOriginalUrlSelector);
                
                // populate text vars
                var publisher = pageModel.Publisher;
                var category = jqCategory.Any() ? jqCategory[0].TextContent.Safe().ToLower() : "";
                if (string.IsNullOrEmpty(category))
                {
                    category = "-";
                }
                var title = jqTitle[0].TextContent.Safe();
                var summary = jqSummary.Any() ? jqSummary[0].TextContent.Safe() : "";
                var imageOriginalUrl = jqImageOriginalUrl.Any() ? pageModel.ImagePrefix + jqImageOriginalUrl[0].GetAttribute("src").Safe() : "";

                //var content = getContent(document, pageModel);
                var content = await getArticlePages(document, pageModel);

                var ts = Helpers.Timestamp();
                
                var sql = "insert into articles " +
                          "(original_url,publisher,category,title,summary,content,image_original_url,image_rss_original_url,timestamp) " +
                          "values " +
                          "('" + rssModel.Link + "'," +
                          "'" + publisher + "'," +
                          "'" + category + "'," +
                          "'" + title + "'," +
                          "'" + summary + "'," +
                          "'" + content + "'," +
                          "'" + imageOriginalUrl + "'," +
                          "'" + rssModel.RssImage + "'," +
                          ts + ")";

                using (var conn = Db.getConnection())
                {
                    conn.Open();
                    using (var cmd = new SQLiteCommand(sql, conn))
                    {
                        cmd.ExecuteNonQuery();
                    }
                }

                Console.WriteLine("OK   - " + rssModel.Link);
            }
            catch (Exception e)
            {
                Console.WriteLine("FAIL - " + rssModel.Link + " - " + e.Message);
            }
        }
コード例 #3
0
ファイル: Program.cs プロジェクト: dedabyte/ScrapNews
        public static void checkUrls(string rssUrl, PageModel pageModel)
        {
            var publisher = pageModel.Publisher;
            
            var urlsInDB = getUrlsForPublisher(publisher);
            var rssModels = getRss(rssUrl);
            
            var urlsFromRSS = rssModels.Select(x => x.Link).ToList();
            var newRssModels = new List<RssModel>();

            for (int i = 0; i < urlsFromRSS.Count; i++)
            {
                var urlRss = urlsFromRSS[i];
                if(pageModel.SkipUrlRegex != null)
                {
                    var match = pageModel.GetSkipUrlRegex().IsMatch(urlRss);
                    if(match)
                    {
                        Console.WriteLine("REGX - " + urlRss);
                        continue;
                    }
                }

                if (!urlsInDB.Contains(urlRss))
                {
                    newRssModels.Add(rssModels[i]);
                }
            }

            if (newRssModels.Count > 0)
            {
                Console.WriteLine(newRssModels.Count + " new urls for " + publisher);
                foreach (var rssModel in newRssModels)
                {
                    getArticle(rssModel, pageModel);
                }
            }
            else
            {
                Console.WriteLine("No new urls for " + publisher);
            }
        }
コード例 #4
0
ファイル: Program.cs プロジェクト: dedabyte/ScrapNews
        public static string getContent(IDocument document, PageModel pageModel)
        {
            // find nodes for html, remove junk
            var jqContent = document.QuerySelector(pageModel.ArticleNodeSelector);
            jqContent.QuerySelectorAll(pageModel.ArticleRemoveSelector).ToList().ForEach(x => x.Remove());
            // remove scripts
            var allowedScripts = new[]
            {
                "platform.instagram.com",
                "platform.twitter.com"
            };
            foreach (var script in jqContent.QuerySelectorAll("script").ToList())
            {
                var src = script.GetAttribute("src").Safe();
                if (!src.Any() || (src.Any() && !allowedScripts.Any(x => src.Contains(x))))
                {
                    Console.WriteLine("     - Script blocked: " + src);
                    script.Remove();
                }
            }

            // mod images
            foreach (var element in jqContent.QuerySelectorAll("img").ToList())
            {
                // remove dimensions 
                element.RemoveAttribute("width");
                element.RemoveAttribute("height");
                // prefix sources
                var src = element.GetAttribute("src").Safe();
                if (src.Any() && !src.StartsWith("http"))
                {
                    src = pageModel.ImagePrefix + src;
                    element.SetAttribute("src", src);
                }
            }

            return jqContent.InnerHtml.Safe();
        }