示例#1
0
        public void ScrapeArticle()
        {
            var articleInfo = new ArticleInfo
            {
                Url = "http://www.delfi.lt/news/daily/crime/siuolaikines-vergoves-mastai-kelia-siauba.d?id=64756213"
            };

            var article = new DelfiArticleScraper().Scrape(articleInfo);
        }
示例#2
0
 public void ScrapeComments()
 {
     var ai = new ArticleInfo
     {
         Url =
             "http://www.delfi.lt/news/daily/education/dalis-sostines-priesmokyklinuku-isikurs-kitur.d?id=64297936",
         Id = { ExternalId = "64297936"}
     };
     var scraper = new DelfiCommentsScraper();
     var comments = scraper.ScrapeRange(ai, 1, 5);
 }
示例#3
0
        public Article Article(ArticleInfo articleInfo)
        {
            try
            {
                return _articleScrapers
                    .First(s => s.Portal == articleInfo.Id.Portal)
                    .Scrape(articleInfo);
            }
            catch (Exception e)
            {
                e.Data["articleInfo"] = articleInfo;
                _log.Error("An error occurred while scraping article.", e);
            }

            return null;
        }
        public Article Scrape(ArticleInfo articleInfo)
        {
            var docNode = Utilities.DownloadPage(articleInfo.Url);

            return new Article
            {
                Id = articleInfo.Id,
                CommentCount = articleInfo.CommentCount,
                Title = articleInfo.Title,
                Url = articleInfo.Url,
                DateScraped = DateTime.UtcNow.AddHours(2),
                AuthorName = GetAuthorName(docNode),
                Body = GetBody(docNode),
                DateModified = GetDateModified(docNode),
                DatePublished = GetDatePublished(docNode),
                Keywords = GetKeywords(docNode)
            };
        }
        public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to)
        {
            var comments = new List<Comment>();

            for (var i = from - 1; i <= to - 1; i += 20)
            {
                var url = articleInfo.Url;
                url = url.AddQueryParameterToUrl("com", 1);
                url = url.AddQueryParameterToUrl("s", 1);
                url = url.AddQueryParameterToUrl("no", i - 1);

                var docNode = Utilities.DownloadPage(url);
                var commentNodes = docNode.SelectNodes("//ul[@id='comments-list']/li");
                comments.AddRange(commentNodes.Select(cn => ParseComment(cn, articleInfo.Id.ExternalId)));
            }

            return comments.Take(to - from + 1).ToList();
        }
示例#6
0
        public List<Comment> Comments(ArticleInfo articleInfo, int from, int to)
        {
            try
            {
                return _commentScrapers
                    .First(s => s.Portal == articleInfo.Id.Portal)
                    .ScrapeRange(articleInfo, from, to);
            }
            catch (Exception e)
            {
                e.Data["articleInfo"] = articleInfo;
                e.Data["from"] = from;
                e.Data["to"] = to;
                _log.Error("An error occurred while scraping comments.", e);
            }

            return new List<Comment>();
        }
        public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to)
        {
            var comments = new List<Comment>();

            var page = 1;
            var pagesToScrape = 100;

            while(page <= pagesToScrape)
            {
                var url = Lrytas.MainHost;
                url = url.AddQueryParameterToUrl("id", articleInfo.Id.ExternalId);
                url = url.AddQueryParameterToUrl("view", 6);
                url = url.AddQueryParameterToUrl("p", page);

                var docNode = Utilities.DownloadPage(url);

                if (page == 1)
                {
                    try
                    {
                        var pageNumberNodes = docNode.SelectNodes(".//div[@class='str-pages-div']/a");
                        var pageCount = Convert.ToInt32(pageNumberNodes[pageNumberNodes.Count - 2].InnerText);
                        pagesToScrape = pageCount - (@from / 25);
                    }
                    catch (Exception)
                    {
                        pagesToScrape = 1;
                    }
                }

                var commentNodes = docNode.SelectNodes(".//div[@class='comment']");
                var scrapedComments = commentNodes.Select(cn => ParseComment(cn, articleInfo.Id.ExternalId)).ToList();

                comments.AddRange(scrapedComments);
                page++;
            }

            return comments.Take(to - from + 1).ToList();
        }
        public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to)
        {
            var comments = new List<Comment>();

            for (var page = (from - 1) / 50 + 1; page <= (to - 1) / 50 + 1; page++)
            {
                var url = articleInfo.Url;
                url = url.AddQueryParameterToUrl("comments", "");
                url = url.AddQueryParameterToUrl("page", page);
                url = url.AddQueryParameterToUrl("order", "ASC");

                var docNode = Utilities.DownloadPage(url);

                var scriptNode = docNode.SelectNodes(".//script").FirstOrDefault(s => s.InnerText.Contains("article_comments"));
                var json = HttpUtility.UrlDecode(scriptNode.InnerText.GetSubstringBetween("var article_comments = ", "];") + "]");

                var commentsFromJson = JsonConvert.DeserializeObject<List<CommentFromJson>>(json);

                comments.AddRange(commentsFromJson.Select(c =>
                {
                    var comment = new Comment
                    {
                        ArticleExternalId = articleInfo.Id.ExternalId,
                        CommentText = c.content,
                        DateCreated = ParseRelativeDate(c.date),
                        DateScraped = DateTime.UtcNow.AddHours(2),
                        IpAddress = c.ip,
                        UserName = c.name,
                        Id = {ExternalId = c.id, Portal = Portal.PenkMin},
                    };
                    return comment;
                }));
            }

            return comments.Take(to - from + 1).ToList();
        }
        private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv)
        {
            var linkToArticle = articleDiv.SelectSingleNode("h3/a");
            var dateDiv = articleDiv.SelectSingleNode("div[@class='headline-date']");
            var commentCountNode = articleDiv.SelectSingleNode("h3/a[@class='commentCount']");

            var articleInfo = new ArticleInfo();

            articleInfo.Url = linkToArticle.Attributes["href"].Value;
            if (articleInfo.Url.Contains(@"/video/"))
            {
                throw new CommonParsingException("Delfi TV article");
            }

            articleInfo.Id.ExternalId = articleInfo.Url.GetQueryParameterValueFromUrl("id");
            articleInfo.Title = linkToArticle.InnerText;
            articleInfo.DatePublished = DelfiWordyDateParser.Parse(dateDiv.InnerText);
            articleInfo.DateScraped = DateTime.UtcNow.AddHours(2);
            articleInfo.Id.Portal = Portal.Delfi;
            articleInfo.CommentCount = commentCountNode == null ? 0 : Convert.ToInt32(commentCountNode.InnerText.TrimStart('(').TrimEnd(')'));

            var articleId = Convert.ToInt32(articleInfo.Url.GetQueryParameterValueFromUrl("id"));
            if (articleId == 0) throw new CommonParsingException("Article id not found");

            return articleInfo;
        }
        private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv)
        {
            var linkToArticle = articleDiv.SelectSingleNode("a");
            var dateDiv = articleDiv.SelectSingleNode("div[@class='rubrika-posted']");
            var commentCountNode = articleDiv.SelectSingleNode(".//a[@class='k']");
            if (commentCountNode == null)
            {
                throw new CommonParsingException("Article id not found");
            }

            var articleInfo = new ArticleInfo();
            articleInfo.Url = new Uri (new Uri(Lrytas.MainHost), linkToArticle.Attributes["href"].Value).ToString();
            articleInfo.Id.ExternalId = commentCountNode.Attributes["href"].Value.GetSubstringBetween("=", "&");
            articleInfo.Title = articleDiv.SelectSingleNode("h2/a").InnerText;
            articleInfo.DatePublished = DateTime.ParseExact(dateDiv.InnerText, "yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture);
            articleInfo.DateScraped = DateTime.UtcNow.AddHours(2);
            articleInfo.Id.Portal = Portal.Lrytas;
            articleInfo.CommentCount = Convert.ToInt32(commentCountNode.InnerText);

            return articleInfo;
        }
        private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv)
        {
            var titleLink = articleDiv.SelectSingleNode(".//h4/a");
            var dateString = articleDiv.SelectSingleNode(".//em[@class='article-date']").InnerText.Trim();
            var commentLink = articleDiv.SelectSingleNode(".//a[@class='comment-text']");
            var commentCount = 0;
            if (commentLink != null)
            {
                commentCount = Convert.ToInt32(commentLink.InnerText.Replace("&nbsp;", "").Trim());
            }

            var articleInfo = new ArticleInfo();
            articleInfo.Title = titleLink.InnerText;
            articleInfo.Url = titleLink.Attributes["href"].Value;
            articleInfo.DatePublished = DateTime.ParseExact(dateString, "yyyy.MM.dd HH:mm", CultureInfo.InvariantCulture);
            articleInfo.DateScraped = DateTime.UtcNow.AddHours(2);
            articleInfo.CommentCount = commentCount;
            articleInfo.Id.Portal = Portal.PenkMin;
            articleInfo.Id.ExternalId = articleInfo.Url.Split(new[] {'?'}, StringSplitOptions.None)[0].Split(new[] {"-"},
                        StringSplitOptions.None).Last().Trim();

            return articleInfo;
        }