public void ScrapeArticle() { var articleInfo = new ArticleInfo { Url = "http://www.delfi.lt/news/daily/crime/siuolaikines-vergoves-mastai-kelia-siauba.d?id=64756213" }; var article = new DelfiArticleScraper().Scrape(articleInfo); }
public void ScrapeComments() { var ai = new ArticleInfo { Url = "http://www.delfi.lt/news/daily/education/dalis-sostines-priesmokyklinuku-isikurs-kitur.d?id=64297936", Id = { ExternalId = "64297936"} }; var scraper = new DelfiCommentsScraper(); var comments = scraper.ScrapeRange(ai, 1, 5); }
public Article Article(ArticleInfo articleInfo) { try { return _articleScrapers .First(s => s.Portal == articleInfo.Id.Portal) .Scrape(articleInfo); } catch (Exception e) { e.Data["articleInfo"] = articleInfo; _log.Error("An error occurred while scraping article.", e); } return null; }
public Article Scrape(ArticleInfo articleInfo) { var docNode = Utilities.DownloadPage(articleInfo.Url); return new Article { Id = articleInfo.Id, CommentCount = articleInfo.CommentCount, Title = articleInfo.Title, Url = articleInfo.Url, DateScraped = DateTime.UtcNow.AddHours(2), AuthorName = GetAuthorName(docNode), Body = GetBody(docNode), DateModified = GetDateModified(docNode), DatePublished = GetDatePublished(docNode), Keywords = GetKeywords(docNode) }; }
public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to) { var comments = new List<Comment>(); for (var i = from - 1; i <= to - 1; i += 20) { var url = articleInfo.Url; url = url.AddQueryParameterToUrl("com", 1); url = url.AddQueryParameterToUrl("s", 1); url = url.AddQueryParameterToUrl("no", i - 1); var docNode = Utilities.DownloadPage(url); var commentNodes = docNode.SelectNodes("//ul[@id='comments-list']/li"); comments.AddRange(commentNodes.Select(cn => ParseComment(cn, articleInfo.Id.ExternalId))); } return comments.Take(to - from + 1).ToList(); }
public List<Comment> Comments(ArticleInfo articleInfo, int from, int to) { try { return _commentScrapers .First(s => s.Portal == articleInfo.Id.Portal) .ScrapeRange(articleInfo, from, to); } catch (Exception e) { e.Data["articleInfo"] = articleInfo; e.Data["from"] = from; e.Data["to"] = to; _log.Error("An error occurred while scraping comments.", e); } return new List<Comment>(); }
public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to) { var comments = new List<Comment>(); var page = 1; var pagesToScrape = 100; while(page <= pagesToScrape) { var url = Lrytas.MainHost; url = url.AddQueryParameterToUrl("id", articleInfo.Id.ExternalId); url = url.AddQueryParameterToUrl("view", 6); url = url.AddQueryParameterToUrl("p", page); var docNode = Utilities.DownloadPage(url); if (page == 1) { try { var pageNumberNodes = docNode.SelectNodes(".//div[@class='str-pages-div']/a"); var pageCount = Convert.ToInt32(pageNumberNodes[pageNumberNodes.Count - 2].InnerText); pagesToScrape = pageCount - (@from / 25); } catch (Exception) { pagesToScrape = 1; } } var commentNodes = docNode.SelectNodes(".//div[@class='comment']"); var scrapedComments = commentNodes.Select(cn => ParseComment(cn, articleInfo.Id.ExternalId)).ToList(); comments.AddRange(scrapedComments); page++; } return comments.Take(to - from + 1).ToList(); }
public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to) { var comments = new List<Comment>(); for (var page = (from - 1) / 50 + 1; page <= (to - 1) / 50 + 1; page++) { var url = articleInfo.Url; url = url.AddQueryParameterToUrl("comments", ""); url = url.AddQueryParameterToUrl("page", page); url = url.AddQueryParameterToUrl("order", "ASC"); var docNode = Utilities.DownloadPage(url); var scriptNode = docNode.SelectNodes(".//script").FirstOrDefault(s => s.InnerText.Contains("article_comments")); var json = HttpUtility.UrlDecode(scriptNode.InnerText.GetSubstringBetween("var article_comments = ", "];") + "]"); var commentsFromJson = JsonConvert.DeserializeObject<List<CommentFromJson>>(json); comments.AddRange(commentsFromJson.Select(c => { var comment = new Comment { ArticleExternalId = articleInfo.Id.ExternalId, CommentText = c.content, DateCreated = ParseRelativeDate(c.date), DateScraped = DateTime.UtcNow.AddHours(2), IpAddress = c.ip, UserName = c.name, Id = {ExternalId = c.id, Portal = Portal.PenkMin}, }; return comment; })); } return comments.Take(to - from + 1).ToList(); }
private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv) { var linkToArticle = articleDiv.SelectSingleNode("h3/a"); var dateDiv = articleDiv.SelectSingleNode("div[@class='headline-date']"); var commentCountNode = articleDiv.SelectSingleNode("h3/a[@class='commentCount']"); var articleInfo = new ArticleInfo(); articleInfo.Url = linkToArticle.Attributes["href"].Value; if (articleInfo.Url.Contains(@"/video/")) { throw new CommonParsingException("Delfi TV article"); } articleInfo.Id.ExternalId = articleInfo.Url.GetQueryParameterValueFromUrl("id"); articleInfo.Title = linkToArticle.InnerText; articleInfo.DatePublished = DelfiWordyDateParser.Parse(dateDiv.InnerText); articleInfo.DateScraped = DateTime.UtcNow.AddHours(2); articleInfo.Id.Portal = Portal.Delfi; articleInfo.CommentCount = commentCountNode == null ? 0 : Convert.ToInt32(commentCountNode.InnerText.TrimStart('(').TrimEnd(')')); var articleId = Convert.ToInt32(articleInfo.Url.GetQueryParameterValueFromUrl("id")); if (articleId == 0) throw new CommonParsingException("Article id not found"); return articleInfo; }
private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv) { var linkToArticle = articleDiv.SelectSingleNode("a"); var dateDiv = articleDiv.SelectSingleNode("div[@class='rubrika-posted']"); var commentCountNode = articleDiv.SelectSingleNode(".//a[@class='k']"); if (commentCountNode == null) { throw new CommonParsingException("Article id not found"); } var articleInfo = new ArticleInfo(); articleInfo.Url = new Uri (new Uri(Lrytas.MainHost), linkToArticle.Attributes["href"].Value).ToString(); articleInfo.Id.ExternalId = commentCountNode.Attributes["href"].Value.GetSubstringBetween("=", "&"); articleInfo.Title = articleDiv.SelectSingleNode("h2/a").InnerText; articleInfo.DatePublished = DateTime.ParseExact(dateDiv.InnerText, "yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture); articleInfo.DateScraped = DateTime.UtcNow.AddHours(2); articleInfo.Id.Portal = Portal.Lrytas; articleInfo.CommentCount = Convert.ToInt32(commentCountNode.InnerText); return articleInfo; }
private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv) { var titleLink = articleDiv.SelectSingleNode(".//h4/a"); var dateString = articleDiv.SelectSingleNode(".//em[@class='article-date']").InnerText.Trim(); var commentLink = articleDiv.SelectSingleNode(".//a[@class='comment-text']"); var commentCount = 0; if (commentLink != null) { commentCount = Convert.ToInt32(commentLink.InnerText.Replace(" ", "").Trim()); } var articleInfo = new ArticleInfo(); articleInfo.Title = titleLink.InnerText; articleInfo.Url = titleLink.Attributes["href"].Value; articleInfo.DatePublished = DateTime.ParseExact(dateString, "yyyy.MM.dd HH:mm", CultureInfo.InvariantCulture); articleInfo.DateScraped = DateTime.UtcNow.AddHours(2); articleInfo.CommentCount = commentCount; articleInfo.Id.Portal = Portal.PenkMin; articleInfo.Id.ExternalId = articleInfo.Url.Split(new[] {'?'}, StringSplitOptions.None)[0].Split(new[] {"-"}, StringSplitOptions.None).Last().Trim(); return articleInfo; }