private Comment ParseComment(HtmlNode commentNode, string id) { var comment = new Comment(); comment.ArticleExternalId = id; comment.CommentText = HttpUtility.HtmlDecode(commentNode.SelectSingleNode("p").InnerText.Trim()); comment.DateCreated = DateTime.ParseExact(commentNode.SelectSingleNode(".//div[@class='comment-time']").InnerText, "yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture); comment.DateScraped = DateTime.UtcNow.AddHours(2); comment.Id.ExternalId = commentNode.SelectSingleNode(".//div[@class='comment-vert1']/a").Attributes["href"].Value .GetSubstringBetween("comr(", ",'"); comment.IpAddress = commentNode.SelectSingleNode(".//div[@class='comment-ip']").InnerText .Replace("IP:", "").Trim(); comment.Id.Portal = Portal.Lrytas; var commentTitle = commentNode.SelectSingleNode(".//div[@class='comment-nr']").InnerText; comment.UserName = HttpUtility.HtmlDecode(commentTitle.Substring(commentTitle.IndexOf('.') + 1).Trim()); return comment; }
public List<Comment> ScrapeRange(ArticleInfo articleInfo, int from, int to) { var comments = new List<Comment>(); for (var page = (from - 1) / 50 + 1; page <= (to - 1) / 50 + 1; page++) { var url = articleInfo.Url; url = url.AddQueryParameterToUrl("comments", ""); url = url.AddQueryParameterToUrl("page", page); url = url.AddQueryParameterToUrl("order", "ASC"); var docNode = Utilities.DownloadPage(url); var scriptNode = docNode.SelectNodes(".//script").FirstOrDefault(s => s.InnerText.Contains("article_comments")); var json = HttpUtility.UrlDecode(scriptNode.InnerText.GetSubstringBetween("var article_comments = ", "];") + "]"); var commentsFromJson = JsonConvert.DeserializeObject<List<CommentFromJson>>(json); comments.AddRange(commentsFromJson.Select(c => { var comment = new Comment { ArticleExternalId = articleInfo.Id.ExternalId, CommentText = c.content, DateCreated = ParseRelativeDate(c.date), DateScraped = DateTime.UtcNow.AddHours(2), IpAddress = c.ip, UserName = c.name, Id = {ExternalId = c.id, Portal = Portal.PenkMin}, }; return comment; })); } return comments.Take(to - from + 1).ToList(); }
private static Comment ParseComment(HtmlNode commentNode, string articleId) { var commentAnchor = commentNode.SelectSingleNode("a[@class='comment-list-comment-anchor']").Attributes["name"].Value; var authorNode = commentNode.SelectSingleNode("div[@class='comment-author']"); var dateAndIp = authorNode.SelectSingleNode("div[contains(@class, 'comm-date')]").InnerText; var parts = dateAndIp.Split(new[] {"IP:"}, StringSplitOptions.None); var dateString = parts[0].Trim(); var ipString = parts[1].Trim(); var votesString = commentNode.SelectSingleNode("div[@class='comment-list-el-votes']/a").Attributes["rel"].Value; var votesParts = votesString.Split(new [] {":"}, StringSplitOptions.None); var comment = new Comment(); comment.ArticleExternalId = articleId; comment.CommentText = commentNode.SelectSingleNode("div[contains(@class, 'comment-body')]").InnerText.Trim(); comment.DateCreated = DateTime.ParseExact(dateString, "yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture); comment.DateScraped = DateTime.UtcNow.AddHours(2); comment.DownVotes = Convert.ToInt32(votesParts[2]); comment.IpAddress = ipString; comment.Id.Portal = Portal.Delfi; comment.Id.ExternalId = commentAnchor.Substring(1).Trim(); comment.Upvotes = Convert.ToInt32(votesParts[1]); comment.UserName = authorNode.SelectSingleNode("h3").InnerText.Trim(); var inReponseToNode = commentNode.SelectSingleNode("fieldset"); if (inReponseToNode != null) { var responseUrl = inReponseToNode.Attributes["rel"].Value; var queryString = string.Join(string.Empty, responseUrl.Split('?').Skip(1)); var qs = HttpUtility.ParseQueryString(queryString); comment.InResponseToCommentId = Convert.ToInt32(qs["q_id"]); } return comment; }