private User ScrapeUserData(HtmlNode node, User user) { node.NullCheck(); user = user ?? new User(); /// name HtmlNode nameNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageName); if (nameNode != null && nameNode.NextSibling != null) { user.Name = nameNode.NextSibling.InnerText.SafeTrim(); } /// email HtmlNode emailNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageEmail); if (emailNode != null && emailNode.NextSibling != null && emailNode.NextSibling.NextSibling != null) { user.Email = emailNode.NextSibling.NextSibling.InnerText.SafeTrim(); } /// gender HtmlNode genderNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageGender); if (genderNode != null && genderNode.NextSibling != null) { string genderString = genderNode.NextSibling.InnerText.SafeTrim(); if (genderString == ParsingHelper.GenderMale) { user.Gender = UserGenderEnum.Male; } else if (genderString == ParsingHelper.GenderFemale) { user.Gender = UserGenderEnum.Female; } else { user.Gender = UserGenderEnum.NotSet; this._logger.DebugFormat("ScrapeUserData, Gender not set - STRING: {0}", genderString); } } /// birthdate HtmlNode birthdateNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageBirthDate); if (birthdateNode != null && birthdateNode.NextSibling != null) { DateTime birthdate; if (birthdateNode.NextSibling.InnerText.TryParseLogging(out birthdate)) { user.Birthdate = birthdate; } } Regex digitsRegex = new Regex(@"(?<digits>\d+)"); /// forum posts HtmlNode forumPostsNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageForumPosts); if (forumPostsNode != null && forumPostsNode.NextSibling != null) { Match match = digitsRegex.Match(forumPostsNode.NextSibling.InnerText); if (match.Success) { user.ForumPosts = int.Parse(match.Groups["digits"].Value); } } /// blog posts HtmlNode blogPostsNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageBlogPosts); if (blogPostsNode != null && blogPostsNode.NextSibling != null) { Match match = digitsRegex.Match(blogPostsNode.NextSibling.InnerText); if (match.Success) { user.BlogPosts = int.Parse(match.Groups["digits"].Value); } } /// picture posts HtmlNode picturePostsNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPagePicturePosts); if (picturePostsNode != null && picturePostsNode.NextSibling != null) { Match match = digitsRegex.Match(picturePostsNode.NextSibling.InnerText); if (match.Success) { user.PublishedPictures = int.Parse(match.Groups["digits"].Value); } } /// comment posts HtmlNode commentPostsNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageCommentPosts); if (commentPostsNode != null && commentPostsNode.NextSibling != null) { Match match = digitsRegex.Match(commentPostsNode.NextSibling.InnerText); if (match.Success) { user.PublishedComments = int.Parse(match.Groups["digits"].Value); } } /// video posts HtmlNode videoPostsNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageVideoPosts); if (videoPostsNode != null && videoPostsNode.NextSibling != null) { Match match = digitsRegex.Match(videoPostsNode.NextSibling.InnerText); if (match.Success) { user.PublishedVideos = int.Parse(match.Groups["digits"].Value); } } /// registered HtmlNode registeredNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageRegistered); if (registeredNode != null && registeredNode.NextSibling != null) { DateTime registered; if (registeredNode.NextSibling.InnerText.TryParseLogging(out registered)) { user.DateCreated = registered; } } /// description HtmlNode descriptionNode = node.ChildNodes.FirstOrDefault(x => x.InnerText == ParsingHelper.UserPageDescription); if (descriptionNode != null && descriptionNode.NextSibling != null) { HtmlNode sibling = descriptionNode.NextSibling; StringBuilder description = new StringBuilder(); do { string text = sibling.InnerText.SafeTrim(); if (text.Length > 0) { description.AppendFormat("{0} ", text); } sibling = sibling.NextSibling; } while (sibling.NextSibling != null); user.Description = description.ToString().SafeTrim(); } return user; }
private Comment ScrapeComment(HtmlNode commentNode) { Comment comment = new Comment() { AccessedDate = DateTime.UtcNow.ToUniversalTime(), }; commentNode.NullCheck(); HtmlNode headerNode = commentNode.SelectSingleNode(CommentsPageXPath.HeaderInfo); IList<HtmlNode> innerHeaderNodes = headerNode.ChildNodes.Where(x => x.Name == "a").ToList(); /// userUrl, url, username, id if (innerHeaderNodes != null && innerHeaderNodes.Count == 2) { if (innerHeaderNodes[0].Attributes["href"] != null) { comment.UserUrl = innerHeaderNodes[0].Attributes["href"].Value.SafeTrim().ToFullRtvSloUrl(); comment.UserId = this.GetIdStringFromUrl(comment.UserUrl); } else { this._logger.ErrorFormat("ScrapingService, ScrapeComment - User url is null - NODE: {0}", commentNode.SerializeHtmlNode()); } comment.UserName = innerHeaderNodes[0].InnerText.SafeTrim(); if (innerHeaderNodes[1].Attributes["href"] != null) { comment.Url = innerHeaderNodes[1].Attributes["href"].Value.SafeTrim().ToFullRtvSloUrl(); comment.Id = this.GetIdFromUrl(comment.Url); } else { this._logger.ErrorFormat("ScrapingService, ScrapeComment - Comment url is null - NODE: {0}", commentNode.SerializeHtmlNode()); } } /// created date time string dateCreatedString = headerNode.LastChild.InnerText.SafeTrim(); DateTime created; if (dateCreatedString.TryParseExactLogging(ParsingHelper.ShortDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out created)) { comment.DateCreated = created.ToUniversalTime(); } HtmlNode contentNode = commentNode.SelectSingleNode(CommentsPageXPath.Content); if (contentNode != null) { string content = contentNode.InnerText.SafeTrimAndEscapeHtml(); comment.Content = content; } else { this._logger.ErrorFormat("ScrapingService, ScrapeComment - Comment content is null - URL: {0}", comment.Url); } /// rating HtmlNode ratingNode = commentNode.SelectSingleNode(CommentsPageXPath.Rating); string plusRatingString = ratingNode.SelectSingleNode(CommentsPageXPath.PlusRating).InnerText.SafeTrim(); string minusRatingString = ratingNode.SelectSingleNode(CommentsPageXPath.MinusRating).InnerText.SafeTrim(); int plusRating = this.ScrapeCommentRating(plusRatingString, comment.Url); int minusRating = this.ScrapeCommentRating(minusRatingString, comment.Url); comment.Rating = plusRating + minusRating; return comment; }
private Post ScrapePost(HtmlNode contentNode, Post post) { post = post ?? new Post(); contentNode.NullCheck(); /// title HtmlNode titleNode = contentNode.SelectSingleNode(PostPageXPath.Title); if (titleNode != null) { post.Title = titleNode.InnerText.SafeTrimAndEscapeHtml(); } else { this._logger.ErrorFormat("ScrapingService, ScrapePost, Title node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); } /// subtitle HtmlNode subtitleNode = contentNode.SelectSingleNode(PostPageXPath.Subtitle); if (subtitleNode != null) { post.Subtitle = subtitleNode.InnerText.SafeTrimAndEscapeHtml(); } else { this._logger.ErrorFormat("ScrapingService, ScrapePost, Subtitle node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); } /// author HtmlNode authorNode = contentNode.SelectSingleNode(PostPageXPath.Auhtor); if (authorNode != null) { IList<HtmlNode> authorNameNodes = authorNode.ChildNodes.Where(x => x.Name == "b" && x.ChildNodes.Where(t => t.Name == "a").Count() == 0).ToList(); if (!authorNameNodes.IsEmpty()) { foreach (HtmlNode author in authorNameNodes) { //TODO http://www.rtvslo.si/mmc-priporoca/dame-niso-sposobne-zmagati-na-dirki-formule-ena/306771 User authorUser = new User() { Name = author.InnerText.SafeTrim().Replace(",", string.Empty).Replace("foto:", string.Empty).SafeTrimAndEscapeHtml(), Function = UserFunctionEnum.Journalist }; post.Authors.Add(authorUser); } } //HtmlNode authorName = authorNode.ChildNodes.FindFirst("b"); //if (authorName != null) //{ // post.Authors = authorName.InnerText.SafeTrimAndEscapeHtml(); //} } if (post.Authors.IsEmpty()) { //this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}", post.Url); } /// info HtmlNode infoNode = contentNode.SelectSingleNode(PostPageXPath.InfoContent); if (infoNode != null) { // <div class="info">16. februar 2013 ob 07:22,<br>zadnji poseg: 16. februar 2013 ob 15:16<br>Schladming - MMC RTV SLO</div> IList<HtmlNode> textNodes = infoNode.ChildNodes.Where(x => x.Name == "#text").ToList(); if (textNodes != null && textNodes.Count > 1) { /// Created datetime string createdDateTimeString = textNodes.First().InnerText.SafeTrim(); DateTime createdDate; if (createdDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out createdDate)) { post.DateCreated = createdDate.ToUniversalTime(); post.LastUpdated = createdDate.ToUniversalTime(); } /// Location string locationString = textNodes.Last().InnerText; IList<string> locationList = locationString.Split(new string[]{"-"}, StringSplitOptions.RemoveEmptyEntries).ToList(); if (locationList != null && locationList.Count > 1) { post.Location = locationList.First().SafeTrim(); if (locationList.Last().SafeTrim() != "MMC RTV SLO") { this._logger.DebugFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, LIST: {1}", post.Url, locationList.SerializeObject()); } } else { this._logger.WarnFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode()); } if (textNodes.Count == 3) { /// Updated datetime string updatedDateTimeString = textNodes[1].InnerText.SafeTrim(); Regex dateTimeRegex = new Regex(@"(?<date>[0-9\.]+[\w+\s+]+[0-9\:]+)", RegexOptions.IgnoreCase); //TODO fix regex Match dateTimeMatch = dateTimeRegex.Match(updatedDateTimeString); if (dateTimeMatch.Success) { updatedDateTimeString = dateTimeMatch.Groups["date"].Value; DateTime updatedDate; if (updatedDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out updatedDate)) { post.DateCreated = updatedDate.ToUniversalTime(); } } } } else { this._logger.ErrorFormat("ScrapingService, ScrapePost, InfoNode - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode()); } } /// Main content IList<HtmlNode> contentNodes = new List<HtmlNode>(); foreach (HtmlNode node in contentNode.ChildNodes) { /// ends with author if (node.Name == "div" && node.Attributes.FirstOrDefault(x => x.Value == "author") != null) { break; } if ((node.Name == "p" || node.Name == "div") && node.FirstChild != null && node.FirstChild.Name != "div" && contentNodes.Count > 0) { contentNodes.Add(node); } /// starts with p tag if (node.Name == "p" && node.FirstChild.Name != "div" && contentNodes.Count == 0) { contentNodes.Add(node); } } //TODO remove string sasas = post.Url; if (!contentNodes.IsEmpty()) { /// Abstract - text inside strong tag in first node HtmlNode abstractNode = contentNodes.First(); HtmlNode strongAbstractNode = abstractNode.ChildNodes.First(x => x.Name == "strong"); post.Abstract = strongAbstractNode.InnerText.SafeTrimAndEscapeHtml(); /// remove abstract from main content abstractNode.ChildNodes.Remove(strongAbstractNode); /// Content StringBuilder content = new StringBuilder(); foreach (HtmlNode node in contentNodes) { // to get white space after paragraph title foreach (HtmlNode childNode in node.ChildNodes) { string text = childNode.InnerText.SafeTrimAndEscapeHtml(); if (text.Length > 0) { content.AppendFormat("{0} ", text); } } } post.Content = content.ToString().SafeTrim(); } else { this._logger.ErrorFormat("ScrapingService, ScrapePost - Post content is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode()); } return post; }
private User ScrapeUserRating(HtmlNode node, User user) { /// <div id="rate_text" style="font-weight:normal;color:#000000;font:9px Arial;">Ocena <strong>4.5</strong> od <strong>642</strong> glasov</div> node.NullCheck(); user = user ?? new User(); IList<HtmlNode> nodes = node.ChildNodes.Where(x => x.Name == "strong").ToList(); if (nodes.Count != 2) { throw new IndexOutOfRangeException(); } decimal rating = -1; int ratings = -1; if (!string.IsNullOrEmpty(nodes[0].InnerHtml) && decimal.TryParse(nodes[0].InnerHtml, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out rating)) { user.Rating = rating; } if (!string.IsNullOrEmpty(nodes[1].InnerHtml) && int.TryParse(nodes[1].InnerHtml, out ratings)) { user.NumOfRatings = ratings; } return user; }