Ejemplo n.º 1
0
        private Post ScrapePost(HtmlNode contentNode, Post post)
        {
            post = post ?? new Post();

            contentNode.NullCheck();

            /// title
            HtmlNode titleNode = contentNode.SelectSingleNode(PostPageXPath.Title);
            if (titleNode != null)
            {
                post.Title = titleNode.InnerText.SafeTrimAndEscapeHtml();
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePost, Title node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
            }

            /// subtitle
            HtmlNode subtitleNode = contentNode.SelectSingleNode(PostPageXPath.Subtitle);
            if (subtitleNode != null)
            {
                post.Subtitle = subtitleNode.InnerText.SafeTrimAndEscapeHtml();
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePost, Subtitle node is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
            }

            /// author
            HtmlNode authorNode = contentNode.SelectSingleNode(PostPageXPath.Auhtor);
            if (authorNode != null)
            {
                IList<HtmlNode> authorNameNodes = authorNode.ChildNodes.Where(x => x.Name == "b" && x.ChildNodes.Where(t => t.Name == "a").Count() == 0).ToList();
                if (!authorNameNodes.IsEmpty())
                {
                    foreach (HtmlNode author in authorNameNodes)
                    {
                        //TODO http://www.rtvslo.si/mmc-priporoca/dame-niso-sposobne-zmagati-na-dirki-formule-ena/306771
                        User authorUser = new User()
                        {
                            Name = author.InnerText.SafeTrim().Replace(",", string.Empty).Replace("foto:", string.Empty).SafeTrimAndEscapeHtml(),
                            Function = UserFunctionEnum.Journalist
                        };

                        post.Authors.Add(authorUser);
                    }
                }

                //HtmlNode authorName = authorNode.ChildNodes.FindFirst("b");
                //if (authorName != null)
                //{
                //    post.Authors = authorName.InnerText.SafeTrimAndEscapeHtml();
                //}
            }

            if (post.Authors.IsEmpty())
            {
                //this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
                this._logger.WarnFormat("ScrapingService, ScrapePost, Author is empty - URL: {0}", post.Url);
            }

            /// info
            HtmlNode infoNode = contentNode.SelectSingleNode(PostPageXPath.InfoContent);
            if (infoNode != null)
            {
                // <div class="info">16. februar 2013 ob 07:22,<br>zadnji poseg: 16. februar 2013 ob 15:16<br>Schladming - MMC RTV SLO</div>

                IList<HtmlNode> textNodes = infoNode.ChildNodes.Where(x => x.Name == "#text").ToList();
                if (textNodes != null && textNodes.Count > 1)
                {
                    /// Created datetime
                    string createdDateTimeString = textNodes.First().InnerText.SafeTrim();

                    DateTime createdDate;
                    if (createdDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out createdDate))
                    {
                        post.DateCreated = createdDate.ToUniversalTime();
                        post.LastUpdated = createdDate.ToUniversalTime();
                    }

                    /// Location
                    string locationString = textNodes.Last().InnerText;
                    IList<string> locationList = locationString.Split(new string[]{"-"}, StringSplitOptions.RemoveEmptyEntries).ToList();
                    if (locationList != null && locationList.Count > 1)
                    {
                        post.Location = locationList.First().SafeTrim();

                        if (locationList.Last().SafeTrim() != "MMC RTV SLO")
                        {
                            this._logger.DebugFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, LIST: {1}", post.Url, locationList.SerializeObject());
                        }
                    }
                    else
                    {
                        this._logger.WarnFormat("ScrapingService, ScrapePost, InfoNode, Location - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode());
                    }

                    if (textNodes.Count == 3)
                    {
                        /// Updated datetime
                        string updatedDateTimeString = textNodes[1].InnerText.SafeTrim();

                        Regex dateTimeRegex = new Regex(@"(?<date>[0-9\.]+[\w+\s+]+[0-9\:]+)", RegexOptions.IgnoreCase);

                        //TODO fix regex
                        Match dateTimeMatch = dateTimeRegex.Match(updatedDateTimeString);

                        if (dateTimeMatch.Success)
                        {
                            updatedDateTimeString = dateTimeMatch.Groups["date"].Value;

                            DateTime updatedDate;
                            if (updatedDateTimeString.TryParseExactLogging(ParsingHelper.LongDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out updatedDate))
                            {
                                post.DateCreated = updatedDate.ToUniversalTime();
                            }
                        }
                    }
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapePost, InfoNode - URL: {0}, NODE: {1}", post.Url, infoNode.SerializeHtmlNode());
                }
            }

            /// Main content
            IList<HtmlNode> contentNodes = new List<HtmlNode>();
            foreach (HtmlNode node in contentNode.ChildNodes)
            {
                /// ends with author
                if (node.Name == "div" && node.Attributes.FirstOrDefault(x => x.Value == "author") != null)
                {
                    break;
                }

                if ((node.Name == "p" || node.Name == "div") && node.FirstChild != null && node.FirstChild.Name != "div" && contentNodes.Count > 0)
                {
                    contentNodes.Add(node);
                }

                /// starts with p tag
                if (node.Name == "p" && node.FirstChild.Name != "div" && contentNodes.Count == 0)
                {
                    contentNodes.Add(node);
                }
            }

            //TODO remove
            string sasas = post.Url;

            if (!contentNodes.IsEmpty())
            {
                /// Abstract - text inside strong tag in first node
                HtmlNode abstractNode = contentNodes.First();
                HtmlNode strongAbstractNode = abstractNode.ChildNodes.First(x => x.Name == "strong");
                post.Abstract = strongAbstractNode.InnerText.SafeTrimAndEscapeHtml();

                /// remove abstract from main content
                abstractNode.ChildNodes.Remove(strongAbstractNode);

                /// Content
                StringBuilder content = new StringBuilder();

                foreach (HtmlNode node in contentNodes)
                {
                    // to get white space after paragraph title
                    foreach (HtmlNode childNode in node.ChildNodes)
                    {
                        string text = childNode.InnerText.SafeTrimAndEscapeHtml();
                        if (text.Length > 0)
                        {
                            content.AppendFormat("{0} ", text);
                        }
                    }
                }

                post.Content = content.ToString().SafeTrim();
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePost - Post content is null - URL: {0}, NODE: {1}", post.Url, contentNode.SerializeHtmlNode());
            }

            return post;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Scrape post statistics
        /// Rating, Number of comments, Number of FB likes, Number of tweets
        /// </summary>
        /// <param name="rootNode"></param>
        /// <param name="post"></param>
        /// <returns></returns>
        private Post ScrapePostStatistics(HtmlNode rootNode, Post post)
        {
            post = post ?? new Post();

            /// rating
            HtmlNode ratingNode = rootNode.SelectSingleNode(PostPageXPath.RatingContent);
            if (ratingNode != null)
            {
                string ratingContent = ratingNode.InnerText;
                Regex ratingRegex = new Regex(@"\w+\s+(?<rating>[0-9\,]+)\s+\w+\s+(?<numRatings>[0-9]+)", RegexOptions.IgnoreCase);
                Match ratingMatch = ratingRegex.Match(ratingContent);

                if (ratingMatch.Success)
                {
                    decimal rating;
                    int numRatings;
                    if (ratingMatch.Groups["rating"].Value.TryParseLogging(out rating))
                    {
                        post.AvgRating = rating;
                    }

                    if (ratingMatch.Groups["numRatings"].Value.TryParseLogging(out numRatings))
                    {
                        post.NumOfRatings = numRatings;
                    }
                }
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, Rating node is null - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode());
            }

            /// num of comments
            HtmlNode numOfCommentsNode = rootNode.SelectSingleNode(PostPageXPath.NumOfComments);
            if (numOfCommentsNode != null &&
                !string.IsNullOrEmpty(numOfCommentsNode.InnerText) &&
                numOfCommentsNode.InnerText.StartsWith("(") &&
                numOfCommentsNode.InnerText.EndsWith(")"))
            {
                int numOfComments;
                string numOfCommentsString = numOfCommentsNode.InnerText.Replace("(", string.Empty).Replace(")", string.Empty);
                if (int.TryParse(numOfCommentsString, out numOfComments))
                {
                    post.NumOfComments = numOfComments;
                }
                else
                {
                    this._logger.WarnFormat("ScrapingService, ScrapePostStatistics, NumOfComments parsing: {2} - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode(), numOfCommentsString);
                }
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfComments - URL: {0}, NODE: {1}", post.Url, rootNode.SerializeHtmlNode());
            }

            /// FB social plugin
            // https://www.facebook.com/plugins/like.php?href=http%3A%2F%2Fwww.rtvslo.si%2Fsport%2Fodbojka%2Fvodebova-in-fabjanova-ubranili-naslov-drzavnih-prvakinj%2F314078&layout=button_count

            string fbUrlPattern = "http://www.facebook.com/plugins/like.php?href={0}&layout=button_count";
            string encodedUrl = HttpUtility.UrlEncode(post.Url);

            string fbUrl = string.Format(fbUrlPattern, encodedUrl);
            string fbPluginPage = this.CreateWebRequest(new Uri(fbUrl));

            if (!string.IsNullOrEmpty(fbPluginPage))
            {
                HtmlNode fbRootNode = fbPluginPage.CreateRootNode();
                if (fbRootNode != null)
                {
                    int fbLikes;

                    HtmlNode fbLikesNode = fbRootNode.SelectSingleNode(PostPageXPath.FbLikes);
                    if (fbLikesNode != null && !string.IsNullOrEmpty(fbLikesNode.InnerText) && int.TryParse(fbLikesNode.InnerText, out fbLikes))
                    {
                        post.NumOfFbLikes = fbLikes;
                    }
                    else
                    {
                        this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, FbLikes - POST URL: {0}, FB URL: {1}, NODE: {2}", post.Url, fbUrl, fbRootNode.SerializeHtmlNode());
                    }
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, FbLikes Node NULL - POST URL: {0}, FB URL: {1}, NODE: {2}", post.Url, fbUrl, fbRootNode.SerializeHtmlNode());
                }
            }

            /// Tweeter social plugin
            // http://platform.twitter.com/widgets/tweet_button.1375828408.html?url=http%3A%2F%2Fwww.rtvslo.si%2Fzabava%2Fiz-sveta-znanih%2Fboy-george-napadel-isinbajevo-zaradi-homofobnih-izjav%2F315495
            // http://cdn.api.twitter.com/1/urls/count.json?url=http%3A%2F%2Fwww.rtvslo.si%2Fzabava%2Fiz-sveta-znanih%2Fboy-george-napadel-isinbajevo-zaradi-homofobnih-izjav%2F315495&callback=twttr.receiveCount

            string twUrlPattern = "http://cdn.api.twitter.com/1/urls/count.json?url={0}";

            string twUrl = string.Format(twUrlPattern, encodedUrl);
            string twJsonPage = this.CreateWebRequest(new Uri(twUrl));

            try
            {
                JObject twJson = JObject.Parse(twJsonPage);
                string countString = (string)twJson["count"];

                int numOfTweets;
                if (!string.IsNullOrEmpty(countString) && int.TryParse(countString, out numOfTweets))
                {
                    post.NumOfTweets = numOfTweets;
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfTweets - POST URL: {0}, TW URL: {1}, NODE: {2}", post.Url, twUrl, twJsonPage);
                }
            }
            catch (JsonReaderException ex)
            {
                this._logger.ErrorFormat("ScrapingService, ScrapePostStatistics, NumOfTweets Parse EXCEPTION - POST URL: {0}, TW URL: {1}, NODE: {2}, EX:{3}", post.Url, twUrl, twJsonPage, ex.Message);
            }

            return post;
        }
Ejemplo n.º 3
0
        private Comment ScrapeComment(HtmlNode commentNode)
        {
            Comment comment = new Comment()
            {
                AccessedDate = DateTime.UtcNow.ToUniversalTime(),
            };

            commentNode.NullCheck();

            HtmlNode headerNode = commentNode.SelectSingleNode(CommentsPageXPath.HeaderInfo);
            IList<HtmlNode> innerHeaderNodes = headerNode.ChildNodes.Where(x => x.Name == "a").ToList();

            /// userUrl, url, username, id
            if (innerHeaderNodes != null && innerHeaderNodes.Count == 2)
            {
                if (innerHeaderNodes[0].Attributes["href"] != null)
                {
                    comment.UserUrl = innerHeaderNodes[0].Attributes["href"].Value.SafeTrim().ToFullRtvSloUrl();
                    comment.UserId = this.GetIdStringFromUrl(comment.UserUrl);
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapeComment - User url is null - NODE: {0}", commentNode.SerializeHtmlNode());
                }

                comment.UserName = innerHeaderNodes[0].InnerText.SafeTrim();

                if (innerHeaderNodes[1].Attributes["href"] != null)
                {
                    comment.Url = innerHeaderNodes[1].Attributes["href"].Value.SafeTrim().ToFullRtvSloUrl();
                    comment.Id = this.GetIdFromUrl(comment.Url);
                }
                else
                {
                    this._logger.ErrorFormat("ScrapingService, ScrapeComment - Comment url is null - NODE: {0}", commentNode.SerializeHtmlNode());
                }
            }

            /// created date time
            string dateCreatedString = headerNode.LastChild.InnerText.SafeTrim();

            DateTime created;
            if (dateCreatedString.TryParseExactLogging(ParsingHelper.ShortDateTimeParseExactPattern, this.cultureInfo, DateTimeStyles.None, out created))
            {
                comment.DateCreated = created.ToUniversalTime();
            }

            HtmlNode contentNode = commentNode.SelectSingleNode(CommentsPageXPath.Content);

            if (contentNode != null)
            {
                string content = contentNode.InnerText.SafeTrimAndEscapeHtml();
                comment.Content = content;
            }
            else
            {
                this._logger.ErrorFormat("ScrapingService, ScrapeComment - Comment content is null - URL: {0}", comment.Url);
            }

            /// rating
            HtmlNode ratingNode = commentNode.SelectSingleNode(CommentsPageXPath.Rating);

            string plusRatingString = ratingNode.SelectSingleNode(CommentsPageXPath.PlusRating).InnerText.SafeTrim();
            string minusRatingString = ratingNode.SelectSingleNode(CommentsPageXPath.MinusRating).InnerText.SafeTrim();

            int plusRating = this.ScrapeCommentRating(plusRatingString, comment.Url);
            int minusRating = this.ScrapeCommentRating(minusRatingString, comment.Url);

            comment.Rating = plusRating + minusRating;

            return comment;
        }