public static List <Comment> ScrapePostComments(HtmlNode mainNode, string inHref, IModelRepository inRepo, bool inFetchCommentsVotes = true) { List <Comment> listComments = new List <Comment>(); // first - check if we have multiple pages of comments // najprije, da vidimo da li je samo jedna stranica s glasovima ili ih ima više var itemlist = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("pager")).ToList(); int pageCount = 0; if (itemlist.Count > 0) { string s = itemlist[0].LastChild.PreviousSibling.InnerHtml; int n1 = s.IndexOf("?page="); int n2 = s.IndexOf("\"", n1); string num = s.Substring(n1 + 6, n2 - n1 - 6); pageCount = Convert.ToInt32(num); } for (int i = 0; i <= pageCount; i++) { HtmlNode comments = mainNode.Descendants().SingleOrDefault(x => x.Id == "comments"); string html = comments.InnerHtml; Console.WriteLine(html); int startInd = 0; while (true) { int ind = html.IndexOf("/user/", startInd); if (ind != -1) { string str = html.Substring(ind, 20); } startInd = ind + 5; if (ind == -1) { break; } } if (comments == null) // No comments? { return(listComments); } List <HtmlNode> allComments = comments.Descendants().Where(x => x.Id.StartsWith("comment-content")).ToList(); foreach (var comment in allComments) { Comment newComment = new Comment(); //comment.ChildNodes[1] has "\n Skviki — Pon, 28/11/2016 - 16:16. string strNameDate = comment.ChildNodes[1].InnerText; int mdashPos = strNameDate.IndexOf("&mdash"); string name = strNameDate.Substring(2, mdashPos - 2); string authorName = name.Trim(); // let's see if we can get his html nick string authorNick = ""; string str = comment.ChildNodes[1].InnerHtml; int usrInd = str.IndexOf("/user/"); if (usrInd != -1) { int usrInd2 = str.IndexOf("title=", usrInd); authorNick = str.Substring(usrInd + 6, usrInd2 - usrInd - 8); } // check if user exists, add him if not User user = inRepo.GetUserByName(authorName); if (user == null) { user = new User { Name = authorName, NameHtml = authorNick }; inRepo.AddUser(user); } newComment.Author = user; int lastCommaPos = strNameDate.LastIndexOf(','); string date = strNameDate.Substring(lastCommaPos + 1, strNameDate.Length - lastCommaPos - 1); var numVotes = comment.Descendants() .Where(n => n.GetAttributeValue("class", "").Equals("total-votes-plain")) .ToList(); string resultString = Regex.Match(numVotes[0].InnerText, @"-?\d+").Value; newComment.NumScrappedVotes = Int32.Parse(resultString); newComment.DatePosted = Utility.ExtractDateTime(date.Trim()); newComment.Text = comment.ChildNodes[3].InnerText; string commentId = comment.Id; int dashPos = commentId.LastIndexOf('-'); if (dashPos > 0) { string idValue = commentId.Substring(dashPos + 1, commentId.Length - dashPos - 1); newComment.Id = Convert.ToInt32(idValue); } else { log.Error("ERROR in getting comment ID " + inHref); } listComments.Add(newComment); } // reinicijaliziramo učitani HTML za sljedeću stranicu if (i < pageCount) { string href = inHref + "?page=" + (i + 1).ToString(); HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(href); mainNode = htmlDocument.DocumentNode; } } // and now we have to fetch list of votes for each one if (inFetchCommentsVotes) { foreach (var comm in listComments) { comm.Votes = VotesAnalyzer.ScrapeListVotesForNode(comm.Id, comm.Author, "comment", inRepo); } } //List<HtmlNode> firstLevelComments = comments.ChildNodes.Where(x => x.Id.StartsWith("comment")).ToList(); //foreach (var com1 in firstLevelComments) //{ // if( com1.Name == "div" ) // Console.WriteLine("DIV DIV DIV ******************************************\n" + com1.InnerHtml); //} return(listComments); }
public static Post AnalyzePost(string inPostUrl, IModelRepository inRepo, bool isOnFrontPage, bool inFetchCommentsVotes) { Post newPost = new Post(); newPost.HrefLink = inPostUrl; newPost.IsOnFrontPage = isOnFrontPage; StringBuilder output = new StringBuilder(); output.AppendFormat("Post - {0,-90}", inPostUrl); ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has settings you can access in setup Browser.AllowMetaRedirect = true; Browser.Encoding = Encoding.UTF8; HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(inPostUrl); HtmlNode mainContent = htmlDocument.DocumentNode.Descendants().SingleOrDefault(x => x.Id == "content-main"); // first, Node ID int nodeId; string votesLink; if (PostAnalyzer.ScrapePostID(mainContent, out nodeId, out votesLink)) { newPost.Id = nodeId; newPost.VotesLink = votesLink; } if (inRepo.PostAlreadyExists(newPost.Id)) // check for Post ID already in the repo { log.WarnFormat("WARNING - Post with ID {0} already in the database", newPost.Id); return(null); } output.AppendFormat(" ID - {0,5}", newPost.Id); // title var titleHtml = mainContent.Descendants().Single(n => n.GetAttributeValue("class", "").Equals("node")).Descendants("h1").ToList(); newPost.Title = titleHtml[0].InnerText; // text of the post var postText = mainContent.Descendants().First(n => n.GetAttributeValue("class", "").Equals("node")); if (postText != null) { int n1 = postText.InnerText.IndexOf("dodaj komentar"); newPost.Text = postText.InnerText.Substring(n1 + 20); } // date posted newPost.DatePosted = ScrapePostDate(mainContent); output.AppendFormat(" Date - {0}", newPost.DatePosted.ToString("dd/MM/yyy hh:mm")); // author string author, authorHtml; PostAnalyzer.ScrapePostAuthor(htmlDocument, out author, out authorHtml); output.AppendFormat(" Username - {0,-18}", author); // check if user exists, add him if not User user = inRepo.GetUserByName(author); if (user == null) { user = new User { Name = author, NameHtml = authorHtml }; Console.WriteLine(user.Name + " ; " + user.NameHtml); inRepo.AddUser(user); } newPost.Author = user; newPost.NumCommentsScrapped = CommentsAnalyzer.ScrapePostCommentsNum(mainContent); if (newPost.NumCommentsScrapped < 0) { log.Error("ERROR - scrapping number of comments"); } output.AppendFormat(" Num.comm - {0,3}", newPost.NumCommentsScrapped); if (newPost.Id > 0) { newPost.Votes = VotesAnalyzer.ScrapeListVotesForNode(newPost.Id, newPost.Author, "node", inRepo); } output.AppendFormat(" Votes - {0}", newPost.Votes.Count); log.Info(output.ToString()); newPost.Comments = CommentsAnalyzer.ScrapePostComments(mainContent, inPostUrl, inRepo, inFetchCommentsVotes); //Console.WriteLine(" Comments - {0}", newPost.Comments.Count); return(newPost); }