public static Post AnalyzePost(string inPostUrl, IModelRepository inRepo, bool isOnFrontPage, bool inFetchCommentsVotes) { Post newPost = new Post(); newPost.HrefLink = inPostUrl; newPost.IsOnFrontPage = isOnFrontPage; StringBuilder output = new StringBuilder(); output.AppendFormat("Post - {0,-90}", inPostUrl); ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has settings you can access in setup Browser.AllowMetaRedirect = true; Browser.Encoding = Encoding.UTF8; HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(inPostUrl); HtmlNode mainContent = htmlDocument.DocumentNode.Descendants().SingleOrDefault(x => x.Id == "content-main"); // first, Node ID int nodeId; string votesLink; if (PostAnalyzer.ScrapePostID(mainContent, out nodeId, out votesLink)) { newPost.Id = nodeId; newPost.VotesLink = votesLink; } if (inRepo.PostAlreadyExists(newPost.Id)) // check for Post ID already in the repo { log.WarnFormat("WARNING - Post with ID {0} already in the database", newPost.Id); return(null); } output.AppendFormat(" ID - {0,5}", newPost.Id); // title var titleHtml = mainContent.Descendants().Single(n => n.GetAttributeValue("class", "").Equals("node")).Descendants("h1").ToList(); newPost.Title = titleHtml[0].InnerText; // text of the post var postText = mainContent.Descendants().First(n => n.GetAttributeValue("class", "").Equals("node")); if (postText != null) { int n1 = postText.InnerText.IndexOf("dodaj komentar"); newPost.Text = postText.InnerText.Substring(n1 + 20); } // date posted newPost.DatePosted = ScrapePostDate(mainContent); output.AppendFormat(" Date - {0}", newPost.DatePosted.ToString("dd/MM/yyy hh:mm")); // author string author, authorHtml; PostAnalyzer.ScrapePostAuthor(htmlDocument, out author, out authorHtml); output.AppendFormat(" Username - {0,-18}", author); // check if user exists, add him if not User user = inRepo.GetUserByName(author); if (user == null) { user = new User { Name = author, NameHtml = authorHtml }; Console.WriteLine(user.Name + " ; " + user.NameHtml); inRepo.AddUser(user); } newPost.Author = user; newPost.NumCommentsScrapped = CommentsAnalyzer.ScrapePostCommentsNum(mainContent); if (newPost.NumCommentsScrapped < 0) { log.Error("ERROR - scrapping number of comments"); } output.AppendFormat(" Num.comm - {0,3}", newPost.NumCommentsScrapped); if (newPost.Id > 0) { newPost.Votes = VotesAnalyzer.ScrapeListVotesForNode(newPost.Id, newPost.Author, "node", inRepo); } output.AppendFormat(" Votes - {0}", newPost.Votes.Count); log.Info(output.ToString()); newPost.Comments = CommentsAnalyzer.ScrapePostComments(mainContent, inPostUrl, inRepo, inFetchCommentsVotes); //Console.WriteLine(" Comments - {0}", newPost.Comments.Count); return(newPost); }
public static List <Comment> ScrapePostComments(HtmlNode mainNode, string inHref, IModelRepository inRepo, bool inFetchCommentsVotes, ScrapingBrowser Browser = null) { List <Comment> listComments = new List <Comment>(); // first - check if we have multiple pages of comments // najprije, da vidimo da li je samo jedna stranica s glasovima ili ih ima više var itemlist = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("pager")).ToList(); int pageCount = 0; if (itemlist.Count > 0) { string s = itemlist[0].LastChild.PreviousSibling.InnerHtml; int n1 = s.IndexOf("?page="); int n2 = s.IndexOf("\"", n1); string num = s.Substring(n1 + 6, n2 - n1 - 6); pageCount = Convert.ToInt32(num); } for (int i = 0; i <= pageCount; i++) { HtmlNode comments = mainNode.Descendants().SingleOrDefault(x => x.Id == "comments"); if (comments == null) // No comments? { return(listComments); } List <HtmlNode> allComments = comments.Descendants().Where(x => x.Id.StartsWith("comment-content")).ToList(); foreach (var comment in allComments) { Comment newComment = new Comment(); //comment.ChildNodes[1] has "\n Skviki — Pon, 28/11/2016 - 16:16. string strNameDate = comment.ChildNodes[1].InnerText; int mdashPos = strNameDate.IndexOf("&mdash"); string name = strNameDate.Substring(2, mdashPos - 2); string authorName = name.Trim(); // let's see if we can get his html nick string authorNick = ""; string str = comment.ChildNodes[1].InnerHtml; int usrInd = str.IndexOf("/user/"); if (usrInd != -1) { int usrInd2 = str.IndexOf("title=", usrInd); authorNick = str.Substring(usrInd + 6, usrInd2 - usrInd - 8); } // check if user exists, add him if not User user = inRepo.GetUserByName(authorName); if (user == null) { user = new User { Name = authorName, NameHtml = authorNick }; inRepo.AddUser(user); } newComment.Author = user; int lastCommaPos = strNameDate.LastIndexOf(','); string date = strNameDate.Substring(lastCommaPos + 1, strNameDate.Length - lastCommaPos - 1); var numVotes = comment.Descendants() .Where(n => n.GetAttributeValue("class", "").Equals("total-votes-plain")) .ToList(); string resultString = Regex.Match(numVotes[0].InnerText, @"-?\d+").Value; newComment.NumScrappedVotes = Int32.Parse(resultString); newComment.DatePosted = Utility.ExtractDateTime(date.Trim()); newComment.Text = comment.ChildNodes[3].InnerText; string commentId = comment.Id; int dashPos = commentId.LastIndexOf('-'); if (dashPos > 0) { string idValue = commentId.Substring(dashPos + 1, commentId.Length - dashPos - 1); newComment.Id = Convert.ToInt32(idValue); } else { log.Error("ERROR in getting comment ID " + inHref); } listComments.Add(newComment); } // reinicijaliziramo učitani HTML za sljedeću stranicu if (i < pageCount) { string href = inHref + "?page=" + (i + 1).ToString(); if (Browser != null) { WebPage PageResult = Browser.NavigateToPage(new Uri(href)); mainNode = PageResult.Html; //log.Info(mainNode.InnerHtml); } else { HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(href); mainNode = htmlDocument.DocumentNode; } } } // and now we have to fetch list of votes for each one if (inFetchCommentsVotes) { foreach (var comm in listComments) { // ovaj if bi jako ubrzao stvar ... ali, što ukoliko je dobio dva plus i dva minus glasa, i rezultat je 0? //if( comm.NumScrappedVotes != 0 ) comm.Votes = VotesAnalyzer.ScrapeListVotesForNode(comm.Id, comm.Author, "comment", inRepo, Browser); } } //List<HtmlNode> firstLevelComments = comments.ChildNodes.Where(x => x.Id.StartsWith("comment")).ToList(); //foreach (var com1 in firstLevelComments) //{ // if( com1.Name == "div" ) // Console.WriteLine("DIV DIV DIV ******************************************\n" + com1.InnerHtml); //} return(listComments); }
// inType: "node" - for getting votes for posts, "comment" - for getting votes for comments public static List <Vote> ScrapeListVotesForNode(int nodeID, User nodeAuthor, string inType, IModelRepository inRepo, ScrapingBrowser inBrowser = null) { List <Vote> listVotes = new List <Vote>(); string href = "http://pollitika.com/" + inType + "/" + nodeID.ToString() + "/who_voted"; HtmlNode mainNode = null; if (inBrowser == null) { HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(href); mainNode = htmlDocument.DocumentNode; } else { WebPage PageResult = inBrowser.NavigateToPage(new Uri(href)); mainNode = PageResult.Html; } // najprije, da vidimo da li je samo jedna stranica s glasovima ili ih ima više var itemlist = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("pager")); int pageCount = 1; if (itemlist.Count() > 0) { pageCount = itemlist.First().ChildNodes.Count / 2 - 2; } for (int i = 0; i < pageCount; i++) { var voteList = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("view-content")); var content = voteList.First(); var table = content.SelectNodes("table"); if (table == null) // it means there is no table with votes { return(listVotes); } var tList = table[0].ChildNodes[3]; // picking up tbody foreach (HtmlNode row in tList.SelectNodes("tr")) { var rowCels = row.SelectNodes("th|td"); Vote newVote = new Vote(); string userName = rowCels[0].InnerText.Substring(13).TrimEnd(); // let's see if we can get his html nick string userNick = ""; string str = rowCels[0].InnerHtml; int usrInd = str.IndexOf("/user/"); if (usrInd != -1) { int usrInd2 = str.IndexOf("title=", usrInd); userNick = str.Substring(usrInd + 6, usrInd2 - usrInd - 8); } // check if user exists, add him if not User user = inRepo.GetUserByName(userName); if (user == null) { user = new User { Name = userName, NameHtml = userNick }; inRepo.AddUser(user); } newVote.ByUser = user; newVote.VoteForUser = nodeAuthor; string value = rowCels[1].InnerText.Substring(13).TrimEnd(); newVote.UpOrDown = Convert.ToInt32(value); string time = rowCels[2].InnerText.Substring(13).TrimEnd(); newVote.DatePosted = Utility.ExtractDateTime2(time); listVotes.Add(newVote); } // reinicijaliziramo učitani HTML za sljedeću stranicu if (i < pageCount - 1) { href = "http://pollitika.com/node/" + nodeID.ToString() + "/who_voted?page=" + (i + 1).ToString(); if (inBrowser == null) { HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(href); mainNode = htmlDocument.DocumentNode; } else { WebPage PageResult = inBrowser.NavigateToPage(new Uri(href)); mainNode = PageResult.Html; } } } return(listVotes); }