Esempio n. 1
0
        public static Post AnalyzePost(string inPostUrl, IModelRepository inRepo, bool isOnFrontPage, bool inFetchCommentsVotes)
        {
            Post newPost = new Post();

            newPost.HrefLink      = inPostUrl;
            newPost.IsOnFrontPage = isOnFrontPage;

            StringBuilder output = new StringBuilder();

            output.AppendFormat("Post - {0,-90}", inPostUrl);

            ScrapingBrowser Browser = new ScrapingBrowser();

            Browser.AllowAutoRedirect = true; // Browser has settings you can access in setup
            Browser.AllowMetaRedirect = true;
            Browser.Encoding          = Encoding.UTF8;

            HtmlWeb      htmlWeb      = new HtmlWeb();
            HtmlDocument htmlDocument = htmlWeb.Load(inPostUrl);
            HtmlNode     mainContent  = htmlDocument.DocumentNode.Descendants().SingleOrDefault(x => x.Id == "content-main");

            // first, Node ID
            int    nodeId;
            string votesLink;

            if (PostAnalyzer.ScrapePostID(mainContent, out nodeId, out votesLink))
            {
                newPost.Id        = nodeId;
                newPost.VotesLink = votesLink;
            }

            if (inRepo.PostAlreadyExists(newPost.Id))            // check for Post ID already in the repo
            {
                log.WarnFormat("WARNING - Post with ID {0} already in the database", newPost.Id);

                return(null);
            }
            output.AppendFormat(" ID - {0,5}", newPost.Id);

            // title
            var titleHtml = mainContent.Descendants().Single(n => n.GetAttributeValue("class", "").Equals("node")).Descendants("h1").ToList();

            newPost.Title = titleHtml[0].InnerText;

            // text of the post
            var postText = mainContent.Descendants().First(n => n.GetAttributeValue("class", "").Equals("node"));

            if (postText != null)
            {
                int n1 = postText.InnerText.IndexOf("dodaj komentar");

                newPost.Text = postText.InnerText.Substring(n1 + 20);
            }

            // date posted
            newPost.DatePosted = ScrapePostDate(mainContent);
            output.AppendFormat(" Date - {0}", newPost.DatePosted.ToString("dd/MM/yyy hh:mm"));

            // author
            string author, authorHtml;

            PostAnalyzer.ScrapePostAuthor(htmlDocument, out author, out authorHtml);
            output.AppendFormat(" Username - {0,-18}", author);

            // check if user exists, add him if not
            User user = inRepo.GetUserByName(author);

            if (user == null)
            {
                user = new User {
                    Name = author, NameHtml = authorHtml
                };

                Console.WriteLine(user.Name + " ; " + user.NameHtml);

                inRepo.AddUser(user);
            }

            newPost.Author = user;

            newPost.NumCommentsScrapped = CommentsAnalyzer.ScrapePostCommentsNum(mainContent);
            if (newPost.NumCommentsScrapped < 0)
            {
                log.Error("ERROR - scrapping number of comments");
            }

            output.AppendFormat("  Num.comm - {0,3}", newPost.NumCommentsScrapped);

            if (newPost.Id > 0)
            {
                newPost.Votes = VotesAnalyzer.ScrapeListVotesForNode(newPost.Id, newPost.Author, "node", inRepo);
            }

            output.AppendFormat("  Votes    - {0}", newPost.Votes.Count);

            log.Info(output.ToString());

            newPost.Comments = CommentsAnalyzer.ScrapePostComments(mainContent, inPostUrl, inRepo, inFetchCommentsVotes);

            //Console.WriteLine("  Comments - {0}", newPost.Comments.Count);

            return(newPost);
        }
        public static List <Comment> ScrapePostComments(HtmlNode mainNode, string inHref, IModelRepository inRepo, bool inFetchCommentsVotes, ScrapingBrowser Browser = null)
        {
            List <Comment> listComments = new List <Comment>();

            // first - check if we have multiple pages of comments
            // najprije, da vidimo da li je samo jedna stranica s glasovima ili ih ima više
            var itemlist = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("pager")).ToList();

            int pageCount = 0;

            if (itemlist.Count > 0)
            {
                string s = itemlist[0].LastChild.PreviousSibling.InnerHtml;

                int    n1  = s.IndexOf("?page=");
                int    n2  = s.IndexOf("\"", n1);
                string num = s.Substring(n1 + 6, n2 - n1 - 6);

                pageCount = Convert.ToInt32(num);
            }

            for (int i = 0; i <= pageCount; i++)
            {
                HtmlNode comments = mainNode.Descendants().SingleOrDefault(x => x.Id == "comments");

                if (comments == null)           // No comments?
                {
                    return(listComments);
                }

                List <HtmlNode> allComments = comments.Descendants().Where(x => x.Id.StartsWith("comment-content")).ToList();

                foreach (var comment in allComments)
                {
                    Comment newComment = new Comment();

                    //comment.ChildNodes[1] has "\n    Skviki &mdash; Pon, 28/11/2016 - 16:16.
                    string strNameDate = comment.ChildNodes[1].InnerText;
                    int    mdashPos    = strNameDate.IndexOf("&mdash");
                    string name        = strNameDate.Substring(2, mdashPos - 2);
                    string authorName  = name.Trim();

                    // let's see if we can get his html nick
                    string authorNick = "";
                    string str        = comment.ChildNodes[1].InnerHtml;
                    int    usrInd     = str.IndexOf("/user/");
                    if (usrInd != -1)
                    {
                        int usrInd2 = str.IndexOf("title=", usrInd);
                        authorNick = str.Substring(usrInd + 6, usrInd2 - usrInd - 8);
                    }
                    // check if user exists, add him if not
                    User user = inRepo.GetUserByName(authorName);
                    if (user == null)
                    {
                        user = new User {
                            Name = authorName, NameHtml = authorNick
                        };
                        inRepo.AddUser(user);
                    }
                    newComment.Author = user;

                    int    lastCommaPos = strNameDate.LastIndexOf(',');
                    string date         = strNameDate.Substring(lastCommaPos + 1, strNameDate.Length - lastCommaPos - 1);

                    var numVotes =
                        comment.Descendants()
                        .Where(n => n.GetAttributeValue("class", "").Equals("total-votes-plain"))
                        .ToList();
                    string resultString = Regex.Match(numVotes[0].InnerText, @"-?\d+").Value;

                    newComment.NumScrappedVotes = Int32.Parse(resultString);

                    newComment.DatePosted = Utility.ExtractDateTime(date.Trim());

                    newComment.Text = comment.ChildNodes[3].InnerText;

                    string commentId = comment.Id;
                    int    dashPos   = commentId.LastIndexOf('-');
                    if (dashPos > 0)
                    {
                        string idValue = commentId.Substring(dashPos + 1, commentId.Length - dashPos - 1);

                        newComment.Id = Convert.ToInt32(idValue);
                    }
                    else
                    {
                        log.Error("ERROR in getting comment ID " + inHref);
                    }

                    listComments.Add(newComment);
                }

                // reinicijaliziramo učitani HTML za sljedeću stranicu
                if (i < pageCount)
                {
                    string href = inHref + "?page=" + (i + 1).ToString();

                    if (Browser != null)
                    {
                        WebPage PageResult = Browser.NavigateToPage(new Uri(href));
                        mainNode = PageResult.Html;

                        //log.Info(mainNode.InnerHtml);
                    }
                    else
                    {
                        HtmlWeb      htmlWeb      = new HtmlWeb();
                        HtmlDocument htmlDocument = htmlWeb.Load(href);
                        mainNode = htmlDocument.DocumentNode;
                    }
                }
            }

            // and now we have to fetch list of votes for each one
            if (inFetchCommentsVotes)
            {
                foreach (var comm in listComments)
                {
                    // ovaj if bi jako ubrzao stvar ... ali, što ukoliko je dobio dva plus i dva minus glasa, i rezultat je 0?
                    //if( comm.NumScrappedVotes != 0 )
                    comm.Votes = VotesAnalyzer.ScrapeListVotesForNode(comm.Id, comm.Author, "comment", inRepo, Browser);
                }
            }

            //List<HtmlNode> firstLevelComments = comments.ChildNodes.Where(x => x.Id.StartsWith("comment")).ToList();
            //foreach (var com1 in firstLevelComments)
            //{
            //    if( com1.Name == "div" )
            //        Console.WriteLine("DIV DIV DIV ******************************************\n" + com1.InnerHtml);
            //}


            return(listComments);
        }
        // inType: "node" - for getting votes for posts, "comment" - for getting votes for comments
        public static List <Vote> ScrapeListVotesForNode(int nodeID, User nodeAuthor, string inType, IModelRepository inRepo, ScrapingBrowser inBrowser = null)
        {
            List <Vote> listVotes = new List <Vote>();

            string href = "http://pollitika.com/" + inType + "/" + nodeID.ToString() + "/who_voted";

            HtmlNode mainNode = null;

            if (inBrowser == null)
            {
                HtmlWeb      htmlWeb      = new HtmlWeb();
                HtmlDocument htmlDocument = htmlWeb.Load(href);
                mainNode = htmlDocument.DocumentNode;
            }
            else
            {
                WebPage PageResult = inBrowser.NavigateToPage(new Uri(href));
                mainNode = PageResult.Html;
            }

            // najprije, da vidimo da li je samo jedna stranica s glasovima ili ih ima više
            var itemlist = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("pager"));

            int pageCount = 1;

            if (itemlist.Count() > 0)
            {
                pageCount = itemlist.First().ChildNodes.Count / 2 - 2;
            }

            for (int i = 0; i < pageCount; i++)
            {
                var voteList = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("view-content"));

                var content = voteList.First();
                var table   = content.SelectNodes("table");

                if (table == null)              // it means there is no table with votes
                {
                    return(listVotes);
                }

                var tList = table[0].ChildNodes[3];         // picking up tbody

                foreach (HtmlNode row in tList.SelectNodes("tr"))
                {
                    var rowCels = row.SelectNodes("th|td");

                    Vote newVote = new Vote();

                    string userName = rowCels[0].InnerText.Substring(13).TrimEnd();

                    // let's see if we can get his html nick
                    string userNick = "";
                    string str      = rowCels[0].InnerHtml;
                    int    usrInd   = str.IndexOf("/user/");
                    if (usrInd != -1)
                    {
                        int usrInd2 = str.IndexOf("title=", usrInd);
                        userNick = str.Substring(usrInd + 6, usrInd2 - usrInd - 8);
                    }

                    // check if user exists, add him if not
                    User user = inRepo.GetUserByName(userName);
                    if (user == null)
                    {
                        user = new User {
                            Name = userName, NameHtml = userNick
                        };
                        inRepo.AddUser(user);
                    }

                    newVote.ByUser      = user;
                    newVote.VoteForUser = nodeAuthor;

                    string value = rowCels[1].InnerText.Substring(13).TrimEnd();
                    newVote.UpOrDown = Convert.ToInt32(value);

                    string time = rowCels[2].InnerText.Substring(13).TrimEnd();
                    newVote.DatePosted = Utility.ExtractDateTime2(time);

                    listVotes.Add(newVote);
                }

                // reinicijaliziramo učitani HTML za sljedeću stranicu
                if (i < pageCount - 1)
                {
                    href = "http://pollitika.com/node/" + nodeID.ToString() + "/who_voted?page=" + (i + 1).ToString();

                    if (inBrowser == null)
                    {
                        HtmlWeb      htmlWeb      = new HtmlWeb();
                        HtmlDocument htmlDocument = htmlWeb.Load(href);
                        mainNode = htmlDocument.DocumentNode;
                    }
                    else
                    {
                        WebPage PageResult = inBrowser.NavigateToPage(new Uri(href));
                        mainNode = PageResult.Html;
                    }
                }
            }
            return(listVotes);
        }