Пример #1
0
        public static List <Comment> ScrapePostComments(HtmlNode mainNode, string inHref, IModelRepository inRepo, bool inFetchCommentsVotes = true)
        {
            List <Comment> listComments = new List <Comment>();

            // first - check if we have multiple pages of comments
            // najprije, da vidimo da li je samo jedna stranica s glasovima ili ih ima više
            var itemlist = mainNode.Descendants().Where(n => n.GetAttributeValue("class", "").Equals("pager")).ToList();

            int pageCount = 0;

            if (itemlist.Count > 0)
            {
                string s = itemlist[0].LastChild.PreviousSibling.InnerHtml;

                int    n1  = s.IndexOf("?page=");
                int    n2  = s.IndexOf("\"", n1);
                string num = s.Substring(n1 + 6, n2 - n1 - 6);

                pageCount = Convert.ToInt32(num);
            }

            for (int i = 0; i <= pageCount; i++)
            {
                HtmlNode comments = mainNode.Descendants().SingleOrDefault(x => x.Id == "comments");

                string html = comments.InnerHtml;
                Console.WriteLine(html);

                int startInd = 0;
                while (true)
                {
                    int ind = html.IndexOf("/user/", startInd);
                    if (ind != -1)
                    {
                        string str = html.Substring(ind, 20);
                    }

                    startInd = ind + 5;
                    if (ind == -1)
                    {
                        break;
                    }
                }

                if (comments == null)           // No comments?
                {
                    return(listComments);
                }

                List <HtmlNode> allComments = comments.Descendants().Where(x => x.Id.StartsWith("comment-content")).ToList();

                foreach (var comment in allComments)
                {
                    Comment newComment = new Comment();

                    //comment.ChildNodes[1] has "\n    Skviki &mdash; Pon, 28/11/2016 - 16:16.
                    string strNameDate = comment.ChildNodes[1].InnerText;
                    int    mdashPos    = strNameDate.IndexOf("&mdash");
                    string name        = strNameDate.Substring(2, mdashPos - 2);
                    string authorName  = name.Trim();

                    // let's see if we can get his html nick
                    string authorNick = "";
                    string str        = comment.ChildNodes[1].InnerHtml;
                    int    usrInd     = str.IndexOf("/user/");
                    if (usrInd != -1)
                    {
                        int usrInd2 = str.IndexOf("title=", usrInd);
                        authorNick = str.Substring(usrInd + 6, usrInd2 - usrInd - 8);
                    }
                    // check if user exists, add him if not
                    User user = inRepo.GetUserByName(authorName);
                    if (user == null)
                    {
                        user = new User {
                            Name = authorName, NameHtml = authorNick
                        };
                        inRepo.AddUser(user);
                    }
                    newComment.Author = user;

                    int    lastCommaPos = strNameDate.LastIndexOf(',');
                    string date         = strNameDate.Substring(lastCommaPos + 1, strNameDate.Length - lastCommaPos - 1);

                    var numVotes =
                        comment.Descendants()
                        .Where(n => n.GetAttributeValue("class", "").Equals("total-votes-plain"))
                        .ToList();
                    string resultString = Regex.Match(numVotes[0].InnerText, @"-?\d+").Value;

                    newComment.NumScrappedVotes = Int32.Parse(resultString);

                    newComment.DatePosted = Utility.ExtractDateTime(date.Trim());

                    newComment.Text = comment.ChildNodes[3].InnerText;

                    string commentId = comment.Id;
                    int    dashPos   = commentId.LastIndexOf('-');
                    if (dashPos > 0)
                    {
                        string idValue = commentId.Substring(dashPos + 1, commentId.Length - dashPos - 1);

                        newComment.Id = Convert.ToInt32(idValue);
                    }
                    else
                    {
                        log.Error("ERROR in getting comment ID " + inHref);
                    }

                    listComments.Add(newComment);
                }

                // reinicijaliziramo učitani HTML za sljedeću stranicu
                if (i < pageCount)
                {
                    string href = inHref + "?page=" + (i + 1).ToString();

                    HtmlWeb      htmlWeb      = new HtmlWeb();
                    HtmlDocument htmlDocument = htmlWeb.Load(href);

                    mainNode = htmlDocument.DocumentNode;
                }
            }

            // and now we have to fetch list of votes for each one
            if (inFetchCommentsVotes)
            {
                foreach (var comm in listComments)
                {
                    comm.Votes = VotesAnalyzer.ScrapeListVotesForNode(comm.Id, comm.Author, "comment", inRepo);
                }
            }

            //List<HtmlNode> firstLevelComments = comments.ChildNodes.Where(x => x.Id.StartsWith("comment")).ToList();
            //foreach (var com1 in firstLevelComments)
            //{
            //    if( com1.Name == "div" )
            //        Console.WriteLine("DIV DIV DIV ******************************************\n" + com1.InnerHtml);
            //}


            return(listComments);
        }
Пример #2
0
        public static Post AnalyzePost(string inPostUrl, IModelRepository inRepo, bool isOnFrontPage, bool inFetchCommentsVotes)
        {
            Post newPost = new Post();

            newPost.HrefLink      = inPostUrl;
            newPost.IsOnFrontPage = isOnFrontPage;

            StringBuilder output = new StringBuilder();

            output.AppendFormat("Post - {0,-90}", inPostUrl);

            ScrapingBrowser Browser = new ScrapingBrowser();

            Browser.AllowAutoRedirect = true; // Browser has settings you can access in setup
            Browser.AllowMetaRedirect = true;
            Browser.Encoding          = Encoding.UTF8;

            HtmlWeb      htmlWeb      = new HtmlWeb();
            HtmlDocument htmlDocument = htmlWeb.Load(inPostUrl);
            HtmlNode     mainContent  = htmlDocument.DocumentNode.Descendants().SingleOrDefault(x => x.Id == "content-main");

            // first, Node ID
            int    nodeId;
            string votesLink;

            if (PostAnalyzer.ScrapePostID(mainContent, out nodeId, out votesLink))
            {
                newPost.Id        = nodeId;
                newPost.VotesLink = votesLink;
            }

            if (inRepo.PostAlreadyExists(newPost.Id))            // check for Post ID already in the repo
            {
                log.WarnFormat("WARNING - Post with ID {0} already in the database", newPost.Id);

                return(null);
            }
            output.AppendFormat(" ID - {0,5}", newPost.Id);

            // title
            var titleHtml = mainContent.Descendants().Single(n => n.GetAttributeValue("class", "").Equals("node")).Descendants("h1").ToList();

            newPost.Title = titleHtml[0].InnerText;

            // text of the post
            var postText = mainContent.Descendants().First(n => n.GetAttributeValue("class", "").Equals("node"));

            if (postText != null)
            {
                int n1 = postText.InnerText.IndexOf("dodaj komentar");

                newPost.Text = postText.InnerText.Substring(n1 + 20);
            }

            // date posted
            newPost.DatePosted = ScrapePostDate(mainContent);
            output.AppendFormat(" Date - {0}", newPost.DatePosted.ToString("dd/MM/yyy hh:mm"));

            // author
            string author, authorHtml;

            PostAnalyzer.ScrapePostAuthor(htmlDocument, out author, out authorHtml);
            output.AppendFormat(" Username - {0,-18}", author);

            // check if user exists, add him if not
            User user = inRepo.GetUserByName(author);

            if (user == null)
            {
                user = new User {
                    Name = author, NameHtml = authorHtml
                };

                Console.WriteLine(user.Name + " ; " + user.NameHtml);

                inRepo.AddUser(user);
            }

            newPost.Author = user;

            newPost.NumCommentsScrapped = CommentsAnalyzer.ScrapePostCommentsNum(mainContent);
            if (newPost.NumCommentsScrapped < 0)
            {
                log.Error("ERROR - scrapping number of comments");
            }

            output.AppendFormat("  Num.comm - {0,3}", newPost.NumCommentsScrapped);

            if (newPost.Id > 0)
            {
                newPost.Votes = VotesAnalyzer.ScrapeListVotesForNode(newPost.Id, newPost.Author, "node", inRepo);
            }

            output.AppendFormat("  Votes    - {0}", newPost.Votes.Count);

            log.Info(output.ToString());

            newPost.Comments = CommentsAnalyzer.ScrapePostComments(mainContent, inPostUrl, inRepo, inFetchCommentsVotes);

            //Console.WriteLine("  Comments - {0}", newPost.Comments.Count);

            return(newPost);
        }