예제 #1
0
        public void CrawlForumBoard()
        {
            for (int i = 1; i <= 1; i++) //keep it on page 1 only for now.
            {
                string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/{3}/?page={4}",
                                                               this.Board.BlizzArea.Region.RegionAbbreviation.ToLower(),
                                                               this.Board.BlizzArea.Game.GameAbbreviation.ToLower(),
                                                               this.Board.BlizzArea.Language.LanguageAbbreviation.ToLower(),
                                                               this.Board.ForumBoardNo,
                                                               i));

                if (!String.IsNullOrEmpty(ht))
                {
                    HTDoc document = new HTDoc();
                    document.LoadHtml(ht);

                    ////*[@id="postRow5149176090"]
                    //*[@id="forum-topics"]/tbody[2]/tr[1]

                    List<HtmlNode> unsortedNodes = new List<HtmlNode>();
                    List<HtmlNode> wantedNodes = new List<HtmlNode>();

                    var tBody1 = document.DocumentNode.SelectNodes("//*[@id=\"forum-topics\"]/tbody[1]"); //Featured topics (stickied)
                    var tBody2 = document.DocumentNode.SelectNodes("//*[@id=\"forum-topics\"]/tbody[2]"); //Stickied topics
                    var tBody3 = document.DocumentNode.SelectNodes("//*[@id=\"forum-topics\"]/tbody[3]"); //Unstickied topics

                    unsortedNodes.AddRange(tBody1.Nodes());
                    unsortedNodes.AddRange(tBody2.Nodes());
                    unsortedNodes.AddRange(tBody3.Nodes());

                    //Removing nodes we don't want.
                    foreach (var node in unsortedNodes)
                    {
                        if (node.Name == "tr")
                        {
                            wantedNodes.Add(node);
                        }
                    }

                    foreach (var node in wantedNodes)
                    {
                        TrElementData trElementData = new TrElementData();

                        trElementData.isNewThread = false;
                        trElementData.isBlueTagged = false;
                        trElementData.threadAuthor = "undetermined";
                        trElementData.lastPoster = "undetermined";
                        trElementData.threadTitle = "undetermined";
                        trElementData.lastUpdateStamp = "undetermined";
                        trElementData.threadNumber = null;
                        trElementData.lastPage = 1;

                        trElementData.Game = this.Board.BlizzArea.Game.GameAbbreviation;
                        trElementData.Lang = this.Board.BlizzArea.Language.LanguageAbbreviation;
                        trElementData.Region = this.Board.BlizzArea.Region.RegionAbbreviation;
                        trElementData.ForumBoardId = this.Board.Id;
                        trElementData.ConnStr = this._connStr;

                        //*[@id="postRow2583878364"]/td[1]/span/span[2]
                        var bluePostIcon = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[1]/span/span[2]", node.Id));

                        if (bluePostIcon != null)
                        {
                            trElementData.isBlueTagged = true;
                            //threadNumber = Int64.Parse(node.Attributes["data-topic-id"].Value);

                            #region Thread Number
                            var threadNumA = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/a", node.Id));
                            string rawNum = threadNumA.Attributes["href"].Value;
                            rawNum = rawNum.Remove(0, rawNum.LastIndexOf("/") + 1);
                            trElementData.threadNumber = Int64.Parse(rawNum);
                            #endregion

                            #region Thread Title
                            var threadTitleSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/a/span[2]", node.Id));

                            if (threadTitleSpan == null)
                            {
                                threadTitleSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/a/span", node.Id));
                            }

                            if (threadTitleSpan != null)
                            {
                                trElementData.threadTitle = threadTitleSpan.InnerText;
                            }
                            #endregion

                            #region ThreadAuthor
                            var threadAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[4]/span/span[1]", node.Id));

                            if (threadAuthorSpan == null)
                            {
                                threadAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[4]/span/span", node.Id));
                            }
                            if (threadAuthorSpan != null)
                            {
                                trElementData.threadAuthor = threadAuthorSpan.InnerText;

                                if (threadAuthorSpan.Attributes["class"].Value.Contains("blizzard-post"))
                                {
                                    trElementData.threadAuthor += "-BLIZZ";
                                }
                                else if (threadAuthorSpan.Attributes["class"].Value.Contains("mvp-author"))
                                {
                                    trElementData.threadAuthor += "-MVP";
                                }
                            }
                            #endregion

                            #region Update Time Spamp
                            var lastUpdateStampMeta = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/meta[2]", node.Id));
                            if (lastUpdateStampMeta != null)
                            {
                                trElementData.lastUpdateStamp = lastUpdateStampMeta.Attributes["content"].Value;
                            }
                            #endregion

                            #region Page Numbers

                            var liPageNums = document.DocumentNode.SelectNodes(String.Format("//*[@id=\"{0}\"]/td[3]/div/ul/li", node.Id));

                            if (liPageNums != null)
                            {
                                trElementData.lastPage = int.Parse(liPageNums[liPageNums.Count() - 1].InnerText);
                            }

                            #endregion

                            #region Last poster

                            var lastAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[7]/a/span[1]/span[1]", node.Id));
                            if (lastAuthorSpan == null)
                            {
                                lastAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[7]/a/span[1]/span", node.Id));
                            }

                            if (lastAuthorSpan != null)
                            {
                                trElementData.lastPoster = lastAuthorSpan.InnerText;
                            }
                            #endregion

                        }

                        if (trElementData.isBlueTagged)
                        {
                            BackgroundWorker worker = new BackgroundWorker();
                            worker.DoWork += worker_DoWork;
                            worker.RunWorkerAsync(trElementData);
                        }
                    }
                }
            }
        }
예제 #2
0
        private static void PerformDatabaseOperations(TrElementData trElementData)
        {
            bool isSuccess = false;
            int failureCount = 0;
            while (!isSuccess)
            {
                try
                {

                    using (ForumBlogsDataContext entities = new ForumBlogsDataContext(trElementData.ConnStr))
                    {
                        Console.WriteLine("Initiating database connection . . .");

                        int threadCount = (from t in entities.Topics
                                           where t.ThreadNumber == trElementData.threadNumber
                                           && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower()
                                           && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower()
                                           && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower()
                                           select t).Count();

                        trElementData.isNewThread = !(threadCount > 0);

                        if (trElementData.isNewThread)
                        {
                            Console.WriteLine("\nFound new thread to be marked for indexing. \n");

                            Topic topic = new Topic();
                            topic.ForumBoardId = trElementData.ForumBoardId;
                            topic.ThreadTitle = trElementData.threadTitle;
                            topic.AuthorOfThread = trElementData.threadAuthor;
                            topic.LastPoster = trElementData.lastPoster.Trim();
                            topic.ThreadNumber = trElementData.threadNumber.Value;
                            topic.ToBeCrawled = true; //temporary
                            topic.NoPages = trElementData.lastPage;
                            topic.LastPostDate = DateTime.Now.AddYears(-3);
                            topic.ThreadUpdate = trElementData.lastUpdateStamp;

                            entities.Topics.InsertOnSubmit(topic);
                            entities.SubmitChanges();
                        }
                        else
                        {
                            Topic topic = (from t in entities.Topics
                                           where t.ThreadNumber == trElementData.threadNumber
                                           && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower()
                                           && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower()
                                           && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower()
                                           select t).ToList()[0];

                            if (topic.ThreadTitle != trElementData.threadTitle)
                            {
                                Console.WriteLine("\nThe thread's title was changed.\n");
                                topic.ThreadTitle = trElementData.threadTitle;
                            }

                            if (topic.ThreadUpdate != trElementData.lastUpdateStamp && !topic.IsBeingCrawled && !topic.ToBeCrawled)
                            {
                                Console.WriteLine("\nFound known thread that was updated. \n");
                                topic.ToBeCrawled = true;
                                topic.NoPages = trElementData.lastPage;
                                topic.LastPoster = trElementData.lastPoster.Trim();
                                topic.ThreadUpdate = trElementData.lastUpdateStamp;
                            }

                            entities.SubmitChanges();

                        }
                    }
                    isSuccess = true;
                    Console.WriteLine("Finished database operation.");
                }
                catch (Exception er)
                {
                    failureCount++;
                    System.Threading.Thread.Sleep(2000);

                    if (failureCount > 10)
                        Console.WriteLine(er.Message);
                }
            }
        }