public void CrawlForumBoard() { for (int i = 1; i <= 1; i++) //keep it on page 1 only for now. { string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/{3}/?page={4}", this.Board.BlizzArea.Region.RegionAbbreviation.ToLower(), this.Board.BlizzArea.Game.GameAbbreviation.ToLower(), this.Board.BlizzArea.Language.LanguageAbbreviation.ToLower(), this.Board.ForumBoardNo, i)); if (!String.IsNullOrEmpty(ht)) { HTDoc document = new HTDoc(); document.LoadHtml(ht); ////*[@id="postRow5149176090"] //*[@id="forum-topics"]/tbody[2]/tr[1] List<HtmlNode> unsortedNodes = new List<HtmlNode>(); List<HtmlNode> wantedNodes = new List<HtmlNode>(); var tBody1 = document.DocumentNode.SelectNodes("//*[@id=\"forum-topics\"]/tbody[1]"); //Featured topics (stickied) var tBody2 = document.DocumentNode.SelectNodes("//*[@id=\"forum-topics\"]/tbody[2]"); //Stickied topics var tBody3 = document.DocumentNode.SelectNodes("//*[@id=\"forum-topics\"]/tbody[3]"); //Unstickied topics unsortedNodes.AddRange(tBody1.Nodes()); unsortedNodes.AddRange(tBody2.Nodes()); unsortedNodes.AddRange(tBody3.Nodes()); //Removing nodes we don't want. foreach (var node in unsortedNodes) { if (node.Name == "tr") { wantedNodes.Add(node); } } foreach (var node in wantedNodes) { TrElementData trElementData = new TrElementData(); trElementData.isNewThread = false; trElementData.isBlueTagged = false; trElementData.threadAuthor = "undetermined"; trElementData.lastPoster = "undetermined"; trElementData.threadTitle = "undetermined"; trElementData.lastUpdateStamp = "undetermined"; trElementData.threadNumber = null; trElementData.lastPage = 1; trElementData.Game = this.Board.BlizzArea.Game.GameAbbreviation; trElementData.Lang = this.Board.BlizzArea.Language.LanguageAbbreviation; trElementData.Region = this.Board.BlizzArea.Region.RegionAbbreviation; trElementData.ForumBoardId = this.Board.Id; trElementData.ConnStr = this._connStr; //*[@id="postRow2583878364"]/td[1]/span/span[2] var bluePostIcon = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[1]/span/span[2]", node.Id)); if (bluePostIcon != null) { trElementData.isBlueTagged = true; //threadNumber = Int64.Parse(node.Attributes["data-topic-id"].Value); #region Thread Number var threadNumA = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/a", node.Id)); string rawNum = threadNumA.Attributes["href"].Value; rawNum = rawNum.Remove(0, rawNum.LastIndexOf("/") + 1); trElementData.threadNumber = Int64.Parse(rawNum); #endregion #region Thread Title var threadTitleSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/a/span[2]", node.Id)); if (threadTitleSpan == null) { threadTitleSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/a/span", node.Id)); } if (threadTitleSpan != null) { trElementData.threadTitle = threadTitleSpan.InnerText; } #endregion #region ThreadAuthor var threadAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[4]/span/span[1]", node.Id)); if (threadAuthorSpan == null) { threadAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[4]/span/span", node.Id)); } if (threadAuthorSpan != null) { trElementData.threadAuthor = threadAuthorSpan.InnerText; if (threadAuthorSpan.Attributes["class"].Value.Contains("blizzard-post")) { trElementData.threadAuthor += "-BLIZZ"; } else if (threadAuthorSpan.Attributes["class"].Value.Contains("mvp-author")) { trElementData.threadAuthor += "-MVP"; } } #endregion #region Update Time Spamp var lastUpdateStampMeta = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[2]/meta[2]", node.Id)); if (lastUpdateStampMeta != null) { trElementData.lastUpdateStamp = lastUpdateStampMeta.Attributes["content"].Value; } #endregion #region Page Numbers var liPageNums = document.DocumentNode.SelectNodes(String.Format("//*[@id=\"{0}\"]/td[3]/div/ul/li", node.Id)); if (liPageNums != null) { trElementData.lastPage = int.Parse(liPageNums[liPageNums.Count() - 1].InnerText); } #endregion #region Last poster var lastAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[7]/a/span[1]/span[1]", node.Id)); if (lastAuthorSpan == null) { lastAuthorSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/td[7]/a/span[1]/span", node.Id)); } if (lastAuthorSpan != null) { trElementData.lastPoster = lastAuthorSpan.InnerText; } #endregion } if (trElementData.isBlueTagged) { BackgroundWorker worker = new BackgroundWorker(); worker.DoWork += worker_DoWork; worker.RunWorkerAsync(trElementData); } } } } }
private static void PerformDatabaseOperations(TrElementData trElementData) { bool isSuccess = false; int failureCount = 0; while (!isSuccess) { try { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(trElementData.ConnStr)) { Console.WriteLine("Initiating database connection . . ."); int threadCount = (from t in entities.Topics where t.ThreadNumber == trElementData.threadNumber && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower() && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower() && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower() select t).Count(); trElementData.isNewThread = !(threadCount > 0); if (trElementData.isNewThread) { Console.WriteLine("\nFound new thread to be marked for indexing. \n"); Topic topic = new Topic(); topic.ForumBoardId = trElementData.ForumBoardId; topic.ThreadTitle = trElementData.threadTitle; topic.AuthorOfThread = trElementData.threadAuthor; topic.LastPoster = trElementData.lastPoster.Trim(); topic.ThreadNumber = trElementData.threadNumber.Value; topic.ToBeCrawled = true; //temporary topic.NoPages = trElementData.lastPage; topic.LastPostDate = DateTime.Now.AddYears(-3); topic.ThreadUpdate = trElementData.lastUpdateStamp; entities.Topics.InsertOnSubmit(topic); entities.SubmitChanges(); } else { Topic topic = (from t in entities.Topics where t.ThreadNumber == trElementData.threadNumber && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower() && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower() && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower() select t).ToList()[0]; if (topic.ThreadTitle != trElementData.threadTitle) { Console.WriteLine("\nThe thread's title was changed.\n"); topic.ThreadTitle = trElementData.threadTitle; } if (topic.ThreadUpdate != trElementData.lastUpdateStamp && !topic.IsBeingCrawled && !topic.ToBeCrawled) { Console.WriteLine("\nFound known thread that was updated. \n"); topic.ToBeCrawled = true; topic.NoPages = trElementData.lastPage; topic.LastPoster = trElementData.lastPoster.Trim(); topic.ThreadUpdate = trElementData.lastUpdateStamp; } entities.SubmitChanges(); } } isSuccess = true; Console.WriteLine("Finished database operation."); } catch (Exception er) { failureCount++; System.Threading.Thread.Sleep(2000); if (failureCount > 10) Console.WriteLine(er.Message); } } }