private void worker_DoWork(object sender, DoWorkEventArgs e) { bool isSuccess = false; int failureCount = 0; while (!isSuccess) { try { TrElementData trElementData = e.Argument as TrElementData; using (ForumBlogsDataContext entities = new ForumBlogsDataContext(trElementData.ConnStr)) { Console.WriteLine("Initiating database connection . . ."); int threadCount = (from t in entities.Topics where t.ThreadNumber == trElementData.threadNumber && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower() && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower() && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower() select t).Count(); trElementData.isNewThread = !(threadCount > 0); if (trElementData.isNewThread) { Console.WriteLine("\nFound new thread to be marked for indexing. \n"); Topic topic = new Topic(); topic.ForumBoardId = trElementData.ForumBoardId; topic.ThreadTitle = trElementData.threadTitle; topic.AuthorOfThread = trElementData.threadAuthor; topic.LastPoster = trElementData.lastPoster.Trim(); topic.ThreadNumber = trElementData.threadNumber.Value; topic.ToBeCrawled = true; //temporary topic.NoPages = trElementData.lastPage; topic.LastPostDate = DateTime.Now.AddYears(-3); topic.ThreadUpdate = trElementData.lastUpdateStamp; entities.Topics.InsertOnSubmit(topic); entities.SubmitChanges(); } else { Topic topic = (from t in entities.Topics where t.ThreadNumber == trElementData.threadNumber && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower() && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower() && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower() select t).ToList()[0]; if (topic.ThreadTitle != trElementData.threadTitle) { Console.WriteLine("\nThe thread's title was changed.\n"); topic.ThreadTitle = trElementData.threadTitle; } if (topic.ThreadUpdate != trElementData.lastUpdateStamp && !topic.IsBeingCrawled && !topic.ToBeCrawled) { Console.WriteLine("\nFound known thread that was updated. \n"); topic.ToBeCrawled = true; topic.NoPages = trElementData.lastPage; topic.LastPoster = trElementData.lastPoster.Trim(); topic.ThreadUpdate = trElementData.lastUpdateStamp; } entities.SubmitChanges(); } } isSuccess = true; Console.WriteLine("Finished database operation."); } catch (Exception er) { failureCount++; System.Threading.Thread.Sleep(2000); if (failureCount > 10) Console.WriteLine(er.Message); } } }
private void CrawlSpecifiedTopics(Topic topic) { //We are going to make sure page 1 is always crawled. List<int> pageNos = new List<int>(); pageNos.Add(1); if (!topic.LastCrawledPage.HasValue) { topic.LastCrawledPage = 1; } for (int i = topic.LastCrawledPage.Value; i <= topic.NoPages; i++) { if (i != 1) { pageNos.Add(i); } } int lastCrawledPage = 0; for (int i = 0; i <= pageNos.Count - 1; i++) { string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(), topic.ThreadNumber, pageNos[i])); HtmlDocument document = new HtmlDocument(); document.LoadHtml(ht); var divTags = document.GetElementbyId("post-list").ChildNodes.Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("topic-post")); foreach (var div in divTags) { bool indexThisPost = false; string postContent = null; string avatarLink = null; string linkDirectPost = null; string postDate = null; string posterName = null; string posterSpecialTitle = null; string postEdited = null; short posterType = 0; if (div.HasChildNodes) { if (div.InnerHtml.Contains("<span id=\"1\">")) //incredibly fragile { indexThisPost = true; } if (div.Attributes["class"].Value.Contains("blizzard")) { indexThisPost = true; posterType = 2; } else if (div.Attributes["class"].Value.Contains("mvp")) { indexThisPost = true; posterType = 1; } if (indexThisPost) { #region Post Content //*[@id="post-64426546361"]/div[1]/table/tr/td[2]/div var postContentTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div", div.Id)); if (postContentTag != null) { postContent = String.Empty; postContent += postContentTag.InnerHtml; } #endregion #region avatar // //*[@id="post-79781202523"] var avatarImgTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[1]/div/div/a/img", div.Id)); if (avatarImgTag != null) { avatarLink = avatarImgTag.OuterHtml; string src = avatarLink.Remove(0, avatarLink.IndexOf("src") + 1); src = src.Remove(0, src.IndexOf("\"") + 1); src = src.Remove(src.IndexOf("\"")); if (!src.Contains("battle.net") && !src.Contains("cms/user_avatar") && !src.Contains("media.blizzard")) //TODO: Second condition here might conflic in other regions. Check for this. { src = String.Format("http://{0}.battle.net/", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower()) + src; } avatarLink = String.Format("<img alt=\"avatar\" src = \"{0}\" width={1} height={2}", src, "{0}", "{1} />"); } #endregion #region Direct Post Link var indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/a", div.Id)); if (indexATag == null) //deleted MVP post { indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[3]/div/a", div.Id)); } if (indexATag != null) { linkDirectPost = String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}{5}", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(), topic.ThreadNumber, pageNos[i], indexATag.Attributes["href"].Value); } #endregion #region Post Date var postDateDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/div[1]", div.Id)); if (postDateDiv != null) { postDate = postDateDiv.Attributes["data-tooltip"].Value.Replace(" ", " "); } #endregion #region Poster Name var posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[1]/a/span", div.Id)); //*[@id="post-66887172203"]/div[1]/table/tr/td[1]/div/div/div/a/span if (posterNameSpan == null) { posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[1]/div/div/div/a/span", div.Id)); } if (posterNameSpan != null) { posterName = posterNameSpan.InnerText; } #endregion #region Poster Special Title if (posterType == 2) { var posterSpecialTitleDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[2]", div.Id)); if (posterSpecialTitleDiv != null) { posterSpecialTitle = posterSpecialTitleDiv.InnerText; } } #endregion #region Post Edited var postEditedDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div[2]", div.Id)); if (postEditedDiv != null) { postEdited = postEditedDiv.OuterHtml; } #endregion //Combining post edited with post content. if (!String.IsNullOrEmpty(postEdited)) { postContent += postEdited; } if (postContent != null) postContent = postContent.Replace("<br>", "<br />"); //Does this post exist? Assume false for now. bool postExists = false; //Check if the post exists. If it does exist, make sure there weren't any updates on it. using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { int postCount = (from p in entities.Posts where p.DirectPostLink == linkDirectPost select p).Count(); postExists = (postCount > 0); } if (postExists) { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { Post post = (from p in entities.Posts where p.DirectPostLink == linkDirectPost select p).ToList()[0]; if (post.PostContent != postContent || post.AvatarLinkOfPost != avatarLink) { post.PostContent = postContent; post.AvatarLinkOfPost = avatarLink; } entities.SubmitChanges(); } } else { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { if (postContent != null) { Post post = new Post(); post.TopicId = topic.Id; post.DirectPostLink = linkDirectPost; post.PosterName = posterName; post.PostDate = postDate; post.PosterType = posterType; post.PosterSpecialTitle = posterSpecialTitle; post.AvatarLinkOfPost = avatarLink; post.PostContent = postContent; try { SetPostDate(post, topic.ForumBoard.BlizzArea.Region.RegionAbbreviation); } catch (Exception) //This will happen if time zone is not specified { } //if (post.PosterType == 2) //{ // topic.LastPostDate = post.PostDateTimeUtc.Value; //} entities.Posts.InsertOnSubmit(post); entities.SubmitChanges(); } } } } } } lastCrawledPage = pageNos[i]; } using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { Topic topicToSave = (from t in entities.Topics where t.Id == topic.Id select t).Single(); var lastBlue = (from p in entities.Posts where p.TopicId == topicToSave.Id && p.PosterType == 2 orderby p.PostDateTimeUtc descending select p).ToList(); if (lastBlue.Count() > 0) topicToSave.LastPostDate = lastBlue[0].PostDateTimeUtc.Value; topicToSave.LastCrawledPage = lastCrawledPage; topicToSave.IsBeingCrawled = false; topicToSave.ToBeCrawled = false; entities.SubmitChanges(); } }
partial void DeleteTopic(Topic instance);
partial void UpdateTopic(Topic instance);
partial void InsertTopic(Topic instance);
private void detach_Topics(Topic entity) { this.SendPropertyChanging("Topics"); entity.ForumBoard = null; }