/// <summary> /// Set the UTC date/time of the post object. /// </summary> /// <param name="post">The Post object to set the date/time of.</param> private void SetPostDate(Post post, string region) { string DateToParse = post.PostDate; if (!String.IsNullOrEmpty(DateToParse) && !DateToParse.Contains("unknown")) { DateTime PostDate = new DateTime(); string newDateRaw = String.Empty; string[] strDateParts = DateToParse.Split(' '); //Careful! This space charcter is of some other ASCII code. It does not work in the split() above if (strDateParts[strDateParts.Count() - 1].Contains(' ')) { strDateParts[strDateParts.Count() - 1] = strDateParts[strDateParts.Count() - 1].Remove(strDateParts[strDateParts.Count() - 1].IndexOf(' ')); for (int i = 0; i < strDateParts.Count(); i++) { newDateRaw = newDateRaw + strDateParts[i] + " "; } } else { for (int i = 0; i < strDateParts.Count() - 1; i++) { newDateRaw = newDateRaw + strDateParts[i] + " "; } } if (DateToParse.Contains("BST") || DateToParse.Contains("CEST") || DateToParse.Contains("CET") || DateToParse.Contains("GMT")) { PostDate = DateTime.Parse(newDateRaw, CultureInfo.CreateSpecificCulture("fr-FR")); } else if (DateToParse.Contains("PDT") || DateToParse.Contains("PST")) { PostDate = DateTime.Parse(newDateRaw, CultureInfo.CreateSpecificCulture("en-US")); } else { if (region == "EU") { PostDate = DateTime.Parse(newDateRaw, CultureInfo.CreateSpecificCulture("fr-FR")); } else if (region == "US") { PostDate = DateTime.Parse(newDateRaw, CultureInfo.CreateSpecificCulture("en-US")); } } if (DateToParse.Contains("BST")) PostDate = PostDate.AddHours(-1); else if (DateToParse.Contains("CEST")) PostDate = PostDate.AddHours(-2); else if (DateToParse.Contains("CET")) PostDate = PostDate.AddHours(-1); else if (DateToParse.Contains("PST")) PostDate = PostDate.AddHours(+8); else if (DateToParse.Contains("PDT")) PostDate = PostDate.AddHours(+7); post.PostDateTimeUtc = PostDate; } }
private void CrawlSpecifiedTopics(Topic topic) { //We are going to make sure page 1 is always crawled. List<int> pageNos = new List<int>(); pageNos.Add(1); if (!topic.LastCrawledPage.HasValue) { topic.LastCrawledPage = 1; } for (int i = topic.LastCrawledPage.Value; i <= topic.NoPages; i++) { if (i != 1) { pageNos.Add(i); } } int lastCrawledPage = 0; for (int i = 0; i <= pageNos.Count - 1; i++) { string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(), topic.ThreadNumber, pageNos[i])); HtmlDocument document = new HtmlDocument(); document.LoadHtml(ht); var divTags = document.GetElementbyId("post-list").ChildNodes.Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("topic-post")); foreach (var div in divTags) { bool indexThisPost = false; string postContent = null; string avatarLink = null; string linkDirectPost = null; string postDate = null; string posterName = null; string posterSpecialTitle = null; string postEdited = null; short posterType = 0; if (div.HasChildNodes) { if (div.InnerHtml.Contains("<span id=\"1\">")) //incredibly fragile { indexThisPost = true; } if (div.Attributes["class"].Value.Contains("blizzard")) { indexThisPost = true; posterType = 2; } else if (div.Attributes["class"].Value.Contains("mvp")) { indexThisPost = true; posterType = 1; } if (indexThisPost) { #region Post Content //*[@id="post-64426546361"]/div[1]/table/tr/td[2]/div var postContentTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div", div.Id)); if (postContentTag != null) { postContent = String.Empty; postContent += postContentTag.InnerHtml; } #endregion #region avatar // //*[@id="post-79781202523"] var avatarImgTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[1]/div/div/a/img", div.Id)); if (avatarImgTag != null) { avatarLink = avatarImgTag.OuterHtml; string src = avatarLink.Remove(0, avatarLink.IndexOf("src") + 1); src = src.Remove(0, src.IndexOf("\"") + 1); src = src.Remove(src.IndexOf("\"")); if (!src.Contains("battle.net") && !src.Contains("cms/user_avatar") && !src.Contains("media.blizzard")) //TODO: Second condition here might conflic in other regions. Check for this. { src = String.Format("http://{0}.battle.net/", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower()) + src; } avatarLink = String.Format("<img alt=\"avatar\" src = \"{0}\" width={1} height={2}", src, "{0}", "{1} />"); } #endregion #region Direct Post Link var indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/a", div.Id)); if (indexATag == null) //deleted MVP post { indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[3]/div/a", div.Id)); } if (indexATag != null) { linkDirectPost = String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}{5}", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(), topic.ThreadNumber, pageNos[i], indexATag.Attributes["href"].Value); } #endregion #region Post Date var postDateDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/div[1]", div.Id)); if (postDateDiv != null) { postDate = postDateDiv.Attributes["data-tooltip"].Value.Replace(" ", " "); } #endregion #region Poster Name var posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[1]/a/span", div.Id)); //*[@id="post-66887172203"]/div[1]/table/tr/td[1]/div/div/div/a/span if (posterNameSpan == null) { posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[1]/div/div/div/a/span", div.Id)); } if (posterNameSpan != null) { posterName = posterNameSpan.InnerText; } #endregion #region Poster Special Title if (posterType == 2) { var posterSpecialTitleDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[2]", div.Id)); if (posterSpecialTitleDiv != null) { posterSpecialTitle = posterSpecialTitleDiv.InnerText; } } #endregion #region Post Edited var postEditedDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div[2]", div.Id)); if (postEditedDiv != null) { postEdited = postEditedDiv.OuterHtml; } #endregion //Combining post edited with post content. if (!String.IsNullOrEmpty(postEdited)) { postContent += postEdited; } if (postContent != null) postContent = postContent.Replace("<br>", "<br />"); //Does this post exist? Assume false for now. bool postExists = false; //Check if the post exists. If it does exist, make sure there weren't any updates on it. using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { int postCount = (from p in entities.Posts where p.DirectPostLink == linkDirectPost select p).Count(); postExists = (postCount > 0); } if (postExists) { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { Post post = (from p in entities.Posts where p.DirectPostLink == linkDirectPost select p).ToList()[0]; if (post.PostContent != postContent || post.AvatarLinkOfPost != avatarLink) { post.PostContent = postContent; post.AvatarLinkOfPost = avatarLink; } entities.SubmitChanges(); } } else { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { if (postContent != null) { Post post = new Post(); post.TopicId = topic.Id; post.DirectPostLink = linkDirectPost; post.PosterName = posterName; post.PostDate = postDate; post.PosterType = posterType; post.PosterSpecialTitle = posterSpecialTitle; post.AvatarLinkOfPost = avatarLink; post.PostContent = postContent; try { SetPostDate(post, topic.ForumBoard.BlizzArea.Region.RegionAbbreviation); } catch (Exception) //This will happen if time zone is not specified { } //if (post.PosterType == 2) //{ // topic.LastPostDate = post.PostDateTimeUtc.Value; //} entities.Posts.InsertOnSubmit(post); entities.SubmitChanges(); } } } } } } lastCrawledPage = pageNos[i]; } using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { Topic topicToSave = (from t in entities.Topics where t.Id == topic.Id select t).Single(); var lastBlue = (from p in entities.Posts where p.TopicId == topicToSave.Id && p.PosterType == 2 orderby p.PostDateTimeUtc descending select p).ToList(); if (lastBlue.Count() > 0) topicToSave.LastPostDate = lastBlue[0].PostDateTimeUtc.Value; topicToSave.LastCrawledPage = lastCrawledPage; topicToSave.IsBeingCrawled = false; topicToSave.ToBeCrawled = false; entities.SubmitChanges(); } }
partial void DeletePost(Post instance);
partial void UpdatePost(Post instance);
partial void InsertPost(Post instance);
private void detach_Posts(Post entity) { this.SendPropertyChanging("Posts"); entity.Topic = null; }
private void attach_Posts(Post entity) { this.SendPropertyChanging("Posts"); entity.Topic = this; }