public void PerformCrawl(string game) { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(_connStr)) { var topicsToCrawl = (from t in entities.Topics where t.ForumBoard.BlizzArea.Game.GameAbbreviation == game && t.ToBeCrawled select t).ToList(); foreach (var topic in topicsToCrawl) { topic.IsBeingCrawled = true; } entities.SubmitChanges(); foreach (var topic in topicsToCrawl) { try { CrawlSpecifiedTopics(topic); entities.SubmitChanges(); } catch (Exception e) { Console.WriteLine(e.Message); } } entities.SubmitChanges(); } }
public override void PerformTwitterOperation() { Console.Write("Starting... ({0}) \n", "Hearthstone"); ForumBlogsDataContext db = new ForumBlogsDataContext(_connStr); try { var tweetQueue = (from t in db.TweetQueues where t.DateAdded > (DateTime.UtcNow.AddMinutes(-35)) && !t.IsTweeted && t.BlizzArea.Language.LanguageAbbreviation == this.Lang & t.BlizzArea.Game.GameAbbreviation == "HEARTHSTONE" & (t.PostLinkNo.Value != null && (t.PostLinkNo.Value == 1 || t.PostLinkNo.Value % 2 == 0)) select t); foreach (var tweet in tweetQueue) { string gameHashtag = "#Hearthstone"; string gameNormTag = "[#HS]"; string tweetedContent = String.Empty; if (tweet.IsForumThread) { string link = "http://www.blizzposts.com/topic/" + tweet.ItemId + "#" + tweet.PostLinkNo; tweetedContent = String.Format("{6} New blue post in \"{0}\" {1} - by {2} ({3}) {4}{5}", HttpUtility.HtmlDecode(tweet.Title), link, tweet.NameOfThePoster.Trim(), tweet.BlizzArea.Region.RegionAbbreviation, gameHashtag, "", //was bluecount - removed gameNormTag); } else { string link = "http://www.blizzposts.com/blogentry/" + tweet.ItemId; tweetedContent = String.Format("{4} [Blog {3}-{5}] \"{0}\" - {1} {2}", HttpUtility.HtmlDecode(tweet.Title), link, gameHashtag, tweet.BlizzArea.Region.RegionAbbreviation, gameNormTag, this.Lang); } TwitterService service = new TwitterService(CONSUMER_KEY, CONSUMER_SECRET); service.AuthenticateWith(ACCESS_TOKEN, ACCESS_SECRET); service.SendTweet(new SendTweetOptions { Status = tweetedContent }); tweet.TweetResult = service.Response.StatusCode + " - " + service.Response.StatusDescription; tweet.TweetedContent = tweetedContent; tweet.IsTweeted = true; tweet.DateIsTweeted = DateTime.UtcNow; Console.WriteLine("Tweeted content: {0}", tweetedContent); Console.WriteLine("Tweet Result: {0}\n\n", service.Response.StatusCode); db.SubmitChanges(); } } catch (Exception e) { Console.WriteLine("An error has ocurred: \n {0} \n", e.Message); } }
private void worker_DoWork(object sender, DoWorkEventArgs e) { bool isSuccess = false; int failureCount = 0; while (!isSuccess) { try { TrElementData trElementData = e.Argument as TrElementData; using (ForumBlogsDataContext entities = new ForumBlogsDataContext(trElementData.ConnStr)) { Console.WriteLine("Initiating database connection . . ."); int threadCount = (from t in entities.Topics where t.ThreadNumber == trElementData.threadNumber && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower() && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower() && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower() select t).Count(); trElementData.isNewThread = !(threadCount > 0); if (trElementData.isNewThread) { Console.WriteLine("\nFound new thread to be marked for indexing. \n"); Topic topic = new Topic(); topic.ForumBoardId = trElementData.ForumBoardId; topic.ThreadTitle = trElementData.threadTitle; topic.AuthorOfThread = trElementData.threadAuthor; topic.LastPoster = trElementData.lastPoster.Trim(); topic.ThreadNumber = trElementData.threadNumber.Value; topic.ToBeCrawled = true; //temporary topic.NoPages = trElementData.lastPage; topic.LastPostDate = DateTime.Now.AddYears(-3); topic.ThreadUpdate = trElementData.lastUpdateStamp; entities.Topics.InsertOnSubmit(topic); entities.SubmitChanges(); } else { Topic topic = (from t in entities.Topics where t.ThreadNumber == trElementData.threadNumber && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower() && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower() && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower() select t).ToList()[0]; if (topic.ThreadTitle != trElementData.threadTitle) { Console.WriteLine("\nThe thread's title was changed.\n"); topic.ThreadTitle = trElementData.threadTitle; } if (topic.ThreadUpdate != trElementData.lastUpdateStamp && !topic.IsBeingCrawled && !topic.ToBeCrawled) { Console.WriteLine("\nFound known thread that was updated. \n"); topic.ToBeCrawled = true; topic.NoPages = trElementData.lastPage; topic.LastPoster = trElementData.lastPoster.Trim(); topic.ThreadUpdate = trElementData.lastUpdateStamp; } entities.SubmitChanges(); } } isSuccess = true; Console.WriteLine("Finished database operation."); } catch (Exception er) { failureCount++; System.Threading.Thread.Sleep(2000); if (failureCount > 10) Console.WriteLine(er.Message); } } }
private void CrawlSpecifiedTopics(Topic topic) { //We are going to make sure page 1 is always crawled. List<int> pageNos = new List<int>(); pageNos.Add(1); if (!topic.LastCrawledPage.HasValue) { topic.LastCrawledPage = 1; } for (int i = topic.LastCrawledPage.Value; i <= topic.NoPages; i++) { if (i != 1) { pageNos.Add(i); } } int lastCrawledPage = 0; for (int i = 0; i <= pageNos.Count - 1; i++) { string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(), topic.ThreadNumber, pageNos[i])); HtmlDocument document = new HtmlDocument(); document.LoadHtml(ht); var divTags = document.GetElementbyId("post-list").ChildNodes.Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("topic-post")); foreach (var div in divTags) { bool indexThisPost = false; string postContent = null; string avatarLink = null; string linkDirectPost = null; string postDate = null; string posterName = null; string posterSpecialTitle = null; string postEdited = null; short posterType = 0; if (div.HasChildNodes) { if (div.InnerHtml.Contains("<span id=\"1\">")) //incredibly fragile { indexThisPost = true; } if (div.Attributes["class"].Value.Contains("blizzard")) { indexThisPost = true; posterType = 2; } else if (div.Attributes["class"].Value.Contains("mvp")) { indexThisPost = true; posterType = 1; } if (indexThisPost) { #region Post Content //*[@id="post-64426546361"]/div[1]/table/tr/td[2]/div var postContentTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div", div.Id)); if (postContentTag != null) { postContent = String.Empty; postContent += postContentTag.InnerHtml; } #endregion #region avatar // //*[@id="post-79781202523"] var avatarImgTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[1]/div/div/a/img", div.Id)); if (avatarImgTag != null) { avatarLink = avatarImgTag.OuterHtml; string src = avatarLink.Remove(0, avatarLink.IndexOf("src") + 1); src = src.Remove(0, src.IndexOf("\"") + 1); src = src.Remove(src.IndexOf("\"")); if (!src.Contains("battle.net") && !src.Contains("cms/user_avatar") && !src.Contains("media.blizzard")) //TODO: Second condition here might conflic in other regions. Check for this. { src = String.Format("http://{0}.battle.net/", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower()) + src; } avatarLink = String.Format("<img alt=\"avatar\" src = \"{0}\" width={1} height={2}", src, "{0}", "{1} />"); } #endregion #region Direct Post Link var indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/a", div.Id)); if (indexATag == null) //deleted MVP post { indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[3]/div/a", div.Id)); } if (indexATag != null) { linkDirectPost = String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}{5}", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(), topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(), topic.ThreadNumber, pageNos[i], indexATag.Attributes["href"].Value); } #endregion #region Post Date var postDateDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/div[1]", div.Id)); if (postDateDiv != null) { postDate = postDateDiv.Attributes["data-tooltip"].Value.Replace(" ", " "); } #endregion #region Poster Name var posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[1]/a/span", div.Id)); //*[@id="post-66887172203"]/div[1]/table/tr/td[1]/div/div/div/a/span if (posterNameSpan == null) { posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[1]/div/div/div/a/span", div.Id)); } if (posterNameSpan != null) { posterName = posterNameSpan.InnerText; } #endregion #region Poster Special Title if (posterType == 2) { var posterSpecialTitleDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[2]", div.Id)); if (posterSpecialTitleDiv != null) { posterSpecialTitle = posterSpecialTitleDiv.InnerText; } } #endregion #region Post Edited var postEditedDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div[2]", div.Id)); if (postEditedDiv != null) { postEdited = postEditedDiv.OuterHtml; } #endregion //Combining post edited with post content. if (!String.IsNullOrEmpty(postEdited)) { postContent += postEdited; } if (postContent != null) postContent = postContent.Replace("<br>", "<br />"); //Does this post exist? Assume false for now. bool postExists = false; //Check if the post exists. If it does exist, make sure there weren't any updates on it. using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { int postCount = (from p in entities.Posts where p.DirectPostLink == linkDirectPost select p).Count(); postExists = (postCount > 0); } if (postExists) { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { Post post = (from p in entities.Posts where p.DirectPostLink == linkDirectPost select p).ToList()[0]; if (post.PostContent != postContent || post.AvatarLinkOfPost != avatarLink) { post.PostContent = postContent; post.AvatarLinkOfPost = avatarLink; } entities.SubmitChanges(); } } else { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { if (postContent != null) { Post post = new Post(); post.TopicId = topic.Id; post.DirectPostLink = linkDirectPost; post.PosterName = posterName; post.PostDate = postDate; post.PosterType = posterType; post.PosterSpecialTitle = posterSpecialTitle; post.AvatarLinkOfPost = avatarLink; post.PostContent = postContent; try { SetPostDate(post, topic.ForumBoard.BlizzArea.Region.RegionAbbreviation); } catch (Exception) //This will happen if time zone is not specified { } //if (post.PosterType == 2) //{ // topic.LastPostDate = post.PostDateTimeUtc.Value; //} entities.Posts.InsertOnSubmit(post); entities.SubmitChanges(); } } } } } } lastCrawledPage = pageNos[i]; } using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { Topic topicToSave = (from t in entities.Topics where t.Id == topic.Id select t).Single(); var lastBlue = (from p in entities.Posts where p.TopicId == topicToSave.Id && p.PosterType == 2 orderby p.PostDateTimeUtc descending select p).ToList(); if (lastBlue.Count() > 0) topicToSave.LastPostDate = lastBlue[0].PostDateTimeUtc.Value; topicToSave.LastCrawledPage = lastCrawledPage; topicToSave.IsBeingCrawled = false; topicToSave.ToBeCrawled = false; entities.SubmitChanges(); } }
public override void PerformTwitterOperation() { Console.Write("Starting... ({0})", this.Lang); ForumBlogsDataContext db = new ForumBlogsDataContext(_connStr); try { var tweetQueue = (from t in db.TweetQueues where t.DateAdded > (DateTime.UtcNow.AddMinutes(-120)) && !t.IsTweeted && t.BlizzArea.Language.LanguageAbbreviation == this.Lang && t.BlizzArea.Game.GameAbbreviation != "BDA" select t); foreach (var tweet in tweetQueue) { string gameHashtag = tweet.BlizzArea.Game.GameAbbreviation; string gameNormTag = String.Empty; if (gameHashtag == "WOW") { gameHashtag = "#Warcraft"; gameNormTag = "[#WoW]"; } else if (gameHashtag == "D3") { gameHashtag = "#Diablo"; gameNormTag = "[#D3]"; } else if (gameHashtag == "SC2") { gameHashtag = "#StarCraft"; gameNormTag = "[#SC2]"; } else if (gameHashtag == "BLIZZCON") { gameHashtag = "#BlizzCon"; } else if (gameHashtag == "HEARTHSTONE") { gameHashtag = "#Hearthstone"; } else { gameHashtag = "#" + gameHashtag; } string tweetedContent = String.Empty; if (tweet.IsForumThread) { string link = "http://www.blizzposts.com/topic/" + tweet.ItemId + "#" + tweet.PostLinkNo; tweetedContent = String.Format("{6} Новые синие посты в \"{0}\" {1} - автор {2} ({3}) {4}{5}", HttpUtility.HtmlDecode(tweet.Title), link, tweet.NameOfThePoster.Trim(), "", //was region - removed gameHashtag, "", //was bluecount - removed gameNormTag); } else { string link = "http://www.blizzposts.com/blogentry/" + tweet.ItemId; tweetedContent = String.Format("{4} [блог {3}{5}] \"{0}\" - {1} {2}", HttpUtility.HtmlDecode(tweet.Title), link, gameHashtag, "", //was region - removed gameNormTag, this.Lang); } TwitterService service = new TwitterService(CONSUMER_KEY, CONSUMER_SECRET); service.AuthenticateWith(ACCESS_TOKEN, ACCESS_SECRET); service.SendTweet(new SendTweetOptions { Status = tweetedContent }); tweet.TweetResult = service.Response.StatusCode + " - " + service.Response.StatusDescription; tweet.TweetedContent = tweetedContent; tweet.IsTweeted = true; tweet.DateIsTweeted = DateTime.UtcNow; Console.WriteLine("Tweeted content: {0}", tweetedContent); Console.WriteLine("Tweet Result: {0}\n\n", service.Response.StatusCode); db.SubmitChanges(); } } catch (Exception e) { Console.WriteLine("An error has ocurred: \n {0} \n", e.Message); } }
public void DoBlogCrawling(string game, string region, string lang) { XmlReader reader = XmlReader.Create(String.Format("http://{0}.battle.net/{1}/{2}/feed/news", region.ToLower(), game.ToLower(), lang.ToLower())); SyndicationFeed feed = SyndicationFeed.Load(reader); List<MinedBlogEntry> blogEntries = (from item in feed.Items select new MinedBlogEntry { Title = item.Title.Text, PublicationDate = ConvertFromDateTimeOffset(item.PublishDate.ToUniversalTime()), DirectLink = item.Links[0].Uri.AbsoluteUri, TimeZone = "UTC" }).ToList(); foreach (var blogEntry in blogEntries) { try { using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr)) { blogEntry.DirectLinkWithAnchor = String.Format("<a href = \"{0}\"> {1} </a>", blogEntry.DirectLink, blogEntry.Title); blogEntry.EntryNumber = int.Parse(blogEntry.DirectLink.Remove(0, blogEntry.DirectLink.IndexOf("blog/") + 5)); int existenceCount = (from b in entities.BlogEntries where b.BlizzArea.Game.GameAbbreviation == game && b.BlizzArea.Region.RegionAbbreviation == region && b.BlizzArea.Language.LanguageAbbreviation == lang && b.EntryNumber == blogEntry.EntryNumber select b).Count(); if (!(existenceCount > 0)) { Console.WriteLine("Found a new blog entry."); string blogEntryPage = WebClient.GetRawHtml(blogEntry.DirectLink); HtmlDocument document = new HtmlDocument(); document.LoadHtml(blogEntryPage); //article-content -> we want the contents of this div. Reason why we aren't getting it from feed directly is because we want the banner image. string blogContent = String.Empty; string author = String.Empty; ////*[@id="blog"]/div[2] var blogContentDiv = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div[2]"); if (blogContentDiv != null) { blogContent = blogContentDiv.InnerHtml; author = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div[1]/a[1]/span[2]").InnerText; } else { var headingImage = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div/div[2]"); blogContentDiv = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div/div[3]"); author = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div/div[1]/div/a").InnerText; if (headingImage != null) { blogContent += headingImage.OuterHtml; } if (blogContentDiv != null) { blogContent += blogContentDiv.InnerHtml; } else { Console.WriteLine("Cannot mine this. The div tags are not in their expected format."); break; } } blogContent = blogContent.Replace("<br>", "<br />"); TimeSpan span = DateTime.UtcNow - blogEntry.PublicationDate; if (span.TotalMinutes > 60) { blogEntry.IsTweeted = true; } BlogEntry newBlogEntry = new BlogEntry(); newBlogEntry.BlizzAreaId = this.BlogPageId; newBlogEntry.BlogTitle = blogEntry.Title; newBlogEntry.BlogDirectLink = blogEntry.DirectLinkWithAnchor; newBlogEntry.BlogContent = blogContent; newBlogEntry.BlogDate = blogEntry.PublicationDate; newBlogEntry.TimeZone = "UTC"; newBlogEntry.EntryNumber = blogEntry.EntryNumber; newBlogEntry.BlogEntryAuthor = author; entities.BlogEntries.InsertOnSubmit(newBlogEntry); entities.SubmitChanges(); } else { Console.WriteLine("Found a blog entry. This blog entry already exists."); //TODO: We should probably check last update stamp. } } } catch (Exception e) { Console.WriteLine("Could not mine blog entry for following reason: \n {0}", e.Message); } } }