Example #1
0
        public void PerformCrawl(string game)
        {
            using (ForumBlogsDataContext entities = new ForumBlogsDataContext(_connStr))
            {
                var topicsToCrawl = (from t in entities.Topics
                                     where t.ForumBoard.BlizzArea.Game.GameAbbreviation == game
                                     && t.ToBeCrawled
                                     select t).ToList();

                foreach (var topic in topicsToCrawl)
                {
                    topic.IsBeingCrawled = true;
                }

                entities.SubmitChanges();

                foreach (var topic in topicsToCrawl)
                {
                    try
                    {
                        CrawlSpecifiedTopics(topic);
                        entities.SubmitChanges();
                    }
                    catch (Exception e) { Console.WriteLine(e.Message); }
                }

                entities.SubmitChanges();
            }
        }
        public override void PerformTwitterOperation()
        {
            Console.Write("Starting... ({0}) \n", "Hearthstone");

            ForumBlogsDataContext db = new ForumBlogsDataContext(_connStr);

            try
            {
                var tweetQueue = (from t in db.TweetQueues
                                  where t.DateAdded > (DateTime.UtcNow.AddMinutes(-35))
                                  && !t.IsTweeted
                                  && t.BlizzArea.Language.LanguageAbbreviation == this.Lang
                                  & t.BlizzArea.Game.GameAbbreviation == "HEARTHSTONE"
                                  & (t.PostLinkNo.Value != null && (t.PostLinkNo.Value == 1 || t.PostLinkNo.Value % 2 == 0))
                                  select t);

                foreach (var tweet in tweetQueue)
                {
                    string gameHashtag = "#Hearthstone";
                    string gameNormTag = "[#HS]";

                    string tweetedContent = String.Empty;
                    if (tweet.IsForumThread)
                    {
                        string link = "http://www.blizzposts.com/topic/" + tweet.ItemId + "#" + tweet.PostLinkNo;
                        tweetedContent = String.Format("{6} New blue post in \"{0}\" {1} - by {2} ({3}) {4}{5}",
                                                            HttpUtility.HtmlDecode(tweet.Title),
                                                            link,
                                                            tweet.NameOfThePoster.Trim(),
                                                            tweet.BlizzArea.Region.RegionAbbreviation,
                                                            gameHashtag,
                                                            "", //was bluecount - removed
                                                            gameNormTag);
                    }
                    else
                    {
                        string link = "http://www.blizzposts.com/blogentry/" + tweet.ItemId;
                        tweetedContent = String.Format("{4} [Blog {3}-{5}] \"{0}\" - {1} {2}",
                                                 HttpUtility.HtmlDecode(tweet.Title),
                                                 link,
                                                 gameHashtag,
                                                 tweet.BlizzArea.Region.RegionAbbreviation,
                                                 gameNormTag,
                                                 this.Lang);

                    }

                    TwitterService service = new TwitterService(CONSUMER_KEY, CONSUMER_SECRET);
                    service.AuthenticateWith(ACCESS_TOKEN, ACCESS_SECRET);
                    service.SendTweet(new SendTweetOptions { Status = tweetedContent });

                    tweet.TweetResult = service.Response.StatusCode + " - " + service.Response.StatusDescription;
                    tweet.TweetedContent = tweetedContent;
                    tweet.IsTweeted = true;
                    tweet.DateIsTweeted = DateTime.UtcNow;

                    Console.WriteLine("Tweeted content: {0}", tweetedContent);
                    Console.WriteLine("Tweet Result: {0}\n\n", service.Response.StatusCode);

                    db.SubmitChanges();

                }

            }
            catch (Exception e)
            {
                Console.WriteLine("An error has ocurred: \n {0} \n", e.Message);
            }
        }
        private void worker_DoWork(object sender, DoWorkEventArgs e)
        {
            bool isSuccess = false;
            int failureCount = 0;
            while (!isSuccess)
            {
                try
                {
                    TrElementData trElementData = e.Argument as TrElementData;

                    using (ForumBlogsDataContext entities = new ForumBlogsDataContext(trElementData.ConnStr))
                    {
                        Console.WriteLine("Initiating database connection . . .");

                        int threadCount = (from t in entities.Topics
                                           where t.ThreadNumber == trElementData.threadNumber
                                           && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower()
                                           && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower()
                                           && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower()
                                           select t).Count();

                        trElementData.isNewThread = !(threadCount > 0);

                        if (trElementData.isNewThread)
                        {
                            Console.WriteLine("\nFound new thread to be marked for indexing. \n");

                            Topic topic = new Topic();
                            topic.ForumBoardId = trElementData.ForumBoardId;
                            topic.ThreadTitle = trElementData.threadTitle;
                            topic.AuthorOfThread = trElementData.threadAuthor;
                            topic.LastPoster = trElementData.lastPoster.Trim();
                            topic.ThreadNumber = trElementData.threadNumber.Value;
                            topic.ToBeCrawled = true; //temporary
                            topic.NoPages = trElementData.lastPage;
                            topic.LastPostDate = DateTime.Now.AddYears(-3);
                            topic.ThreadUpdate = trElementData.lastUpdateStamp;

                            entities.Topics.InsertOnSubmit(topic);
                            entities.SubmitChanges();
                        }
                        else
                        {
                            Topic topic = (from t in entities.Topics
                                           where t.ThreadNumber == trElementData.threadNumber
                                           && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower()
                                           && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower()
                                           && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower()
                                           select t).ToList()[0];

                            if (topic.ThreadTitle != trElementData.threadTitle)
                            {
                                Console.WriteLine("\nThe thread's title was changed.\n");
                                topic.ThreadTitle = trElementData.threadTitle;
                            }

                            if (topic.ThreadUpdate != trElementData.lastUpdateStamp && !topic.IsBeingCrawled && !topic.ToBeCrawled)
                            {
                                Console.WriteLine("\nFound known thread that was updated. \n");
                                topic.ToBeCrawled = true;
                                topic.NoPages = trElementData.lastPage;
                                topic.LastPoster = trElementData.lastPoster.Trim();
                                topic.ThreadUpdate = trElementData.lastUpdateStamp;
                            }

                            entities.SubmitChanges();

                        }
                    }
                    isSuccess = true;
                    Console.WriteLine("Finished database operation.");
                }
                catch (Exception er)
                {
                    failureCount++;
                    System.Threading.Thread.Sleep(2000);

                    if (failureCount > 10)
                        Console.WriteLine(er.Message);
                }
            }
        }
Example #4
0
        private void CrawlSpecifiedTopics(Topic topic)
        {
            //We are going to make sure page 1 is always crawled.
            List<int> pageNos = new List<int>();
            pageNos.Add(1);
            if (!topic.LastCrawledPage.HasValue)
            {
                topic.LastCrawledPage = 1;
            }

            for (int i = topic.LastCrawledPage.Value; i <= topic.NoPages; i++)
            {
                if (i != 1)
                {
                    pageNos.Add(i);
                }
            }

            int lastCrawledPage = 0;

            for (int i = 0; i <= pageNos.Count - 1; i++)
            {
                string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}",
                                                   topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(),
                                                   topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(),
                                                   topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(),
                                                   topic.ThreadNumber,
                                                   pageNos[i]));
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(ht);

                var divTags = document.GetElementbyId("post-list").ChildNodes.Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("topic-post"));

                foreach (var div in divTags)
                {
                    bool indexThisPost = false;
                    string postContent = null;
                    string avatarLink = null;
                    string linkDirectPost = null;
                    string postDate = null;
                    string posterName = null;
                    string posterSpecialTitle = null;
                    string postEdited = null;
                    short posterType = 0;

                    if (div.HasChildNodes)
                    {

                        if (div.InnerHtml.Contains("<span id=\"1\">")) //incredibly fragile
                        {
                            indexThisPost = true;
                        }

                        if (div.Attributes["class"].Value.Contains("blizzard"))
                        {
                            indexThisPost = true;
                            posterType = 2;
                        }
                        else if (div.Attributes["class"].Value.Contains("mvp"))
                        {
                            indexThisPost = true;
                            posterType = 1;
                        }

                        if (indexThisPost)
                        {
                            #region Post Content
                            //*[@id="post-64426546361"]/div[1]/table/tr/td[2]/div
                            var postContentTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div", div.Id));

                            if (postContentTag != null)
                            {
                                postContent = String.Empty;

                                postContent += postContentTag.InnerHtml;
                            }
                            #endregion

                            #region avatar

                            // //*[@id="post-79781202523"]

                            var avatarImgTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[1]/div/div/a/img", div.Id));

                            if (avatarImgTag != null)
                            {
                                avatarLink = avatarImgTag.OuterHtml;

                                string src = avatarLink.Remove(0, avatarLink.IndexOf("src") + 1);
                                src = src.Remove(0, src.IndexOf("\"") + 1);
                                src = src.Remove(src.IndexOf("\""));

                                if (!src.Contains("battle.net") && !src.Contains("cms/user_avatar") && !src.Contains("media.blizzard")) //TODO: Second condition here might conflic in other regions. Check for this.
                                {
                                    src = String.Format("http://{0}.battle.net/", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower()) + src;
                                }

                                avatarLink = String.Format("<img alt=\"avatar\" src = \"{0}\" width={1} height={2}", src, "{0}", "{1} />");
                            }

                            #endregion

                            #region Direct Post Link

                            var indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/a", div.Id));

                            if (indexATag == null) //deleted MVP post
                            {
                                indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[3]/div/a", div.Id));
                            }

                            if (indexATag != null)
                            {
                                linkDirectPost = String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}{5}",
                                                               topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(),
                                                               topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(),
                                                               topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(),
                                                               topic.ThreadNumber,
                                                               pageNos[i],
                                                               indexATag.Attributes["href"].Value);
                            }

                            #endregion

                            #region Post Date

                            var postDateDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/div[1]", div.Id));

                            if (postDateDiv != null)
                            {
                                postDate = postDateDiv.Attributes["data-tooltip"].Value.Replace("&nbsp;", " ");
                            }

                            #endregion

                            #region Poster Name

                            var posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[1]/a/span", div.Id));

                            //*[@id="post-66887172203"]/div[1]/table/tr/td[1]/div/div/div/a/span
                            if (posterNameSpan == null)
                            {
                                posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[1]/div/div/div/a/span", div.Id));
                            }

                            if (posterNameSpan != null)
                            {
                                posterName = posterNameSpan.InnerText;
                            }

                            #endregion

                            #region Poster Special Title

                            if (posterType == 2)
                            {
                                var posterSpecialTitleDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[2]", div.Id));

                                if (posterSpecialTitleDiv != null)
                                {
                                    posterSpecialTitle = posterSpecialTitleDiv.InnerText;
                                }
                            }

                            #endregion

                            #region Post Edited

                            var postEditedDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div[2]", div.Id));

                            if (postEditedDiv != null)
                            {
                                postEdited = postEditedDiv.OuterHtml;
                            }
                            #endregion

                            //Combining post edited with post content.
                            if (!String.IsNullOrEmpty(postEdited))
                            {
                                postContent += postEdited;
                            }

                            if (postContent != null)
                                postContent = postContent.Replace("<br>", "<br />");

                            //Does this post exist? Assume false for now.
                            bool postExists = false;

                            //Check if the post exists. If it does exist, make sure there weren't any updates on it.
                            using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                            {
                                int postCount = (from p in entities.Posts
                                                 where p.DirectPostLink == linkDirectPost
                                                 select p).Count();

                                postExists = (postCount > 0);

                            }

                            if (postExists)
                            {
                                using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                                {
                                    Post post = (from p in entities.Posts
                                                 where p.DirectPostLink == linkDirectPost
                                                 select p).ToList()[0];

                                    if (post.PostContent != postContent || post.AvatarLinkOfPost != avatarLink)
                                    {
                                        post.PostContent = postContent;
                                        post.AvatarLinkOfPost = avatarLink;
                                    }

                                    entities.SubmitChanges();
                                }
                            }
                            else
                            {
                                using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                                {
                                    if (postContent != null)
                                    {
                                        Post post = new Post();
                                        post.TopicId = topic.Id;
                                        post.DirectPostLink = linkDirectPost;
                                        post.PosterName = posterName;
                                        post.PostDate = postDate;
                                        post.PosterType = posterType;
                                        post.PosterSpecialTitle = posterSpecialTitle;
                                        post.AvatarLinkOfPost = avatarLink;
                                        post.PostContent = postContent;

                                        try
                                        {
                                            SetPostDate(post, topic.ForumBoard.BlizzArea.Region.RegionAbbreviation);
                                        }
                                        catch (Exception) //This will happen if time zone is not specified
                                        {
                                        }

                                        //if (post.PosterType == 2)
                                        //{
                                        //    topic.LastPostDate = post.PostDateTimeUtc.Value;
                                        //}

                                        entities.Posts.InsertOnSubmit(post);
                                        entities.SubmitChanges();
                                    }

                                }
                            }

                        }
                    }
                }

                lastCrawledPage = pageNos[i];

            }

            using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
            {
                Topic topicToSave = (from t in entities.Topics
                                     where t.Id == topic.Id
                                     select t).Single();

                var lastBlue = (from p in entities.Posts
                                where p.TopicId == topicToSave.Id
                                && p.PosterType == 2
                                orderby p.PostDateTimeUtc descending
                                select p).ToList();

                if (lastBlue.Count() > 0)
                    topicToSave.LastPostDate = lastBlue[0].PostDateTimeUtc.Value;

                topicToSave.LastCrawledPage = lastCrawledPage;
                topicToSave.IsBeingCrawled = false;
                topicToSave.ToBeCrawled = false;
                entities.SubmitChanges();
            }
        }
Example #5
0
        public override void PerformTwitterOperation()
        {
            Console.Write("Starting... ({0})", this.Lang);

            ForumBlogsDataContext db = new ForumBlogsDataContext(_connStr);

            try
            {
                var tweetQueue = (from t in db.TweetQueues
                                  where t.DateAdded > (DateTime.UtcNow.AddMinutes(-120))
                                  && !t.IsTweeted
                                  && t.BlizzArea.Language.LanguageAbbreviation == this.Lang
                                  && t.BlizzArea.Game.GameAbbreviation != "BDA"
                                  select t);

                foreach (var tweet in tweetQueue)
                {
                    string gameHashtag = tweet.BlizzArea.Game.GameAbbreviation;

                    string gameNormTag = String.Empty;
                    if (gameHashtag == "WOW")
                    {
                        gameHashtag = "#Warcraft";
                        gameNormTag = "[#WoW]";
                    }
                    else if (gameHashtag == "D3")
                    {
                        gameHashtag = "#Diablo";
                        gameNormTag = "[#D3]";
                    }
                    else if (gameHashtag == "SC2")
                    {
                        gameHashtag = "#StarCraft";
                        gameNormTag = "[#SC2]";
                    }
                    else if (gameHashtag == "BLIZZCON")
                    {
                        gameHashtag = "#BlizzCon";
                    }
                    else if (gameHashtag == "HEARTHSTONE")
                    {
                        gameHashtag = "#Hearthstone";
                    }
                    else
                    {
                        gameHashtag = "#" + gameHashtag;
                    }
                    string tweetedContent = String.Empty;
                    if (tweet.IsForumThread)
                    {
                        string link = "http://www.blizzposts.com/topic/" + tweet.ItemId + "#" + tweet.PostLinkNo;
                        tweetedContent = String.Format("{6} Новые синие посты в \"{0}\" {1} - автор {2} ({3}) {4}{5}",
                                                            HttpUtility.HtmlDecode(tweet.Title),
                                                            link,
                                                            tweet.NameOfThePoster.Trim(),
                                                            "", //was region - removed
                                                            gameHashtag,
                                                            "", //was bluecount - removed
                                                            gameNormTag);
                    }
                    else
                    {
                        string link = "http://www.blizzposts.com/blogentry/" + tweet.ItemId;
                        tweetedContent = String.Format("{4} [блог {3}{5}] \"{0}\" - {1} {2}",
                                                 HttpUtility.HtmlDecode(tweet.Title),
                                                 link,
                                                 gameHashtag,
                                                 "", //was region - removed
                                                 gameNormTag,
                                                 this.Lang);

                    }

                    TwitterService service = new TwitterService(CONSUMER_KEY, CONSUMER_SECRET);
                    service.AuthenticateWith(ACCESS_TOKEN, ACCESS_SECRET);
                    service.SendTweet(new SendTweetOptions { Status = tweetedContent });

                    tweet.TweetResult = service.Response.StatusCode + " - " + service.Response.StatusDescription;
                    tweet.TweetedContent = tweetedContent;
                    tweet.IsTweeted = true;
                    tweet.DateIsTweeted = DateTime.UtcNow;

                    Console.WriteLine("Tweeted content: {0}", tweetedContent);
                    Console.WriteLine("Tweet Result: {0}\n\n", service.Response.StatusCode);

                    db.SubmitChanges();

                }

            }
            catch (Exception e)
            {
                Console.WriteLine("An error has ocurred: \n {0} \n", e.Message);
            }
        }
        public void DoBlogCrawling(string game, string region, string lang)
        {
            XmlReader reader = XmlReader.Create(String.Format("http://{0}.battle.net/{1}/{2}/feed/news", region.ToLower(), game.ToLower(), lang.ToLower()));
            SyndicationFeed feed = SyndicationFeed.Load(reader);
            List<MinedBlogEntry> blogEntries = (from item in feed.Items
                                                select new MinedBlogEntry
                                                {
                                                    Title = item.Title.Text,
                                                    PublicationDate = ConvertFromDateTimeOffset(item.PublishDate.ToUniversalTime()),
                                                    DirectLink = item.Links[0].Uri.AbsoluteUri,
                                                    TimeZone = "UTC"
                                                }).ToList();

            foreach (var blogEntry in blogEntries)
            {
                try
                {
                    using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                    {

                        blogEntry.DirectLinkWithAnchor = String.Format("<a href = \"{0}\"> {1} </a>", blogEntry.DirectLink, blogEntry.Title);
                        blogEntry.EntryNumber = int.Parse(blogEntry.DirectLink.Remove(0, blogEntry.DirectLink.IndexOf("blog/") + 5));

                        int existenceCount = (from b in entities.BlogEntries
                                              where b.BlizzArea.Game.GameAbbreviation == game
                                              && b.BlizzArea.Region.RegionAbbreviation == region
                                              && b.BlizzArea.Language.LanguageAbbreviation == lang
                                              && b.EntryNumber == blogEntry.EntryNumber
                                              select b).Count();

                        if (!(existenceCount > 0))
                        {
                            Console.WriteLine("Found a new blog entry.");

                            string blogEntryPage = WebClient.GetRawHtml(blogEntry.DirectLink);

                            HtmlDocument document = new HtmlDocument();
                            document.LoadHtml(blogEntryPage);

                            //article-content -> we want the contents of this div. Reason why we aren't getting it from feed directly is because we want the banner image.

                            string blogContent = String.Empty;
                            string author = String.Empty;
                            ////*[@id="blog"]/div[2]

                            var blogContentDiv = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div[2]");

                            if (blogContentDiv != null)
                            {
                                blogContent = blogContentDiv.InnerHtml;
                                author = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div[1]/a[1]/span[2]").InnerText;
                            }
                            else
                            {
                                var headingImage = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div/div[2]");
                                blogContentDiv = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div/div[3]");
                                author = document.DocumentNode.SelectSingleNode("//*[@id=\"blog\"]/div/div[1]/div/a").InnerText;

                                if (headingImage != null)
                                {
                                    blogContent += headingImage.OuterHtml;
                                }
                                if (blogContentDiv != null)
                                {
                                    blogContent += blogContentDiv.InnerHtml;
                                }
                                else
                                {
                                    Console.WriteLine("Cannot mine this. The div tags are not in their expected format.");
                                    break;
                                }

                            }

                            blogContent = blogContent.Replace("<br>", "<br />");

                            TimeSpan span = DateTime.UtcNow - blogEntry.PublicationDate;
                            if (span.TotalMinutes > 60)
                            {
                                blogEntry.IsTweeted = true;
                            }

                            BlogEntry newBlogEntry = new BlogEntry();
                            newBlogEntry.BlizzAreaId = this.BlogPageId;
                            newBlogEntry.BlogTitle = blogEntry.Title;
                            newBlogEntry.BlogDirectLink = blogEntry.DirectLinkWithAnchor;
                            newBlogEntry.BlogContent = blogContent;
                            newBlogEntry.BlogDate = blogEntry.PublicationDate;
                            newBlogEntry.TimeZone = "UTC";
                            newBlogEntry.EntryNumber = blogEntry.EntryNumber;
                            newBlogEntry.BlogEntryAuthor = author;

                            entities.BlogEntries.InsertOnSubmit(newBlogEntry);
                            entities.SubmitChanges();
                        }
                        else
                        {
                            Console.WriteLine("Found a blog entry. This blog entry already exists.");
                            //TODO: We should probably check last update stamp.
                        }

                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine("Could not mine blog entry for following reason: \n {0}", e.Message);
                }

            }
        }