private void worker_DoWork(object sender, DoWorkEventArgs e)
        {
            bool isSuccess = false;
            int failureCount = 0;
            while (!isSuccess)
            {
                try
                {
                    TrElementData trElementData = e.Argument as TrElementData;

                    using (ForumBlogsDataContext entities = new ForumBlogsDataContext(trElementData.ConnStr))
                    {
                        Console.WriteLine("Initiating database connection . . .");

                        int threadCount = (from t in entities.Topics
                                           where t.ThreadNumber == trElementData.threadNumber
                                           && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower()
                                           && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower()
                                           && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower()
                                           select t).Count();

                        trElementData.isNewThread = !(threadCount > 0);

                        if (trElementData.isNewThread)
                        {
                            Console.WriteLine("\nFound new thread to be marked for indexing. \n");

                            Topic topic = new Topic();
                            topic.ForumBoardId = trElementData.ForumBoardId;
                            topic.ThreadTitle = trElementData.threadTitle;
                            topic.AuthorOfThread = trElementData.threadAuthor;
                            topic.LastPoster = trElementData.lastPoster.Trim();
                            topic.ThreadNumber = trElementData.threadNumber.Value;
                            topic.ToBeCrawled = true; //temporary
                            topic.NoPages = trElementData.lastPage;
                            topic.LastPostDate = DateTime.Now.AddYears(-3);
                            topic.ThreadUpdate = trElementData.lastUpdateStamp;

                            entities.Topics.InsertOnSubmit(topic);
                            entities.SubmitChanges();
                        }
                        else
                        {
                            Topic topic = (from t in entities.Topics
                                           where t.ThreadNumber == trElementData.threadNumber
                                           && t.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower() == trElementData.Game.ToLower()
                                           && t.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower() == trElementData.Lang.ToLower()
                                           && t.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower() == trElementData.Region.ToLower()
                                           select t).ToList()[0];

                            if (topic.ThreadTitle != trElementData.threadTitle)
                            {
                                Console.WriteLine("\nThe thread's title was changed.\n");
                                topic.ThreadTitle = trElementData.threadTitle;
                            }

                            if (topic.ThreadUpdate != trElementData.lastUpdateStamp && !topic.IsBeingCrawled && !topic.ToBeCrawled)
                            {
                                Console.WriteLine("\nFound known thread that was updated. \n");
                                topic.ToBeCrawled = true;
                                topic.NoPages = trElementData.lastPage;
                                topic.LastPoster = trElementData.lastPoster.Trim();
                                topic.ThreadUpdate = trElementData.lastUpdateStamp;
                            }

                            entities.SubmitChanges();

                        }
                    }
                    isSuccess = true;
                    Console.WriteLine("Finished database operation.");
                }
                catch (Exception er)
                {
                    failureCount++;
                    System.Threading.Thread.Sleep(2000);

                    if (failureCount > 10)
                        Console.WriteLine(er.Message);
                }
            }
        }
Example #2
0
        private void CrawlSpecifiedTopics(Topic topic)
        {
            //We are going to make sure page 1 is always crawled.
            List<int> pageNos = new List<int>();
            pageNos.Add(1);
            if (!topic.LastCrawledPage.HasValue)
            {
                topic.LastCrawledPage = 1;
            }

            for (int i = topic.LastCrawledPage.Value; i <= topic.NoPages; i++)
            {
                if (i != 1)
                {
                    pageNos.Add(i);
                }
            }

            int lastCrawledPage = 0;

            for (int i = 0; i <= pageNos.Count - 1; i++)
            {
                string ht = WebClient.GetRawHtml(String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}",
                                                   topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(),
                                                   topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(),
                                                   topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(),
                                                   topic.ThreadNumber,
                                                   pageNos[i]));
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(ht);

                var divTags = document.GetElementbyId("post-list").ChildNodes.Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("topic-post"));

                foreach (var div in divTags)
                {
                    bool indexThisPost = false;
                    string postContent = null;
                    string avatarLink = null;
                    string linkDirectPost = null;
                    string postDate = null;
                    string posterName = null;
                    string posterSpecialTitle = null;
                    string postEdited = null;
                    short posterType = 0;

                    if (div.HasChildNodes)
                    {

                        if (div.InnerHtml.Contains("<span id=\"1\">")) //incredibly fragile
                        {
                            indexThisPost = true;
                        }

                        if (div.Attributes["class"].Value.Contains("blizzard"))
                        {
                            indexThisPost = true;
                            posterType = 2;
                        }
                        else if (div.Attributes["class"].Value.Contains("mvp"))
                        {
                            indexThisPost = true;
                            posterType = 1;
                        }

                        if (indexThisPost)
                        {
                            #region Post Content
                            //*[@id="post-64426546361"]/div[1]/table/tr/td[2]/div
                            var postContentTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div", div.Id));

                            if (postContentTag != null)
                            {
                                postContent = String.Empty;

                                postContent += postContentTag.InnerHtml;
                            }
                            #endregion

                            #region avatar

                            // //*[@id="post-79781202523"]

                            var avatarImgTag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[1]/div/div/a/img", div.Id));

                            if (avatarImgTag != null)
                            {
                                avatarLink = avatarImgTag.OuterHtml;

                                string src = avatarLink.Remove(0, avatarLink.IndexOf("src") + 1);
                                src = src.Remove(0, src.IndexOf("\"") + 1);
                                src = src.Remove(src.IndexOf("\""));

                                if (!src.Contains("battle.net") && !src.Contains("cms/user_avatar") && !src.Contains("media.blizzard")) //TODO: Second condition here might conflic in other regions. Check for this.
                                {
                                    src = String.Format("http://{0}.battle.net/", topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower()) + src;
                                }

                                avatarLink = String.Format("<img alt=\"avatar\" src = \"{0}\" width={1} height={2}", src, "{0}", "{1} />");
                            }

                            #endregion

                            #region Direct Post Link

                            var indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/a", div.Id));

                            if (indexATag == null) //deleted MVP post
                            {
                                indexATag = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[3]/div/a", div.Id));
                            }

                            if (indexATag != null)
                            {
                                linkDirectPost = String.Format("http://{0}.battle.net/{1}/{2}/forum/topic/{3}?page={4}{5}",
                                                               topic.ForumBoard.BlizzArea.Region.RegionAbbreviation.ToLower(),
                                                               topic.ForumBoard.BlizzArea.Game.GameAbbreviation.ToLower(),
                                                               topic.ForumBoard.BlizzArea.Language.LanguageAbbreviation.ToLower(),
                                                               topic.ThreadNumber,
                                                               pageNos[i],
                                                               indexATag.Attributes["href"].Value);
                            }

                            #endregion

                            #region Post Date

                            var postDateDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[3]/div/div[1]", div.Id));

                            if (postDateDiv != null)
                            {
                                postDate = postDateDiv.Attributes["data-tooltip"].Value.Replace("&nbsp;", " ");
                            }

                            #endregion

                            #region Poster Name

                            var posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[1]/a/span", div.Id));

                            //*[@id="post-66887172203"]/div[1]/table/tr/td[1]/div/div/div/a/span
                            if (posterNameSpan == null)
                            {
                                posterNameSpan = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div[1]/table/tr/td[1]/div/div/div/a/span", div.Id));
                            }

                            if (posterNameSpan != null)
                            {
                                posterName = posterNameSpan.InnerText;
                            }

                            #endregion

                            #region Poster Special Title

                            if (posterType == 2)
                            {
                                var posterSpecialTitleDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[1]/div/div[2]/div[2]", div.Id));

                                if (posterSpecialTitleDiv != null)
                                {
                                    posterSpecialTitle = posterSpecialTitleDiv.InnerText;
                                }
                            }

                            #endregion

                            #region Post Edited

                            var postEditedDiv = document.DocumentNode.SelectSingleNode(String.Format("//*[@id=\"{0}\"]/div/table/tr/td[2]/div[2]", div.Id));

                            if (postEditedDiv != null)
                            {
                                postEdited = postEditedDiv.OuterHtml;
                            }
                            #endregion

                            //Combining post edited with post content.
                            if (!String.IsNullOrEmpty(postEdited))
                            {
                                postContent += postEdited;
                            }

                            if (postContent != null)
                                postContent = postContent.Replace("<br>", "<br />");

                            //Does this post exist? Assume false for now.
                            bool postExists = false;

                            //Check if the post exists. If it does exist, make sure there weren't any updates on it.
                            using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                            {
                                int postCount = (from p in entities.Posts
                                                 where p.DirectPostLink == linkDirectPost
                                                 select p).Count();

                                postExists = (postCount > 0);

                            }

                            if (postExists)
                            {
                                using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                                {
                                    Post post = (from p in entities.Posts
                                                 where p.DirectPostLink == linkDirectPost
                                                 select p).ToList()[0];

                                    if (post.PostContent != postContent || post.AvatarLinkOfPost != avatarLink)
                                    {
                                        post.PostContent = postContent;
                                        post.AvatarLinkOfPost = avatarLink;
                                    }

                                    entities.SubmitChanges();
                                }
                            }
                            else
                            {
                                using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
                                {
                                    if (postContent != null)
                                    {
                                        Post post = new Post();
                                        post.TopicId = topic.Id;
                                        post.DirectPostLink = linkDirectPost;
                                        post.PosterName = posterName;
                                        post.PostDate = postDate;
                                        post.PosterType = posterType;
                                        post.PosterSpecialTitle = posterSpecialTitle;
                                        post.AvatarLinkOfPost = avatarLink;
                                        post.PostContent = postContent;

                                        try
                                        {
                                            SetPostDate(post, topic.ForumBoard.BlizzArea.Region.RegionAbbreviation);
                                        }
                                        catch (Exception) //This will happen if time zone is not specified
                                        {
                                        }

                                        //if (post.PosterType == 2)
                                        //{
                                        //    topic.LastPostDate = post.PostDateTimeUtc.Value;
                                        //}

                                        entities.Posts.InsertOnSubmit(post);
                                        entities.SubmitChanges();
                                    }

                                }
                            }

                        }
                    }
                }

                lastCrawledPage = pageNos[i];

            }

            using (ForumBlogsDataContext entities = new ForumBlogsDataContext(this._connStr))
            {
                Topic topicToSave = (from t in entities.Topics
                                     where t.Id == topic.Id
                                     select t).Single();

                var lastBlue = (from p in entities.Posts
                                where p.TopicId == topicToSave.Id
                                && p.PosterType == 2
                                orderby p.PostDateTimeUtc descending
                                select p).ToList();

                if (lastBlue.Count() > 0)
                    topicToSave.LastPostDate = lastBlue[0].PostDateTimeUtc.Value;

                topicToSave.LastCrawledPage = lastCrawledPage;
                topicToSave.IsBeingCrawled = false;
                topicToSave.ToBeCrawled = false;
                entities.SubmitChanges();
            }
        }
 partial void DeleteTopic(Topic instance);
 partial void UpdateTopic(Topic instance);
 partial void InsertTopic(Topic instance);
 private void detach_Topics(Topic entity)
 {
     this.SendPropertyChanging("Topics");
     entity.ForumBoard = null;
 }