Пример #1
0
        public void Scrape()
        {
            using (var db = new NishkriyaContext())
            {
                var session = new ScraperSession { Start = DateTime.Now };

                db.Accounts.Where(a => a.Active).ToList().ForEach(account =>
                    {
                        var toAdd = GetNewPosts(account, db.Threads.ToList(), session).ToList();
                        session.PostsAdded += toAdd.Count;
                        account.Posts.AddRange(toAdd);
                        db.SaveChanges(); //Pesky thread duplication avoided
                    });

                session.Finish = DateTime.Now;

                db.Stats.Add(session);
                db.SaveChanges();
            }
        }
Пример #2
0
        private IEnumerable<Post> GetNewPosts(ForumAccount account, List<Thread> threads, ScraperSession session)
        {
            try
            {
                var url = string.Format(Settings.Default.ProfileUrl, account.ForumId);

                var req = (HttpWebRequest)WebRequest.Create(url);
                req.Method = "GET";
                req.ContentType = "application/x-www-form-urlencoded";
                req.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2";
                req.CookieContainer = new CookieContainer();
                req.CookieContainer.Add(new Cookie(".YAFNET_Authentication", Settings.Default.AuthToken, "/", "forums.white-wolf.com"));

                var responseStream = req.GetResponse().GetResponseStream();
                var document = new HtmlDocument();

                if (responseStream == null)
                {
                    throw new NoNullAllowedException();
                }

                using (var reader = new StreamReader(responseStream))
                {
                    using (var memoryStream = new MemoryStream())
                    {
                        using (var writer = new StreamWriter(memoryStream))
                        {
                            writer.Write(reader.ReadToEnd());
                            memoryStream.Position = 0;
                            document.Load(memoryStream, new UTF8Encoding());
                        }
                    }
                }

                document = CleanHtml(document);

                var postsCollection = new List<Post>();

                const string placeholderFragment = "id('MasterPageContentPlaceHolder_forum_ctl01_ProfileTabs_Last10PostsTab')//table//tr[";
                const string anchorSelectorFragment = "]//td/a/@href";
                const string titleSelectorFragment = "]//td/a/text()";
                const string dateSelectorFragment = "]//td/text()[4]";

                foreach (int i in Enumerable.Range(0, 10))
                {
                    var tableRow = 1 + (2 * i);

                    var threadIdSelector = String.Format("{0}{1}{2}", placeholderFragment, tableRow, anchorSelectorFragment);
                    var titleSelector = String.Format("{0}{1}{2}", placeholderFragment, tableRow, titleSelectorFragment);
                    var dateSelector = String.Format("{0}{1}{2}", placeholderFragment, tableRow, dateSelectorFragment);

                    var threadHref = document.DocumentNode.SelectSingleNode(threadIdSelector).Attributes[0].Value;
                    var threadId = int.Parse(Regex.Match(threadHref, @"(\d+)$").Groups[0].Value);

                    var threadTitle = document.DocumentNode.SelectSingleNode(titleSelector).InnerHtml.Trim();

                    var postDate = DateTime.Parse(document.DocumentNode.SelectSingleNode(dateSelector).InnerHtml.Trim());

                    var postContent =
                        document.DocumentNode.SelectSingleNode(
                            "id('MasterPageContentPlaceHolder_forum_ctl01_ProfileTabs_Last10PostsTab_LastPosts_MessagePost_" + i + "')").InnerHtml;

                    Thread thread = threads.SingleOrDefault(s => s.ThreadId == threadId);
                    if (thread == null)
                    {
                        thread = new Thread { ThreadId = threadId, Title = threadTitle };
                        threads.Add(thread);
                        session.ThreadsAdded++;
                    }

                    postsCollection.Add(new Post
                    {
                        Content = postContent,
                        Hash = _hashProvider.Compute(postContent),
                        PostDate = postDate,
                        Thread = thread
                    });
                }

                return postsCollection.Where(newPost => !account.Posts.Select(p => p.Hash).Contains(newPost.Hash));
            }
            catch (Exception ex)
            {
                session.HadErrors = true;
                session.Errors.Add(new Error
                    {
                        Message = ex.Message,
                        Source = ex.Source,
                        StackTrace = ex.StackTrace,
                        TargetSite = ex.TargetSite.Name
                    });

                Debug.WriteLine(ex.Message + "\n----\n" + ex.StackTrace);
                return new Post[0];
            }
        }