private static void ScrapePosts(Facebook fb, int lookBackDays, int maxUnscrapedPosts, bool dailyScraping, bool scrapeLikes, bool scrapeComments, UpdateOrInsertBuilder <Entity> entityUpdater) { log.Info("Scraping posts"); if (scrapeLikes) { log.Info("Scraping for likes"); } if (scrapeComments) { log.Info("Scraping for comments"); } using (var db = new FacebookDebatEntities()) { db.Database.CommandTimeout = 0; #region Build list of posts to be scraped for comments log.Info("Building post-list"); var dayLimit = DateTime.Now.AddDays(-lookBackDays); // Get unscraped posts var posts = db.Posts.Where(x => !x.scraped).OrderBy(x => x.date).Take(maxUnscrapedPosts).ToList(); // Get posts with comments that has been made less than two days old if (dailyScraping) { posts = posts.Union(db.Posts.Where(x => x.date > dayLimit)).ToList(); var latestedCommentedPosts = db.Comments.Where(x => x.date > dayLimit).GroupBy(x => x.post_id).Select(x => x.Key); posts = posts.Union(db.Posts.Where(x => latestedCommentedPosts.Contains(x.id))).ToList(); } #endregion log.Info("Found " + posts.Count + " posts"); foreach (var postChunk in posts.Chunk(50)) { var scrapedPosts = new List <Post>(); #region Scrape Comments from FB List <Facebook.Comment> postComments = new List <Facebook.Comment>(); List <Facebook.PostLike> postLikes = new List <Facebook.PostLike>(); log.Info("Getting from " + postChunk.Count() + " post"); var fbFailed = false; Parallel.ForEach(postChunk, (post) => { if (fbFailed) { return; } try { var comments = Task.Run(() => scrapeComments ? fb.GetComments(post.fb_id) : new List <Facebook.Comment>()); var likes = Task.Run(() => scrapeLikes ? fb.GetLikes(post.fb_id) : new List <Facebook.PostLike>()); comments.Wait(); likes.Wait(); lock (postComments) { postComments.AddRange(comments.Result); postLikes.AddRange(likes.Result); scrapedPosts.Add(post); } } catch (Exception e) { //fbFailed = true; log.Warn("Being ignored by Facebook. Aborting."); log.Warn(e.Message); return; } }); #endregion #region Update Entities foreach (var comment in postComments) { entityUpdater.Process(comment.user_id, () => new Entity { fb_id = comment.user_id, name = comment.user_name }); } foreach (var like in postLikes) { entityUpdater.Process(like.user_id, () => new Entity { fb_id = like.user_id, name = like.user_name }); } entityUpdater.SyncDatabase(2000, "dbo.Entities", "id", x => new { id = (int?)null, fb_id = x.fb_id, name = x.name }); #endregion log.Info("Intializing post-cache"); var postTranslator = DatabaseTools.ExecuteDictionaryReader("select fb_id, id from dbo.Posts", (x) => (string)x["fb_id"], x => (int)x["id"]); log.Info("Intializing entity-cache"); var entityTranslator = entityUpdater.GetLookup(x => x.id); var ActivePostID = postChunk.Select(x => x.id).ToList(); #region Update comments if (scrapeComments) { var commentUpdater = new UpdateOrInsertBuilder <Comment>( AlreadyExists: db.Comments.Where(x => ActivePostID.Contains(x.post_id)), GetKey: x => x.fb_id + "_" + x.post_id, Updater: (dbItem, memItem) => { if (dbItem.message != memItem.message) { dbItem.message = memItem.message; return(true); } return(false); } ); foreach (var comment in postComments) { commentUpdater.Process(comment.id + "_" + comment.post_id, () => new Comment() { fb_id = comment.id, message = comment.message, post_id = postTranslator[comment.post_id], entity_id = entityTranslator[comment.user_id], date = comment.date, score = Common.Classifier.Classify(comment.message) }); } commentUpdater.SyncDatabase(2000, "dbo.[Comments]", "id", x => new { id = (int?)null, fb_id = x.fb_id, post_id = x.post_id, entity_id = x.entity_id, date = x.date, score = x.score, scored = 1, message = x.message, splitted = 0, }); } #endregion #region Update likes if (scrapeLikes) { log.Info("Creating PostLike-UpdateOrInsertBuilder"); var likeUpdater = new UpdateOrInsertBuilder <PostLike>( AlreadyExists: db.PostLikes.Where(x => ActivePostID.Contains(x.post_id)), GetKey: x => x.post_id + "_" + x.entity_id, Updater: (dbItem, memItem) => false); log.Info("Processing PostLikes"); foreach (var like in postLikes) { likeUpdater.Process(postTranslator[like.post_id] + "_" + entityTranslator[like.user_id], () => new PostLike() { post_id = postTranslator[like.post_id], entity_id = entityTranslator[like.user_id], }); } likeUpdater.SyncDatabase(10000, "dbo.[PostLikes]", "id", x => new { id = (int?)null, post_id = x.post_id, entity_id = x.entity_id, }); } #endregion log.Info("Marking scraped"); Parallel.ForEach(scrapedPosts, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, scrapedPost => { if (!scrapedPost.scraped) { DatabaseTools.ExecuteNonQuery("UPDATE dbo.[Posts] SET scraped = 1 WHERE id = @id", new SqlParameter("id", scrapedPost.id)); } }); } } }