private static void ScrapePosts(Facebook fb, int lookBackDays, int maxUnscrapedPosts, bool dailyScraping, bool scrapeLikes, bool scrapeComments, UpdateOrInsertBuilder <Entity> entityUpdater) { log.Info("Scraping posts"); if (scrapeLikes) { log.Info("Scraping for likes"); } if (scrapeComments) { log.Info("Scraping for comments"); } using (var db = new FacebookDebatEntities()) { db.Database.CommandTimeout = 0; #region Build list of posts to be scraped for comments log.Info("Building post-list"); var dayLimit = DateTime.Now.AddDays(-lookBackDays); // Get unscraped posts var posts = db.Posts.Where(x => !x.scraped).OrderBy(x => x.date).Take(maxUnscrapedPosts).ToList(); // Get posts with comments that has been made less than two days old if (dailyScraping) { posts = posts.Union(db.Posts.Where(x => x.date > dayLimit)).ToList(); var latestedCommentedPosts = db.Comments.Where(x => x.date > dayLimit).GroupBy(x => x.post_id).Select(x => x.Key); posts = posts.Union(db.Posts.Where(x => latestedCommentedPosts.Contains(x.id))).ToList(); } #endregion log.Info("Found " + posts.Count + " posts"); foreach (var postChunk in posts.Chunk(50)) { var scrapedPosts = new List <Post>(); #region Scrape Comments from FB List <Facebook.Comment> postComments = new List <Facebook.Comment>(); List <Facebook.PostLike> postLikes = new List <Facebook.PostLike>(); log.Info("Getting from " + postChunk.Count() + " post"); var fbFailed = false; Parallel.ForEach(postChunk, (post) => { if (fbFailed) { return; } try { var comments = Task.Run(() => scrapeComments ? fb.GetComments(post.fb_id) : new List <Facebook.Comment>()); var likes = Task.Run(() => scrapeLikes ? fb.GetLikes(post.fb_id) : new List <Facebook.PostLike>()); comments.Wait(); likes.Wait(); lock (postComments) { postComments.AddRange(comments.Result); postLikes.AddRange(likes.Result); scrapedPosts.Add(post); } } catch (Exception e) { //fbFailed = true; log.Warn("Being ignored by Facebook. Aborting."); log.Warn(e.Message); return; } }); #endregion #region Update Entities foreach (var comment in postComments) { entityUpdater.Process(comment.user_id, () => new Entity { fb_id = comment.user_id, name = comment.user_name }); } foreach (var like in postLikes) { entityUpdater.Process(like.user_id, () => new Entity { fb_id = like.user_id, name = like.user_name }); } entityUpdater.SyncDatabase(2000, "dbo.Entities", "id", x => new { id = (int?)null, fb_id = x.fb_id, name = x.name }); #endregion log.Info("Intializing post-cache"); var postTranslator = DatabaseTools.ExecuteDictionaryReader("select fb_id, id from dbo.Posts", (x) => (string)x["fb_id"], x => (int)x["id"]); log.Info("Intializing entity-cache"); var entityTranslator = entityUpdater.GetLookup(x => x.id); var ActivePostID = postChunk.Select(x => x.id).ToList(); #region Update comments if (scrapeComments) { var commentUpdater = new UpdateOrInsertBuilder <Comment>( AlreadyExists: db.Comments.Where(x => ActivePostID.Contains(x.post_id)), GetKey: x => x.fb_id + "_" + x.post_id, Updater: (dbItem, memItem) => { if (dbItem.message != memItem.message) { dbItem.message = memItem.message; return(true); } return(false); } ); foreach (var comment in postComments) { commentUpdater.Process(comment.id + "_" + comment.post_id, () => new Comment() { fb_id = comment.id, message = comment.message, post_id = postTranslator[comment.post_id], entity_id = entityTranslator[comment.user_id], date = comment.date, score = Common.Classifier.Classify(comment.message) }); } commentUpdater.SyncDatabase(2000, "dbo.[Comments]", "id", x => new { id = (int?)null, fb_id = x.fb_id, post_id = x.post_id, entity_id = x.entity_id, date = x.date, score = x.score, scored = 1, message = x.message, splitted = 0, }); } #endregion #region Update likes if (scrapeLikes) { log.Info("Creating PostLike-UpdateOrInsertBuilder"); var likeUpdater = new UpdateOrInsertBuilder <PostLike>( AlreadyExists: db.PostLikes.Where(x => ActivePostID.Contains(x.post_id)), GetKey: x => x.post_id + "_" + x.entity_id, Updater: (dbItem, memItem) => false); log.Info("Processing PostLikes"); foreach (var like in postLikes) { likeUpdater.Process(postTranslator[like.post_id] + "_" + entityTranslator[like.user_id], () => new PostLike() { post_id = postTranslator[like.post_id], entity_id = entityTranslator[like.user_id], }); } likeUpdater.SyncDatabase(10000, "dbo.[PostLikes]", "id", x => new { id = (int?)null, post_id = x.post_id, entity_id = x.entity_id, }); } #endregion log.Info("Marking scraped"); Parallel.ForEach(scrapedPosts, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, scrapedPost => { if (!scrapedPost.scraped) { DatabaseTools.ExecuteNonQuery("UPDATE dbo.[Posts] SET scraped = 1 WHERE id = @id", new SqlParameter("id", scrapedPost.id)); } }); } } }
public static void SplitWords() { using (var db = new FacebookDebatEntities()) { log.Info("Finding un-splitted comments"); var comments = db.Comments.Where(x => !x.splitted).Take(10000).ToList(); var commentIds = string.Join(",", comments.Select(x => x.id.ToString()).ToArray()); var deleteWordsTask = Task.Factory.StartNew(() => { if (commentIds.Count() != 0) { DatabaseTools.ExecuteNonQuery(string.Format("DELETE FROM CommentWords WHERE comment_id IN ({0})", commentIds)); } log.Info("Finished deleting words"); }); var deleteLinksTask = Task.Factory.StartNew(() => { if (commentIds.Count() != 0) { DatabaseTools.ExecuteNonQuery(string.Format("DELETE FROM CommentLinks WHERE comment_id IN ({0})", commentIds)); } log.Info("Finished deleting links"); }); var commentWords = new List <Tuple <int, string> >(); var commentLinks = new List <Tuple <int, string> >(); log.Info("Building comment cache"); var wordCache = new UpdateOrInsertBuilder <Word>(db.Words, x => x.word1, (x, y) => false); log.Info("Building link cache"); var linkCache = new UpdateOrInsertBuilder <Link>(db.Links, x => x.url, (x, y) => false); log.Info("Splitting"); foreach (var comment in comments) { // Get links String commentWithoutLinks; var links = Tools.StripLinks(comment.message, out commentWithoutLinks); foreach (var link in links) { linkCache.Process(link, () => new Link() { url = link }); commentLinks.Add(Tuple.Create(comment.id, link)); } // Get words from link-stripped comment var words = Tools.SplitWords(commentWithoutLinks.ToLower()).Where(x => !string.IsNullOrEmpty(x)); foreach (var word in words) { if (word.Length >= 100) // Longest possible word in DB { log.Warn("Ignoring " + word); continue; } if (word.Any(x => char.IsDigit(x) || x == '_')) // no words with underscores or digits { continue; } if (word.Length < 2) { continue; } wordCache.Process(word, () => new Word() { word1 = word }); commentWords.Add(Tuple.Create(comment.id, word)); } } log.Info("Waiting for delete-tasks"); Task.WaitAll(deleteLinksTask, deleteWordsTask); wordCache.SyncDatabase(2000, "dbo.Words", "id", (word) => new { id = (int?)null, word = word.word1 }); linkCache.SyncDatabase(2000, "dbo.Links", "id", (link) => new { id = (int?)null, url = link.url }); var linkTranslator = new FacebookDebatEntities().Links.ToDictionary(x => x.url, x => x.id); DatabaseTools.ChunkInsert("dbo.CommentLinks", 10000, commentLinks.Select(x => new { id = (int?)null, comment_id = x.Item1, link_id = linkTranslator[x.Item2] })); var wordTranslator = new FacebookDebatEntities().Words.ToDictionary(x => x.word1, x => x.id); DatabaseTools.ChunkInsert("dbo.CommentWords", 20000, commentWords.Select(x => new { id = (int?)null, comment_id = x.Item1, word_id = wordTranslator[x.Item2] })); log.Info("Marking splitted"); DatabaseTools.ExecuteNonQuery(string.Format("update dbo.Comments set splitted = 1 where id in ({0})", commentIds)); } }