private static void ScrapePosts(Facebook fb, int lookBackDays, int maxUnscrapedPosts, bool dailyScraping, bool scrapeLikes, bool scrapeComments, UpdateOrInsertBuilder <Entity> entityUpdater) { log.Info("Scraping posts"); if (scrapeLikes) { log.Info("Scraping for likes"); } if (scrapeComments) { log.Info("Scraping for comments"); } using (var db = new FacebookDebatEntities()) { db.Database.CommandTimeout = 0; #region Build list of posts to be scraped for comments log.Info("Building post-list"); var dayLimit = DateTime.Now.AddDays(-lookBackDays); // Get unscraped posts var posts = db.Posts.Where(x => !x.scraped).OrderBy(x => x.date).Take(maxUnscrapedPosts).ToList(); // Get posts with comments that has been made less than two days old if (dailyScraping) { posts = posts.Union(db.Posts.Where(x => x.date > dayLimit)).ToList(); var latestedCommentedPosts = db.Comments.Where(x => x.date > dayLimit).GroupBy(x => x.post_id).Select(x => x.Key); posts = posts.Union(db.Posts.Where(x => latestedCommentedPosts.Contains(x.id))).ToList(); } #endregion log.Info("Found " + posts.Count + " posts"); foreach (var postChunk in posts.Chunk(50)) { var scrapedPosts = new List <Post>(); #region Scrape Comments from FB List <Facebook.Comment> postComments = new List <Facebook.Comment>(); List <Facebook.PostLike> postLikes = new List <Facebook.PostLike>(); log.Info("Getting from " + postChunk.Count() + " post"); var fbFailed = false; Parallel.ForEach(postChunk, (post) => { if (fbFailed) { return; } try { var comments = Task.Run(() => scrapeComments ? fb.GetComments(post.fb_id) : new List <Facebook.Comment>()); var likes = Task.Run(() => scrapeLikes ? fb.GetLikes(post.fb_id) : new List <Facebook.PostLike>()); comments.Wait(); likes.Wait(); lock (postComments) { postComments.AddRange(comments.Result); postLikes.AddRange(likes.Result); scrapedPosts.Add(post); } } catch (Exception e) { //fbFailed = true; log.Warn("Being ignored by Facebook. Aborting."); log.Warn(e.Message); return; } }); #endregion #region Update Entities foreach (var comment in postComments) { entityUpdater.Process(comment.user_id, () => new Entity { fb_id = comment.user_id, name = comment.user_name }); } foreach (var like in postLikes) { entityUpdater.Process(like.user_id, () => new Entity { fb_id = like.user_id, name = like.user_name }); } entityUpdater.SyncDatabase(2000, "dbo.Entities", "id", x => new { id = (int?)null, fb_id = x.fb_id, name = x.name }); #endregion log.Info("Intializing post-cache"); var postTranslator = DatabaseTools.ExecuteDictionaryReader("select fb_id, id from dbo.Posts", (x) => (string)x["fb_id"], x => (int)x["id"]); log.Info("Intializing entity-cache"); var entityTranslator = entityUpdater.GetLookup(x => x.id); var ActivePostID = postChunk.Select(x => x.id).ToList(); #region Update comments if (scrapeComments) { var commentUpdater = new UpdateOrInsertBuilder <Comment>( AlreadyExists: db.Comments.Where(x => ActivePostID.Contains(x.post_id)), GetKey: x => x.fb_id + "_" + x.post_id, Updater: (dbItem, memItem) => { if (dbItem.message != memItem.message) { dbItem.message = memItem.message; return(true); } return(false); } ); foreach (var comment in postComments) { commentUpdater.Process(comment.id + "_" + comment.post_id, () => new Comment() { fb_id = comment.id, message = comment.message, post_id = postTranslator[comment.post_id], entity_id = entityTranslator[comment.user_id], date = comment.date, score = Common.Classifier.Classify(comment.message) }); } commentUpdater.SyncDatabase(2000, "dbo.[Comments]", "id", x => new { id = (int?)null, fb_id = x.fb_id, post_id = x.post_id, entity_id = x.entity_id, date = x.date, score = x.score, scored = 1, message = x.message, splitted = 0, }); } #endregion #region Update likes if (scrapeLikes) { log.Info("Creating PostLike-UpdateOrInsertBuilder"); var likeUpdater = new UpdateOrInsertBuilder <PostLike>( AlreadyExists: db.PostLikes.Where(x => ActivePostID.Contains(x.post_id)), GetKey: x => x.post_id + "_" + x.entity_id, Updater: (dbItem, memItem) => false); log.Info("Processing PostLikes"); foreach (var like in postLikes) { likeUpdater.Process(postTranslator[like.post_id] + "_" + entityTranslator[like.user_id], () => new PostLike() { post_id = postTranslator[like.post_id], entity_id = entityTranslator[like.user_id], }); } likeUpdater.SyncDatabase(10000, "dbo.[PostLikes]", "id", x => new { id = (int?)null, post_id = x.post_id, entity_id = x.entity_id, }); } #endregion log.Info("Marking scraped"); Parallel.ForEach(scrapedPosts, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, scrapedPost => { if (!scrapedPost.scraped) { DatabaseTools.ExecuteNonQuery("UPDATE dbo.[Posts] SET scraped = 1 WHERE id = @id", new SqlParameter("id", scrapedPost.id)); } }); } } }
private static void Start(string[] args) { var fb = new Facebook(); int lookbackDays = int.Parse(ConfigurationManager.AppSettings["LookbackDays"]); int maxUnscrapedPosts = int.Parse(ConfigurationManager.AppSettings["MaxUnscrapedPosts"]); bool dailyScraping = bool.Parse(ConfigurationManager.AppSettings["DailyScraping"]); bool getNames = bool.Parse(ConfigurationManager.AppSettings["GetNames"]); int nameScrapeInterval = int.Parse(ConfigurationManager.AppSettings["NameScrapeInterval"]); bool getPosts = bool.Parse(ConfigurationManager.AppSettings["GetPosts"]); int postScrapeInterval = int.Parse(ConfigurationManager.AppSettings["PostScrapeInterval"]); bool getComments = bool.Parse(ConfigurationManager.AppSettings["GetComments"]); int commentScrapeInterval = int.Parse(ConfigurationManager.AppSettings["CommentScrapeInterval"]); bool getLikes = bool.Parse(ConfigurationManager.AppSettings["GetLikes"]); int likeScrapeInterval = int.Parse(ConfigurationManager.AppSettings["LikeScrapeInterval"]); bool splitComments = bool.Parse(ConfigurationManager.AppSettings["SplitComments"]); int splitCommentInterval = int.Parse(ConfigurationManager.AppSettings["SplitCommentInterval"]); Func <bool, Action, int, Task> StartRepeatingTask = (doIt, function, interval) => { return(Task.Run(() => { if (doIt) { while (true) { CatchWithoutDebug(() => function()); Thread.Sleep(interval * 1000); } } })); }; UpdateOrInsertBuilder <Entity> entityUpdater; log.Info("Initializing Entity cache"); using (var db = new FacebookDebatEntities()) { entityUpdater = new UpdateOrInsertBuilder <Entity>( AlreadyExists: db.Entities, GetKey: x => x.fb_id, Updater: (dbItem, memItem) => { if (dbItem.name != memItem.name) { dbItem.name = memItem.name; return(true); } return(false); } ); } log.Info("Done"); StartRepeatingTask(getNames, () => GetScrapees(fb), nameScrapeInterval); StartRepeatingTask(getPosts, () => GetPosts(fb, lookbackDays, DateTime.Now), postScrapeInterval); StartRepeatingTask(getComments, () => ScrapePosts(fb, lookbackDays, maxUnscrapedPosts, dailyScraping, false, true, entityUpdater), commentScrapeInterval); StartRepeatingTask(getLikes, () => ScrapePosts(fb, lookbackDays, maxUnscrapedPosts, dailyScraping, true, false, entityUpdater), likeScrapeInterval); StartRepeatingTask(splitComments, () => CommentSplitter.SplitWords(), splitCommentInterval); StartRepeatingTask(true, () => { log.Info("Aggregating likes"); DatabaseTools.ExecuteNonQuery(@"update e set blalikes = a.blalikes, rodlikes = a.rodlikes from dbo.Entities e left join dbo.Scrapees s on e.id = s.entity_id left join ( select c.entity_id, sum(case when s.Blok = 'Blå' then 1 else 0 end) as blalikes, sum(case when s.Blok = 'Rød' then 1 else 0 end) as rodlikes from Comments c inner join Posts p on c.post_id = p.id inner join Scrapees s on s.entity_id = p.entity_id group by c.entity_id ) a on a.entity_id = e.id where s.entity_id is null"); log.Info("Concluding blocks"); DatabaseTools.ExecuteNonQuery(@"update e set blok = IIF(ratio >= 65, 'Rød', IIF(ratio <= 35, 'Blå', 'Midt')) from Entities e inner join ( select id, rodlikes, blalikes, rodlikes*100/(rodlikes+blalikes) as ratio from Entities ) a on e.id= a.id where (e.rodlikes+e.blalikes) > 4"); }, 10 * 60); Console.ReadLine(); }
public static void SplitWords() { using (var db = new FacebookDebatEntities()) { log.Info("Finding un-splitted comments"); var comments = db.Comments.Where(x => !x.splitted).Take(10000).ToList(); var commentIds = string.Join(",", comments.Select(x => x.id.ToString()).ToArray()); var deleteWordsTask = Task.Factory.StartNew(() => { if (commentIds.Count() != 0) { DatabaseTools.ExecuteNonQuery(string.Format("DELETE FROM CommentWords WHERE comment_id IN ({0})", commentIds)); } log.Info("Finished deleting words"); }); var deleteLinksTask = Task.Factory.StartNew(() => { if (commentIds.Count() != 0) { DatabaseTools.ExecuteNonQuery(string.Format("DELETE FROM CommentLinks WHERE comment_id IN ({0})", commentIds)); } log.Info("Finished deleting links"); }); var commentWords = new List <Tuple <int, string> >(); var commentLinks = new List <Tuple <int, string> >(); log.Info("Building comment cache"); var wordCache = new UpdateOrInsertBuilder <Word>(db.Words, x => x.word1, (x, y) => false); log.Info("Building link cache"); var linkCache = new UpdateOrInsertBuilder <Link>(db.Links, x => x.url, (x, y) => false); log.Info("Splitting"); foreach (var comment in comments) { // Get links String commentWithoutLinks; var links = Tools.StripLinks(comment.message, out commentWithoutLinks); foreach (var link in links) { linkCache.Process(link, () => new Link() { url = link }); commentLinks.Add(Tuple.Create(comment.id, link)); } // Get words from link-stripped comment var words = Tools.SplitWords(commentWithoutLinks.ToLower()).Where(x => !string.IsNullOrEmpty(x)); foreach (var word in words) { if (word.Length >= 100) // Longest possible word in DB { log.Warn("Ignoring " + word); continue; } if (word.Any(x => char.IsDigit(x) || x == '_')) // no words with underscores or digits { continue; } if (word.Length < 2) { continue; } wordCache.Process(word, () => new Word() { word1 = word }); commentWords.Add(Tuple.Create(comment.id, word)); } } log.Info("Waiting for delete-tasks"); Task.WaitAll(deleteLinksTask, deleteWordsTask); wordCache.SyncDatabase(2000, "dbo.Words", "id", (word) => new { id = (int?)null, word = word.word1 }); linkCache.SyncDatabase(2000, "dbo.Links", "id", (link) => new { id = (int?)null, url = link.url }); var linkTranslator = new FacebookDebatEntities().Links.ToDictionary(x => x.url, x => x.id); DatabaseTools.ChunkInsert("dbo.CommentLinks", 10000, commentLinks.Select(x => new { id = (int?)null, comment_id = x.Item1, link_id = linkTranslator[x.Item2] })); var wordTranslator = new FacebookDebatEntities().Words.ToDictionary(x => x.word1, x => x.id); DatabaseTools.ChunkInsert("dbo.CommentWords", 20000, commentWords.Select(x => new { id = (int?)null, comment_id = x.Item1, word_id = wordTranslator[x.Item2] })); log.Info("Marking splitted"); DatabaseTools.ExecuteNonQuery(string.Format("update dbo.Comments set splitted = 1 where id in ({0})", commentIds)); } }