Example #1
0
        private static void ScrapePosts(Facebook fb, int lookBackDays, int maxUnscrapedPosts, bool dailyScraping, bool scrapeLikes, bool scrapeComments, UpdateOrInsertBuilder <Entity> entityUpdater)
        {
            log.Info("Scraping posts");
            if (scrapeLikes)
            {
                log.Info("Scraping for likes");
            }
            if (scrapeComments)
            {
                log.Info("Scraping for comments");
            }
            using (var db = new FacebookDebatEntities())
            {
                db.Database.CommandTimeout = 0;

                #region Build list of posts to be scraped for comments
                log.Info("Building post-list");
                var dayLimit = DateTime.Now.AddDays(-lookBackDays);

                // Get unscraped posts
                var posts = db.Posts.Where(x => !x.scraped).OrderBy(x => x.date).Take(maxUnscrapedPosts).ToList();

                // Get posts with comments that has been made less than two days old
                if (dailyScraping)
                {
                    posts = posts.Union(db.Posts.Where(x => x.date > dayLimit)).ToList();
                    var latestedCommentedPosts = db.Comments.Where(x => x.date > dayLimit).GroupBy(x => x.post_id).Select(x => x.Key);
                    posts = posts.Union(db.Posts.Where(x => latestedCommentedPosts.Contains(x.id))).ToList();
                }
                #endregion

                log.Info("Found " + posts.Count + " posts");

                foreach (var postChunk in posts.Chunk(50))
                {
                    var scrapedPosts = new List <Post>();

                    #region Scrape Comments from FB
                    List <Facebook.Comment>  postComments = new List <Facebook.Comment>();
                    List <Facebook.PostLike> postLikes    = new List <Facebook.PostLike>();
                    log.Info("Getting from " + postChunk.Count() + " post");
                    var fbFailed = false;
                    Parallel.ForEach(postChunk, (post) =>
                    {
                        if (fbFailed)
                        {
                            return;
                        }

                        try
                        {
                            var comments = Task.Run(() => scrapeComments ? fb.GetComments(post.fb_id) : new List <Facebook.Comment>());
                            var likes    = Task.Run(() => scrapeLikes ? fb.GetLikes(post.fb_id) : new List <Facebook.PostLike>());
                            comments.Wait();
                            likes.Wait();
                            lock (postComments)
                            {
                                postComments.AddRange(comments.Result);
                                postLikes.AddRange(likes.Result);
                                scrapedPosts.Add(post);
                            }
                        }
                        catch (Exception e)
                        {
                            //fbFailed = true;
                            log.Warn("Being ignored by Facebook. Aborting.");
                            log.Warn(e.Message);

                            return;
                        }
                    });
                    #endregion

                    #region Update Entities
                    foreach (var comment in postComments)
                    {
                        entityUpdater.Process(comment.user_id, () => new Entity
                        {
                            fb_id = comment.user_id,
                            name  = comment.user_name
                        });
                    }
                    foreach (var like in postLikes)
                    {
                        entityUpdater.Process(like.user_id, () => new Entity
                        {
                            fb_id = like.user_id,
                            name  = like.user_name
                        });
                    }
                    entityUpdater.SyncDatabase(2000, "dbo.Entities", "id", x => new
                    {
                        id    = (int?)null,
                        fb_id = x.fb_id,
                        name  = x.name
                    });
                    #endregion

                    log.Info("Intializing post-cache");
                    var postTranslator = DatabaseTools.ExecuteDictionaryReader("select fb_id, id from dbo.Posts", (x) => (string)x["fb_id"], x => (int)x["id"]);
                    log.Info("Intializing entity-cache");
                    var entityTranslator = entityUpdater.GetLookup(x => x.id);

                    var ActivePostID = postChunk.Select(x => x.id).ToList();

                    #region Update comments
                    if (scrapeComments)
                    {
                        var commentUpdater = new UpdateOrInsertBuilder <Comment>(
                            AlreadyExists: db.Comments.Where(x => ActivePostID.Contains(x.post_id)),
                            GetKey: x => x.fb_id + "_" + x.post_id,
                            Updater: (dbItem, memItem) =>
                        {
                            if (dbItem.message != memItem.message)
                            {
                                dbItem.message = memItem.message;
                                return(true);
                            }
                            return(false);
                        }
                            );

                        foreach (var comment in postComments)
                        {
                            commentUpdater.Process(comment.id + "_" + comment.post_id, () => new Comment()
                            {
                                fb_id     = comment.id,
                                message   = comment.message,
                                post_id   = postTranslator[comment.post_id],
                                entity_id = entityTranslator[comment.user_id],
                                date      = comment.date,
                                score     = Common.Classifier.Classify(comment.message)
                            });
                        }
                        commentUpdater.SyncDatabase(2000, "dbo.[Comments]", "id", x => new
                        {
                            id        = (int?)null,
                            fb_id     = x.fb_id,
                            post_id   = x.post_id,
                            entity_id = x.entity_id,
                            date      = x.date,
                            score     = x.score,
                            scored    = 1,
                            message   = x.message,
                            splitted  = 0,
                        });
                    }
                    #endregion

                    #region Update likes
                    if (scrapeLikes)
                    {
                        log.Info("Creating PostLike-UpdateOrInsertBuilder");
                        var likeUpdater = new UpdateOrInsertBuilder <PostLike>(
                            AlreadyExists: db.PostLikes.Where(x => ActivePostID.Contains(x.post_id)),
                            GetKey: x => x.post_id + "_" + x.entity_id,
                            Updater: (dbItem, memItem) => false);

                        log.Info("Processing PostLikes");
                        foreach (var like in postLikes)
                        {
                            likeUpdater.Process(postTranslator[like.post_id] + "_" + entityTranslator[like.user_id], () => new PostLike()
                            {
                                post_id   = postTranslator[like.post_id],
                                entity_id = entityTranslator[like.user_id],
                            });
                        }
                        likeUpdater.SyncDatabase(10000, "dbo.[PostLikes]", "id", x => new
                        {
                            id        = (int?)null,
                            post_id   = x.post_id,
                            entity_id = x.entity_id,
                        });
                    }
                    #endregion

                    log.Info("Marking scraped");
                    Parallel.ForEach(scrapedPosts, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 10
                    }, scrapedPost =>
                    {
                        if (!scrapedPost.scraped)
                        {
                            DatabaseTools.ExecuteNonQuery("UPDATE dbo.[Posts] SET scraped = 1 WHERE id = @id", new SqlParameter("id", scrapedPost.id));
                        }
                    });
                }
            }
        }