Beispiel #1
0
        private static void ScrapePosts(Facebook fb, int lookBackDays, int maxUnscrapedPosts, bool dailyScraping, bool scrapeLikes, bool scrapeComments, UpdateOrInsertBuilder <Entity> entityUpdater)
        {
            log.Info("Scraping posts");
            if (scrapeLikes)
            {
                log.Info("Scraping for likes");
            }
            if (scrapeComments)
            {
                log.Info("Scraping for comments");
            }
            using (var db = new FacebookDebatEntities())
            {
                db.Database.CommandTimeout = 0;

                #region Build list of posts to be scraped for comments
                log.Info("Building post-list");
                var dayLimit = DateTime.Now.AddDays(-lookBackDays);

                // Get unscraped posts
                var posts = db.Posts.Where(x => !x.scraped).OrderBy(x => x.date).Take(maxUnscrapedPosts).ToList();

                // Get posts with comments that has been made less than two days old
                if (dailyScraping)
                {
                    posts = posts.Union(db.Posts.Where(x => x.date > dayLimit)).ToList();
                    var latestedCommentedPosts = db.Comments.Where(x => x.date > dayLimit).GroupBy(x => x.post_id).Select(x => x.Key);
                    posts = posts.Union(db.Posts.Where(x => latestedCommentedPosts.Contains(x.id))).ToList();
                }
                #endregion

                log.Info("Found " + posts.Count + " posts");

                foreach (var postChunk in posts.Chunk(50))
                {
                    var scrapedPosts = new List <Post>();

                    #region Scrape Comments from FB
                    List <Facebook.Comment>  postComments = new List <Facebook.Comment>();
                    List <Facebook.PostLike> postLikes    = new List <Facebook.PostLike>();
                    log.Info("Getting from " + postChunk.Count() + " post");
                    var fbFailed = false;
                    Parallel.ForEach(postChunk, (post) =>
                    {
                        if (fbFailed)
                        {
                            return;
                        }

                        try
                        {
                            var comments = Task.Run(() => scrapeComments ? fb.GetComments(post.fb_id) : new List <Facebook.Comment>());
                            var likes    = Task.Run(() => scrapeLikes ? fb.GetLikes(post.fb_id) : new List <Facebook.PostLike>());
                            comments.Wait();
                            likes.Wait();
                            lock (postComments)
                            {
                                postComments.AddRange(comments.Result);
                                postLikes.AddRange(likes.Result);
                                scrapedPosts.Add(post);
                            }
                        }
                        catch (Exception e)
                        {
                            //fbFailed = true;
                            log.Warn("Being ignored by Facebook. Aborting.");
                            log.Warn(e.Message);

                            return;
                        }
                    });
                    #endregion

                    #region Update Entities
                    foreach (var comment in postComments)
                    {
                        entityUpdater.Process(comment.user_id, () => new Entity
                        {
                            fb_id = comment.user_id,
                            name  = comment.user_name
                        });
                    }
                    foreach (var like in postLikes)
                    {
                        entityUpdater.Process(like.user_id, () => new Entity
                        {
                            fb_id = like.user_id,
                            name  = like.user_name
                        });
                    }
                    entityUpdater.SyncDatabase(2000, "dbo.Entities", "id", x => new
                    {
                        id    = (int?)null,
                        fb_id = x.fb_id,
                        name  = x.name
                    });
                    #endregion

                    log.Info("Intializing post-cache");
                    var postTranslator = DatabaseTools.ExecuteDictionaryReader("select fb_id, id from dbo.Posts", (x) => (string)x["fb_id"], x => (int)x["id"]);
                    log.Info("Intializing entity-cache");
                    var entityTranslator = entityUpdater.GetLookup(x => x.id);

                    var ActivePostID = postChunk.Select(x => x.id).ToList();

                    #region Update comments
                    if (scrapeComments)
                    {
                        var commentUpdater = new UpdateOrInsertBuilder <Comment>(
                            AlreadyExists: db.Comments.Where(x => ActivePostID.Contains(x.post_id)),
                            GetKey: x => x.fb_id + "_" + x.post_id,
                            Updater: (dbItem, memItem) =>
                        {
                            if (dbItem.message != memItem.message)
                            {
                                dbItem.message = memItem.message;
                                return(true);
                            }
                            return(false);
                        }
                            );

                        foreach (var comment in postComments)
                        {
                            commentUpdater.Process(comment.id + "_" + comment.post_id, () => new Comment()
                            {
                                fb_id     = comment.id,
                                message   = comment.message,
                                post_id   = postTranslator[comment.post_id],
                                entity_id = entityTranslator[comment.user_id],
                                date      = comment.date,
                                score     = Common.Classifier.Classify(comment.message)
                            });
                        }
                        commentUpdater.SyncDatabase(2000, "dbo.[Comments]", "id", x => new
                        {
                            id        = (int?)null,
                            fb_id     = x.fb_id,
                            post_id   = x.post_id,
                            entity_id = x.entity_id,
                            date      = x.date,
                            score     = x.score,
                            scored    = 1,
                            message   = x.message,
                            splitted  = 0,
                        });
                    }
                    #endregion

                    #region Update likes
                    if (scrapeLikes)
                    {
                        log.Info("Creating PostLike-UpdateOrInsertBuilder");
                        var likeUpdater = new UpdateOrInsertBuilder <PostLike>(
                            AlreadyExists: db.PostLikes.Where(x => ActivePostID.Contains(x.post_id)),
                            GetKey: x => x.post_id + "_" + x.entity_id,
                            Updater: (dbItem, memItem) => false);

                        log.Info("Processing PostLikes");
                        foreach (var like in postLikes)
                        {
                            likeUpdater.Process(postTranslator[like.post_id] + "_" + entityTranslator[like.user_id], () => new PostLike()
                            {
                                post_id   = postTranslator[like.post_id],
                                entity_id = entityTranslator[like.user_id],
                            });
                        }
                        likeUpdater.SyncDatabase(10000, "dbo.[PostLikes]", "id", x => new
                        {
                            id        = (int?)null,
                            post_id   = x.post_id,
                            entity_id = x.entity_id,
                        });
                    }
                    #endregion

                    log.Info("Marking scraped");
                    Parallel.ForEach(scrapedPosts, new ParallelOptions()
                    {
                        MaxDegreeOfParallelism = 10
                    }, scrapedPost =>
                    {
                        if (!scrapedPost.scraped)
                        {
                            DatabaseTools.ExecuteNonQuery("UPDATE dbo.[Posts] SET scraped = 1 WHERE id = @id", new SqlParameter("id", scrapedPost.id));
                        }
                    });
                }
            }
        }
        public static void SplitWords()
        {
            using (var db = new FacebookDebatEntities())
            {
                log.Info("Finding un-splitted comments");
                var comments = db.Comments.Where(x => !x.splitted).Take(10000).ToList();

                var commentIds      = string.Join(",", comments.Select(x => x.id.ToString()).ToArray());
                var deleteWordsTask = Task.Factory.StartNew(() =>
                {
                    if (commentIds.Count() != 0)
                    {
                        DatabaseTools.ExecuteNonQuery(string.Format("DELETE FROM CommentWords WHERE comment_id IN ({0})", commentIds));
                    }

                    log.Info("Finished deleting words");
                });

                var deleteLinksTask = Task.Factory.StartNew(() =>
                {
                    if (commentIds.Count() != 0)
                    {
                        DatabaseTools.ExecuteNonQuery(string.Format("DELETE FROM CommentLinks WHERE comment_id IN ({0})", commentIds));
                    }
                    log.Info("Finished deleting links");
                });

                var commentWords = new List <Tuple <int, string> >();
                var commentLinks = new List <Tuple <int, string> >();

                log.Info("Building comment cache");
                var wordCache = new UpdateOrInsertBuilder <Word>(db.Words, x => x.word1, (x, y) => false);

                log.Info("Building link cache");
                var linkCache = new UpdateOrInsertBuilder <Link>(db.Links, x => x.url, (x, y) => false);


                log.Info("Splitting");
                foreach (var comment in comments)
                {
                    // Get links
                    String commentWithoutLinks;
                    var    links = Tools.StripLinks(comment.message, out commentWithoutLinks);
                    foreach (var link in links)
                    {
                        linkCache.Process(link, () => new Link()
                        {
                            url = link
                        });
                        commentLinks.Add(Tuple.Create(comment.id, link));
                    }

                    // Get words from link-stripped comment
                    var words = Tools.SplitWords(commentWithoutLinks.ToLower()).Where(x => !string.IsNullOrEmpty(x));
                    foreach (var word in words)
                    {
                        if (word.Length >= 100) // Longest possible word in DB
                        {
                            log.Warn("Ignoring " + word);
                            continue;
                        }

                        if (word.Any(x => char.IsDigit(x) || x == '_')) // no words with underscores or digits
                        {
                            continue;
                        }

                        if (word.Length < 2)
                        {
                            continue;
                        }

                        wordCache.Process(word, () => new Word()
                        {
                            word1 = word
                        });

                        commentWords.Add(Tuple.Create(comment.id, word));
                    }
                }

                log.Info("Waiting for delete-tasks");
                Task.WaitAll(deleteLinksTask, deleteWordsTask);

                wordCache.SyncDatabase(2000, "dbo.Words", "id", (word) => new
                {
                    id   = (int?)null,
                    word = word.word1
                });

                linkCache.SyncDatabase(2000, "dbo.Links", "id", (link) => new
                {
                    id  = (int?)null,
                    url = link.url
                });

                var linkTranslator = new FacebookDebatEntities().Links.ToDictionary(x => x.url, x => x.id);
                DatabaseTools.ChunkInsert("dbo.CommentLinks", 10000, commentLinks.Select(x => new
                {
                    id         = (int?)null,
                    comment_id = x.Item1,
                    link_id    = linkTranslator[x.Item2]
                }));

                var wordTranslator = new FacebookDebatEntities().Words.ToDictionary(x => x.word1, x => x.id);
                DatabaseTools.ChunkInsert("dbo.CommentWords", 20000, commentWords.Select(x => new
                {
                    id         = (int?)null,
                    comment_id = x.Item1,
                    word_id    = wordTranslator[x.Item2]
                }));

                log.Info("Marking splitted");
                DatabaseTools.ExecuteNonQuery(string.Format("update dbo.Comments set splitted = 1 where id in ({0})", commentIds));
            }
        }