public IActionResult ExportAsCSV([FromBody] ElasticSearchRequest request)
        {
            IEnumerable <ScrapedPost> history = PostScraper.All(request.Query, request.Sort).Data;

            byte[] serialized = CsvSerialization.Serialize(history, CsvSerialization.MapPost);
            return(File(serialized, "text/csv", "export.csv"));
        }
        public IActionResult ExportAsJson([FromBody] ElasticSearchRequest request)
        {
            IEnumerable <ScrapedPost> history = PostScraper.All(request.Query, request.Sort).Data;

            byte[] serialized = Encoding.UTF8.GetBytes(JsonConvert.SerializeObject(history));
            return(File(serialized, "application/json-download", "export.json"));
        }
        public void GoThroughEachPostAndGetTheCommentsOhMyGodThisWillDestroyMyLaptop()
        {
            const int LastScrapeAmount = 0;
            int       i = 0;

            AllResponse <ScrapedPost> posts = PostScraper.All(new SortField[] { new SortField {
                                                                                    Field = "created_time", Order = SortOrder.Descending
                                                                                } });

            foreach (ScrapedPost post in posts.Data)
            {
                i++;
                if (post.CreatedTime < new DateTime(2017, 04, 01))
                {
                    continue;
                }
                if (i > LastScrapeAmount)
                {
                    List <ScrapedComment> comments = CommentScraper.Scrape(post).ToList();
                    Console.WriteLine($"{i}/{posts.TotalCount}: {post.Id}; {comments.Count}");
                }
                else
                {
                    Console.WriteLine($"{i}/{posts.TotalCount}: {post.Id}; Already scraped.");
                }
            }
        }
 public PostScrapeController(PostScraper postScraper, CommentScraper commentScraper, PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, ElasticSearchRepository <PostScrapeHistory> postScrapeHistoryRepository)
 {
     PostScraper                 = postScraper;
     CommentScraper              = commentScraper;
     PageScraper                 = pageScraper;
     PageMetadataRepository      = pageMetadataRepository;
     PostScrapeHistoryRepository = postScrapeHistoryRepository;
 }
 public IEnumerable <ScrapedPost> ImportElasticSearchPosts(string path)
 {
     using (var fileStream = new FileStream(path, FileMode.Open))
         using (var streamReader = new StreamReader(fileStream))
             using (var csvReader = new CsvReader(streamReader))
             {
                 csvReader.Configuration.RegisterClassMap <ScrapedPostMapping>();
                 foreach (ScrapedPost record in csvReader.GetRecords <ScrapedPost>())
                 {
                     yield return(PostScraper.Save(record, Refresh.False));
                 }
             }
 }
        public PostScrapeHistory ScrapePosts([FromBody] PostScrapeRequest request)
        {
            Debug.Assert(request != null);
            Console.WriteLine("Started Scraping");

            // If no specific pages were specified, scrape them all.
            PageMetadata[] pages;
            if (request.Pages == null)
            {
                pages = PageMetadataRepository.All().Data.ToArray();
            }
            else
            {
                pages = request.Pages.Select(p => PageMetadataRepository.Get(p)).ToArray();
            }

            int numberOfComments = 0;

            ScrapedPost[] posts = PostScraper.Scrape(pages, request.Since, request.Until).ToArray();

            Console.WriteLine($"Started scraping comments for {posts.Length} posts");

            foreach (ScrapedPost post in posts)
            {
                ScrapedComment[] comments = CommentScraper.Scrape(post).ToArray();
                numberOfComments += comments.Length;
                Console.WriteLine(numberOfComments);
            }

            Console.WriteLine($"Done scraping {pages.Length} pages. Scraped {posts.Length} posts with {numberOfComments} comments");

            var postScrape = new PostScrapeHistory
            {
                Id               = Guid.NewGuid().ToString(),
                Since            = request.Since,
                Until            = request.Until,
                ImportStart      = posts.FirstOrDefault()?.Scraped ?? DateTime.Now,
                ImportEnd        = DateTime.Now,
                NumberOfPosts    = posts.Length,
                NumberOfComments = numberOfComments,
                Pages            = pages
            };

            return(PostScrapeHistoryRepository.Save(postScrape));
        }
Пример #7
0
        static void Main(string[] args)
        {
            PostScraper     scraper = new PostScraper();
            RebbitMQManager manager = new RebbitMQManager();
            var             data    = scraper.GetNewsOnPage(1);

            foreach (var item in data)
            {
                try
                {
                    manager.SendMessage(item);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                    break;
                }
                Console.WriteLine("_________________________");
                Console.WriteLine(item.Title + "\n" + item.Author + "\n" + item.Description + "\n" + item.Url + "\n" + item.Date + "\n" + item.Type + "\n" + item.Topics + "\n" + item.NumberOfViews);
                Thread.Sleep(10000);
            }
            Console.WriteLine(data.Count);
            Console.ReadKey();
        }
Пример #8
0
        public void ConfigureServices(IServiceCollection services)
        {
            // Boilerplate: add service and create Policy with options
            services.AddCors(options =>
            {
                options.AddPolicy("CorsPolicy",
                                  builder => builder.AllowAnyOrigin()
                                  .AllowAnyMethod()
                                  .AllowAnyHeader()
                                  .AllowCredentials());
            });

            services.AddMvc();
            services.AddSingleton(Configuration);

            // Register our repositories with ASP.NET Core to allow them to be injected
            // into our controllers. This preserves the same state between the controllers.

            Version facebookGraphAPIVersion = new Version(Configuration["facebook:graphAPIVersion"]);
            string  facebookAppId           = Configuration["facebook:appId"];
            string  facebookAppSecret       = Configuration["facebook:appSecret"];
            var     graphClient             = new GraphClient(facebookGraphAPIVersion, facebookAppId, facebookAppSecret);

            services.AddSingleton(graphClient);

            string elasticSearchUrl          = Configuration["elasticsearch:url"];
            string elasticSearchDefaultIndex = Configuration["elasticsearch:defaultIndex"];

            string elasticSearchUserName = Configuration["elasticsearch:user"];
            string elasticSearchPassword = Configuration["elasticsearch:password"];

            var node = new Uri(elasticSearchUrl);
            Func <ConnectionSettings> settings = () =>
            {
                var connectionSettings = new ConnectionSettings(node);
                if (string.IsNullOrEmpty(elasticSearchUserName))
                {
                    return(connectionSettings);
                }

                return(connectionSettings.BasicAuthentication(elasticSearchUserName, elasticSearchPassword));
            };

            var pageMetadataRepository = new ElasticSearchRepository <PageMetadata>(settings(), elasticSearchDefaultIndex + "-metadata-page");

            services.AddSingleton(pageMetadataRepository);

            var pageScrapeHistoryRepository = new ElasticSearchRepository <PageScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-pagescrape");

            services.AddSingleton(pageScrapeHistoryRepository);

            var postScrapeRepository = new ElasticSearchRepository <PostScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-postscrape");

            services.AddSingleton(postScrapeRepository);

            var pageScraper = new PageScraper(settings(), elasticSearchDefaultIndex + "-page", graphClient);

            services.AddSingleton(pageScraper);

            var postScraper = new PostScraper(settings(), elasticSearchDefaultIndex + "-post", pageScraper, graphClient);

            services.AddSingleton(postScraper);

            var commentScraper = new CommentScraper(settings(), elasticSearchDefaultIndex + "-comment", graphClient);

            services.AddSingleton(commentScraper);
        }
Пример #9
0
 public CommentScrapeController(CommentScraper commentScraper, PostScraper postScraper)
 {
     CommentScraper = commentScraper;
     PostScraper    = postScraper;
 }
Пример #10
0
 public ScrapeImporter(PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, PostScraper postScraper)
 {
     PageScraper            = pageScraper;
     PageMetadataRepository = pageMetadataRepository;
     PostScraper            = postScraper;
 }
Пример #11
0
        public IEnumerable <ScrapedPost> ImportPosts(IEnumerable <string> postCSVs)
        {
            var      posts       = new List <ScrapedPost>();
            DateTime now         = DateTime.Now;
            int      numberSaved = 0;

            Read(postCSVs, record =>
            {
                string postId         = (string)record["Media Title"];
                ScrapedPost savedPost = PostScraper.Get(postId);
                if (savedPost != null)
                {
                    // Skip posts that already exist.
                    //Console.WriteLine($"Skipping {postId}.");
                    //continue;
                }

                ScrapedPost post = PostScraper.ScrapePost(postId);
                bool useDatabase = post == null;
                if (post == null)
                {
                    // Post has been deleted - we still want to save it..
                    post = new ScrapedPost {
                        Id = postId
                    };
                    Console.WriteLine($"Post {postId} does not exist.");
                }

                string normalizedPageName = null;
                foreach (string field in record.Keys)
                {
                    string trimmedField = field.Trim();
                    string value        = (string)record[field];

                    // If the post doesn't exist, we need to import various stuff from the page.
                    if (useDatabase)
                    {
                        if (trimmedField == "#_Post_Likes")
                        {
                            // Yuck: whole number likes can have decimal points in the data.
                            // Yuck: some rows are empty, or have invalid entries.
                            if (!int.TryParse(value, NumberStyles.AllowDecimalPoint, null, out int numberOfLikes))
                            {
                                Console.WriteLine("Cannot parse number of likes. Skipping...");
                                post.Reactions = new Reactions
                                {
                                    Summary = new ReactionsSummary {
                                        TotalCount = -1
                                    }
                                };
                                continue;
                            }

                            post.Reactions = new Reactions
                            {
                                Summary = new ReactionsSummary {
                                    TotalCount = numberOfLikes
                                }
                            };
                        }
                        else if (trimmedField == "#_Post_Comments")
                        {
                            // Yuck: whole number likes can have decimal points in the data.
                            // Yuck: some rows are empty, or have invalid entries.
                            if (!int.TryParse(value, NumberStyles.AllowDecimalPoint, null, out int numberOfComments))
                            {
                                Console.WriteLine("Cannot parse number of comments. Skipping...");
                                post.Comments = new Comments
                                {
                                    Summary = new CommentsSummary {
                                        TotalCount = -1
                                    }
                                };
                                continue;
                            }

                            post.Comments = new Comments
                            {
                                Summary = new CommentsSummary {
                                    TotalCount = numberOfComments
                                }
                            };
                        }
                        else if (trimmedField == "#_Post_Shares")
                        {
                            // Yuck: whole number likes can have decimal points in the data.
                            // Yuck: some rows are empty, or have invalid entries.
                            if (!int.TryParse(value, NumberStyles.AllowDecimalPoint, null, out int numberOfShares))
                            {
                                Console.WriteLine("Cannot parse number of shares. Skipping...");
                                post.Shares = new Shares {
                                    Count = -1
                                };
                                continue;
                            }

                            post.Shares = new Shares {
                                Count = numberOfShares
                            };
                        }
                        else if (trimmedField == "Post_Date" || trimmedField == "Excerpt Date")
                        {
                            DateTime date    = DateTime.ParseExact(value, "M/d/yyyy", null);
                            post.CreatedTime = date;
                        }
                        else if (trimmedField == "Excerpt Copy")
                        {
                            post.Message = value;
                        }
                    }

                    // Turn the comma separated list of topics into an array.
                    if (trimmedField == "Codes Applied Combined")
                    {
                        IEnumerable <string> topics = value.Split(',').Select(c => c.Trim());
                        post.Topics = topics;
                    }

                    // Get the page from the post.
                    if (trimmedField == "Page Name")
                    {
                        normalizedPageName = Mappings[value.Trim()].Name;
                    }
                }

                // Get the nearest data we have for page likes at the time the post was created.
                Debug.Assert(normalizedPageName != null);
                PostScraper.UpdateMetadata(post, normalizedPageName);

                // Print the progress to make sure we know something is happening.
                numberSaved++;
                Console.WriteLine(numberSaved);

                // Save the post.
                posts.Add(PostScraper.Save(post, Refresh.False));
            });

            return(posts);
        }
 public PagedResponse <ScrapedPost> AllPosts([FromBody] ElasticSearchRequest request)
 {
     return(PostScraper.Paged(request.PageNumber, request.PageSize, request.Query, request.Sort));
 }
 public ScrapedPost GetPost(string id) => PostScraper.Get(id);