public IActionResult ExportAsCSV([FromBody] ElasticSearchRequest request) { IEnumerable <ScrapedPost> history = PostScraper.All(request.Query, request.Sort).Data; byte[] serialized = CsvSerialization.Serialize(history, CsvSerialization.MapPost); return(File(serialized, "text/csv", "export.csv")); }
public IActionResult ExportAsJson([FromBody] ElasticSearchRequest request) { IEnumerable <ScrapedPost> history = PostScraper.All(request.Query, request.Sort).Data; byte[] serialized = Encoding.UTF8.GetBytes(JsonConvert.SerializeObject(history)); return(File(serialized, "application/json-download", "export.json")); }
public void GoThroughEachPostAndGetTheCommentsOhMyGodThisWillDestroyMyLaptop() { const int LastScrapeAmount = 0; int i = 0; AllResponse <ScrapedPost> posts = PostScraper.All(new SortField[] { new SortField { Field = "created_time", Order = SortOrder.Descending } }); foreach (ScrapedPost post in posts.Data) { i++; if (post.CreatedTime < new DateTime(2017, 04, 01)) { continue; } if (i > LastScrapeAmount) { List <ScrapedComment> comments = CommentScraper.Scrape(post).ToList(); Console.WriteLine($"{i}/{posts.TotalCount}: {post.Id}; {comments.Count}"); } else { Console.WriteLine($"{i}/{posts.TotalCount}: {post.Id}; Already scraped."); } } }
public PostScrapeController(PostScraper postScraper, CommentScraper commentScraper, PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, ElasticSearchRepository <PostScrapeHistory> postScrapeHistoryRepository) { PostScraper = postScraper; CommentScraper = commentScraper; PageScraper = pageScraper; PageMetadataRepository = pageMetadataRepository; PostScrapeHistoryRepository = postScrapeHistoryRepository; }
public IEnumerable <ScrapedPost> ImportElasticSearchPosts(string path) { using (var fileStream = new FileStream(path, FileMode.Open)) using (var streamReader = new StreamReader(fileStream)) using (var csvReader = new CsvReader(streamReader)) { csvReader.Configuration.RegisterClassMap <ScrapedPostMapping>(); foreach (ScrapedPost record in csvReader.GetRecords <ScrapedPost>()) { yield return(PostScraper.Save(record, Refresh.False)); } } }
public PostScrapeHistory ScrapePosts([FromBody] PostScrapeRequest request) { Debug.Assert(request != null); Console.WriteLine("Started Scraping"); // If no specific pages were specified, scrape them all. PageMetadata[] pages; if (request.Pages == null) { pages = PageMetadataRepository.All().Data.ToArray(); } else { pages = request.Pages.Select(p => PageMetadataRepository.Get(p)).ToArray(); } int numberOfComments = 0; ScrapedPost[] posts = PostScraper.Scrape(pages, request.Since, request.Until).ToArray(); Console.WriteLine($"Started scraping comments for {posts.Length} posts"); foreach (ScrapedPost post in posts) { ScrapedComment[] comments = CommentScraper.Scrape(post).ToArray(); numberOfComments += comments.Length; Console.WriteLine(numberOfComments); } Console.WriteLine($"Done scraping {pages.Length} pages. Scraped {posts.Length} posts with {numberOfComments} comments"); var postScrape = new PostScrapeHistory { Id = Guid.NewGuid().ToString(), Since = request.Since, Until = request.Until, ImportStart = posts.FirstOrDefault()?.Scraped ?? DateTime.Now, ImportEnd = DateTime.Now, NumberOfPosts = posts.Length, NumberOfComments = numberOfComments, Pages = pages }; return(PostScrapeHistoryRepository.Save(postScrape)); }
static void Main(string[] args) { PostScraper scraper = new PostScraper(); RebbitMQManager manager = new RebbitMQManager(); var data = scraper.GetNewsOnPage(1); foreach (var item in data) { try { manager.SendMessage(item); } catch (Exception ex) { Console.WriteLine(ex.Message); break; } Console.WriteLine("_________________________"); Console.WriteLine(item.Title + "\n" + item.Author + "\n" + item.Description + "\n" + item.Url + "\n" + item.Date + "\n" + item.Type + "\n" + item.Topics + "\n" + item.NumberOfViews); Thread.Sleep(10000); } Console.WriteLine(data.Count); Console.ReadKey(); }
public void ConfigureServices(IServiceCollection services) { // Boilerplate: add service and create Policy with options services.AddCors(options => { options.AddPolicy("CorsPolicy", builder => builder.AllowAnyOrigin() .AllowAnyMethod() .AllowAnyHeader() .AllowCredentials()); }); services.AddMvc(); services.AddSingleton(Configuration); // Register our repositories with ASP.NET Core to allow them to be injected // into our controllers. This preserves the same state between the controllers. Version facebookGraphAPIVersion = new Version(Configuration["facebook:graphAPIVersion"]); string facebookAppId = Configuration["facebook:appId"]; string facebookAppSecret = Configuration["facebook:appSecret"]; var graphClient = new GraphClient(facebookGraphAPIVersion, facebookAppId, facebookAppSecret); services.AddSingleton(graphClient); string elasticSearchUrl = Configuration["elasticsearch:url"]; string elasticSearchDefaultIndex = Configuration["elasticsearch:defaultIndex"]; string elasticSearchUserName = Configuration["elasticsearch:user"]; string elasticSearchPassword = Configuration["elasticsearch:password"]; var node = new Uri(elasticSearchUrl); Func <ConnectionSettings> settings = () => { var connectionSettings = new ConnectionSettings(node); if (string.IsNullOrEmpty(elasticSearchUserName)) { return(connectionSettings); } return(connectionSettings.BasicAuthentication(elasticSearchUserName, elasticSearchPassword)); }; var pageMetadataRepository = new ElasticSearchRepository <PageMetadata>(settings(), elasticSearchDefaultIndex + "-metadata-page"); services.AddSingleton(pageMetadataRepository); var pageScrapeHistoryRepository = new ElasticSearchRepository <PageScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-pagescrape"); services.AddSingleton(pageScrapeHistoryRepository); var postScrapeRepository = new ElasticSearchRepository <PostScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-postscrape"); services.AddSingleton(postScrapeRepository); var pageScraper = new PageScraper(settings(), elasticSearchDefaultIndex + "-page", graphClient); services.AddSingleton(pageScraper); var postScraper = new PostScraper(settings(), elasticSearchDefaultIndex + "-post", pageScraper, graphClient); services.AddSingleton(postScraper); var commentScraper = new CommentScraper(settings(), elasticSearchDefaultIndex + "-comment", graphClient); services.AddSingleton(commentScraper); }
public CommentScrapeController(CommentScraper commentScraper, PostScraper postScraper) { CommentScraper = commentScraper; PostScraper = postScraper; }
public ScrapeImporter(PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, PostScraper postScraper) { PageScraper = pageScraper; PageMetadataRepository = pageMetadataRepository; PostScraper = postScraper; }
public IEnumerable <ScrapedPost> ImportPosts(IEnumerable <string> postCSVs) { var posts = new List <ScrapedPost>(); DateTime now = DateTime.Now; int numberSaved = 0; Read(postCSVs, record => { string postId = (string)record["Media Title"]; ScrapedPost savedPost = PostScraper.Get(postId); if (savedPost != null) { // Skip posts that already exist. //Console.WriteLine($"Skipping {postId}."); //continue; } ScrapedPost post = PostScraper.ScrapePost(postId); bool useDatabase = post == null; if (post == null) { // Post has been deleted - we still want to save it.. post = new ScrapedPost { Id = postId }; Console.WriteLine($"Post {postId} does not exist."); } string normalizedPageName = null; foreach (string field in record.Keys) { string trimmedField = field.Trim(); string value = (string)record[field]; // If the post doesn't exist, we need to import various stuff from the page. if (useDatabase) { if (trimmedField == "#_Post_Likes") { // Yuck: whole number likes can have decimal points in the data. // Yuck: some rows are empty, or have invalid entries. if (!int.TryParse(value, NumberStyles.AllowDecimalPoint, null, out int numberOfLikes)) { Console.WriteLine("Cannot parse number of likes. Skipping..."); post.Reactions = new Reactions { Summary = new ReactionsSummary { TotalCount = -1 } }; continue; } post.Reactions = new Reactions { Summary = new ReactionsSummary { TotalCount = numberOfLikes } }; } else if (trimmedField == "#_Post_Comments") { // Yuck: whole number likes can have decimal points in the data. // Yuck: some rows are empty, or have invalid entries. if (!int.TryParse(value, NumberStyles.AllowDecimalPoint, null, out int numberOfComments)) { Console.WriteLine("Cannot parse number of comments. Skipping..."); post.Comments = new Comments { Summary = new CommentsSummary { TotalCount = -1 } }; continue; } post.Comments = new Comments { Summary = new CommentsSummary { TotalCount = numberOfComments } }; } else if (trimmedField == "#_Post_Shares") { // Yuck: whole number likes can have decimal points in the data. // Yuck: some rows are empty, or have invalid entries. if (!int.TryParse(value, NumberStyles.AllowDecimalPoint, null, out int numberOfShares)) { Console.WriteLine("Cannot parse number of shares. Skipping..."); post.Shares = new Shares { Count = -1 }; continue; } post.Shares = new Shares { Count = numberOfShares }; } else if (trimmedField == "Post_Date" || trimmedField == "Excerpt Date") { DateTime date = DateTime.ParseExact(value, "M/d/yyyy", null); post.CreatedTime = date; } else if (trimmedField == "Excerpt Copy") { post.Message = value; } } // Turn the comma separated list of topics into an array. if (trimmedField == "Codes Applied Combined") { IEnumerable <string> topics = value.Split(',').Select(c => c.Trim()); post.Topics = topics; } // Get the page from the post. if (trimmedField == "Page Name") { normalizedPageName = Mappings[value.Trim()].Name; } } // Get the nearest data we have for page likes at the time the post was created. Debug.Assert(normalizedPageName != null); PostScraper.UpdateMetadata(post, normalizedPageName); // Print the progress to make sure we know something is happening. numberSaved++; Console.WriteLine(numberSaved); // Save the post. posts.Add(PostScraper.Save(post, Refresh.False)); }); return(posts); }
public PagedResponse <ScrapedPost> AllPosts([FromBody] ElasticSearchRequest request) { return(PostScraper.Paged(request.PageNumber, request.PageSize, request.Query, request.Sort)); }
public ScrapedPost GetPost(string id) => PostScraper.Get(id);