public async Task <IEnumerable <NewsThread> > GetThreads(string board) { using (var op = Begin("Get threads for board {0}", board)) { var source = $"r.{board}"; var webAgent = new BotWebAgent(Config("Reddit:User"), Config("Reddit:Pass"), Config("Reddit:ClientId"), Config("Reddit:ClientSecret"), "https://github.com/allisterb/Canaan"); webAgent.UserAgent = "Canaan/0.1"; var reddit = new RedditSharp.Reddit(webAgent); var threads = new List <NewsThread>(); var r = await reddit.GetSubredditAsync(board); await r.GetPosts(Subreddit.Sort.Top, 400).ForEachAsync((post, p) => { var text = post.IsSelfPost ? post.SelfText : string.Empty; var html = post.IsSelfPost ? post.SelfTextHtml : null; NewsThread thread = new NewsThread() { Id = post.Id + "-" + YY, Source = source, Position = p + 1, Subject = post.Title, DatePublished = post.CreatedUTC, User = post.AuthorName, Text = text, Links = post.IsSelfPost ? WebScraper.ExtractLinksFromHtmlFrag(html) : new Link[] { new Link() { Uri = post.Url } } }; threads.Add(thread); }); return(threads); } }
public async Task <IEnumerable <NewsThread> > GetThreads(string board) { using (var op = Begin("Get threads for board {0}", board)) { var r = await HttpClient.GetAsync($"http://a.4cdn.org/{board}/threads.json", CancellationToken); r.EnsureSuccessStatusCode(); var json = await r.Content.ReadAsStringAsync(); var pages = JArray.Parse(json); var threads = new List <NewsThread>(); foreach (dynamic page in pages) { int p = 1; foreach (dynamic thread in page.threads) { var t = new NewsThread() { Source = "4ch.pol", No = thread.no, Id = ((string)thread.no) + "-" + YY, Position = p++, LastModified = DateTimeOffset.FromUnixTimeSeconds((long)thread.last_modified).UtcDateTime, ReplyCount = thread.replies }; threads.Add(t); } } op.Complete(); return(threads); } }
public async Task <ValueTuple <NewsThread, List <Post> > > GetThread(string board, string threadno) { using (var op = Begin("Get thread no {0} for board {1}", board, threadno)) { var r = await HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{threadno}.json", CancellationToken); r.EnsureSuccessStatusCode(); var json = await r.Content.ReadAsStringAsync(); dynamic o = JObject.Parse(json); JArray threadPosts = o.posts; dynamic subjectPost = threadPosts[0]; NewsThread thread = new NewsThread() { Source = $"4ch.{board}", No = subjectPost.no, Id = subjectPost.no.ToString() + "-" + YY, DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)subjectPost.time).UtcDateTime, Subject = subjectPost.sub, Text = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com), User = subjectPost.name, ReplyCount = subjectPost.replies }; var posts = ParsePostsFromThreadJson(board, json); foreach (var p in posts) { p.ThreadId = thread.Id; } foreach (var p in posts) { var rnos = ExtractPostRepliesFromText(p); foreach (var no in rnos) { p.ReplyTo.Add(no.ToString() + "-" + YY); var replyto = posts.SingleOrDefault(v => v.No == no); if (replyto != null) { replyto.Replies.Add(p.Id); } } p.Text = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty); p.Text = WebScraper.RemoveUrlsFromText(p.Text); p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w)); await NLUService.GetPredictionForPost(p); if (p.Entities.Count > 0) { Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id); } if (p.ThreatIntent > 0.0) { Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id); } } op.Complete(); return(thread, posts); } }
public async Task <IDictionary <NewsThread, List <Post> > > GetPosts(string board, IEnumerable <NewsThread> threads, int delay = 500) { using (var op = Begin("Get posts for {0} threads for board {1}", threads.Count(), board)) { Task <HttpResponseMessage>[] threadTasks = threads.Select(t => HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{t.No}.json", CancellationToken)).ToArray(); try { foreach (var task in threadTasks) { await task; await Task.Delay(delay); } } catch { } foreach (var task in threadTasks.Where(t => t.IsFaulted || t.IsCanceled)) { if (task.Exception != null) { Error(task.Exception, "Could not fetch thread."); } else { Error("Could not fetch thread. Thread task did not complete."); } } var threadsResponseTask = threadTasks .Where(t => t.IsCompleted) .Select(t => t.Result) .Where(r => r.StatusCode == HttpStatusCode.OK) .Select(r => r.Content.ReadAsStringAsync()); await Task.WhenAll(threadsResponseTask); var threadsJson = threadsResponseTask.Select(r => r.Result); var posts = new ConcurrentDictionary <NewsThread, List <Post> >(); Parallel.ForEach(threadsJson, (j) => { dynamic o = JObject.Parse(j); JArray threadPosts = o.posts; dynamic subjectPost = threadPosts[0]; NewsThread thread = threads.Single(t => t.No == (long)subjectPost.no); thread.Subject = subjectPost.sub; thread.Text = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com); thread.User = subjectPost.name; thread.ReplyCount = subjectPost.replies; if (!posts.TryAdd(thread, new List <Post>())) { Error("Could not add thread {0} to posts collection.", thread.Id); } int pos = 1; foreach (dynamic post in threadPosts) { posts[thread].Add( new Post() { Source = $"4ch.{board}", ThreadId = thread.Id, No = post.no, Id = post.no.ToString() + "-" + YY, Position = pos++, User = post.name, DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)post.time).UtcDateTime, Text = WebScraper.ExtractTextFromHtmlFrag((string)post.com), Links = WebScraper.ExtractLinksFromHtmlFrag((string)post.com), Additional = GetAdditionalPropsForPost((JObject)post) }); } }); foreach (var tp in posts) { foreach (var p in tp.Value) { var rnos = ExtractPostRepliesFromText(p); foreach (var no in rnos) { p.ReplyTo.Add(no.ToString() + "-" + YY); var replyto = tp.Value.SingleOrDefault(v => v.No == no); if (replyto != null) { replyto.Replies.Add(p.Id); } } p.Text = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty); p.Text = WebScraper.RemoveUrlsFromText(p.Text); } } foreach (var p in posts.Values.SelectMany(x => x)) { p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w)); await NLUService.GetPredictionForPost(p); if (p.Entities.Count > 0) { Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id); } if (p.ThreatIntent > 0.0) { Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id); } } op.Complete(); return(posts); } }