protected IEnumerable <Post> GetPostsFromComment(string board, int pos, string tid, Comment c, Comment parent = null) { var source = $"r.{board}"; List <Post> posts = new List <Post>(); var post = new Post { Id = c.Id + "-" + YY, ThreadId = tid, Source = source, Position = pos, DatePublished = c.CreatedUTC, User = c.AuthorName, Text = c.Body, ReplyTo = new List <string>(1) { c.ParentId + "-" + YY }, Links = WebScraper.ExtractLinksFromHtmlFrag(c.BodyHtml), }; posts.Add(post); if (c.Comments.Count > 0) { posts.AddRange(c.Comments.Select((cn, p) => GetPostsFromComment(board, p, tid, cn)).SelectMany(x => x)); } return(posts); }
public async Task <IEnumerable <NewsThread> > GetThreads(string board) { using (var op = Begin("Get threads for board {0}", board)) { var source = $"r.{board}"; var webAgent = new BotWebAgent(Config("Reddit:User"), Config("Reddit:Pass"), Config("Reddit:ClientId"), Config("Reddit:ClientSecret"), "https://github.com/allisterb/Canaan"); webAgent.UserAgent = "Canaan/0.1"; var reddit = new RedditSharp.Reddit(webAgent); var threads = new List <NewsThread>(); var r = await reddit.GetSubredditAsync(board); await r.GetPosts(Subreddit.Sort.Top, 400).ForEachAsync((post, p) => { var text = post.IsSelfPost ? post.SelfText : string.Empty; var html = post.IsSelfPost ? post.SelfTextHtml : null; NewsThread thread = new NewsThread() { Id = post.Id + "-" + YY, Source = source, Position = p + 1, Subject = post.Title, DatePublished = post.CreatedUTC, User = post.AuthorName, Text = text, Links = post.IsSelfPost ? WebScraper.ExtractLinksFromHtmlFrag(html) : new Link[] { new Link() { Uri = post.Url } } }; threads.Add(thread); }); return(threads); } }
public List <Post> ParsePostsFromThreadJson(string board, string json) { dynamic o = JObject.Parse(json); JArray threadPosts = o.posts; dynamic subjectPost = threadPosts[0]; int pos = 1; List <Post> posts = new List <Post>(); foreach (dynamic post in threadPosts) { posts.Add( new Post() { Source = $"4ch.{board}", No = post.no, Id = post.no.ToString() + "-" + YY, Position = pos++, User = post.name, DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)post.time).UtcDateTime, Text = WebScraper.ExtractTextFromHtmlFrag((string)post.com), Links = WebScraper.ExtractLinksFromHtmlFrag((string)post.com), Additional = GetAdditionalPropsForPost(post) }); } return(posts); }
public async Task <IDictionary <NewsThread, List <Post> > > GetPosts(string board, IEnumerable <NewsThread> threads, int delay = 500) { using (var op = Begin("Get posts for {0} threads for board {1}", threads.Count(), board)) { Task <HttpResponseMessage>[] threadTasks = threads.Select(t => HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{t.No}.json", CancellationToken)).ToArray(); try { foreach (var task in threadTasks) { await task; await Task.Delay(delay); } } catch { } foreach (var task in threadTasks.Where(t => t.IsFaulted || t.IsCanceled)) { if (task.Exception != null) { Error(task.Exception, "Could not fetch thread."); } else { Error("Could not fetch thread. Thread task did not complete."); } } var threadsResponseTask = threadTasks .Where(t => t.IsCompleted) .Select(t => t.Result) .Where(r => r.StatusCode == HttpStatusCode.OK) .Select(r => r.Content.ReadAsStringAsync()); await Task.WhenAll(threadsResponseTask); var threadsJson = threadsResponseTask.Select(r => r.Result); var posts = new ConcurrentDictionary <NewsThread, List <Post> >(); Parallel.ForEach(threadsJson, (j) => { dynamic o = JObject.Parse(j); JArray threadPosts = o.posts; dynamic subjectPost = threadPosts[0]; NewsThread thread = threads.Single(t => t.No == (long)subjectPost.no); thread.Subject = subjectPost.sub; thread.Text = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com); thread.User = subjectPost.name; thread.ReplyCount = subjectPost.replies; if (!posts.TryAdd(thread, new List <Post>())) { Error("Could not add thread {0} to posts collection.", thread.Id); } int pos = 1; foreach (dynamic post in threadPosts) { posts[thread].Add( new Post() { Source = $"4ch.{board}", ThreadId = thread.Id, No = post.no, Id = post.no.ToString() + "-" + YY, Position = pos++, User = post.name, DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)post.time).UtcDateTime, Text = WebScraper.ExtractTextFromHtmlFrag((string)post.com), Links = WebScraper.ExtractLinksFromHtmlFrag((string)post.com), Additional = GetAdditionalPropsForPost((JObject)post) }); } }); foreach (var tp in posts) { foreach (var p in tp.Value) { var rnos = ExtractPostRepliesFromText(p); foreach (var no in rnos) { p.ReplyTo.Add(no.ToString() + "-" + YY); var replyto = tp.Value.SingleOrDefault(v => v.No == no); if (replyto != null) { replyto.Replies.Add(p.Id); } } p.Text = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty); p.Text = WebScraper.RemoveUrlsFromText(p.Text); } } foreach (var p in posts.Values.SelectMany(x => x)) { p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w)); await NLUService.GetPredictionForPost(p); if (p.Entities.Count > 0) { Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id); } if (p.ThreatIntent > 0.0) { Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id); } } op.Complete(); return(posts); } }
public async Task <IEnumerable <Post> > GetUpdates(int listenTimeout) { if (HttpClient.Timeout != TimeSpan.FromSeconds(listenTimeout)) { HttpClient.Timeout = TimeSpan.FromSeconds(listenTimeout); } List <string> updates = new List <string>(); List <Post> posts = new List <Post>(); var requestUri = "https://gab.com/api/v1/streaming/public"; try { using (var op = Begin("Listen to Gab live stream for {0} seconds", listenTimeout)) { Stopwatch sw = new Stopwatch(); sw.Start(); var stream = await HttpClient.GetStreamAsync(requestUri); using (var reader = new StreamReader(stream)) { StringBuilder eventBuilder = new StringBuilder(); bool reading = false; while (!reader.EndOfStream) { var l = reader.ReadLine(); if (l.StartsWith("event: update")) { reading = true; continue; } else if (l == "" && reading) { updates.Add(eventBuilder.ToString()); eventBuilder.Clear(); reading = false; continue; } else if (reading) { eventBuilder.Append(l); continue; } else { continue; } } } op.Complete(); } } catch (Exception e) { Error(e, "Error occurred listening to Gab live stream."); } if (updates.Count == 0) { Info("0 updates received."); return(posts); } Info("{0} updates received from Gab live stream.", updates.Count); foreach (var s in updates) { var props = JObject.Parse(s.Replace("data: ", "")).Properties(); var account = (JObject)props.First(p => p.Name == "account").Value; var post = new Post() { No = (long)props.First(p => p.Name == "id").Value, Id = (string)props.First(p => p.Name == "id").Value + "-" + YY, DatePublished = (DateTime)props.First(p => p.Name == "created_at").Value, Text = (string)props.First(p => p.Name == "content").Value, User = (string)account.Properties().First(p => p.Name == "username").Value, Source = "gab" }; posts.Add(post); } foreach (var post in posts) { var html = post.Text; post.Text = WebScraper.ExtractTextFromHtmlFrag(html); post.Links = WebScraper.ExtractLinksFromHtmlFrag(html); post.Text = WebScraper.RemoveUrlsFromText(post.Text); post.HasIdentityHate = HateWords.IdentityHateWords.Any(w => post.Text.Contains(w)); await intentService.GetPredictionForPost(post); if (post.Entities.Count > 0) { Info("Detected {0} entities in post {1}.", post.Entities.Count, post.Id); } if (post.ThreatIntent > 0.0) { Info("Detected threat intent {0:0.00} in post {1}.", post.ThreatIntent, post.Id); } } return(posts); }