Example #1
0
        protected IEnumerable <Post> GetPostsFromComment(string board, int pos, string tid, Comment c, Comment parent = null)
        {
            var         source = $"r.{board}";
            List <Post> posts  = new List <Post>();
            var         post   = new Post
            {
                Id            = c.Id + "-" + YY,
                ThreadId      = tid,
                Source        = source,
                Position      = pos,
                DatePublished = c.CreatedUTC,
                User          = c.AuthorName,
                Text          = c.Body,
                ReplyTo       = new List <string>(1)
                {
                    c.ParentId + "-" + YY
                },
                Links = WebScraper.ExtractLinksFromHtmlFrag(c.BodyHtml),
            };

            posts.Add(post);
            if (c.Comments.Count > 0)
            {
                posts.AddRange(c.Comments.Select((cn, p) => GetPostsFromComment(board, p, tid, cn)).SelectMany(x => x));
            }
            return(posts);
        }
Example #2
0
        public async Task <IEnumerable <NewsThread> > GetThreads(string board)
        {
            using (var op = Begin("Get threads for board {0}", board))
            {
                var source   = $"r.{board}";
                var webAgent = new BotWebAgent(Config("Reddit:User"), Config("Reddit:Pass"), Config("Reddit:ClientId"), Config("Reddit:ClientSecret"), "https://github.com/allisterb/Canaan");
                webAgent.UserAgent = "Canaan/0.1";
                var reddit  = new RedditSharp.Reddit(webAgent);
                var threads = new List <NewsThread>();
                var r       = await reddit.GetSubredditAsync(board);

                await r.GetPosts(Subreddit.Sort.Top, 400).ForEachAsync((post, p) =>
                {
                    var text          = post.IsSelfPost ? post.SelfText : string.Empty;
                    var html          = post.IsSelfPost ? post.SelfTextHtml : null;
                    NewsThread thread = new NewsThread()
                    {
                        Id            = post.Id + "-" + YY,
                        Source        = source,
                        Position      = p + 1,
                        Subject       = post.Title,
                        DatePublished = post.CreatedUTC,
                        User          = post.AuthorName,
                        Text          = text,
                        Links         = post.IsSelfPost ? WebScraper.ExtractLinksFromHtmlFrag(html) : new Link[] { new Link()
                                                                                                                   {
                                                                                                                       Uri = post.Url
                                                                                                                   } }
                    };
                    threads.Add(thread);
                });

                return(threads);
            }
        }
Example #3
0
        public List <Post> ParsePostsFromThreadJson(string board, string json)
        {
            dynamic     o           = JObject.Parse(json);
            JArray      threadPosts = o.posts;
            dynamic     subjectPost = threadPosts[0];
            int         pos         = 1;
            List <Post> posts       = new List <Post>();

            foreach (dynamic post in threadPosts)
            {
                posts.Add(
                    new Post()
                {
                    Source        = $"4ch.{board}",
                    No            = post.no,
                    Id            = post.no.ToString() + "-" + YY,
                    Position      = pos++,
                    User          = post.name,
                    DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)post.time).UtcDateTime,
                    Text          = WebScraper.ExtractTextFromHtmlFrag((string)post.com),
                    Links         = WebScraper.ExtractLinksFromHtmlFrag((string)post.com),
                    Additional    = GetAdditionalPropsForPost(post)
                });
            }
            return(posts);
        }
Example #4
0
        public async Task <ValueTuple <NewsThread, List <Post> > > GetThread(string board, string threadno)
        {
            using (var op = Begin("Get thread no {0} for board {1}", board, threadno))
            {
                var r = await HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{threadno}.json", CancellationToken);

                r.EnsureSuccessStatusCode();
                var json = await r.Content.ReadAsStringAsync();

                dynamic    o           = JObject.Parse(json);
                JArray     threadPosts = o.posts;
                dynamic    subjectPost = threadPosts[0];
                NewsThread thread      = new NewsThread()
                {
                    Source        = $"4ch.{board}",
                    No            = subjectPost.no,
                    Id            = subjectPost.no.ToString() + "-" + YY,
                    DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)subjectPost.time).UtcDateTime,
                    Subject       = subjectPost.sub,
                    Text          = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com),
                    User          = subjectPost.name,
                    ReplyCount    = subjectPost.replies
                };
                var posts = ParsePostsFromThreadJson(board, json);
                foreach (var p in posts)
                {
                    p.ThreadId = thread.Id;
                }
                foreach (var p in posts)
                {
                    var rnos = ExtractPostRepliesFromText(p);
                    foreach (var no in rnos)
                    {
                        p.ReplyTo.Add(no.ToString() + "-" + YY);
                        var replyto = posts.SingleOrDefault(v => v.No == no);
                        if (replyto != null)
                        {
                            replyto.Replies.Add(p.Id);
                        }
                    }

                    p.Text            = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty);
                    p.Text            = WebScraper.RemoveUrlsFromText(p.Text);
                    p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w));
                    await NLUService.GetPredictionForPost(p);

                    if (p.Entities.Count > 0)
                    {
                        Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id);
                    }
                    if (p.ThreatIntent > 0.0)
                    {
                        Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id);
                    }
                }
                op.Complete();
                return(thread, posts);
            }
        }
Example #5
0
        public async Task <IDictionary <NewsThread, List <Post> > > GetPosts(string board, IEnumerable <NewsThread> threads, int delay = 500)
        {
            using (var op = Begin("Get posts for {0} threads for board {1}", threads.Count(), board))
            {
                Task <HttpResponseMessage>[] threadTasks =
                    threads.Select(t => HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{t.No}.json", CancellationToken)).ToArray();
                try
                {
                    foreach (var task in threadTasks)
                    {
                        await task;
                        await Task.Delay(delay);
                    }
                }
                catch { }

                foreach (var task in threadTasks.Where(t => t.IsFaulted || t.IsCanceled))
                {
                    if (task.Exception != null)
                    {
                        Error(task.Exception, "Could not fetch thread.");
                    }
                    else
                    {
                        Error("Could not fetch thread. Thread task did not complete.");
                    }
                }
                var threadsResponseTask = threadTasks
                                          .Where(t => t.IsCompleted)
                                          .Select(t => t.Result)
                                          .Where(r => r.StatusCode == HttpStatusCode.OK)
                                          .Select(r => r.Content.ReadAsStringAsync());
                await Task.WhenAll(threadsResponseTask);

                var threadsJson = threadsResponseTask.Select(r => r.Result);

                var posts = new ConcurrentDictionary <NewsThread, List <Post> >();
                Parallel.ForEach(threadsJson, (j) =>
                {
                    dynamic o           = JObject.Parse(j);
                    JArray threadPosts  = o.posts;
                    dynamic subjectPost = threadPosts[0];
                    NewsThread thread   = threads.Single(t => t.No == (long)subjectPost.no);
                    thread.Subject      = subjectPost.sub;
                    thread.Text         = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com);
                    thread.User         = subjectPost.name;
                    thread.ReplyCount   = subjectPost.replies;
                    if (!posts.TryAdd(thread, new List <Post>()))
                    {
                        Error("Could not add thread {0} to posts collection.", thread.Id);
                    }
                    int pos = 1;
                    foreach (dynamic post in threadPosts)
                    {
                        posts[thread].Add(
                            new Post()
                        {
                            Source        = $"4ch.{board}",
                            ThreadId      = thread.Id,
                            No            = post.no,
                            Id            = post.no.ToString() + "-" + YY,
                            Position      = pos++,
                            User          = post.name,
                            DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)post.time).UtcDateTime,
                            Text          = WebScraper.ExtractTextFromHtmlFrag((string)post.com),
                            Links         = WebScraper.ExtractLinksFromHtmlFrag((string)post.com),
                            Additional    = GetAdditionalPropsForPost((JObject)post)
                        });
                    }
                });

                foreach (var tp in posts)
                {
                    foreach (var p in tp.Value)
                    {
                        var rnos = ExtractPostRepliesFromText(p);
                        foreach (var no in rnos)
                        {
                            p.ReplyTo.Add(no.ToString() + "-" + YY);
                            var replyto = tp.Value.SingleOrDefault(v => v.No == no);
                            if (replyto != null)
                            {
                                replyto.Replies.Add(p.Id);
                            }
                        }
                        p.Text = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty);
                        p.Text = WebScraper.RemoveUrlsFromText(p.Text);
                    }
                }

                foreach (var p in posts.Values.SelectMany(x => x))
                {
                    p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w));

                    await NLUService.GetPredictionForPost(p);

                    if (p.Entities.Count > 0)
                    {
                        Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id);
                    }
                    if (p.ThreatIntent > 0.0)
                    {
                        Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id);
                    }
                }

                op.Complete();
                return(posts);
            }
        }
Example #6
0
        public async Task <IEnumerable <Post> > GetUpdates(int listenTimeout)
        {
            if (HttpClient.Timeout != TimeSpan.FromSeconds(listenTimeout))
            {
                HttpClient.Timeout = TimeSpan.FromSeconds(listenTimeout);
            }
            List <string> updates    = new List <string>();
            List <Post>   posts      = new List <Post>();
            var           requestUri = "https://gab.com/api/v1/streaming/public";

            try
            {
                using (var op = Begin("Listen to Gab live stream for {0} seconds", listenTimeout))
                {
                    Stopwatch sw = new Stopwatch();
                    sw.Start();
                    var stream = await HttpClient.GetStreamAsync(requestUri);

                    using (var reader = new StreamReader(stream))
                    {
                        StringBuilder eventBuilder = new StringBuilder();

                        bool reading = false;
                        while (!reader.EndOfStream)
                        {
                            var l = reader.ReadLine();
                            if (l.StartsWith("event: update"))
                            {
                                reading = true;
                                continue;
                            }
                            else if (l == "" && reading)
                            {
                                updates.Add(eventBuilder.ToString());
                                eventBuilder.Clear();
                                reading = false;
                                continue;
                            }
                            else if (reading)
                            {
                                eventBuilder.Append(l);
                                continue;
                            }
                            else
                            {
                                continue;
                            }
                        }
                    }
                    op.Complete();
                }
            }
            catch (Exception e)
            {
                Error(e, "Error occurred listening to Gab live stream.");
            }

            if (updates.Count == 0)
            {
                Info("0 updates received.");
                return(posts);
            }

            Info("{0} updates received from Gab live stream.", updates.Count);

            foreach (var s in updates)
            {
                var props   = JObject.Parse(s.Replace("data: ", "")).Properties();
                var account = (JObject)props.First(p => p.Name == "account").Value;

                var post = new Post()
                {
                    No            = (long)props.First(p => p.Name == "id").Value,
                    Id            = (string)props.First(p => p.Name == "id").Value + "-" + YY,
                    DatePublished = (DateTime)props.First(p => p.Name == "created_at").Value,
                    Text          = (string)props.First(p => p.Name == "content").Value,
                    User          = (string)account.Properties().First(p => p.Name == "username").Value,
                    Source        = "gab"
                };
                posts.Add(post);
            }
            foreach (var post in posts)
            {
                var html = post.Text;
                post.Text            = WebScraper.ExtractTextFromHtmlFrag(html);
                post.Links           = WebScraper.ExtractLinksFromHtmlFrag(html);
                post.Text            = WebScraper.RemoveUrlsFromText(post.Text);
                post.HasIdentityHate = HateWords.IdentityHateWords.Any(w => post.Text.Contains(w));
                await intentService.GetPredictionForPost(post);

                if (post.Entities.Count > 0)
                {
                    Info("Detected {0} entities in post {1}.", post.Entities.Count, post.Id);
                }
                if (post.ThreatIntent > 0.0)
                {
                    Info("Detected threat intent {0:0.00} in post {1}.", post.ThreatIntent, post.Id);
                }
            }
            return(posts);
        }