Пример #1
0
        public async Task <IEnumerable <NewsThread> > GetThreads(string board)
        {
            using (var op = Begin("Get threads for board {0}", board))
            {
                var source   = $"r.{board}";
                var webAgent = new BotWebAgent(Config("Reddit:User"), Config("Reddit:Pass"), Config("Reddit:ClientId"), Config("Reddit:ClientSecret"), "https://github.com/allisterb/Canaan");
                webAgent.UserAgent = "Canaan/0.1";
                var reddit  = new RedditSharp.Reddit(webAgent);
                var threads = new List <NewsThread>();
                var r       = await reddit.GetSubredditAsync(board);

                await r.GetPosts(Subreddit.Sort.Top, 400).ForEachAsync((post, p) =>
                {
                    var text          = post.IsSelfPost ? post.SelfText : string.Empty;
                    var html          = post.IsSelfPost ? post.SelfTextHtml : null;
                    NewsThread thread = new NewsThread()
                    {
                        Id            = post.Id + "-" + YY,
                        Source        = source,
                        Position      = p + 1,
                        Subject       = post.Title,
                        DatePublished = post.CreatedUTC,
                        User          = post.AuthorName,
                        Text          = text,
                        Links         = post.IsSelfPost ? WebScraper.ExtractLinksFromHtmlFrag(html) : new Link[] { new Link()
                                                                                                                   {
                                                                                                                       Uri = post.Url
                                                                                                                   } }
                    };
                    threads.Add(thread);
                });

                return(threads);
            }
        }
Пример #2
0
        public async Task <IEnumerable <NewsThread> > GetThreads(string board)
        {
            using (var op = Begin("Get threads for board {0}", board))
            {
                var r = await HttpClient.GetAsync($"http://a.4cdn.org/{board}/threads.json", CancellationToken);

                r.EnsureSuccessStatusCode();
                var json = await r.Content.ReadAsStringAsync();

                var pages   = JArray.Parse(json);
                var threads = new List <NewsThread>();
                foreach (dynamic page in pages)
                {
                    int p = 1;
                    foreach (dynamic thread in page.threads)
                    {
                        var t = new NewsThread()
                        {
                            Source       = "4ch.pol",
                            No           = thread.no,
                            Id           = ((string)thread.no) + "-" + YY,
                            Position     = p++,
                            LastModified = DateTimeOffset.FromUnixTimeSeconds((long)thread.last_modified).UtcDateTime,
                            ReplyCount   = thread.replies
                        };
                        threads.Add(t);
                    }
                }
                op.Complete();
                return(threads);
            }
        }
Пример #3
0
        public async Task <ValueTuple <NewsThread, List <Post> > > GetThread(string board, string threadno)
        {
            using (var op = Begin("Get thread no {0} for board {1}", board, threadno))
            {
                var r = await HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{threadno}.json", CancellationToken);

                r.EnsureSuccessStatusCode();
                var json = await r.Content.ReadAsStringAsync();

                dynamic    o           = JObject.Parse(json);
                JArray     threadPosts = o.posts;
                dynamic    subjectPost = threadPosts[0];
                NewsThread thread      = new NewsThread()
                {
                    Source        = $"4ch.{board}",
                    No            = subjectPost.no,
                    Id            = subjectPost.no.ToString() + "-" + YY,
                    DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)subjectPost.time).UtcDateTime,
                    Subject       = subjectPost.sub,
                    Text          = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com),
                    User          = subjectPost.name,
                    ReplyCount    = subjectPost.replies
                };
                var posts = ParsePostsFromThreadJson(board, json);
                foreach (var p in posts)
                {
                    p.ThreadId = thread.Id;
                }
                foreach (var p in posts)
                {
                    var rnos = ExtractPostRepliesFromText(p);
                    foreach (var no in rnos)
                    {
                        p.ReplyTo.Add(no.ToString() + "-" + YY);
                        var replyto = posts.SingleOrDefault(v => v.No == no);
                        if (replyto != null)
                        {
                            replyto.Replies.Add(p.Id);
                        }
                    }

                    p.Text            = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty);
                    p.Text            = WebScraper.RemoveUrlsFromText(p.Text);
                    p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w));
                    await NLUService.GetPredictionForPost(p);

                    if (p.Entities.Count > 0)
                    {
                        Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id);
                    }
                    if (p.ThreatIntent > 0.0)
                    {
                        Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id);
                    }
                }
                op.Complete();
                return(thread, posts);
            }
        }
Пример #4
0
        public async Task <IDictionary <NewsThread, List <Post> > > GetPosts(string board, IEnumerable <NewsThread> threads, int delay = 500)
        {
            using (var op = Begin("Get posts for {0} threads for board {1}", threads.Count(), board))
            {
                Task <HttpResponseMessage>[] threadTasks =
                    threads.Select(t => HttpClient.GetAsync($"http://a.4cdn.org/{board}/thread/{t.No}.json", CancellationToken)).ToArray();
                try
                {
                    foreach (var task in threadTasks)
                    {
                        await task;
                        await Task.Delay(delay);
                    }
                }
                catch { }

                foreach (var task in threadTasks.Where(t => t.IsFaulted || t.IsCanceled))
                {
                    if (task.Exception != null)
                    {
                        Error(task.Exception, "Could not fetch thread.");
                    }
                    else
                    {
                        Error("Could not fetch thread. Thread task did not complete.");
                    }
                }
                var threadsResponseTask = threadTasks
                                          .Where(t => t.IsCompleted)
                                          .Select(t => t.Result)
                                          .Where(r => r.StatusCode == HttpStatusCode.OK)
                                          .Select(r => r.Content.ReadAsStringAsync());
                await Task.WhenAll(threadsResponseTask);

                var threadsJson = threadsResponseTask.Select(r => r.Result);

                var posts = new ConcurrentDictionary <NewsThread, List <Post> >();
                Parallel.ForEach(threadsJson, (j) =>
                {
                    dynamic o           = JObject.Parse(j);
                    JArray threadPosts  = o.posts;
                    dynamic subjectPost = threadPosts[0];
                    NewsThread thread   = threads.Single(t => t.No == (long)subjectPost.no);
                    thread.Subject      = subjectPost.sub;
                    thread.Text         = WebScraper.ExtractTextFromHtmlFrag((string)subjectPost.com);
                    thread.User         = subjectPost.name;
                    thread.ReplyCount   = subjectPost.replies;
                    if (!posts.TryAdd(thread, new List <Post>()))
                    {
                        Error("Could not add thread {0} to posts collection.", thread.Id);
                    }
                    int pos = 1;
                    foreach (dynamic post in threadPosts)
                    {
                        posts[thread].Add(
                            new Post()
                        {
                            Source        = $"4ch.{board}",
                            ThreadId      = thread.Id,
                            No            = post.no,
                            Id            = post.no.ToString() + "-" + YY,
                            Position      = pos++,
                            User          = post.name,
                            DatePublished = DateTimeOffset.FromUnixTimeSeconds((long)post.time).UtcDateTime,
                            Text          = WebScraper.ExtractTextFromHtmlFrag((string)post.com),
                            Links         = WebScraper.ExtractLinksFromHtmlFrag((string)post.com),
                            Additional    = GetAdditionalPropsForPost((JObject)post)
                        });
                    }
                });

                foreach (var tp in posts)
                {
                    foreach (var p in tp.Value)
                    {
                        var rnos = ExtractPostRepliesFromText(p);
                        foreach (var no in rnos)
                        {
                            p.ReplyTo.Add(no.ToString() + "-" + YY);
                            var replyto = tp.Value.SingleOrDefault(v => v.No == no);
                            if (replyto != null)
                            {
                                replyto.Replies.Add(p.Id);
                            }
                        }
                        p.Text = repliesRegex.Replace(p.Text, string.Empty).Replace(">", string.Empty);
                        p.Text = WebScraper.RemoveUrlsFromText(p.Text);
                    }
                }

                foreach (var p in posts.Values.SelectMany(x => x))
                {
                    p.HasIdentityHate = HateWords.IdentityHateWords.Any(w => p.Text.Contains(w));

                    await NLUService.GetPredictionForPost(p);

                    if (p.Entities.Count > 0)
                    {
                        Info("Detected {0} entities in post {1}.", p.Entities.Count, p.Id);
                    }
                    if (p.ThreatIntent > 0.0)
                    {
                        Info("Detected threat intent {0:0.00} in post {1}.", p.ThreatIntent, p.Id);
                    }
                }

                op.Complete();
                return(posts);
            }
        }