Exemple #1
0
        public async Task <(string Html, ChattyPage Page)> GetChattyPage(StepStopwatch stopwatch,
                                                                         int page, string previousHtml = null, ChattyPage previousChattyPage = null)
        {
            stopwatch?.Step($"Download_{page}");
            var url  = $"https://www.shacknews.com/chatty?page={page}";
            var html = await _downloadService.DownloadWithSharedLogin(url);

            stopwatch?.Step($"Parse_{page}");
            // remove the progress meter which changes on every load, so that we'll get identical html when nothing
            // has changed
            html = _progressMeterRegex.Replace(html, "");

            // remove the comment count too, it appears on all pages even though some pages haven't changed
            html = _commentCountRegex.Replace(html, "");

            return(html, html == previousHtml ? previousChattyPage : ParseChattyPage(html));
        }
Exemple #2
0
        private async Task <(List <ScrapeState.Page> Pages, Chatty Chatty)> GetChattyWithoutBodies(
            List <ScrapeState.Page> previousPages, StepStopwatch stopwatch)
        {
            ScrapeState.Page GetPreviousPage(int page) =>
            previousPages != null && previousPages.Count >= page
                ? previousPages[page - 1]
                : null;

            var chatty = new Chatty {
                Threads = new List <ChattyThread>(200)
            };

            var currentPage   = 1;
            var lastPage      = 1;
            var newPages      = new List <ScrapeState.Page>();
            var seenThreadIds = new HashSet <int>();

            // The chatty almost always has three or fewer pages, so try downloading the first three in parallel for speed.
            stopwatch.Step("First three pages");
            var firstThreePages = new (string Html, ChattyPage Page)[3];
Exemple #3
0
        private async Task <TimeSpan> Scrape()
        {
            var stopwatch = new StepStopwatch();

            try
            {
                var oldEventId = await _eventProvider.GetLastEventId();

                stopwatch.Step(nameof(GC));
                GC.Collect();

                stopwatch.Step(nameof(_lolParser.DownloadChattyLolCounts));
                var lolTask = _lolParser.DownloadChattyLolCounts(_state?.LolJson, _state?.LolCounts);

                var(newPages, newChatty) = await GetChattyWithoutBodies(_state?.Pages, stopwatch);

                if (_state != null)
                {
                    stopwatch.Step(nameof(HandleThreadsThatDisappeared));
                    await HandleThreadsThatDisappeared(_state.Chatty, newChatty);
                }

                stopwatch.Step(nameof(ReorderThreads));
                ReorderThreads(newChatty);

                stopwatch.Step(nameof(newChatty.SetDictionaries));
                newChatty.SetDictionaries();

                stopwatch.Step(nameof(CopyPostBodies));
                CopyPostBodies(newChatty);

                stopwatch.Step(nameof(DownloadPostBodies));
                await DownloadPostBodies(newChatty);

                stopwatch.Step(nameof(RemovePostsWithNoBody));
                RemovePostsWithNoBody(newChatty);

                stopwatch.Step(nameof(Cortex.DetectCortexThreads));
                Cortex.DetectCortexThreads(newChatty);

                stopwatch.Step(nameof(FixRelativeLinks));
                FixRelativeLinks(newChatty);

                stopwatch.Step(nameof(_chattyProvider.Update));
                var(lolJson, lolCounts) = await lolTask;
                await _eventProvider.Update(newChatty, lolCounts);

                _chattyProvider.Update(newChatty, lolCounts);

                _state =
                    new ScrapeState
                {
                    Chatty    = newChatty,
                    Pages     = newPages,
                    LolJson   = lolJson,
                    LolCounts = lolCounts
                };

                var newEventId = await _eventProvider.GetLastEventId();

                if (oldEventId != newEventId)
                {
                    stopwatch.Step(nameof(SaveState));
                    await SaveState();
                }

                ThreadPool.GetMaxThreads(out var maxWorkerThreads, out var maxCompletionPortThreads);
                ThreadPool.GetAvailableThreads(out var availableWorkerThreads, out var availableCompletionPortThreads);
                _logger.LogInformation("Scrape complete. Last event is #{EventId}. {Elapsed}. Worker threads: {WorkerCount}. IOCP threads: {CompletionPortCount}.",
                                       await _eventProvider.GetLastEventId(), stopwatch, maxWorkerThreads - availableWorkerThreads,
                                       maxCompletionPortThreads - availableCompletionPortThreads);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, $"Scrape failed. {stopwatch}. {ex.Message}");
            }

            return(stopwatch.Elapsed);
        }