public async Task <(string Html, ChattyPage Page)> GetChattyPage(StepStopwatch stopwatch, int page, string previousHtml = null, ChattyPage previousChattyPage = null) { stopwatch?.Step($"Download_{page}"); var url = $"https://www.shacknews.com/chatty?page={page}"; var html = await _downloadService.DownloadWithSharedLogin(url); stopwatch?.Step($"Parse_{page}"); // remove the progress meter which changes on every load, so that we'll get identical html when nothing // has changed html = _progressMeterRegex.Replace(html, ""); // remove the comment count too, it appears on all pages even though some pages haven't changed html = _commentCountRegex.Replace(html, ""); return(html, html == previousHtml ? previousChattyPage : ParseChattyPage(html)); }
private async Task <(List <ScrapeState.Page> Pages, Chatty Chatty)> GetChattyWithoutBodies( List <ScrapeState.Page> previousPages, StepStopwatch stopwatch) { ScrapeState.Page GetPreviousPage(int page) => previousPages != null && previousPages.Count >= page ? previousPages[page - 1] : null; var chatty = new Chatty { Threads = new List <ChattyThread>(200) }; var currentPage = 1; var lastPage = 1; var newPages = new List <ScrapeState.Page>(); var seenThreadIds = new HashSet <int>(); // The chatty almost always has three or fewer pages, so try downloading the first three in parallel for speed. stopwatch.Step("First three pages"); var firstThreePages = new (string Html, ChattyPage Page)[3];
private async Task <TimeSpan> Scrape() { var stopwatch = new StepStopwatch(); try { var oldEventId = await _eventProvider.GetLastEventId(); stopwatch.Step(nameof(GC)); GC.Collect(); stopwatch.Step(nameof(_lolParser.DownloadChattyLolCounts)); var lolTask = _lolParser.DownloadChattyLolCounts(_state?.LolJson, _state?.LolCounts); var(newPages, newChatty) = await GetChattyWithoutBodies(_state?.Pages, stopwatch); if (_state != null) { stopwatch.Step(nameof(HandleThreadsThatDisappeared)); await HandleThreadsThatDisappeared(_state.Chatty, newChatty); } stopwatch.Step(nameof(ReorderThreads)); ReorderThreads(newChatty); stopwatch.Step(nameof(newChatty.SetDictionaries)); newChatty.SetDictionaries(); stopwatch.Step(nameof(CopyPostBodies)); CopyPostBodies(newChatty); stopwatch.Step(nameof(DownloadPostBodies)); await DownloadPostBodies(newChatty); stopwatch.Step(nameof(RemovePostsWithNoBody)); RemovePostsWithNoBody(newChatty); stopwatch.Step(nameof(Cortex.DetectCortexThreads)); Cortex.DetectCortexThreads(newChatty); stopwatch.Step(nameof(FixRelativeLinks)); FixRelativeLinks(newChatty); stopwatch.Step(nameof(_chattyProvider.Update)); var(lolJson, lolCounts) = await lolTask; await _eventProvider.Update(newChatty, lolCounts); _chattyProvider.Update(newChatty, lolCounts); _state = new ScrapeState { Chatty = newChatty, Pages = newPages, LolJson = lolJson, LolCounts = lolCounts }; var newEventId = await _eventProvider.GetLastEventId(); if (oldEventId != newEventId) { stopwatch.Step(nameof(SaveState)); await SaveState(); } ThreadPool.GetMaxThreads(out var maxWorkerThreads, out var maxCompletionPortThreads); ThreadPool.GetAvailableThreads(out var availableWorkerThreads, out var availableCompletionPortThreads); _logger.LogInformation("Scrape complete. Last event is #{EventId}. {Elapsed}. Worker threads: {WorkerCount}. IOCP threads: {CompletionPortCount}.", await _eventProvider.GetLastEventId(), stopwatch, maxWorkerThreads - availableWorkerThreads, maxCompletionPortThreads - availableCompletionPortThreads); } catch (Exception ex) { _logger.LogError(ex, $"Scrape failed. {stopwatch}. {ex.Message}"); } return(stopwatch.Elapsed); }