private async Task <bool> GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); GenerateTags(); if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in"); shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name); postQueue.CompleteAdding(); incompleteCrawl = true; return(incompleteCrawl); } foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync(pageNumber)); } await Task.WhenAll(trackedTasks); jsonQueue.CompleteAdding(); postQueue.CompleteAdding(); UpdateBlogStats(); return(incompleteCrawl); }
private async Task GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); if (!await CheckIfLoggedInAsync()) { Logger.Error("TumblrTagSearchCrawler:GetUrlsAsync: {0}", "User not logged in"); ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name); PostQueue.CompleteAdding(); return; } GenerateTags(); await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync()); await Task.WhenAll(trackedTasks); PostQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(true); }
private async Task GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); GenerateTags(); await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync()); await Task.WhenAll(trackedTasks); PostQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(true); }
private async Task <bool> GetUrlsAsync() { trackedTasks = new List <Task>(); semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); GenerateTags(); // page already loaded in GetHighestPostIdCoreAsync(), so retrieve new number of posts already there await Task.Run(() => Task.CompletedTask); //await UpdateTotalPostCountAsync(); foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (CheckIfShouldStop()) { break; } CheckIfShouldPause(); trackedTasks.Add(CrawlPageAsync(pageNumber)); } await Task.WhenAll(trackedTasks); PostQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(GetLastPostId() != 0); return(incompleteCrawl); }
private async Task <bool> GetUrlsAsync() { semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); trackedTasks = new List <Task>(); GenerateTags(); foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(CrawlPageAsync(pageNumber)); } await Task.WhenAll(trackedTasks); jsonQueue.CompleteAdding(); PostQueue.CompleteAdding(); UpdateBlogStats(GetLastPostId() != 0); return(incompleteCrawl); }
private async Task <bool> GetUrlsAsync() { trackedTasks = new List <Task>(); semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans); GenerateTags(); await UpdateTotalPostCountAsync(); foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (CheckIfShouldStop()) { break; } CheckIfShouldPause(); trackedTasks.Add(CrawlPageAsync(pageNumber)); } await Task.WhenAll(trackedTasks); postQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(); return(incompleteCrawl); }
private async Task GetUrlsAsync() { SemaphoreSlim semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans); List <Task> trackedTasks = new List <Task>(); if (!await CheckIfLoggedIn()) { Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in"); shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name); postQueue.CompleteAdding(); return; } foreach (int crawlerNumber in Enumerable.Range(0, shellService.Settings.ConcurrentScans)) { await semaphoreSlim.WaitAsync(); trackedTasks.Add(new Func <Task>(async() => { if (!string.IsNullOrWhiteSpace(blog.Tags)) { tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); } try { string document = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize * crawlerNumber).ToString()); TumblrJson response = ConvertJsonToClass <TumblrJson>(document); await AddUrlsToDownloadList(response, crawlerNumber); } catch (WebException webException) when(webException.Response != null) { HttpWebResponse resp = (HttpWebResponse)webException.Response; if ((int)resp.StatusCode == 429) { // TODO: add retry logic? Logger.Error("TumblrHiddenCrawler:GetUrls:WebException {0}", webException); shellService.ShowError(webException, Resources.LimitExceeded, blog.Name); } } catch (TimeoutException timeoutException) { Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException); shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name); } catch { } finally { semaphoreSlim.Release(); } })()); } await Task.WhenAll(trackedTasks); jsonQueue.CompleteAdding(); postQueue.CompleteAdding(); UpdateBlogStats(); }
private async Task <Tuple <ulong, bool> > GetUrlsAsync() { var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans); var trackedTasks = new List <Task>(); var apiLimitHit = false; var completeGrab = true; await UpdateTotalPostCountAsync(); int totalPosts = blog.Posts; ulong highestId = await GetHighestPostIdAsync(); foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { string document = await GetApiPageAsync(pageNumber); var response = ConvertJsonToClass <TumblrApiJson>(document); completeGrab = CheckPostAge(response); if (!string.IsNullOrWhiteSpace(blog.Tags)) { tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); } await AddUrlsToDownloadList(response); } catch (WebException webException) when((webException.Response != null)) { var webRespStatusCode = (int)((HttpWebResponse)webException?.Response).StatusCode; if (webRespStatusCode == 429) { apiLimitHit = true; Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", webException); shellService.ShowError(webException, Resources.LimitExceeded, blog.Name); } } catch (TimeoutException timeoutException) { Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException); shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name); } catch { } finally { semaphoreSlim.Release(); } numberOfPagesCrawled += blog.PageSize; UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPagesCrawled, totalPosts); })()); } await Task.WhenAll(trackedTasks); postQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(); return(new Tuple <ulong, bool>(highestId, apiLimitHit)); }
private async Task <bool> GetUrlsAsync() { trackedTasks = new List <Task>(); semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans); GenerateTags(); await IsBlogOnlineAsync(); if (!Blog.Online) { PostQueue.CompleteAdding(); jsonQueue.CompleteAdding(); return(true); } Blog.Posts = twUser.Data.User.Legacy.StatusesCount; if (Blog.PageSize == 0) { Blog.PageSize = 50; } int currentPage = (Blog.Posts > 3200) ? (Blog.Posts - 3200) / 20 + 3200 / Blog.PageSize + 1 : Blog.Posts / Blog.PageSize + 1; if (Blog.Posts > 3200) { currentPage += 50; } int pageNo = 1; while (true) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (CheckIfShouldStop()) { break; } CheckIfShouldPause(); await CrawlPageAsync(pageNo); if (currentPage > 0) { currentPage--; pageNo++; } else { break; } } PostQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(GetLastPostId() != 0); return(incompleteCrawl); }