private async Task <bool> GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            GenerateTags();

            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in");
                shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name);
                postQueue.CompleteAdding();
                incompleteCrawl = true;
                return(incompleteCrawl);
            }

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(CrawlPageAsync(pageNumber));
            }

            await Task.WhenAll(trackedTasks);

            jsonQueue.CompleteAdding();
            postQueue.CompleteAdding();

            UpdateBlogStats();

            return(incompleteCrawl);
        }
        private async Task GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            if (!await CheckIfLoggedInAsync())
            {
                Logger.Error("TumblrTagSearchCrawler:GetUrlsAsync: {0}", "User not logged in");
                ShellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, Blog.Name);
                PostQueue.CompleteAdding();
                return;
            }

            GenerateTags();

            await semaphoreSlim.WaitAsync();

            trackedTasks.Add(CrawlPageAsync());
            await Task.WhenAll(trackedTasks);

            PostQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats(true);
        }
示例#3
0
        private async Task GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            GenerateTags();

            await semaphoreSlim.WaitAsync();

            trackedTasks.Add(CrawlPageAsync());
            await Task.WhenAll(trackedTasks);

            PostQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats(true);
        }
        private async Task <bool> GetUrlsAsync()
        {
            trackedTasks  = new List <Task>();
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);

            GenerateTags();

            // page already loaded in GetHighestPostIdCoreAsync(), so retrieve new number of posts already there
            await Task.Run(() => Task.CompletedTask);

            //await UpdateTotalPostCountAsync();

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }

                if (CheckIfShouldStop())
                {
                    break;
                }

                CheckIfShouldPause();

                trackedTasks.Add(CrawlPageAsync(pageNumber));
            }

            await Task.WhenAll(trackedTasks);

            PostQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats(GetLastPostId() != 0);

            return(incompleteCrawl);
        }
示例#5
0
        private async Task <bool> GetUrlsAsync()
        {
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);
            trackedTasks  = new List <Task>();

            GenerateTags();

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(CrawlPageAsync(pageNumber));
            }

            await Task.WhenAll(trackedTasks);

            jsonQueue.CompleteAdding();
            PostQueue.CompleteAdding();

            UpdateBlogStats(GetLastPostId() != 0);

            return(incompleteCrawl);
        }
示例#6
0
        private async Task <bool> GetUrlsAsync()
        {
            trackedTasks  = new List <Task>();
            semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans);

            GenerateTags();

            await UpdateTotalPostCountAsync();

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }

                if (CheckIfShouldStop())
                {
                    break;
                }

                CheckIfShouldPause();

                trackedTasks.Add(CrawlPageAsync(pageNumber));
            }

            await Task.WhenAll(trackedTasks);

            postQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats();

            return(incompleteCrawl);
        }
示例#7
0
        private async Task GetUrlsAsync()
        {
            SemaphoreSlim semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans);
            List <Task>   trackedTasks  = new List <Task>();

            if (!await CheckIfLoggedIn())
            {
                Logger.Error("TumblrHiddenCrawler:GetUrlsAsync: {0}", "User not logged in");
                shellService.ShowError(new Exception("User not logged in"), Resources.NotLoggedIn, blog.Name);
                postQueue.CompleteAdding();
                return;
            }

            foreach (int crawlerNumber in Enumerable.Range(0, shellService.Settings.ConcurrentScans))
            {
                await semaphoreSlim.WaitAsync();

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    if (!string.IsNullOrWhiteSpace(blog.Tags))
                    {
                        tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();
                    }

                    try
                    {
                        string document     = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize * crawlerNumber).ToString());
                        TumblrJson response = ConvertJsonToClass <TumblrJson>(document);
                        await AddUrlsToDownloadList(response, crawlerNumber);
                    }
                    catch (WebException webException) when(webException.Response != null)
                    {
                        HttpWebResponse resp = (HttpWebResponse)webException.Response;
                        if ((int)resp.StatusCode == 429)
                        {
                            // TODO: add retry logic?
                            Logger.Error("TumblrHiddenCrawler:GetUrls:WebException {0}", webException);
                            shellService.ShowError(webException, Resources.LimitExceeded, blog.Name);
                        }
                    }
                    catch (TimeoutException timeoutException)
                    {
                        Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException);
                        shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name);
                    }
                    catch
                    {
                    }
                    finally
                    {
                        semaphoreSlim.Release();
                    }
                })());
            }
            await Task.WhenAll(trackedTasks);

            jsonQueue.CompleteAdding();
            postQueue.CompleteAdding();

            UpdateBlogStats();
        }
示例#8
0
        private async Task <Tuple <ulong, bool> > GetUrlsAsync()
        {
            var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans);
            var trackedTasks  = new List <Task>();
            var apiLimitHit   = false;
            var completeGrab  = true;

            await UpdateTotalPostCountAsync();

            int totalPosts = blog.Posts;

            ulong highestId = await GetHighestPostIdAsync();

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }

                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try
                    {
                        string document = await GetApiPageAsync(pageNumber);
                        var response    = ConvertJsonToClass <TumblrApiJson>(document);

                        completeGrab = CheckPostAge(response);

                        if (!string.IsNullOrWhiteSpace(blog.Tags))
                        {
                            tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();
                        }

                        await AddUrlsToDownloadList(response);
                    }
                    catch (WebException webException) when((webException.Response != null))
                    {
                        var webRespStatusCode = (int)((HttpWebResponse)webException?.Response).StatusCode;
                        if (webRespStatusCode == 429)
                        {
                            apiLimitHit = true;
                            Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", webException);
                            shellService.ShowError(webException, Resources.LimitExceeded, blog.Name);
                        }
                    }
                    catch (TimeoutException timeoutException)
                    {
                        Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException);
                        shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name);
                    }
                    catch
                    {
                    }
                    finally
                    {
                        semaphoreSlim.Release();
                    }

                    numberOfPagesCrawled += blog.PageSize;
                    UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPagesCrawled, totalPosts);
                })());
            }
            await Task.WhenAll(trackedTasks);

            postQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats();

            return(new Tuple <ulong, bool>(highestId, apiLimitHit));
        }
        private async Task <bool> GetUrlsAsync()
        {
            trackedTasks  = new List <Task>();
            semaphoreSlim = new SemaphoreSlim(ShellService.Settings.ConcurrentScans);

            GenerateTags();

            await IsBlogOnlineAsync();

            if (!Blog.Online)
            {
                PostQueue.CompleteAdding();
                jsonQueue.CompleteAdding();
                return(true);
            }

            Blog.Posts = twUser.Data.User.Legacy.StatusesCount;
            if (Blog.PageSize == 0)
            {
                Blog.PageSize = 50;
            }

            int currentPage = (Blog.Posts > 3200) ? (Blog.Posts - 3200) / 20 + 3200 / Blog.PageSize + 1 : Blog.Posts / Blog.PageSize + 1;

            if (Blog.Posts > 3200)
            {
                currentPage += 50;
            }
            int pageNo = 1;

            while (true)
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }
                if (CheckIfShouldStop())
                {
                    break;
                }
                CheckIfShouldPause();

                await CrawlPageAsync(pageNo);

                if (currentPage > 0)
                {
                    currentPage--;
                    pageNo++;
                }
                else
                {
                    break;
                }
            }

            PostQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats(GetLastPostId() != 0);

            return(incompleteCrawl);
        }