public virtual async Task DownloadCrawlerDataAsync() { var trackedTasks = new List <Task>(); blog.CreateDataFolder(); foreach (TumblrCrawlerXmlData downloadItem in xmlQueue.GetConsumingEnumerable()) { if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { await DownloadTextPost(downloadItem); } catch { } })()); } try { await Task.WhenAll(trackedTasks); } catch { } }
private TumblrBlog CrawlCoreTumblrBlog(TumblrBlog blog, IProgress <DataModels.DownloadProgress> progress, CancellationToken ct, PauseToken pt) { Logger.Verbose("ManagerController.CrawlCoreTumblrBlog:Start"); var newProgress = new DataModels.DownloadProgress(); var tuple = GetImageUrls(blog, progress, ct, pt); var newImageCount = tuple.Item1; var newImageUrls = tuple.Item2; blog.TotalCount = newImageCount; var imageUrls = newImageUrls.Except(blog.Links.ToList()); var indexPath = Path.Combine(shellService.Settings.DownloadLocation, "Index"); var blogPath = shellService.Settings.DownloadLocation; var parallel = Parallel.ForEach( imageUrls, new ParallelOptions { MaxDegreeOfParallelism = (shellService.Settings.ParallelImages / selectionService.ActiveItems.Count) }, (currentImageUrl, state) => { if (ct.IsCancellationRequested) { state.Break(); } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } string fileName = currentImageUrl.Split('/').Last(); string fileLocation = Path.Combine(Path.Combine(blogPath, blog.Name), fileName); if (Download(blog, fileLocation, currentImageUrl)) { blog.Links.Add(currentImageUrl); blog.DownloadedImages = (uint)blog.Links.Count(); blog.Progress = (uint)((double)blog.DownloadedImages / (double)blog.TotalCount * 100); newProgress = new DataModels.DownloadProgress(); newProgress.Progress = string.Format(CultureInfo.CurrentCulture, Resources.ProgressDownloadImage, currentImageUrl);; progress.Report(newProgress); } }); if (!ct.IsCancellationRequested) { blog.LastCompleteCrawl = DateTime.Now; } SaveBlog(blog); newProgress = new DataModels.DownloadProgress(); newProgress.Progress = ""; progress.Report(newProgress); return(blog); }
private async Task AddUrlsToDownloadList(string document, IProgress <DownloadProgress> progress, int crawlerNumber, CancellationToken ct, PauseToken pt) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } var tags = new List <string>(); if (!string.IsNullOrWhiteSpace(blog.Tags)) { tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); } AddPhotoUrlToDownloadList(document, tags); AddVideoUrlToDownloadList(document, tags); Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(progress, Resources.ProgressGetUrlShort, numberOfPagesCrawled); crawlerNumber += shellService.Settings.ParallelScans; document = await RequestDataAsync(blog.Url + "/page/" + crawlerNumber); if (document.Contains("<div class=\"no_posts_found\">")) { return; } } }
private async Task AddUrlsToDownloadList(string response, IList <string> tags, IProgress <DownloadProgress> progress, int crawlerNumber, CancellationToken ct, PauseToken pt) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } try { AddPhotoUrlToDownloadList(response, tags); AddVideoUrlToDownloadList(response, tags); } catch (NullReferenceException) { } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(progress, Resources.ProgressGetUrlShort, numberOfPagesCrawled); string document = await GetSvcPageAsync((crawlerNumber + shellService.Settings.ParallelScans)); //if (!document.response.posts.Any()) //{ // return; //} crawlerNumber += shellService.Settings.ParallelScans; } }
public virtual async Task DownloadCrawlerDataAsync() { var trackedTasks = new List <Task>(); blog.CreateDataFolder(); try { foreach (TumblrCrawlerData <T> downloadItem in jsonQueue.GetConsumingEnumerable(ct)) { if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(DownloadPostAsync(downloadItem)); } } catch (OperationCanceledException e) { System.Diagnostics.Debug.WriteLine(e.ToString()); } await Task.WhenAll(trackedTasks); }
public async Task DownloadCrawlerDataAsync() { var trackedTasks = new List <Task>(); _blog.CreateDataFolder(); try { while (await _xmlQueue.OutputAvailableAsync(_ct)) { CrawlerData <XDocument> downloadItem = (CrawlerData <XDocument>) await _xmlQueue.ReceiveAsync(); if (_ct.IsCancellationRequested) { break; } if (_pt.IsPaused) { _pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(DownloadPostAsync(downloadItem)); } } catch (OperationCanceledException e) { System.Diagnostics.Debug.WriteLine(e.ToString()); } await Task.WhenAll(trackedTasks); }
protected void CheckIfShouldPause() { if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } }
public static async Task SomeMethodAsync(PauseToken pause) { await Task.Delay(500); Console.WriteLine("Before await pause.WaitWhilePausedAsync()"); await pause.WaitWhilePausedWithResponseAsyc(); Console.WriteLine("After await pause.WaitWhilePausedAsync()"); }
public static async Task SomeMethodAsync(PauseToken pause) { while (true) { await Task.Delay(1000).ConfigureAwait(false); Console.WriteLine("Before await pause.WaitWhilePausedAsync()"); await pause.WaitWhilePausedWithResponseAsyc(); Console.WriteLine("After await pause.WaitWhilePausedAsync()"); } }
private async Task RunCrawlerTasksAsync(PauseToken pt, CancellationToken ct) { while (true) { if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } Monitor.Enter(_lockObject); if (_crawlerService.ActiveItems.Count < QueueManager.Items.Count) { IEnumerable <QueueListItem> queueList = QueueManager.Items.Except(_crawlerService.ActiveItems); QueueListItem nextQueueItem = queueList.First(); IBlog blog = nextQueueItem.Blog; ICrawler crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), pt, ct); crawler.IsBlogOnlineAsync().Wait(4000); crawler.Dispose(); if (_crawlerService.ActiveItems.Any(item => item.Blog.Name.Equals(nextQueueItem.Blog.Name) && item.Blog.BlogType.Equals(nextQueueItem.Blog.BlogType))) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(_lockObject); continue; } if (!nextQueueItem.Blog.Online) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(_lockObject); continue; } _crawlerService.AddActiveItems(nextQueueItem); Monitor.Exit(_lockObject); await StartSiteSpecificDownloaderAsync(nextQueueItem, pt, ct); } else { Monitor.Exit(_lockObject); await Task.Delay(4000, ct); } } }
public virtual async Task <bool> DownloadBlogAsync() { var concurrentConnectionsSemaphore = new SemaphoreSlim(shellService.Settings.ConcurrentConnections / crawlerService.ActiveItems.Count); var concurrentVideoConnectionsSemaphore = new SemaphoreSlim(shellService.Settings.ConcurrentVideoConnections / crawlerService.ActiveItems.Count); var trackedTasks = new List <Task>(); var completeDownload = true; blog.CreateDataFolder(); foreach (TumblrPost downloadItem in postQueue.GetConsumingEnumerable()) { if (downloadItem.GetType() == typeof(VideoPost)) { await concurrentVideoConnectionsSemaphore.WaitAsync(); } await concurrentConnectionsSemaphore.WaitAsync(); if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { await DownloadPostAsync(downloadItem); } catch {} finally { concurrentConnectionsSemaphore.Release(); if (downloadItem.GetType() == typeof(VideoPost)) { concurrentVideoConnectionsSemaphore.Release(); } } })()); } try { await Task.WhenAll(trackedTasks); } catch { completeDownload = false; } blog.LastDownloadedPhoto = null; blog.LastDownloadedVideo = null; files.Save(); return(completeDownload); }
private async Task AddUrlsToDownloadList(TumblrJson response, int crawlerNumber) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } try { AddPhotoUrlToDownloadList(response); AddVideoUrlToDownloadList(response); AddAudioUrlToDownloadList(response); AddTextUrlToDownloadList(response); AddQuoteUrlToDownloadList(response); AddLinkUrlToDownloadList(response); AddConversationUrlToDownloadList(response); AddAnswerUrlToDownloadList(response); AddPhotoMetaUrlToDownloadList(response); AddVideoMetaUrlToDownloadList(response); AddAudioMetaUrlToDownloadList(response); await AddExternalPhotoUrlToDownloadList(response); } catch (NullReferenceException) { } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); string document = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize *crawlerNumber).ToString()); response = ConvertJsonToClass <TumblrJson>(document); if (!response.response.posts.Any()) { return; } crawlerNumber += shellService.Settings.ConcurrentScans; } }
private async Task RunCrawlerTasks(CancellationToken ct, PauseToken pt) { while (true) { ct.ThrowIfCancellationRequested(); if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(ct); } Monitor.Enter(lockObject); if (crawlerService.ActiveItems.Count() < QueueManager.Items.Count()) { IEnumerable <QueueListItem> queueList = QueueManager.Items.Except(crawlerService.ActiveItems); QueueListItem nextQueueItem = queueList.First(); IBlog blog = nextQueueItem.Blog; IDownloader downloader = DownloaderFactory.GetDownloader(blog.BlogType, shellService, crawlerService, blog); downloader.IsBlogOnlineAsync().Wait(4000); if (crawlerService.ActiveItems.Any(item => item.Blog.Name.Contains(nextQueueItem.Blog.Name))) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(lockObject); continue; } if (!nextQueueItem.Blog.Online) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(lockObject); continue; } crawlerService.AddActiveItems(nextQueueItem); Monitor.Exit(lockObject); await StartSiteSpecificDownloader(nextQueueItem, ct, pt); } else { Monitor.Exit(lockObject); await Task.Delay(4000, ct); } } }
private async Task AddUrlsToDownloadList(long pagination, long nextCrawlersPagination) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } string document = await GetTaggedSearchPageAsync(pagination); if (document.Contains("<div class=\"no_posts_found\"")) { return; } try { AddPhotoUrlToDownloadList(document); AddVideoUrlToDownloadList(document); } catch (NullReferenceException) { } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); pagination = ExtractNextPageLink(document); if (pagination < nextCrawlersPagination) { return; } if (!CheckIfWithinTimespan(pagination)) { return; } //if (!string.IsNullOrEmpty(blog.DownloadPages)) // return; } }
private async Task AddUrlsToDownloadList(string response, int crawlerNumber) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } var result = ConvertJsonToClass <TumblrSearchJson>(response); if (string.IsNullOrEmpty(result.response.posts_html)) { return; } try { string html = result.response.posts_html; html = Regex.Unescape(html); AddPhotoUrlToDownloadList(html); AddVideoUrlToDownloadList(html); } catch (NullReferenceException) { } if (!string.IsNullOrEmpty(blog.DownloadPages)) { return; } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); response = await GetSearchPageAsync((crawlerNumber + shellService.Settings.ConcurrentScans)); crawlerNumber += shellService.Settings.ConcurrentScans; } }
public static async Task SomeMethodAsync(PauseToken pause) { try { while (true) { await Task.Delay(1000).ConfigureAwait(false); Console.WriteLine("Before await pause.WaitWhilePausedAsync()"); await pause.WaitWhilePausedWithResponseAsyc(); Console.WriteLine("After await pause.WaitWhilePausedAsync()"); } } catch (Exception e) { Console.WriteLine("Exception: {0}", e); throw; } }
private async Task AddUrlsToDownloadList(long pagination, int crawlerNumber) { while (true) { if (ct.IsCancellationRequested) { return; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } string document = await RequestDataAsync(blog.Url + "/page/" + crawlerNumber + "/" + pagination, "https://www.tumblr.com/", "https://" + blog.Name.Replace("+", "-") + ".tumblr.com"); if (document.Contains("<div class=\"no_posts_found\"")) { return; } try { AddPhotoUrlToDownloadList(document); AddVideoUrlToDownloadList(document); } catch (NullReferenceException) { } Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); pagination = ExtractNextPageLink(document); crawlerNumber++; if (!CheckIfWithinTimespan(pagination)) { return; } //if (CheckIfPagecountReached(numberOfPagesCrawled)) // return; } }
protected virtual async Task <bool> DownloadBlogAsync(IProgress <DataModels.DownloadProgress> progress, CancellationToken ct, PauseToken pt) { var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ParallelImages / crawlerService.ActiveItems.Count); var trackedTasks = new List <Task>(); var completeDownload = true; CreateDataFolder(); foreach (TumblrPost downloadItem in producerConsumerCollection.GetConsumingEnumerable()) { await semaphoreSlim.WaitAsync(); if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { await DownloadPostAsync(progress, ct, downloadItem); } catch {} finally { semaphoreSlim.Release(); } })()); } try { await Task.WhenAll(trackedTasks); } catch { completeDownload = false; } blog.LastDownloadedPhoto = null; blog.LastDownloadedVideo = null; files.Save(); return(completeDownload); }
public virtual async Task DownloadCrawlerDataAsync() { var trackedTasks = new List <Task>(); blog.CreateDataFolder(); foreach (TumblrCrawlerData <T> downloadItem in jsonQueue.GetConsumingEnumerable()) { if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(DownloadPostAsync(downloadItem)); } await Task.WhenAll(trackedTasks); }
private async Task <Tuple <ulong, bool> > GetUrlsAsync(IProgress <DownloadProgress> progress, CancellationToken ct, PauseToken pt) { var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ParallelScans); var trackedTasks = new List <Task>(); var numberOfPostsCrawled = 0; var apiLimitHit = false; var completeGrab = true; ulong lastId = GetLastPostId(); await UpdateTotalPostCount(); int totalPosts = blog.Posts; ulong highestId = await GetHighestPostId(); foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { XDocument document = await GetApiPageAsync(pageNumber); completeGrab = CheckPostAge(document, lastId); var tags = new List <string>(); if (!string.IsNullOrWhiteSpace(blog.Tags)) { tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); } AddUrlsToDownloadList(document, tags); } catch (WebException webException) { if (webException.Message.Contains("429")) { // TODO: add retry logic? apiLimitHit = true; Logger.Error("TumblrDownloader:GetUrls:WebException {0}", webException); shellService.ShowError(webException, Resources.LimitExceeded, blog.Name); } } finally { semaphoreSlim.Release(); } numberOfPostsCrawled += blog.PageSize; UpdateProgressQueueInformation(progress, Resources.ProgressGetUrlLong, numberOfPostsCrawled, totalPosts); })()); } await Task.WhenAll(trackedTasks); producerConsumerCollection.CompleteAdding(); if (!ct.IsCancellationRequested && completeGrab) { UpdateBlogStats(); } return(new Tuple <ulong, bool>(highestId, apiLimitHit)); }
protected virtual async Task <bool> DownloadBlogAsync(IProgress <DataModels.DownloadProgress> progress, CancellationToken ct, PauseToken pt) { var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ParallelImages / crawlerService.ActiveItems.Count); var trackedTasks = new List <Task>(); var completeDownload = true; CreateDataFolder(); foreach (Tuple <PostTypes, string, string> downloadItem in producerConsumerCollection.GetConsumingEnumerable()) { await semaphoreSlim.WaitAsync(); if (ct.IsCancellationRequested) { completeDownload = false; break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { switch (downloadItem.Item1) { case PostTypes.Photo: await DownloadPhotoAsync(progress, downloadItem, ct); break; case PostTypes.Video: await DownloadVideoAsync(progress, downloadItem, ct); break; case PostTypes.Audio: await DownloadAudioAsync(progress, downloadItem, ct); break; case PostTypes.Text: DownloadText(progress, downloadItem); break; case PostTypes.Quote: DownloadQuote(progress, downloadItem); break; case PostTypes.Link: DownloadLink(progress, downloadItem); break; case PostTypes.Conversation: DownloadConversation(progress, downloadItem); break; case PostTypes.PhotoMeta: DownloadPhotoMeta(progress, downloadItem); break; case PostTypes.VideoMeta: DownloadVideoMeta(progress, downloadItem); break; case PostTypes.AudioMeta: DownloadAudioMeta(progress, downloadItem); break; default: break; } semaphoreSlim.Release(); })()); } await Task.WhenAll(trackedTasks); blog.LastDownloadedPhoto = null; blog.LastDownloadedVideo = null; files.Save(); return(completeDownload); }
private async Task RunCrawlerTasksAsync(PauseToken pt, CancellationToken ct) { while (true) { if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } bool lockTaken = false; Monitor.Enter(_lockObject, ref lockTaken); try { if (_crawlerService.ActiveItems.Count < QueueManager.Items.Count) { QueueListItem nextQueueItem = QueueManager.Items.Except(_crawlerService.ActiveItems).First(); IBlog blog = nextQueueItem.Blog; ICrawler crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), pt, ct); crawler.IsBlogOnlineAsync().Wait(4000); crawler.Dispose(); if (_crawlerService.ActiveItems.Any(item => item.Blog.Name.Equals(nextQueueItem.Blog.Name) && item.Blog.BlogType.Equals(nextQueueItem.Blog.BlogType))) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(_lockObject); continue; } if (!nextQueueItem.Blog.Online) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(_lockObject); continue; } _crawlerService.AddActiveItems(nextQueueItem); Monitor.Exit(_lockObject); lockTaken = false; await StartSiteSpecificDownloaderAsync(nextQueueItem, pt, ct); } else { Monitor.Exit(_lockObject); lockTaken = false; await Task.Delay(4000, ct); } } catch (Exception e) { Logger.Error("CrawlerController.RunCrawlerTasksAsync: {0}", e); _shellService.ShowError(e, "Error starting the next item in the queue."); if (lockTaken) { Monitor.Exit(_lockObject); } } } }
private async Task <Tuple <ulong, bool> > GetUrlsAsync(IProgress <DownloadProgress> progress, CancellationToken ct, PauseToken pt) { var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ParallelScans); var trackedTasks = new List <Task>(); var numberOfPostsCrawled = 0; var apiLimitHit = false; var completeGrab = true; ulong lastId = GetLastPostId(); await UpdateTotalPostCount(); int totalPosts = blog.Posts; ulong highestId = await GetHighestPostId(); // The Tumblr api v1 shows 50 posts at max, determine the number of pages to crawl int totalPages = (totalPosts / 50) + 1; foreach (int pageNumber in Enumerable.Range(0, totalPages)) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { XDocument document = await GetApiPageAsync(pageNumber); completeGrab = CheckPostAge(document, lastId); AddUrlsToDownloadList(document); } catch (WebException webException) { if (webException.Message.Contains("429")) { // add retry logic? apiLimitHit = true; Logger.Error("TumblrDownloader:GetUrls:WebException {0}", webException); shellService.ShowError(webException, Resources.LimitExceeded, blog.Name); } } finally { semaphoreSlim.Release(); } numberOfPostsCrawled += 50; UpdateProgressQueueInformation(progress, Resources.ProgressGetUrl, numberOfPostsCrawled, totalPosts); })()); } await Task.WhenAll(trackedTasks); producerConsumerCollection.CompleteAdding(); if (!ct.IsCancellationRequested && completeGrab) { UpdateBlogStats(); } return(Tuple.Create(highestId, apiLimitHit)); }
public Tuple <uint, List <string> > GetImageUrls(TumblrBlog blog, IProgress <DataModels.DownloadProgress> progress, CancellationToken ct, PauseToken pt) { int totalPosts = 0; int numberOfPostsCrawled = 0; uint totalImages; List <string> images = new List <string>(); string url = GetApiUrl(blog.Name, 1); string authHeader = shellService.OAuthManager.GenerateauthHeader(url, "GET"); var blogDoc = RequestData(url, authHeader); totalPosts = blogDoc.response.blog.total_posts; // Generate URL list of Images // the api v2 shows 20 posts at max, determine the number of pages to crawl int totalPages = (totalPosts / 20) + 1; Parallel.For(0, totalPages, new ParallelOptions { MaxDegreeOfParallelism = (shellService.Settings.ParallelImages / selectionService.ActiveItems.Count) }, (i, state) => { if (ct.IsCancellationRequested) { state.Break(); } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } try { // check for tags -- crawling for all images here if (blog.Tags == null || blog.Tags.Count() == 0) { DataModels.TumblrJson document = null; // get 20 posts per crawl/page url = GetApiUrl(blog.Name, 20, i * 20); authHeader = shellService.OAuthManager.GenerateauthHeader(url, "GET"); document = RequestData(url, authHeader); if (shellService.Settings.DownloadImages == true) { foreach (Datamodels.Post post in document.response.posts.Where(posts => posts.type.Equals("photo"))) { foreach (DataModels.Photo photo in post.photos) { var imageUrl = photo.alt_sizes.ElementAt(shellService.Settings.ImageSizes.IndexOf(shellService.Settings.ImageSize.ToString())).url; if (shellService.Settings.SkipGif == true && imageUrl.EndsWith(".gif")) { continue; } Monitor.Enter(images); images.Add(imageUrl); Monitor.Exit(images); } } } if (shellService.Settings.DownloadVideos == true) { foreach (DataModels.Post post in document.response.posts.Where(posts => posts.type.Equals("video"))) { if (shellService.Settings.VideoSize == 1080) { Monitor.Enter(images); images.Add(post.video_url); Monitor.Exit(images); } if (shellService.Settings.VideoSize == 480) { Monitor.Enter(images); images.Add(post.video_url.Insert(post.video_url.LastIndexOf("."), "_480")); Monitor.Exit(images); } } } } // crawling only for tagged images else { List <string> tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); DataModels.TumblrJson document = null; // get 20 posts per crawl/page url = GetApiUrl(blog.Name, 20, i * 20); authHeader = shellService.OAuthManager.GenerateauthHeader(url, "GET"); document = RequestData(url, authHeader); if (shellService.Settings.DownloadImages == true) { foreach (Datamodels.Post post in document.response.posts.Where(posts => posts.tags.Any(tag => tags.Equals(tag)) && posts.type.Equals("photo"))) { foreach (DataModels.Photo photo in post.photos ?? new List <Datamodels.Photo>()) { var imageUrl = photo.alt_sizes.ElementAt(shellService.Settings.ImageSizes.IndexOf(shellService.Settings.ImageSize.ToString())).url; if (shellService.Settings.SkipGif == true && imageUrl.EndsWith(".gif")) { continue; } Monitor.Enter(images); images.Add(imageUrl); Monitor.Exit(images); } } } if (shellService.Settings.DownloadVideos == true) { foreach (DataModels.Post post in document.response.posts.Where(posts => posts.tags.Any(tag => tags.Equals(tag)) && posts.type.Equals("video"))) { if (shellService.Settings.VideoSize == 1080) { Monitor.Enter(images); images.Add(post.video_url); Monitor.Exit(images); } if (shellService.Settings.VideoSize == 480) { Monitor.Enter(images); images.Add(post.video_url.Insert(post.video_url.LastIndexOf("."), "_480")); Monitor.Exit(images); } } } } } catch (Exception ex) { Console.WriteLine(ex.Data); } numberOfPostsCrawled += 20; var newProgress = new DataModels.DownloadProgress(); newProgress.Progress = string.Format(CultureInfo.CurrentCulture, Resources.ProgressGetUrl, numberOfPostsCrawled, totalPosts); progress.Report(newProgress); } ); images = images.Distinct().ToList(); totalImages = (uint)images.Count; return(Tuple.Create(totalImages, images)); }
private void runCrawlerTasks(CancellationToken ct, PauseToken pt) { while (true) { // check if stopped if (ct.IsCancellationRequested) { //break; throw new OperationCanceledException(ct); } // check if paused if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } Monitor.Enter(QueueManager.Items); if (selectionService.ActiveItems.Count() < QueueManager.Items.Count()) { var blogListToCrawlNext = QueueManager.Items.Except(selectionService.ActiveItems); var blogToCrawlNext = blogListToCrawlNext.First(); Monitor.Enter(selectionService.ActiveItems); selectionService.AddActiveItems(blogToCrawlNext); Monitor.Exit(selectionService.ActiveItems); Monitor.Exit(QueueManager.Items); if (blogToCrawlNext.Blog is TumblrBlog) { var blog = (TumblrBlog)blogToCrawlNext.Blog; var progressHandler = new Progress <DataModels.DownloadProgress>(value => { blogToCrawlNext.Progress = value.Progress; }); var progress = progressHandler as IProgress <DataModels.DownloadProgress>; CrawlCoreTumblrBlog(blog, progress, ct, pt); if (ct.IsCancellationRequested) { Application.Current.Dispatcher.BeginInvoke( DispatcherPriority.Background, new Action(() => { Monitor.Enter(selectionService.ActiveItems); selectionService.RemoveActiveItem(blogToCrawlNext); Monitor.Exit(selectionService.ActiveItems); })); throw new OperationCanceledException(ct); } else { Application.Current.Dispatcher.BeginInvoke( DispatcherPriority.Background, new Action(() => { Monitor.Enter(QueueManager.Items); QueueManager.RemoveItem(blogToCrawlNext); Monitor.Exit(QueueManager.Items); Monitor.Enter(selectionService.ActiveItems); selectionService.RemoveActiveItem(blogToCrawlNext); Monitor.Exit(selectionService.ActiveItems); })); } } } else { Monitor.Exit(QueueManager.Items); Task.Delay(4000, ct).Wait(); } } }
private async Task RunCrawlerTasksAsync(PauseToken pt, CancellationToken ct) { while (true) { if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } bool lockTaken = false; Monitor.Enter(_lockObject, ref lockTaken); try { if (_crawlerService.ActiveItems.Count < QueueManager.Items.Count) { QueueListItem nextQueueItem; try { nextQueueItem = QueueManager.Items.Except(_crawlerService.ActiveItems).First(); } catch (InvalidOperationException) { Monitor.Exit(_lockObject); continue; } IBlog blog = nextQueueItem.Blog; var privacyConsentNeeded = false; ICrawler crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), pt, ct); try { crawler.IsBlogOnlineAsync().Wait(4000); } catch (AggregateException ex) { if (ex.InnerExceptions.Any(x => x.Message == "Acceptance of privacy consent needed!")) { privacyConsentNeeded = true; } } finally { crawler.Dispose(); } if (privacyConsentNeeded || (_crawlerService.ActiveItems.Any(item => item.Blog.Name.Equals(nextQueueItem.Blog.Name) && item.Blog.BlogType.Equals(nextQueueItem.Blog.BlogType))) || (!nextQueueItem.Blog.Online)) { QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem)); Monitor.Exit(_lockObject); continue; } _crawlerService.AddActiveItems(nextQueueItem); Monitor.Exit(_lockObject); lockTaken = false; await StartSiteSpecificDownloaderAsync(nextQueueItem, pt, ct); } else { Monitor.Exit(_lockObject); lockTaken = false; await Task.Delay(4000, ct); } } catch (Exception e) { if (!ct.IsCancellationRequested) { Logger.Error("CrawlerController.RunCrawlerTasksAsync: {0}", e); } if (lockTaken) { Monitor.Exit(_lockObject); } } } }
private async Task <Tuple <ulong, bool> > GetUrlsAsync() { var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans); var trackedTasks = new List <Task>(); var apiLimitHit = false; var completeGrab = true; await UpdateTotalPostCountAsync(); int totalPosts = blog.Posts; ulong highestId = await GetHighestPostIdAsync(); foreach (int pageNumber in GetPageNumbers()) { await semaphoreSlim.WaitAsync(); if (!completeGrab) { break; } if (ct.IsCancellationRequested) { break; } if (pt.IsPaused) { pt.WaitWhilePausedWithResponseAsyc().Wait(); } trackedTasks.Add(new Func <Task>(async() => { try { string document = await GetApiPageAsync(pageNumber); var response = ConvertJsonToClass <TumblrApiJson>(document); completeGrab = CheckPostAge(response); if (!string.IsNullOrWhiteSpace(blog.Tags)) { tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList(); } await AddUrlsToDownloadList(response); } catch (WebException webException) when((webException.Response != null)) { var webRespStatusCode = (int)((HttpWebResponse)webException?.Response).StatusCode; if (webRespStatusCode == 429) { apiLimitHit = true; Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", webException); shellService.ShowError(webException, Resources.LimitExceeded, blog.Name); } } catch (TimeoutException timeoutException) { Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException); shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name); } catch { } finally { semaphoreSlim.Release(); } numberOfPagesCrawled += blog.PageSize; UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPagesCrawled, totalPosts); })()); } await Task.WhenAll(trackedTasks); postQueue.CompleteAdding(); jsonQueue.CompleteAdding(); UpdateBlogStats(); return(new Tuple <ulong, bool>(highestId, apiLimitHit)); }