示例#1
0
        public virtual async Task DownloadCrawlerDataAsync()
        {
            var trackedTasks = new List <Task>();

            blog.CreateDataFolder();

            foreach (TumblrCrawlerXmlData downloadItem in xmlQueue.GetConsumingEnumerable())
            {
                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try { await DownloadTextPost(downloadItem); }
                    catch { }
                })());
            }
            try { await Task.WhenAll(trackedTasks); }
            catch { }
        }
示例#2
0
        private TumblrBlog CrawlCoreTumblrBlog(TumblrBlog blog, IProgress <DataModels.DownloadProgress> progress, CancellationToken ct, PauseToken pt)
        {
            Logger.Verbose("ManagerController.CrawlCoreTumblrBlog:Start");

            var newProgress = new DataModels.DownloadProgress();

            var tuple         = GetImageUrls(blog, progress, ct, pt);
            var newImageCount = tuple.Item1;
            var newImageUrls  = tuple.Item2;

            blog.TotalCount = newImageCount;

            var imageUrls = newImageUrls.Except(blog.Links.ToList());

            var indexPath = Path.Combine(shellService.Settings.DownloadLocation, "Index");
            var blogPath  = shellService.Settings.DownloadLocation;

            var parallel = Parallel.ForEach(
                imageUrls,
                new ParallelOptions {
                MaxDegreeOfParallelism = (shellService.Settings.ParallelImages / selectionService.ActiveItems.Count)
            },
                (currentImageUrl, state) =>
            {
                if (ct.IsCancellationRequested)
                {
                    state.Break();
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                string fileName     = currentImageUrl.Split('/').Last();
                string fileLocation = Path.Combine(Path.Combine(blogPath, blog.Name), fileName);

                if (Download(blog, fileLocation, currentImageUrl))
                {
                    blog.Links.Add(currentImageUrl);
                    blog.DownloadedImages = (uint)blog.Links.Count();
                    blog.Progress         = (uint)((double)blog.DownloadedImages / (double)blog.TotalCount * 100);

                    newProgress          = new DataModels.DownloadProgress();
                    newProgress.Progress = string.Format(CultureInfo.CurrentCulture, Resources.ProgressDownloadImage, currentImageUrl);;
                    progress.Report(newProgress);
                }
            });

            if (!ct.IsCancellationRequested)
            {
                blog.LastCompleteCrawl = DateTime.Now;
            }
            SaveBlog(blog);

            newProgress          = new DataModels.DownloadProgress();
            newProgress.Progress = "";
            progress.Report(newProgress);

            return(blog);
        }
示例#3
0
        private async Task AddUrlsToDownloadList(string document, IProgress <DownloadProgress> progress, int crawlerNumber, CancellationToken ct, PauseToken pt)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                var tags = new List <string>();
                if (!string.IsNullOrWhiteSpace(blog.Tags))
                {
                    tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();
                }

                AddPhotoUrlToDownloadList(document, tags);
                AddVideoUrlToDownloadList(document, tags);

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(progress, Resources.ProgressGetUrlShort, numberOfPagesCrawled);
                crawlerNumber += shellService.Settings.ParallelScans;
                document       = await RequestDataAsync(blog.Url + "/page/" + crawlerNumber);

                if (document.Contains("<div class=\"no_posts_found\">"))
                {
                    return;
                }
            }
        }
示例#4
0
        private async Task AddUrlsToDownloadList(string response, IList <string> tags, IProgress <DownloadProgress> progress, int crawlerNumber, CancellationToken ct, PauseToken pt)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                try
                {
                    AddPhotoUrlToDownloadList(response, tags);
                    AddVideoUrlToDownloadList(response, tags);
                }
                catch (NullReferenceException)
                {
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(progress, Resources.ProgressGetUrlShort, numberOfPagesCrawled);

                string document = await GetSvcPageAsync((crawlerNumber + shellService.Settings.ParallelScans));

                //if (!document.response.posts.Any())
                //{
                //    return;
                //}

                crawlerNumber += shellService.Settings.ParallelScans;
            }
        }
示例#5
0
        public virtual async Task DownloadCrawlerDataAsync()
        {
            var trackedTasks = new List <Task>();

            blog.CreateDataFolder();

            try
            {
                foreach (TumblrCrawlerData <T> downloadItem in jsonQueue.GetConsumingEnumerable(ct))
                {
                    if (ct.IsCancellationRequested)
                    {
                        break;
                    }

                    if (pt.IsPaused)
                    {
                        pt.WaitWhilePausedWithResponseAsyc().Wait();
                    }

                    trackedTasks.Add(DownloadPostAsync(downloadItem));
                }
            }
            catch (OperationCanceledException e)
            {
                System.Diagnostics.Debug.WriteLine(e.ToString());
            }

            await Task.WhenAll(trackedTasks);
        }
        public async Task DownloadCrawlerDataAsync()
        {
            var trackedTasks = new List <Task>();

            _blog.CreateDataFolder();

            try
            {
                while (await _xmlQueue.OutputAvailableAsync(_ct))
                {
                    CrawlerData <XDocument> downloadItem = (CrawlerData <XDocument>) await _xmlQueue.ReceiveAsync();

                    if (_ct.IsCancellationRequested)
                    {
                        break;
                    }

                    if (_pt.IsPaused)
                    {
                        _pt.WaitWhilePausedWithResponseAsyc().Wait();
                    }

                    trackedTasks.Add(DownloadPostAsync(downloadItem));
                }
            }
            catch (OperationCanceledException e)
            {
                System.Diagnostics.Debug.WriteLine(e.ToString());
            }

            await Task.WhenAll(trackedTasks);
        }
示例#7
0
 protected void CheckIfShouldPause()
 {
     if (pt.IsPaused)
     {
         pt.WaitWhilePausedWithResponseAsyc().Wait();
     }
 }
示例#8
0
        public static async Task SomeMethodAsync(PauseToken pause)
        {
            await Task.Delay(500);

            Console.WriteLine("Before await pause.WaitWhilePausedAsync()");
            await pause.WaitWhilePausedWithResponseAsyc();

            Console.WriteLine("After await pause.WaitWhilePausedAsync()");
        }
示例#9
0
        public static async Task SomeMethodAsync(PauseToken pause)
        {
            while (true)
            {
                await Task.Delay(1000).ConfigureAwait(false);

                Console.WriteLine("Before await pause.WaitWhilePausedAsync()");
                await pause.WaitWhilePausedWithResponseAsyc();

                Console.WriteLine("After await pause.WaitWhilePausedAsync()");
            }
        }
示例#10
0
        private async Task RunCrawlerTasksAsync(PauseToken pt, CancellationToken ct)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    break;
                }

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                Monitor.Enter(_lockObject);
                if (_crawlerService.ActiveItems.Count < QueueManager.Items.Count)
                {
                    IEnumerable <QueueListItem> queueList = QueueManager.Items.Except(_crawlerService.ActiveItems);
                    QueueListItem nextQueueItem           = queueList.First();
                    IBlog         blog = nextQueueItem.Blog;

                    ICrawler crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), pt, ct);
                    crawler.IsBlogOnlineAsync().Wait(4000);
                    crawler.Dispose();

                    if (_crawlerService.ActiveItems.Any(item =>
                                                        item.Blog.Name.Equals(nextQueueItem.Blog.Name) &&
                                                        item.Blog.BlogType.Equals(nextQueueItem.Blog.BlogType)))
                    {
                        QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                        Monitor.Exit(_lockObject);
                        continue;
                    }

                    if (!nextQueueItem.Blog.Online)
                    {
                        QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                        Monitor.Exit(_lockObject);
                        continue;
                    }

                    _crawlerService.AddActiveItems(nextQueueItem);
                    Monitor.Exit(_lockObject);
                    await StartSiteSpecificDownloaderAsync(nextQueueItem, pt, ct);
                }
                else
                {
                    Monitor.Exit(_lockObject);
                    await Task.Delay(4000, ct);
                }
            }
        }
示例#11
0
        public virtual async Task <bool> DownloadBlogAsync()
        {
            var concurrentConnectionsSemaphore      = new SemaphoreSlim(shellService.Settings.ConcurrentConnections / crawlerService.ActiveItems.Count);
            var concurrentVideoConnectionsSemaphore = new SemaphoreSlim(shellService.Settings.ConcurrentVideoConnections / crawlerService.ActiveItems.Count);
            var trackedTasks     = new List <Task>();
            var completeDownload = true;

            blog.CreateDataFolder();

            foreach (TumblrPost downloadItem in postQueue.GetConsumingEnumerable())
            {
                if (downloadItem.GetType() == typeof(VideoPost))
                {
                    await concurrentVideoConnectionsSemaphore.WaitAsync();
                }
                await concurrentConnectionsSemaphore.WaitAsync();

                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try { await DownloadPostAsync(downloadItem); }
                    catch {}
                    finally {
                        concurrentConnectionsSemaphore.Release();
                        if (downloadItem.GetType() == typeof(VideoPost))
                        {
                            concurrentVideoConnectionsSemaphore.Release();
                        }
                    }
                })());
            }
            try { await Task.WhenAll(trackedTasks); }
            catch { completeDownload = false; }

            blog.LastDownloadedPhoto = null;
            blog.LastDownloadedVideo = null;

            files.Save();

            return(completeDownload);
        }
示例#12
0
        private async Task AddUrlsToDownloadList(TumblrJson response, int crawlerNumber)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                try
                {
                    AddPhotoUrlToDownloadList(response);
                    AddVideoUrlToDownloadList(response);
                    AddAudioUrlToDownloadList(response);
                    AddTextUrlToDownloadList(response);
                    AddQuoteUrlToDownloadList(response);
                    AddLinkUrlToDownloadList(response);
                    AddConversationUrlToDownloadList(response);
                    AddAnswerUrlToDownloadList(response);
                    AddPhotoMetaUrlToDownloadList(response);
                    AddVideoMetaUrlToDownloadList(response);
                    AddAudioMetaUrlToDownloadList(response);
                    await AddExternalPhotoUrlToDownloadList(response);
                }
                catch (NullReferenceException)
                {
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);

                string document = await GetSvcPageAsync(blog.PageSize.ToString(), (blog.PageSize *crawlerNumber).ToString());

                response = ConvertJsonToClass <TumblrJson>(document);
                if (!response.response.posts.Any())
                {
                    return;
                }

                crawlerNumber += shellService.Settings.ConcurrentScans;
            }
        }
示例#13
0
        private async Task RunCrawlerTasks(CancellationToken ct, PauseToken pt)
        {
            while (true)
            {
                ct.ThrowIfCancellationRequested();

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait(ct);
                }

                Monitor.Enter(lockObject);
                if (crawlerService.ActiveItems.Count() < QueueManager.Items.Count())
                {
                    IEnumerable <QueueListItem> queueList = QueueManager.Items.Except(crawlerService.ActiveItems);
                    QueueListItem nextQueueItem           = queueList.First();
                    IBlog         blog = nextQueueItem.Blog;

                    IDownloader downloader = DownloaderFactory.GetDownloader(blog.BlogType, shellService, crawlerService, blog);
                    downloader.IsBlogOnlineAsync().Wait(4000);

                    if (crawlerService.ActiveItems.Any(item => item.Blog.Name.Contains(nextQueueItem.Blog.Name)))
                    {
                        QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                        Monitor.Exit(lockObject);
                        continue;
                    }

                    if (!nextQueueItem.Blog.Online)
                    {
                        QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                        Monitor.Exit(lockObject);
                        continue;
                    }

                    crawlerService.AddActiveItems(nextQueueItem);
                    Monitor.Exit(lockObject);
                    await StartSiteSpecificDownloader(nextQueueItem, ct, pt);
                }
                else
                {
                    Monitor.Exit(lockObject);
                    await Task.Delay(4000, ct);
                }
            }
        }
示例#14
0
        private async Task AddUrlsToDownloadList(long pagination, long nextCrawlersPagination)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                string document = await GetTaggedSearchPageAsync(pagination);

                if (document.Contains("<div class=\"no_posts_found\""))
                {
                    return;
                }

                try
                {
                    AddPhotoUrlToDownloadList(document);
                    AddVideoUrlToDownloadList(document);
                }
                catch (NullReferenceException)
                {
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);
                pagination = ExtractNextPageLink(document);
                if (pagination < nextCrawlersPagination)
                {
                    return;
                }
                if (!CheckIfWithinTimespan(pagination))
                {
                    return;
                }
                //if (!string.IsNullOrEmpty(blog.DownloadPages))
                //    return;
            }
        }
示例#15
0
        private async Task AddUrlsToDownloadList(string response, int crawlerNumber)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                var result = ConvertJsonToClass <TumblrSearchJson>(response);
                if (string.IsNullOrEmpty(result.response.posts_html))
                {
                    return;
                }

                try
                {
                    string html = result.response.posts_html;
                    html = Regex.Unescape(html);
                    AddPhotoUrlToDownloadList(html);
                    AddVideoUrlToDownloadList(html);
                }
                catch (NullReferenceException)
                {
                }

                if (!string.IsNullOrEmpty(blog.DownloadPages))
                {
                    return;
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);
                response = await GetSearchPageAsync((crawlerNumber + shellService.Settings.ConcurrentScans));

                crawlerNumber += shellService.Settings.ConcurrentScans;
            }
        }
示例#16
0
        public static async Task SomeMethodAsync(PauseToken pause)
        {
            try
            {
                while (true)
                {
                    await Task.Delay(1000).ConfigureAwait(false);

                    Console.WriteLine("Before await pause.WaitWhilePausedAsync()");
                    await pause.WaitWhilePausedWithResponseAsyc();

                    Console.WriteLine("After await pause.WaitWhilePausedAsync()");
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception: {0}", e);
                throw;
            }
        }
        private async Task AddUrlsToDownloadList(long pagination, int crawlerNumber)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    return;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                string document = await RequestDataAsync(blog.Url + "/page/" + crawlerNumber + "/" + pagination, "https://www.tumblr.com/", "https://" + blog.Name.Replace("+", "-") + ".tumblr.com");

                if (document.Contains("<div class=\"no_posts_found\""))
                {
                    return;
                }

                try
                {
                    AddPhotoUrlToDownloadList(document);
                    AddVideoUrlToDownloadList(document);
                }
                catch (NullReferenceException)
                {
                }

                Interlocked.Increment(ref numberOfPagesCrawled);
                UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);
                pagination = ExtractNextPageLink(document);
                crawlerNumber++;
                if (!CheckIfWithinTimespan(pagination))
                {
                    return;
                }
                //if (CheckIfPagecountReached(numberOfPagesCrawled))
                //    return;
            }
        }
示例#18
0
        protected virtual async Task <bool> DownloadBlogAsync(IProgress <DataModels.DownloadProgress> progress, CancellationToken ct,
                                                              PauseToken pt)
        {
            var semaphoreSlim    = new SemaphoreSlim(shellService.Settings.ParallelImages / crawlerService.ActiveItems.Count);
            var trackedTasks     = new List <Task>();
            var completeDownload = true;

            CreateDataFolder();

            foreach (TumblrPost downloadItem in producerConsumerCollection.GetConsumingEnumerable())
            {
                await semaphoreSlim.WaitAsync();

                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try { await DownloadPostAsync(progress, ct, downloadItem); }
                    catch {}
                    finally { semaphoreSlim.Release(); }
                })());
            }
            try { await Task.WhenAll(trackedTasks); }
            catch { completeDownload = false; }

            blog.LastDownloadedPhoto = null;
            blog.LastDownloadedVideo = null;

            files.Save();

            return(completeDownload);
        }
        public virtual async Task DownloadCrawlerDataAsync()
        {
            var trackedTasks = new List <Task>();

            blog.CreateDataFolder();

            foreach (TumblrCrawlerData <T> downloadItem in jsonQueue.GetConsumingEnumerable())
            {
                if (ct.IsCancellationRequested)
                {
                    break;
                }

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(DownloadPostAsync(downloadItem));
            }

            await Task.WhenAll(trackedTasks);
        }
示例#20
0
        private async Task <Tuple <ulong, bool> > GetUrlsAsync(IProgress <DownloadProgress> progress, CancellationToken ct, PauseToken pt)
        {
            var semaphoreSlim        = new SemaphoreSlim(shellService.Settings.ParallelScans);
            var trackedTasks         = new List <Task>();
            var numberOfPostsCrawled = 0;
            var apiLimitHit          = false;
            var completeGrab         = true;

            ulong lastId = GetLastPostId();

            await UpdateTotalPostCount();

            int totalPosts = blog.Posts;

            ulong highestId = await GetHighestPostId();

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }

                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try
                    {
                        XDocument document = await GetApiPageAsync(pageNumber);

                        completeGrab = CheckPostAge(document, lastId);

                        var tags = new List <string>();
                        if (!string.IsNullOrWhiteSpace(blog.Tags))
                        {
                            tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();
                        }

                        AddUrlsToDownloadList(document, tags);
                    }
                    catch (WebException webException)
                    {
                        if (webException.Message.Contains("429"))
                        {
                            // TODO: add retry logic?
                            apiLimitHit = true;
                            Logger.Error("TumblrDownloader:GetUrls:WebException {0}", webException);
                            shellService.ShowError(webException, Resources.LimitExceeded, blog.Name);
                        }
                    }
                    finally
                    {
                        semaphoreSlim.Release();
                    }

                    numberOfPostsCrawled += blog.PageSize;
                    UpdateProgressQueueInformation(progress, Resources.ProgressGetUrlLong, numberOfPostsCrawled, totalPosts);
                })());
            }
            await Task.WhenAll(trackedTasks);

            producerConsumerCollection.CompleteAdding();

            if (!ct.IsCancellationRequested && completeGrab)
            {
                UpdateBlogStats();
            }

            return(new Tuple <ulong, bool>(highestId, apiLimitHit));
        }
示例#21
0
        protected virtual async Task <bool> DownloadBlogAsync(IProgress <DataModels.DownloadProgress> progress, CancellationToken ct,
                                                              PauseToken pt)
        {
            var semaphoreSlim    = new SemaphoreSlim(shellService.Settings.ParallelImages / crawlerService.ActiveItems.Count);
            var trackedTasks     = new List <Task>();
            var completeDownload = true;

            CreateDataFolder();

            foreach (Tuple <PostTypes, string, string> downloadItem in producerConsumerCollection.GetConsumingEnumerable())
            {
                await semaphoreSlim.WaitAsync();

                if (ct.IsCancellationRequested)
                {
                    completeDownload = false;
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    switch (downloadItem.Item1)
                    {
                    case PostTypes.Photo:
                        await DownloadPhotoAsync(progress, downloadItem, ct);
                        break;

                    case PostTypes.Video:
                        await DownloadVideoAsync(progress, downloadItem, ct);
                        break;

                    case PostTypes.Audio:
                        await DownloadAudioAsync(progress, downloadItem, ct);
                        break;

                    case PostTypes.Text:
                        DownloadText(progress, downloadItem);
                        break;

                    case PostTypes.Quote:
                        DownloadQuote(progress, downloadItem);
                        break;

                    case PostTypes.Link:
                        DownloadLink(progress, downloadItem);
                        break;

                    case PostTypes.Conversation:
                        DownloadConversation(progress, downloadItem);
                        break;

                    case PostTypes.PhotoMeta:
                        DownloadPhotoMeta(progress, downloadItem);
                        break;

                    case PostTypes.VideoMeta:
                        DownloadVideoMeta(progress, downloadItem);
                        break;

                    case PostTypes.AudioMeta:
                        DownloadAudioMeta(progress, downloadItem);
                        break;

                    default:
                        break;
                    }
                    semaphoreSlim.Release();
                })());
            }
            await Task.WhenAll(trackedTasks);

            blog.LastDownloadedPhoto = null;
            blog.LastDownloadedVideo = null;

            files.Save();

            return(completeDownload);
        }
示例#22
0
        private async Task RunCrawlerTasksAsync(PauseToken pt, CancellationToken ct)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    break;
                }

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                bool lockTaken = false;
                Monitor.Enter(_lockObject, ref lockTaken);
                try
                {
                    if (_crawlerService.ActiveItems.Count < QueueManager.Items.Count)
                    {
                        QueueListItem nextQueueItem = QueueManager.Items.Except(_crawlerService.ActiveItems).First();
                        IBlog         blog          = nextQueueItem.Blog;

                        ICrawler crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), pt, ct);
                        crawler.IsBlogOnlineAsync().Wait(4000);
                        crawler.Dispose();

                        if (_crawlerService.ActiveItems.Any(item =>
                                                            item.Blog.Name.Equals(nextQueueItem.Blog.Name) &&
                                                            item.Blog.BlogType.Equals(nextQueueItem.Blog.BlogType)))
                        {
                            QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                            Monitor.Exit(_lockObject);
                            continue;
                        }

                        if (!nextQueueItem.Blog.Online)
                        {
                            QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                            Monitor.Exit(_lockObject);
                            continue;
                        }

                        _crawlerService.AddActiveItems(nextQueueItem);
                        Monitor.Exit(_lockObject);
                        lockTaken = false;
                        await StartSiteSpecificDownloaderAsync(nextQueueItem, pt, ct);
                    }
                    else
                    {
                        Monitor.Exit(_lockObject);
                        lockTaken = false;
                        await Task.Delay(4000, ct);
                    }
                }
                catch (Exception e)
                {
                    Logger.Error("CrawlerController.RunCrawlerTasksAsync: {0}", e);
                    _shellService.ShowError(e, "Error starting the next item in the queue.");
                    if (lockTaken)
                    {
                        Monitor.Exit(_lockObject);
                    }
                }
            }
        }
示例#23
0
        private async Task <Tuple <ulong, bool> > GetUrlsAsync(IProgress <DownloadProgress> progress, CancellationToken ct, PauseToken pt)
        {
            var semaphoreSlim        = new SemaphoreSlim(shellService.Settings.ParallelScans);
            var trackedTasks         = new List <Task>();
            var numberOfPostsCrawled = 0;
            var apiLimitHit          = false;
            var completeGrab         = true;

            ulong lastId = GetLastPostId();

            await UpdateTotalPostCount();

            int totalPosts = blog.Posts;

            ulong highestId = await GetHighestPostId();

            // The Tumblr api v1 shows 50 posts at max, determine the number of pages to crawl
            int totalPages = (totalPosts / 50) + 1;

            foreach (int pageNumber in Enumerable.Range(0, totalPages))
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }

                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try
                    {
                        XDocument document = await GetApiPageAsync(pageNumber);

                        completeGrab = CheckPostAge(document, lastId);

                        AddUrlsToDownloadList(document);
                    }
                    catch (WebException webException)
                    {
                        if (webException.Message.Contains("429"))
                        {
                            // add retry logic?
                            apiLimitHit = true;
                            Logger.Error("TumblrDownloader:GetUrls:WebException {0}", webException);
                            shellService.ShowError(webException, Resources.LimitExceeded, blog.Name);
                        }
                    }
                    finally
                    {
                        semaphoreSlim.Release();
                    }

                    numberOfPostsCrawled += 50;
                    UpdateProgressQueueInformation(progress, Resources.ProgressGetUrl, numberOfPostsCrawled, totalPosts);
                })());
            }
            await Task.WhenAll(trackedTasks);

            producerConsumerCollection.CompleteAdding();

            if (!ct.IsCancellationRequested && completeGrab)
            {
                UpdateBlogStats();
            }

            return(Tuple.Create(highestId, apiLimitHit));
        }
示例#24
0
        public Tuple <uint, List <string> > GetImageUrls(TumblrBlog blog, IProgress <DataModels.DownloadProgress> progress, CancellationToken ct, PauseToken pt)
        {
            int           totalPosts           = 0;
            int           numberOfPostsCrawled = 0;
            uint          totalImages;
            List <string> images = new List <string>();

            string url        = GetApiUrl(blog.Name, 1);
            string authHeader = shellService.OAuthManager.GenerateauthHeader(url, "GET");

            var blogDoc = RequestData(url, authHeader);

            totalPosts = blogDoc.response.blog.total_posts;

            // Generate URL list of Images
            // the api v2 shows 20 posts at max, determine the number of pages to crawl
            int totalPages = (totalPosts / 20) + 1;

            Parallel.For(0, totalPages,
                         new ParallelOptions {
                MaxDegreeOfParallelism = (shellService.Settings.ParallelImages / selectionService.ActiveItems.Count)
            },
                         (i, state) =>
            {
                if (ct.IsCancellationRequested)
                {
                    state.Break();
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }
                try
                {
                    // check for tags -- crawling for all images here
                    if (blog.Tags == null || blog.Tags.Count() == 0)
                    {
                        DataModels.TumblrJson document = null;

                        // get 20 posts per crawl/page
                        url        = GetApiUrl(blog.Name, 20, i * 20);
                        authHeader = shellService.OAuthManager.GenerateauthHeader(url, "GET");

                        document = RequestData(url, authHeader);

                        if (shellService.Settings.DownloadImages == true)
                        {
                            foreach (Datamodels.Post post in document.response.posts.Where(posts => posts.type.Equals("photo")))
                            {
                                foreach (DataModels.Photo photo in post.photos)
                                {
                                    var imageUrl = photo.alt_sizes.ElementAt(shellService.Settings.ImageSizes.IndexOf(shellService.Settings.ImageSize.ToString())).url;
                                    if (shellService.Settings.SkipGif == true && imageUrl.EndsWith(".gif"))
                                    {
                                        continue;
                                    }
                                    Monitor.Enter(images);
                                    images.Add(imageUrl);
                                    Monitor.Exit(images);
                                }
                            }
                        }
                        if (shellService.Settings.DownloadVideos == true)
                        {
                            foreach (DataModels.Post post in document.response.posts.Where(posts => posts.type.Equals("video")))
                            {
                                if (shellService.Settings.VideoSize == 1080)
                                {
                                    Monitor.Enter(images);
                                    images.Add(post.video_url);
                                    Monitor.Exit(images);
                                }
                                if (shellService.Settings.VideoSize == 480)
                                {
                                    Monitor.Enter(images);
                                    images.Add(post.video_url.Insert(post.video_url.LastIndexOf("."), "_480"));
                                    Monitor.Exit(images);
                                }
                            }
                        }
                    }
                    // crawling only for tagged images
                    else
                    {
                        List <string> tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();

                        DataModels.TumblrJson document = null;

                        // get 20 posts per crawl/page
                        url        = GetApiUrl(blog.Name, 20, i * 20);
                        authHeader = shellService.OAuthManager.GenerateauthHeader(url, "GET");

                        document = RequestData(url, authHeader);

                        if (shellService.Settings.DownloadImages == true)
                        {
                            foreach (Datamodels.Post post in document.response.posts.Where(posts => posts.tags.Any(tag => tags.Equals(tag)) && posts.type.Equals("photo")))
                            {
                                foreach (DataModels.Photo photo in post.photos ?? new List <Datamodels.Photo>())
                                {
                                    var imageUrl = photo.alt_sizes.ElementAt(shellService.Settings.ImageSizes.IndexOf(shellService.Settings.ImageSize.ToString())).url;
                                    if (shellService.Settings.SkipGif == true && imageUrl.EndsWith(".gif"))
                                    {
                                        continue;
                                    }
                                    Monitor.Enter(images);
                                    images.Add(imageUrl);
                                    Monitor.Exit(images);
                                }
                            }
                        }
                        if (shellService.Settings.DownloadVideos == true)
                        {
                            foreach (DataModels.Post post in document.response.posts.Where(posts => posts.tags.Any(tag => tags.Equals(tag)) && posts.type.Equals("video")))
                            {
                                if (shellService.Settings.VideoSize == 1080)
                                {
                                    Monitor.Enter(images);
                                    images.Add(post.video_url);
                                    Monitor.Exit(images);
                                }
                                if (shellService.Settings.VideoSize == 480)
                                {
                                    Monitor.Enter(images);
                                    images.Add(post.video_url.Insert(post.video_url.LastIndexOf("."), "_480"));
                                    Monitor.Exit(images);
                                }
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Data);
                }

                numberOfPostsCrawled += 20;
                var newProgress       = new DataModels.DownloadProgress();
                newProgress.Progress  = string.Format(CultureInfo.CurrentCulture, Resources.ProgressGetUrl, numberOfPostsCrawled, totalPosts);
                progress.Report(newProgress);
            }
                         );

            images = images.Distinct().ToList();

            totalImages = (uint)images.Count;
            return(Tuple.Create(totalImages, images));
        }
示例#25
0
        private void runCrawlerTasks(CancellationToken ct, PauseToken pt)
        {
            while (true)
            {
                // check if stopped
                if (ct.IsCancellationRequested)
                {
                    //break;
                    throw new OperationCanceledException(ct);
                }

                // check if paused
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                Monitor.Enter(QueueManager.Items);
                if (selectionService.ActiveItems.Count() < QueueManager.Items.Count())
                {
                    var blogListToCrawlNext = QueueManager.Items.Except(selectionService.ActiveItems);
                    var blogToCrawlNext     = blogListToCrawlNext.First();

                    Monitor.Enter(selectionService.ActiveItems);
                    selectionService.AddActiveItems(blogToCrawlNext);
                    Monitor.Exit(selectionService.ActiveItems);


                    Monitor.Exit(QueueManager.Items);

                    if (blogToCrawlNext.Blog is TumblrBlog)
                    {
                        var blog = (TumblrBlog)blogToCrawlNext.Blog;

                        var progressHandler = new Progress <DataModels.DownloadProgress>(value =>
                        {
                            blogToCrawlNext.Progress = value.Progress;
                        });
                        var progress = progressHandler as IProgress <DataModels.DownloadProgress>;

                        CrawlCoreTumblrBlog(blog, progress, ct, pt);

                        if (ct.IsCancellationRequested)
                        {
                            Application.Current.Dispatcher.BeginInvoke(
                                DispatcherPriority.Background,
                                new Action(() => {
                                Monitor.Enter(selectionService.ActiveItems);
                                selectionService.RemoveActiveItem(blogToCrawlNext);
                                Monitor.Exit(selectionService.ActiveItems);
                            }));
                            throw new OperationCanceledException(ct);
                        }
                        else
                        {
                            Application.Current.Dispatcher.BeginInvoke(
                                DispatcherPriority.Background,
                                new Action(() => {
                                Monitor.Enter(QueueManager.Items);
                                QueueManager.RemoveItem(blogToCrawlNext);
                                Monitor.Exit(QueueManager.Items);

                                Monitor.Enter(selectionService.ActiveItems);
                                selectionService.RemoveActiveItem(blogToCrawlNext);
                                Monitor.Exit(selectionService.ActiveItems);
                            }));
                        }
                    }
                }
                else
                {
                    Monitor.Exit(QueueManager.Items);
                    Task.Delay(4000, ct).Wait();
                }
            }
        }
示例#26
0
        private async Task RunCrawlerTasksAsync(PauseToken pt, CancellationToken ct)
        {
            while (true)
            {
                if (ct.IsCancellationRequested)
                {
                    break;
                }

                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                bool lockTaken = false;
                Monitor.Enter(_lockObject, ref lockTaken);
                try
                {
                    if (_crawlerService.ActiveItems.Count < QueueManager.Items.Count)
                    {
                        QueueListItem nextQueueItem;
                        try
                        {
                            nextQueueItem = QueueManager.Items.Except(_crawlerService.ActiveItems).First();
                        }
                        catch (InvalidOperationException)
                        {
                            Monitor.Exit(_lockObject);
                            continue;
                        }
                        IBlog blog = nextQueueItem.Blog;

                        var      privacyConsentNeeded = false;
                        ICrawler crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), pt, ct);
                        try
                        {
                            crawler.IsBlogOnlineAsync().Wait(4000);
                        }
                        catch (AggregateException ex)
                        {
                            if (ex.InnerExceptions.Any(x => x.Message == "Acceptance of privacy consent needed!"))
                            {
                                privacyConsentNeeded = true;
                            }
                        }
                        finally
                        {
                            crawler.Dispose();
                        }

                        if (privacyConsentNeeded ||
                            (_crawlerService.ActiveItems.Any(item =>
                                                             item.Blog.Name.Equals(nextQueueItem.Blog.Name) &&
                                                             item.Blog.BlogType.Equals(nextQueueItem.Blog.BlogType))) ||
                            (!nextQueueItem.Blog.Online))
                        {
                            QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(nextQueueItem));
                            Monitor.Exit(_lockObject);
                            continue;
                        }

                        _crawlerService.AddActiveItems(nextQueueItem);
                        Monitor.Exit(_lockObject);
                        lockTaken = false;
                        await StartSiteSpecificDownloaderAsync(nextQueueItem, pt, ct);
                    }
                    else
                    {
                        Monitor.Exit(_lockObject);
                        lockTaken = false;
                        await Task.Delay(4000, ct);
                    }
                }
                catch (Exception e)
                {
                    if (!ct.IsCancellationRequested)
                    {
                        Logger.Error("CrawlerController.RunCrawlerTasksAsync: {0}", e);
                    }
                    if (lockTaken)
                    {
                        Monitor.Exit(_lockObject);
                    }
                }
            }
        }
示例#27
0
        private async Task <Tuple <ulong, bool> > GetUrlsAsync()
        {
            var semaphoreSlim = new SemaphoreSlim(shellService.Settings.ConcurrentScans);
            var trackedTasks  = new List <Task>();
            var apiLimitHit   = false;
            var completeGrab  = true;

            await UpdateTotalPostCountAsync();

            int totalPosts = blog.Posts;

            ulong highestId = await GetHighestPostIdAsync();

            foreach (int pageNumber in GetPageNumbers())
            {
                await semaphoreSlim.WaitAsync();

                if (!completeGrab)
                {
                    break;
                }

                if (ct.IsCancellationRequested)
                {
                    break;
                }
                if (pt.IsPaused)
                {
                    pt.WaitWhilePausedWithResponseAsyc().Wait();
                }

                trackedTasks.Add(new Func <Task>(async() =>
                {
                    try
                    {
                        string document = await GetApiPageAsync(pageNumber);
                        var response    = ConvertJsonToClass <TumblrApiJson>(document);

                        completeGrab = CheckPostAge(response);

                        if (!string.IsNullOrWhiteSpace(blog.Tags))
                        {
                            tags = blog.Tags.Split(',').Select(x => x.Trim()).ToList();
                        }

                        await AddUrlsToDownloadList(response);
                    }
                    catch (WebException webException) when((webException.Response != null))
                    {
                        var webRespStatusCode = (int)((HttpWebResponse)webException?.Response).StatusCode;
                        if (webRespStatusCode == 429)
                        {
                            apiLimitHit = true;
                            Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", webException);
                            shellService.ShowError(webException, Resources.LimitExceeded, blog.Name);
                        }
                    }
                    catch (TimeoutException timeoutException)
                    {
                        Logger.Error("TumblrBlogCrawler:GetUrls:WebException {0}", timeoutException);
                        shellService.ShowError(timeoutException, Resources.TimeoutReached, Resources.Crawling, blog.Name);
                    }
                    catch
                    {
                    }
                    finally
                    {
                        semaphoreSlim.Release();
                    }

                    numberOfPagesCrawled += blog.PageSize;
                    UpdateProgressQueueInformation(Resources.ProgressGetUrlLong, numberOfPagesCrawled, totalPosts);
                })());
            }
            await Task.WhenAll(trackedTasks);

            postQueue.CompleteAdding();
            jsonQueue.CompleteAdding();

            UpdateBlogStats();

            return(new Tuple <ulong, bool>(highestId, apiLimitHit));
        }