public async Task DownloadCrawledUrl(ICrawledUrl crawledUrl, string downloadDirectory)
        {
            if (crawledUrl == null)
            {
                throw new ArgumentNullException(nameof(crawledUrl));
            }
            if (downloadDirectory == null)
            {
                throw new ArgumentNullException(nameof(downloadDirectory));
            }

            IPlugin downloadPlugin = _defaultPlugin;

            if (_plugins != null && _plugins.Count > 0)
            {
                foreach (IPlugin plugin in _plugins)
                {
                    if (await plugin.IsSupportedUrl(crawledUrl.Url))
                    {
                        downloadPlugin = plugin;
                        break;
                    }
                }
            }

            await downloadPlugin.Download(crawledUrl, downloadDirectory);
        }
        public async Task Download(ICrawledUrl crawledUrl, string downloadDirectory)
        {
            _logger.Debug($"Received new url: {crawledUrl.Url}, download dir: {downloadDirectory}");

            Match match = _googleDriveRegex.Match(crawledUrl.Url);

            if (!match.Success)
            {
                _logger.Error($"Unable to parse google drive url: {crawledUrl.Url}");
                throw new DownloadException($"Unable to parse google drive url: {crawledUrl.Url}");
            }

            string id = match.Groups[1].Value;

            string downloadPath = Path.Combine(downloadDirectory, crawledUrl.DownloadPath);

            try
            {
                //warning: returns '' in drive's root
                if (!Directory.Exists(downloadPath))
                {
                    Directory.CreateDirectory(downloadPath);
                }
            }
            catch (Exception ex)
            {
                throw new DownloadException($"Unable to create directory for file {crawledUrl.Url}", ex);
            }

            downloadPath = Path.Combine(downloadPath, $"{id.Substring(id.Length - 6, 5)}_")
                           .TrimEnd(new[] { '/', '\\' });

            _logger.Debug($"Retrieved id: {id}, download path: {downloadPath}");

            try
            {
                _engine.Download(id, downloadPath, _overwriteFiles);
            }
            catch (Exception ex)
            {
                _logger.Error("GOOGLE DRIVE ERROR: " + ex);
                throw new DownloadException($"Unable to download {crawledUrl.Url}", ex);
            }
        }
        public async Task Download(ICrawledUrl crawledUrl, string downloadDirectory)
        {
            if (_megaDownloader == null)
            {
                _logger.Fatal($"Mega downloader initialization failure (check credentials), {crawledUrl.Url} will not be downloaded!");
                return;
            }

            try
            {
                await _megaDownloader.DownloadUrlAsync(crawledUrl, downloadDirectory);
            }
            catch (DownloadException ex)
            {
                throw;
            }
            catch (Exception ex)
            {
                throw new Common.Exceptions.DownloadException($"Unable to download {crawledUrl.Url}: {ex}", ex);
            }
        }
        public async Task DownloadUrlAsync(ICrawledUrl crawledUrl, string downloadPath, bool overwriteFiles = false)
        {
            _logger.Debug($"[MEGA] Staring downloading {crawledUrl.Url}");

            Uri uri = new Uri(crawledUrl.Url);

            if (await IsUrlAFolder(uri))
            {
                await DownloadFolderAsync(uri, Path.Combine(downloadPath, crawledUrl.DownloadPath), overwriteFiles);
            }
            else
            {
                (_, string id, _) = MegaUrlDataExtractor.Extract(crawledUrl.Url);
                INodeInfo fileNodeInfo = await _client.GetNodeFromLinkAsync(uri);

                string path = Path.Combine(downloadPath, crawledUrl.DownloadPath,
                                           $"{id.Substring(0, 5)}_{fileNodeInfo.Name}");
                await DownloadFileAsync(null, uri, fileNodeInfo, path, overwriteFiles);
            }

            _logger.Debug($"[MEGA] Finished downloading {crawledUrl.Url}");
        }
        public async Task Download(List <ICrawledUrl> crawledUrls, string downloadDirectory, CancellationToken cancellationToken)
        {
            if (crawledUrls == null)
            {
                throw new ArgumentNullException(nameof(crawledUrls));
            }
            if (string.IsNullOrEmpty(downloadDirectory))
            {
                throw new ArgumentException("Argument cannot be null or empty", nameof(downloadDirectory));
            }

            using (SemaphoreSlim concurrencySemaphore = new SemaphoreSlim(4)) //todo: allow setting the count here (issue #4)
            {
                List <Task> tasks = new List <Task>();
                for (int i = 0; i < crawledUrls.Count; i++)
                {
                    concurrencySemaphore.Wait();

                    cancellationToken.ThrowIfCancellationRequested();

                    int  entryPos = i;
                    Task task     = Task.Run(async() =>
                    {
                        try
                        {
                            ICrawledUrl entry = crawledUrls[entryPos];

                            if (!_urlChecker.IsValidUrl(entry.Url))
                            {
                                _logger.Error($"Invalid url: {entry.Url}");
                                return;
                            }

                            if (_urlChecker.IsBlacklistedUrl(entry.Url))
                            {
                                _logger.Warn($"Url is blacklisted: {entry.Url}");
                                return;
                            }

                            _logger.Debug($"Downloading {entryPos + 1}/{crawledUrls.Count}: {entry.Url}");

                            try
                            {
                                _logger.Debug($"Calling url processor for: {entry.Url}");
                                bool isDownloadAllowed = await _crawledUrlProcessor.ProcessCrawledUrl(entry, downloadDirectory);

                                if (isDownloadAllowed)
                                {
                                    if (string.IsNullOrWhiteSpace(entry.DownloadPath))
                                    {
                                        throw new DownloadException($"Download path is not filled for {entry.Url}");
                                    }

                                    await _pluginManager.DownloadCrawledUrl(entry, downloadDirectory);
                                }
                                else
                                {
                                    _logger.Debug($"ProcessCrawledUrl returned false, {entry.Url} will be skipped");
                                }

                                //TODO: mark isDownloadAllowed = false entries as skipped
                                entry.IsDownloaded = true;
                                OnFileDownloaded(new FileDownloadedEventArgs(entry.Url, crawledUrls.Count));
                            }
                            catch (DownloadException ex)
                            {
                                string logMessage = $"Error while downloading {entry.Url}: {ex.Message}";
                                if (ex.InnerException != null)
                                {
                                    logMessage += $". Inner Exception: {ex.InnerException}";
                                }
                                _logger.Error(logMessage);
                                OnFileDownloaded(new FileDownloadedEventArgs(entry.Url, crawledUrls.Count,
                                                                             false, logMessage));
                            }
                            catch (Exception ex)
                            {
                                throw new UniversalDownloaderPlatformException(
                                    $"Error while downloading {entry.Url}: {ex.Message}", ex);
                            }
                        }
                        finally
                        {
                            concurrencySemaphore.Release();
                        }
                    }, cancellationToken);

                    tasks.Add(task);
                }

                await Task.WhenAll(tasks);

                _logger.Debug("Finished all tasks");
            }
        }
Ejemplo n.º 6
0
 public NewCrawledUrlEventArgs(ICrawledUrl crawledUrl)
 {
     _crawledUrl = crawledUrl ?? throw new ArgumentNullException(nameof(crawledUrl));
 }