Ejemplo n.º 1
0
        public DiaryScraperNew(ILogger <DiaryScraperNew> logger, ScrapeContext context, DiaryScraperOptions options)
        {
            _logger          = logger;
            _cookieContainer = new CookieContainer();
            _webClient       = new CF_WebClient(_cookieContainer);


            _context = context;
            _options = options;
            _downloadExistingChecker = new DownloadExistingChecker(Path.Combine(_options.WorkingDir, _options.DiaryName), context, _logger);
            _downloader = new DataDownloader($"http://{_options.DiaryName}.diary.ru",
                                             Path.Combine(_options.WorkingDir, _options.DiaryName),
                                             _cookieContainer,
                                             _logger);

            _downloader.BeforeDownload += (s, e) =>
            {
                if (!(e.Resource is DiaryImage))
                {
                    Progress.Values[ScrapeProgressNames.CurrentUrl] = e.Resource.Url.ToLower();
                }
            };

            _downloader.AfterDownload += OnResourceDownloaded;
            var config = new Configuration().WithCss();

            _parser    = new HtmlParser(config);
            _moreFixer = new DiaryMoreLinksFixer(_downloader, _options.WorkingDir, _options.DiaryName);
        }
Ejemplo n.º 2
0
        public async Task <DataDownloaderResult> Download(DownloadResource downloadResource, bool ignore404 = true, int requestDelay = 0)
        {
            if (downloadResource == null || string.IsNullOrEmpty(downloadResource.Url))
            {
                throw new ArgumentException("Для скачивания должны быть заполнены пути к данным");
            }
            _logger.LogInformation("Downloading data: " + downloadResource.Url);

            var uri = downloadResource.Url.StartsWith("htt") ? new Uri(downloadResource.Url) : new Uri(new Uri(BaseUrl), downloadResource.Url);

            var filePath = string.IsNullOrEmpty(downloadResource.RelativePath)
                            ? string.Empty
                            : Path.Combine(_diaryPath, downloadResource.RelativePath);

            var client = new CF_WebClient(_cookieContainer);

            BeforeDownload?.Invoke(this, new DataDownloaderEventArgs {
                Resource = downloadResource
            });
            Thread.Sleep(requestDelay);
            byte[] downloadedData;
            var    retries = 0;

            while (true)
            {
                try
                {
                    downloadedData = await client.DownloadDataTaskAsync(uri);

                    break; //i want to break freeeeee
                }
                catch (WebException e)
                {
                    if (e.Status == WebExceptionStatus.ProtocolError && ignore404)
                    {
                        var response = e.Response as HttpWebResponse;
                        if (response != null)
                        {
                            if (response.StatusCode == HttpStatusCode.NotFound)
                            {
                                _logger.LogWarning("Url not found: " + e.Response.ResponseUri.AbsoluteUri);
                                downloadResource.LocalPath = "";
                                return(new DataDownloaderResult {
                                    Resource = downloadResource, DownloadedData = null
                                });
                            }
                        }
                    }
                    retries += 1;
                    _logger.LogError(e, $"Error, retry count: {retries}");

                    if (retries >= Constants.DownloadRetryCount)
                    {
                        throw;
                    }
                    Thread.Sleep(2000);
                }
            }

            AfterDownload?.Invoke(this, new DataDownloaderEventArgs {
                Resource = downloadResource, DownloadedData = downloadedData
            });

            if (!string.IsNullOrEmpty(filePath))
            {
                using (var f = File.Create(filePath))
                {
                    await f.WriteAsync(downloadedData, 0, downloadedData.Length);
                }
            }

            return(new DataDownloaderResult {
                Resource = downloadResource, DownloadedData = downloadedData
            });
        }