private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken cancellationToken) { Logger.Debug($"Start [{name}]"); bool maxConnections = false; do { Interlocked.Increment(ref RunningWebDirectoryThreads); if (queue.TryDequeue(out WebDirectory webDirectory)) { try { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo[name] = webDirectory; } if (!Session.ProcessedUrls.Contains(webDirectory.Url)) { Session.ProcessedUrls.Add(webDirectory.Url); webDirectory.StartTime = DateTimeOffset.UtcNow; Logger.Info($"[{name}] Begin processing {webDirectory.Url}"); if (Session.Root.Uri.Scheme == Constants.UriScheme.Ftp || Session.Root.Uri.Scheme == Constants.UriScheme.Ftps) { WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory, OpenDirectoryIndexerSettings.Username, OpenDirectoryIndexerSettings.Password); if (webDirectory?.CancellationReason == Constants.Ftp_Max_Connections) { webDirectory.CancellationReason = null; maxConnections = true; if (webDirectory.Name == Constants.Root) { webDirectory.Error = true; Interlocked.Decrement(ref RunningWebDirectoryThreads); throw new Exception("Error checking FTP because maximum connections reached"); } // Requeue Session.ProcessedUrls.Remove(webDirectory.Url); queue.Enqueue(webDirectory); try { await FtpParser.FtpClients[name].DisconnectAsync(cancellationToken); lock (FtpParser.FtpClients) { FtpParser.FtpClients.Remove(name); } } catch (Exception exFtpDisconnect) { Logger.Error(exFtpDisconnect, "Error disconnecting FTP connection."); } } if (parsedWebDirectory != null) { DirectoryParser.CheckParsedResults(parsedWebDirectory); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } } else if (Session.Root.Uri.Host == Constants.GoogleDriveDomain) { string baseUrl = webDirectory.Url; WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory); parsedWebDirectory.Url = baseUrl; AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { if (Session.Root.Uri.Host == Constants.BlitzfilesTechDomain || SameHostAndDirectory(Session.Root.Uri, webDirectory.Uri)) { Logger.Debug($"[{name}] Start download '{webDirectory.Url}'"); Session.TotalHttpRequests++; CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); cancellationTokenSource.CancelAfter(TimeSpan.FromMinutes(5)); Context pollyContext = new Context { { "Processor", name }, { "WebDirectory", webDirectory }, { "CancellationTokenSource", cancellationTokenSource } }; await RetryPolicy.ExecuteAsync(async (context, token) => { await ProcessWebDirectoryAsync(name, webDirectory, cancellationTokenSource.Token); }, pollyContext, cancellationTokenSource.Token); } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path"); Session.Skipped++; } } Logger.Info($"[{name}] Finished processing {webDirectory.Url}"); } else { //Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}"); } } catch (Exception ex) { if (ex is TaskCanceledException taskCanceledException) { Session.Errors++; webDirectory.Error = true; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } if (webDirectory.ParentDirectory?.Url != null) { Logger.Error($"Skipped processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'"); } else { Logger.Error($"Skipped processing Url: '{webDirectory.Url}'"); Session.Root.Error = true; } } else { Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory?.Url}'"); } } finally { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo.Remove(name); } if (string.IsNullOrWhiteSpace(webDirectory.CancellationReason)) { webDirectory.Finished = true; webDirectory.FinishTime = DateTimeOffset.UtcNow; } } } Interlocked.Decrement(ref RunningWebDirectoryThreads); // Needed, because of the TryDequeue, no waiting in ConcurrentQueue! if (queue.IsEmpty) { // Don't hog the CPU when queue < threads await Task.Delay(TimeSpan.FromMilliseconds(1000), cancellationToken); } else { await Task.Delay(TimeSpan.FromMilliseconds(10), cancellationToken); } }while (!cancellationToken.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0) && !maxConnections); Logger.Debug($"Finished [{name}]"); }
private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken token) { Logger.Debug($"Start [{name}]"); do { Interlocked.Increment(ref RunningWebDirectoryThreads); if (queue.TryDequeue(out WebDirectory webDirectory)) { try { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo[name] = webDirectory; } if (!Session.ProcessedUrls.Contains(webDirectory.Url)) { Session.ProcessedUrls.Add(webDirectory.Url); Logger.Info($"[{name}] Begin processing {webDirectory.Url}"); if (Session.Root.Uri.Scheme == "ftp") { WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else if (Session.Root.Uri.Host == "drive.google.com") { string baseUrl = webDirectory.Url; WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory); parsedWebDirectory.Url = baseUrl; AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { if (webDirectory.Uri.Host == Session.Root.Uri.Host && webDirectory.Uri.LocalPath.StartsWith(Session.Root.Uri.LocalPath)) { Logger.Debug($"[{name}] Start download '{webDirectory.Url}'"); Session.TotalHttpRequests++; await RetryPolicy.ExecuteAsync(async() => { webDirectory.StartTime = DateTimeOffset.UtcNow; HttpResponseMessage httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url); string html = null; if (httpResponseMessage.IsSuccessStatusCode) { html = await GetHtml(httpResponseMessage); } if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html)) { Logger.Warn("First request fails, using Curl fallback User-Agent"); HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Curl); httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url); if (httpResponseMessage.IsSuccessStatusCode) { html = await GetHtml(httpResponseMessage); Logger.Warn("Yes, this Curl User-Agent did the trick!"); } } if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html)) { Logger.Warn("First request fails, using Chrome fallback User-Agent"); HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Chrome); httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url); if (httpResponseMessage.IsSuccessStatusCode) { html = await GetHtml(httpResponseMessage); Logger.Warn("Yes, the Chrome User-Agent did the trick!"); } } bool calibreDetected = false; string calibreVersionString = string.Empty; if (httpResponseMessage.IsSuccessStatusCode) { FirstRequest = false; List <string> serverHeaders = new List <string>(); if (httpResponseMessage.Headers.Contains("Server")) { serverHeaders = httpResponseMessage.Headers.GetValues("Server").ToList(); calibreDetected = serverHeaders.Any(h => h.Contains("calibre")); } if (calibreDetected) { string serverHeader = string.Join("/", serverHeaders); calibreVersionString = serverHeader; } else { if (html == null) { html = await GetHtml(httpResponseMessage); } // UNTESTED (cannot find or down Calibre with this issue) const string calibreVersionIdentifier = "CALIBRE_VERSION = \""; calibreDetected = html?.Contains(calibreVersionIdentifier) == true; if (calibreDetected) { int calibreVersionIdentifierStart = html.IndexOf(calibreVersionIdentifier); calibreVersionString = html.Substring(calibreVersionIdentifierStart, html.IndexOf("\"", ++calibreVersionIdentifierStart)); } } } if (calibreDetected) { Version calibreVersion = CalibreParser.ParseVersion(calibreVersionString); Console.WriteLine($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre..."); Logger.Info($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre..."); await CalibreParser.ParseCalibre(HttpClient, httpResponseMessage.RequestMessage.RequestUri, webDirectory, calibreVersion); return; } Uri originalUri = new Uri(webDirectory.Url); Logger.Debug($"[{name}] Finish download '{webDirectory.Url}'"); // Process only same site if (httpResponseMessage.RequestMessage.RequestUri.Host == Session.Root.Uri.Host) { int httpStatusCode = (int)httpResponseMessage.StatusCode; if (!Session.HttpStatusCodes.ContainsKey(httpStatusCode)) { Session.HttpStatusCodes[httpStatusCode] = 0; } Session.HttpStatusCodes[httpStatusCode]++; if (httpResponseMessage.IsSuccessStatusCode) { if (html == null) { html = await GetHtml(httpResponseMessage); } Session.TotalHttpTraffic += html.Length; WebDirectory parsedWebDirectory = await DirectoryParser.ParseHtml(webDirectory, html, HttpClient); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { Session.Errors++; webDirectory.Error = true; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } httpResponseMessage.EnsureSuccessStatusCode(); } } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' which points to '{httpResponseMessage.RequestMessage.RequestUri}'"); Session.Skipped++; } }); } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path"); Session.Skipped++; } } Logger.Info($"[{name}] Finished processing {webDirectory.Url}"); } else { Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}"); } } catch (Exception ex) { Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'"); Session.Errors++; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } } finally { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo.Remove(name); } } } Interlocked.Decrement(ref RunningWebDirectoryThreads); // Needed! await Task.Delay(TimeSpan.FromMilliseconds(10)); }while (!token.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0)); Logger.Debug($"Finished [{name}]"); }
private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken cancellationToken) { Logger.Debug($"Start [{name}]"); do { Interlocked.Increment(ref RunningWebDirectoryThreads); if (queue.TryDequeue(out WebDirectory webDirectory)) { try { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo[name] = webDirectory; } if (!Session.ProcessedUrls.Contains(webDirectory.Url)) { Session.ProcessedUrls.Add(webDirectory.Url); webDirectory.StartTime = DateTimeOffset.UtcNow; Logger.Info($"[{name}] Begin processing {webDirectory.Url}"); if (Session.Root.Uri.Scheme == "ftp") { WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else if (Session.Root.Uri.Host == Constants.GoogleDriveDomain) { string baseUrl = webDirectory.Url; WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory); parsedWebDirectory.Url = baseUrl; AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { if (SameHostAndDirectory(Session.Root.Uri, webDirectory.Uri)) { Logger.Debug($"[{name}] Start download '{webDirectory.Url}'"); Session.TotalHttpRequests++; CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); cancellationTokenSource.CancelAfter(TimeSpan.FromMinutes(5)); Context pollyContext = new Context { { "Processor", name }, { "WebDirectory", webDirectory }, { "CancellationTokenSource", cancellationTokenSource } }; await RetryPolicy.ExecuteAsync(async (context, token) => { await ProcessWebDirectoryAsync(name, webDirectory, cancellationTokenSource.Token); }, pollyContext, cancellationTokenSource.Token); } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path"); Session.Skipped++; } } Logger.Info($"[{name}] Finished processing {webDirectory.Url}"); } else { //Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}"); } } catch (Exception ex) { if (ex is TaskCanceledException taskCanceledException) { if (webDirectory.ParentDirectory?.Url != null) { Logger.Warn($"Skipped processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'"); } else { Logger.Warn($"Skipped processing Url: '{webDirectory.Url}'"); Session.Root.Error = true; } } else { Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory?.Url}'"); } Session.Errors++; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } } finally { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo.Remove(name); } webDirectory.Finished = true; webDirectory.FinishTime = DateTimeOffset.UtcNow; } } Interlocked.Decrement(ref RunningWebDirectoryThreads); // Needed! await Task.Delay(TimeSpan.FromMilliseconds(10)); }while (!cancellationToken.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0)); Logger.Debug($"Finished [{name}]"); }
private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken token) { Logger.Debug($"Start [{name}]"); do { Interlocked.Increment(ref RunningWebDirectoryThreads); if (queue.TryDequeue(out WebDirectory webDirectory)) { try { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo[name] = webDirectory; } if (!Session.ProcessedUrls.Contains(webDirectory.Url)) { Session.ProcessedUrls.Add(webDirectory.Url); Logger.Info($"[{name}] Begin processing {webDirectory.Url}"); if (Session.Root.Uri.Scheme == "ftp") { WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else if (Session.Root.Uri.Host == "drive.google.com") { string baseUrl = webDirectory.Url; WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory); parsedWebDirectory.Url = baseUrl; AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { if (webDirectory.Uri.Host == Session.Root.Uri.Host && webDirectory.Uri.LocalPath.StartsWith(Session.Root.Uri.LocalPath)) { Logger.Debug($"[{name}] Start download '{webDirectory.Url}'"); Session.TotalHttpRequests++; Context pollyContext = new Context(); pollyContext.Add("Processor", name); pollyContext.Add("WebDirectory", webDirectory); await RetryPolicy.ExecuteAsync(ctx => ProcessWebDirectoryAsync(name, webDirectory), pollyContext); } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path"); Session.Skipped++; } } Logger.Info($"[{name}] Finished processing {webDirectory.Url}"); } else { Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}"); } } catch (Exception ex) { Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'"); Session.Errors++; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } } finally { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo.Remove(name); } } } Interlocked.Decrement(ref RunningWebDirectoryThreads); // Needed! await Task.Delay(TimeSpan.FromMilliseconds(10)); }while (!token.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0)); Logger.Debug($"Finished [{name}]"); }