private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirectory, CancellationToken cancellationToken) { if (Session.Parameters.ContainsKey(Constants.Parameters_GdIndex_RootId)) { await Site.GoIndex.GdIndex.GdIndexParser.ParseIndex(HttpClient, webDirectory, string.Empty); return; } if (!string.IsNullOrWhiteSpace(OpenDirectoryIndexerSettings.CommandLineOptions.UserAgent)) { HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(OpenDirectoryIndexerSettings.CommandLineOptions.UserAgent); } HttpResponseMessage httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url, cancellationToken); string html = null; if (httpResponseMessage.IsSuccessStatusCode) { SetRootUrl(httpResponseMessage); html = await GetHtml(httpResponseMessage); } if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html) || html?.Contains("HTTP_USER_AGENT") == true) { Logger.Warn("First request fails, using Curl fallback User-Agent"); HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(Constants.UserAgent.Curl); httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url, cancellationToken); if (httpResponseMessage.IsSuccessStatusCode) { SetRootUrl(httpResponseMessage); html = await GetHtml(httpResponseMessage); Logger.Warn("Yes, Curl User-Agent did the trick!"); } } if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html)) { Logger.Warn("First request fails, using Chrome fallback User-Agent"); HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(Constants.UserAgent.Chrome); httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url, cancellationToken); if (httpResponseMessage.IsSuccessStatusCode) { SetRootUrl(httpResponseMessage); html = await GetHtml(httpResponseMessage); Logger.Warn("Yes, Chrome User-Agent did the trick!"); } } if (!HttpClient.DefaultRequestHeaders.Contains("Referer")) { HttpClient.DefaultRequestHeaders.Add("Referer", webDirectory.Url); } bool calibreDetected = false; string calibreVersionString = string.Empty; if (httpResponseMessage.IsSuccessStatusCode) { FirstRequest = false; List <string> serverHeaders = new List <string>(); if (httpResponseMessage.Headers.Contains("Server")) { serverHeaders = httpResponseMessage.Headers.GetValues("Server").ToList(); calibreDetected = serverHeaders.Any(h => h.Contains("calibre")); } if (calibreDetected) { string serverHeader = string.Join("/", serverHeaders); calibreVersionString = serverHeader; } else { if (html == null) { html = await GetHtml(httpResponseMessage); } // UNTESTED (cannot find or down Calibre with this issue) const string calibreVersionIdentifier = "CALIBRE_VERSION = \""; calibreDetected = html?.Contains(calibreVersionIdentifier) == true; if (calibreDetected) { int calibreVersionIdentifierStart = html.IndexOf(calibreVersionIdentifier); calibreVersionString = html.Substring(calibreVersionIdentifierStart, html.IndexOf("\"", ++calibreVersionIdentifierStart)); } } } if (calibreDetected) { Version calibreVersion = CalibreParser.ParseVersion(calibreVersionString); Console.WriteLine($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre..."); Logger.Info($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre..."); await CalibreParser.ParseCalibre(HttpClient, httpResponseMessage.RequestMessage.RequestUri, webDirectory, calibreVersion, cancellationToken); return; } if (httpResponseMessage.IsSuccessStatusCode && webDirectory.Url != httpResponseMessage.RequestMessage.RequestUri.ToString()) { webDirectory.Url = httpResponseMessage.RequestMessage.RequestUri.ToString(); } Uri originalUri = new Uri(webDirectory.Url); Logger.Debug($"[{name}] Finish download '{webDirectory.Url}'"); // Process only same site if (httpResponseMessage.RequestMessage.RequestUri.Host == Session.Root.Uri.Host) { int httpStatusCode = (int)httpResponseMessage.StatusCode; if (!Session.HttpStatusCodes.ContainsKey(httpStatusCode)) { Session.HttpStatusCodes[httpStatusCode] = 0; } Session.HttpStatusCodes[httpStatusCode]++; if (httpResponseMessage.IsSuccessStatusCode) { if (html == null) { html = await GetHtml(httpResponseMessage); } Session.TotalHttpTraffic += html.Length; WebDirectory parsedWebDirectory = await DirectoryParser.ParseHtml(webDirectory, html, HttpClient); bool processSubdirectories = parsedWebDirectory.Parser != "DirectoryListingModel01"; AddProcessedWebDirectory(webDirectory, parsedWebDirectory, processSubdirectories); } else { httpResponseMessage.EnsureSuccessStatusCode(); } } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' which points to '{httpResponseMessage.RequestMessage.RequestUri}'"); Session.Skipped++; } }
private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken token) { Logger.Debug($"Start [{name}]"); do { Interlocked.Increment(ref RunningWebDirectoryThreads); if (queue.TryDequeue(out WebDirectory webDirectory)) { try { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo[name] = webDirectory; } if (!Session.ProcessedUrls.Contains(webDirectory.Url)) { Session.ProcessedUrls.Add(webDirectory.Url); Logger.Info($"[{name}] Begin processing {webDirectory.Url}"); if (Session.Root.Uri.Scheme == "ftp") { WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else if (Session.Root.Uri.Host == "drive.google.com") { string baseUrl = webDirectory.Url; WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory); parsedWebDirectory.Url = baseUrl; AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { if (webDirectory.Uri.Host == Session.Root.Uri.Host && webDirectory.Uri.LocalPath.StartsWith(Session.Root.Uri.LocalPath)) { Logger.Debug($"[{name}] Start download '{webDirectory.Url}'"); Session.TotalHttpRequests++; await RetryPolicy.ExecuteAsync(async() => { webDirectory.StartTime = DateTimeOffset.UtcNow; HttpResponseMessage httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url); string html = null; if (httpResponseMessage.IsSuccessStatusCode) { html = await GetHtml(httpResponseMessage); } if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html)) { Logger.Warn("First request fails, using Curl fallback User-Agent"); HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Curl); httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url); if (httpResponseMessage.IsSuccessStatusCode) { html = await GetHtml(httpResponseMessage); Logger.Warn("Yes, this Curl User-Agent did the trick!"); } } if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html)) { Logger.Warn("First request fails, using Chrome fallback User-Agent"); HttpClient.DefaultRequestHeaders.UserAgent.Clear(); HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Chrome); httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url); if (httpResponseMessage.IsSuccessStatusCode) { html = await GetHtml(httpResponseMessage); Logger.Warn("Yes, the Chrome User-Agent did the trick!"); } } bool calibreDetected = false; string calibreVersionString = string.Empty; if (httpResponseMessage.IsSuccessStatusCode) { FirstRequest = false; List <string> serverHeaders = new List <string>(); if (httpResponseMessage.Headers.Contains("Server")) { serverHeaders = httpResponseMessage.Headers.GetValues("Server").ToList(); calibreDetected = serverHeaders.Any(h => h.Contains("calibre")); } if (calibreDetected) { string serverHeader = string.Join("/", serverHeaders); calibreVersionString = serverHeader; } else { if (html == null) { html = await GetHtml(httpResponseMessage); } // UNTESTED (cannot find or down Calibre with this issue) const string calibreVersionIdentifier = "CALIBRE_VERSION = \""; calibreDetected = html?.Contains(calibreVersionIdentifier) == true; if (calibreDetected) { int calibreVersionIdentifierStart = html.IndexOf(calibreVersionIdentifier); calibreVersionString = html.Substring(calibreVersionIdentifierStart, html.IndexOf("\"", ++calibreVersionIdentifierStart)); } } } if (calibreDetected) { Version calibreVersion = CalibreParser.ParseVersion(calibreVersionString); Console.WriteLine($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre..."); Logger.Info($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre..."); await CalibreParser.ParseCalibre(HttpClient, httpResponseMessage.RequestMessage.RequestUri, webDirectory, calibreVersion); return; } Uri originalUri = new Uri(webDirectory.Url); Logger.Debug($"[{name}] Finish download '{webDirectory.Url}'"); // Process only same site if (httpResponseMessage.RequestMessage.RequestUri.Host == Session.Root.Uri.Host) { int httpStatusCode = (int)httpResponseMessage.StatusCode; if (!Session.HttpStatusCodes.ContainsKey(httpStatusCode)) { Session.HttpStatusCodes[httpStatusCode] = 0; } Session.HttpStatusCodes[httpStatusCode]++; if (httpResponseMessage.IsSuccessStatusCode) { if (html == null) { html = await GetHtml(httpResponseMessage); } Session.TotalHttpTraffic += html.Length; WebDirectory parsedWebDirectory = await DirectoryParser.ParseHtml(webDirectory, html, HttpClient); AddProcessedWebDirectory(webDirectory, parsedWebDirectory); } else { Session.Errors++; webDirectory.Error = true; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } httpResponseMessage.EnsureSuccessStatusCode(); } } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' which points to '{httpResponseMessage.RequestMessage.RequestUri}'"); Session.Skipped++; } }); } else { Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path"); Session.Skipped++; } } Logger.Info($"[{name}] Finished processing {webDirectory.Url}"); } else { Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}"); } } catch (Exception ex) { Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'"); Session.Errors++; if (!Session.UrlsWithErrors.Contains(webDirectory.Url)) { Session.UrlsWithErrors.Add(webDirectory.Url); } } finally { lock (WebDirectoryProcessorInfoLock) { WebDirectoryProcessorInfo.Remove(name); } } } Interlocked.Decrement(ref RunningWebDirectoryThreads); // Needed! await Task.Delay(TimeSpan.FromMilliseconds(10)); }while (!token.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0)); Logger.Debug($"Finished [{name}]"); }