Esempio n. 1
0
        private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken cancellationToken)
        {
            Logger.Debug($"Start [{name}]");

            bool maxConnections = false;

            do
            {
                Interlocked.Increment(ref RunningWebDirectoryThreads);

                if (queue.TryDequeue(out WebDirectory webDirectory))
                {
                    try
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo[name] = webDirectory;
                        }

                        if (!Session.ProcessedUrls.Contains(webDirectory.Url))
                        {
                            Session.ProcessedUrls.Add(webDirectory.Url);
                            webDirectory.StartTime = DateTimeOffset.UtcNow;

                            Logger.Info($"[{name}] Begin processing {webDirectory.Url}");

                            if (Session.Root.Uri.Scheme == Constants.UriScheme.Ftp || Session.Root.Uri.Scheme == Constants.UriScheme.Ftps)
                            {
                                WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory, OpenDirectoryIndexerSettings.Username, OpenDirectoryIndexerSettings.Password);

                                if (webDirectory?.CancellationReason == Constants.Ftp_Max_Connections)
                                {
                                    webDirectory.CancellationReason = null;
                                    maxConnections = true;

                                    if (webDirectory.Name == Constants.Root)
                                    {
                                        webDirectory.Error = true;
                                        Interlocked.Decrement(ref RunningWebDirectoryThreads);
                                        throw new Exception("Error checking FTP because maximum connections reached");
                                    }

                                    // Requeue
                                    Session.ProcessedUrls.Remove(webDirectory.Url);
                                    queue.Enqueue(webDirectory);

                                    try
                                    {
                                        await FtpParser.FtpClients[name].DisconnectAsync(cancellationToken);

                                        lock (FtpParser.FtpClients)
                                        {
                                            FtpParser.FtpClients.Remove(name);
                                        }
                                    }
                                    catch (Exception exFtpDisconnect)
                                    {
                                        Logger.Error(exFtpDisconnect, "Error disconnecting FTP connection.");
                                    }
                                }

                                if (parsedWebDirectory != null)
                                {
                                    DirectoryParser.CheckParsedResults(parsedWebDirectory);
                                    AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                                }
                            }
                            else if (Session.Root.Uri.Host == Constants.GoogleDriveDomain)
                            {
                                string baseUrl = webDirectory.Url;

                                WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory);

                                parsedWebDirectory.Url = baseUrl;

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            {
                                if (Session.Root.Uri.Host == Constants.BlitzfilesTechDomain || SameHostAndDirectory(Session.Root.Uri, webDirectory.Uri))
                                {
                                    Logger.Debug($"[{name}] Start download '{webDirectory.Url}'");
                                    Session.TotalHttpRequests++;

                                    CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();

                                    cancellationTokenSource.CancelAfter(TimeSpan.FromMinutes(5));

                                    Context pollyContext = new Context
                                    {
                                        { "Processor", name },
                                        { "WebDirectory", webDirectory },
                                        { "CancellationTokenSource", cancellationTokenSource }
                                    };

                                    await RetryPolicy.ExecuteAsync(async (context, token) => { await ProcessWebDirectoryAsync(name, webDirectory, cancellationTokenSource.Token); }, pollyContext, cancellationTokenSource.Token);
                                }
                                else
                                {
                                    Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path");

                                    Session.Skipped++;
                                }
                            }

                            Logger.Info($"[{name}] Finished processing {webDirectory.Url}");
                        }
                        else
                        {
                            //Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}");
                        }
                    }
                    catch (Exception ex)
                    {
                        if (ex is TaskCanceledException taskCanceledException)
                        {
                            Session.Errors++;
                            webDirectory.Error = true;

                            if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                            {
                                Session.UrlsWithErrors.Add(webDirectory.Url);
                            }

                            if (webDirectory.ParentDirectory?.Url != null)
                            {
                                Logger.Error($"Skipped processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'");
                            }
                            else
                            {
                                Logger.Error($"Skipped processing Url: '{webDirectory.Url}'");
                                Session.Root.Error = true;
                            }
                        }
                        else
                        {
                            Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory?.Url}'");
                        }
                    }
                    finally
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo.Remove(name);
                        }

                        if (string.IsNullOrWhiteSpace(webDirectory.CancellationReason))
                        {
                            webDirectory.Finished   = true;
                            webDirectory.FinishTime = DateTimeOffset.UtcNow;
                        }
                    }
                }

                Interlocked.Decrement(ref RunningWebDirectoryThreads);

                // Needed, because of the TryDequeue, no waiting in ConcurrentQueue!
                if (queue.IsEmpty)
                {
                    // Don't hog the CPU when queue < threads
                    await Task.Delay(TimeSpan.FromMilliseconds(1000), cancellationToken);
                }
                else
                {
                    await Task.Delay(TimeSpan.FromMilliseconds(10), cancellationToken);
                }
            }while (!cancellationToken.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0) && !maxConnections);

            Logger.Debug($"Finished [{name}]");
        }
        private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken token)
        {
            Logger.Debug($"Start [{name}]");

            do
            {
                Interlocked.Increment(ref RunningWebDirectoryThreads);

                if (queue.TryDequeue(out WebDirectory webDirectory))
                {
                    try
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo[name] = webDirectory;
                        }

                        if (!Session.ProcessedUrls.Contains(webDirectory.Url))
                        {
                            Session.ProcessedUrls.Add(webDirectory.Url);
                            Logger.Info($"[{name}] Begin processing {webDirectory.Url}");

                            if (Session.Root.Uri.Scheme == "ftp")
                            {
                                WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory);

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            if (Session.Root.Uri.Host == "drive.google.com")
                            {
                                string baseUrl = webDirectory.Url;

                                WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory);

                                parsedWebDirectory.Url = baseUrl;

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            {
                                if (webDirectory.Uri.Host == Session.Root.Uri.Host && webDirectory.Uri.LocalPath.StartsWith(Session.Root.Uri.LocalPath))
                                {
                                    Logger.Debug($"[{name}] Start download '{webDirectory.Url}'");
                                    Session.TotalHttpRequests++;

                                    await RetryPolicy.ExecuteAsync(async() =>
                                    {
                                        webDirectory.StartTime = DateTimeOffset.UtcNow;

                                        HttpResponseMessage httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url);
                                        string html = null;

                                        if (httpResponseMessage.IsSuccessStatusCode)
                                        {
                                            html = await GetHtml(httpResponseMessage);
                                        }

                                        if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html))
                                        {
                                            Logger.Warn("First request fails, using Curl fallback User-Agent");
                                            HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                                            HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Curl);
                                            httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url);

                                            if (httpResponseMessage.IsSuccessStatusCode)
                                            {
                                                html = await GetHtml(httpResponseMessage);
                                                Logger.Warn("Yes, this Curl User-Agent did the trick!");
                                            }
                                        }

                                        if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html))
                                        {
                                            Logger.Warn("First request fails, using Chrome fallback User-Agent");
                                            HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                                            HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Chrome);
                                            httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url);

                                            if (httpResponseMessage.IsSuccessStatusCode)
                                            {
                                                html = await GetHtml(httpResponseMessage);
                                                Logger.Warn("Yes, the Chrome User-Agent did the trick!");
                                            }
                                        }

                                        bool calibreDetected        = false;
                                        string calibreVersionString = string.Empty;

                                        if (httpResponseMessage.IsSuccessStatusCode)
                                        {
                                            FirstRequest = false;

                                            List <string> serverHeaders = new List <string>();

                                            if (httpResponseMessage.Headers.Contains("Server"))
                                            {
                                                serverHeaders = httpResponseMessage.Headers.GetValues("Server").ToList();

                                                calibreDetected = serverHeaders.Any(h => h.Contains("calibre"));
                                            }

                                            if (calibreDetected)
                                            {
                                                string serverHeader  = string.Join("/", serverHeaders);
                                                calibreVersionString = serverHeader;
                                            }
                                            else
                                            {
                                                if (html == null)
                                                {
                                                    html = await GetHtml(httpResponseMessage);
                                                }

                                                // UNTESTED (cannot find or down Calibre with this issue)
                                                const string calibreVersionIdentifier = "CALIBRE_VERSION = \"";
                                                calibreDetected = html?.Contains(calibreVersionIdentifier) == true;

                                                if (calibreDetected)
                                                {
                                                    int calibreVersionIdentifierStart = html.IndexOf(calibreVersionIdentifier);
                                                    calibreVersionString = html.Substring(calibreVersionIdentifierStart, html.IndexOf("\"", ++calibreVersionIdentifierStart));
                                                }
                                            }
                                        }

                                        if (calibreDetected)
                                        {
                                            Version calibreVersion = CalibreParser.ParseVersion(calibreVersionString);

                                            Console.WriteLine($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre...");
                                            Logger.Info($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre...");

                                            await CalibreParser.ParseCalibre(HttpClient, httpResponseMessage.RequestMessage.RequestUri, webDirectory, calibreVersion);

                                            return;
                                        }

                                        Uri originalUri = new Uri(webDirectory.Url);
                                        Logger.Debug($"[{name}] Finish download '{webDirectory.Url}'");

                                        // Process only same site
                                        if (httpResponseMessage.RequestMessage.RequestUri.Host == Session.Root.Uri.Host)
                                        {
                                            int httpStatusCode = (int)httpResponseMessage.StatusCode;

                                            if (!Session.HttpStatusCodes.ContainsKey(httpStatusCode))
                                            {
                                                Session.HttpStatusCodes[httpStatusCode] = 0;
                                            }

                                            Session.HttpStatusCodes[httpStatusCode]++;

                                            if (httpResponseMessage.IsSuccessStatusCode)
                                            {
                                                if (html == null)
                                                {
                                                    html = await GetHtml(httpResponseMessage);
                                                }

                                                Session.TotalHttpTraffic += html.Length;

                                                WebDirectory parsedWebDirectory = await DirectoryParser.ParseHtml(webDirectory, html, HttpClient);
                                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                                            }
                                            else
                                            {
                                                Session.Errors++;
                                                webDirectory.Error = true;

                                                if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                                                {
                                                    Session.UrlsWithErrors.Add(webDirectory.Url);
                                                }

                                                httpResponseMessage.EnsureSuccessStatusCode();
                                            }
                                        }
                                        else
                                        {
                                            Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' which points to '{httpResponseMessage.RequestMessage.RequestUri}'");
                                            Session.Skipped++;
                                        }
                                    });
                                }
                                else
                                {
                                    Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path");

                                    Session.Skipped++;
                                }
                            }

                            Logger.Info($"[{name}] Finished processing {webDirectory.Url}");
                        }
                        else
                        {
                            Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}");
                        }
                    }
                    catch (Exception ex)
                    {
                        Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'");

                        Session.Errors++;

                        if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                        {
                            Session.UrlsWithErrors.Add(webDirectory.Url);
                        }
                    }
                    finally
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo.Remove(name);
                        }
                    }
                }

                Interlocked.Decrement(ref RunningWebDirectoryThreads);

                // Needed!
                await Task.Delay(TimeSpan.FromMilliseconds(10));
            }while (!token.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0));

            Logger.Debug($"Finished [{name}]");
        }
        private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken cancellationToken)
        {
            Logger.Debug($"Start [{name}]");

            do
            {
                Interlocked.Increment(ref RunningWebDirectoryThreads);

                if (queue.TryDequeue(out WebDirectory webDirectory))
                {
                    try
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo[name] = webDirectory;
                        }

                        if (!Session.ProcessedUrls.Contains(webDirectory.Url))
                        {
                            Session.ProcessedUrls.Add(webDirectory.Url);
                            webDirectory.StartTime = DateTimeOffset.UtcNow;

                            Logger.Info($"[{name}] Begin processing {webDirectory.Url}");

                            if (Session.Root.Uri.Scheme == "ftp")
                            {
                                WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory);

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            if (Session.Root.Uri.Host == Constants.GoogleDriveDomain)
                            {
                                string baseUrl = webDirectory.Url;

                                WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory);

                                parsedWebDirectory.Url = baseUrl;

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            {
                                if (SameHostAndDirectory(Session.Root.Uri, webDirectory.Uri))
                                {
                                    Logger.Debug($"[{name}] Start download '{webDirectory.Url}'");
                                    Session.TotalHttpRequests++;

                                    CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();

                                    cancellationTokenSource.CancelAfter(TimeSpan.FromMinutes(5));

                                    Context pollyContext = new Context
                                    {
                                        { "Processor", name },
                                        { "WebDirectory", webDirectory },
                                        { "CancellationTokenSource", cancellationTokenSource }
                                    };

                                    await RetryPolicy.ExecuteAsync(async (context, token) => { await ProcessWebDirectoryAsync(name, webDirectory, cancellationTokenSource.Token); }, pollyContext, cancellationTokenSource.Token);
                                }
                                else
                                {
                                    Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path");

                                    Session.Skipped++;
                                }
                            }

                            Logger.Info($"[{name}] Finished processing {webDirectory.Url}");
                        }
                        else
                        {
                            //Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}");
                        }
                    }
                    catch (Exception ex)
                    {
                        if (ex is TaskCanceledException taskCanceledException)
                        {
                            if (webDirectory.ParentDirectory?.Url != null)
                            {
                                Logger.Warn($"Skipped processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'");
                            }
                            else
                            {
                                Logger.Warn($"Skipped processing Url: '{webDirectory.Url}'");
                                Session.Root.Error = true;
                            }
                        }
                        else
                        {
                            Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory?.Url}'");
                        }

                        Session.Errors++;

                        if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                        {
                            Session.UrlsWithErrors.Add(webDirectory.Url);
                        }
                    }
                    finally
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo.Remove(name);
                        }

                        webDirectory.Finished   = true;
                        webDirectory.FinishTime = DateTimeOffset.UtcNow;
                    }
                }

                Interlocked.Decrement(ref RunningWebDirectoryThreads);

                // Needed!
                await Task.Delay(TimeSpan.FromMilliseconds(10));
            }while (!cancellationToken.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0));

            Logger.Debug($"Finished [{name}]");
        }
        private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken token)
        {
            Logger.Debug($"Start [{name}]");

            do
            {
                Interlocked.Increment(ref RunningWebDirectoryThreads);

                if (queue.TryDequeue(out WebDirectory webDirectory))
                {
                    try
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo[name] = webDirectory;
                        }

                        if (!Session.ProcessedUrls.Contains(webDirectory.Url))
                        {
                            Session.ProcessedUrls.Add(webDirectory.Url);
                            Logger.Info($"[{name}] Begin processing {webDirectory.Url}");

                            if (Session.Root.Uri.Scheme == "ftp")
                            {
                                WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory);

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            if (Session.Root.Uri.Host == "drive.google.com")
                            {
                                string baseUrl = webDirectory.Url;

                                WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory);

                                parsedWebDirectory.Url = baseUrl;

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            {
                                if (webDirectory.Uri.Host == Session.Root.Uri.Host && webDirectory.Uri.LocalPath.StartsWith(Session.Root.Uri.LocalPath))
                                {
                                    Logger.Debug($"[{name}] Start download '{webDirectory.Url}'");
                                    Session.TotalHttpRequests++;
                                    Context pollyContext = new Context();
                                    pollyContext.Add("Processor", name);
                                    pollyContext.Add("WebDirectory", webDirectory);
                                    await RetryPolicy.ExecuteAsync(ctx => ProcessWebDirectoryAsync(name, webDirectory), pollyContext);
                                }
                                else
                                {
                                    Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path");

                                    Session.Skipped++;
                                }
                            }

                            Logger.Info($"[{name}] Finished processing {webDirectory.Url}");
                        }
                        else
                        {
                            Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}");
                        }
                    }
                    catch (Exception ex)
                    {
                        Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'");

                        Session.Errors++;

                        if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                        {
                            Session.UrlsWithErrors.Add(webDirectory.Url);
                        }
                    }
                    finally
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo.Remove(name);
                        }
                    }
                }

                Interlocked.Decrement(ref RunningWebDirectoryThreads);

                // Needed!
                await Task.Delay(TimeSpan.FromMilliseconds(10));
            }while (!token.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0));

            Logger.Debug($"Finished [{name}]");
        }