Exemplo n.º 1
0
        private async Task ProcessWebDirectoryAsync(string name, WebDirectory webDirectory, CancellationToken cancellationToken)
        {
            if (Session.Parameters.ContainsKey(Constants.Parameters_GdIndex_RootId))
            {
                await Site.GoIndex.GdIndex.GdIndexParser.ParseIndex(HttpClient, webDirectory, string.Empty);

                return;
            }

            if (!string.IsNullOrWhiteSpace(OpenDirectoryIndexerSettings.CommandLineOptions.UserAgent))
            {
                HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(OpenDirectoryIndexerSettings.CommandLineOptions.UserAgent);
            }

            HttpResponseMessage httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url, cancellationToken);

            string html = null;

            if (httpResponseMessage.IsSuccessStatusCode)
            {
                SetRootUrl(httpResponseMessage);

                html = await GetHtml(httpResponseMessage);
            }

            if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html) || html?.Contains("HTTP_USER_AGENT") == true)
            {
                Logger.Warn("First request fails, using Curl fallback User-Agent");
                HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(Constants.UserAgent.Curl);
                httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url, cancellationToken);

                if (httpResponseMessage.IsSuccessStatusCode)
                {
                    SetRootUrl(httpResponseMessage);

                    html = await GetHtml(httpResponseMessage);

                    Logger.Warn("Yes, Curl User-Agent did the trick!");
                }
            }

            if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html))
            {
                Logger.Warn("First request fails, using Chrome fallback User-Agent");
                HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(Constants.UserAgent.Chrome);
                httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url, cancellationToken);

                if (httpResponseMessage.IsSuccessStatusCode)
                {
                    SetRootUrl(httpResponseMessage);

                    html = await GetHtml(httpResponseMessage);

                    Logger.Warn("Yes, Chrome User-Agent did the trick!");
                }
            }

            if (!HttpClient.DefaultRequestHeaders.Contains("Referer"))
            {
                HttpClient.DefaultRequestHeaders.Add("Referer", webDirectory.Url);
            }

            bool   calibreDetected      = false;
            string calibreVersionString = string.Empty;

            if (httpResponseMessage.IsSuccessStatusCode)
            {
                FirstRequest = false;

                List <string> serverHeaders = new List <string>();

                if (httpResponseMessage.Headers.Contains("Server"))
                {
                    serverHeaders = httpResponseMessage.Headers.GetValues("Server").ToList();

                    calibreDetected = serverHeaders.Any(h => h.Contains("calibre"));
                }

                if (calibreDetected)
                {
                    string serverHeader = string.Join("/", serverHeaders);
                    calibreVersionString = serverHeader;
                }
                else
                {
                    if (html == null)
                    {
                        html = await GetHtml(httpResponseMessage);
                    }

                    // UNTESTED (cannot find or down Calibre with this issue)
                    const string calibreVersionIdentifier = "CALIBRE_VERSION = \"";
                    calibreDetected = html?.Contains(calibreVersionIdentifier) == true;

                    if (calibreDetected)
                    {
                        int calibreVersionIdentifierStart = html.IndexOf(calibreVersionIdentifier);
                        calibreVersionString = html.Substring(calibreVersionIdentifierStart, html.IndexOf("\"", ++calibreVersionIdentifierStart));
                    }
                }
            }

            if (calibreDetected)
            {
                Version calibreVersion = CalibreParser.ParseVersion(calibreVersionString);

                Console.WriteLine($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre...");
                Logger.Info($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre...");

                await CalibreParser.ParseCalibre(HttpClient, httpResponseMessage.RequestMessage.RequestUri, webDirectory, calibreVersion, cancellationToken);

                return;
            }

            if (httpResponseMessage.IsSuccessStatusCode && webDirectory.Url != httpResponseMessage.RequestMessage.RequestUri.ToString())
            {
                webDirectory.Url = httpResponseMessage.RequestMessage.RequestUri.ToString();
            }

            Uri originalUri = new Uri(webDirectory.Url);

            Logger.Debug($"[{name}] Finish download '{webDirectory.Url}'");

            // Process only same site
            if (httpResponseMessage.RequestMessage.RequestUri.Host == Session.Root.Uri.Host)
            {
                int httpStatusCode = (int)httpResponseMessage.StatusCode;

                if (!Session.HttpStatusCodes.ContainsKey(httpStatusCode))
                {
                    Session.HttpStatusCodes[httpStatusCode] = 0;
                }

                Session.HttpStatusCodes[httpStatusCode]++;

                if (httpResponseMessage.IsSuccessStatusCode)
                {
                    if (html == null)
                    {
                        html = await GetHtml(httpResponseMessage);
                    }

                    Session.TotalHttpTraffic += html.Length;

                    WebDirectory parsedWebDirectory = await DirectoryParser.ParseHtml(webDirectory, html, HttpClient);

                    bool processSubdirectories = parsedWebDirectory.Parser != "DirectoryListingModel01";
                    AddProcessedWebDirectory(webDirectory, parsedWebDirectory, processSubdirectories);
                }
                else
                {
                    httpResponseMessage.EnsureSuccessStatusCode();
                }
            }
            else
            {
                Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' which points to '{httpResponseMessage.RequestMessage.RequestUri}'");
                Session.Skipped++;
            }
        }
        private async Task WebDirectoryProcessor(ConcurrentQueue <WebDirectory> queue, string name, CancellationToken token)
        {
            Logger.Debug($"Start [{name}]");

            do
            {
                Interlocked.Increment(ref RunningWebDirectoryThreads);

                if (queue.TryDequeue(out WebDirectory webDirectory))
                {
                    try
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo[name] = webDirectory;
                        }

                        if (!Session.ProcessedUrls.Contains(webDirectory.Url))
                        {
                            Session.ProcessedUrls.Add(webDirectory.Url);
                            Logger.Info($"[{name}] Begin processing {webDirectory.Url}");

                            if (Session.Root.Uri.Scheme == "ftp")
                            {
                                WebDirectory parsedWebDirectory = await FtpParser.ParseFtpAsync(name, webDirectory);

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            if (Session.Root.Uri.Host == "drive.google.com")
                            {
                                string baseUrl = webDirectory.Url;

                                WebDirectory parsedWebDirectory = await GoogleDriveIndexer.IndexAsync(webDirectory);

                                parsedWebDirectory.Url = baseUrl;

                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                            }
                            else
                            {
                                if (webDirectory.Uri.Host == Session.Root.Uri.Host && webDirectory.Uri.LocalPath.StartsWith(Session.Root.Uri.LocalPath))
                                {
                                    Logger.Debug($"[{name}] Start download '{webDirectory.Url}'");
                                    Session.TotalHttpRequests++;

                                    await RetryPolicy.ExecuteAsync(async() =>
                                    {
                                        webDirectory.StartTime = DateTimeOffset.UtcNow;

                                        HttpResponseMessage httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url);
                                        string html = null;

                                        if (httpResponseMessage.IsSuccessStatusCode)
                                        {
                                            html = await GetHtml(httpResponseMessage);
                                        }

                                        if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html))
                                        {
                                            Logger.Warn("First request fails, using Curl fallback User-Agent");
                                            HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                                            HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Curl);
                                            httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url);

                                            if (httpResponseMessage.IsSuccessStatusCode)
                                            {
                                                html = await GetHtml(httpResponseMessage);
                                                Logger.Warn("Yes, this Curl User-Agent did the trick!");
                                            }
                                        }

                                        if (FirstRequest && !httpResponseMessage.IsSuccessStatusCode || httpResponseMessage.IsSuccessStatusCode && string.IsNullOrWhiteSpace(html))
                                        {
                                            Logger.Warn("First request fails, using Chrome fallback User-Agent");
                                            HttpClient.DefaultRequestHeaders.UserAgent.Clear();
                                            HttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent_Chrome);
                                            httpResponseMessage = await HttpClient.GetAsync(webDirectory.Url);

                                            if (httpResponseMessage.IsSuccessStatusCode)
                                            {
                                                html = await GetHtml(httpResponseMessage);
                                                Logger.Warn("Yes, the Chrome User-Agent did the trick!");
                                            }
                                        }

                                        bool calibreDetected        = false;
                                        string calibreVersionString = string.Empty;

                                        if (httpResponseMessage.IsSuccessStatusCode)
                                        {
                                            FirstRequest = false;

                                            List <string> serverHeaders = new List <string>();

                                            if (httpResponseMessage.Headers.Contains("Server"))
                                            {
                                                serverHeaders = httpResponseMessage.Headers.GetValues("Server").ToList();

                                                calibreDetected = serverHeaders.Any(h => h.Contains("calibre"));
                                            }

                                            if (calibreDetected)
                                            {
                                                string serverHeader  = string.Join("/", serverHeaders);
                                                calibreVersionString = serverHeader;
                                            }
                                            else
                                            {
                                                if (html == null)
                                                {
                                                    html = await GetHtml(httpResponseMessage);
                                                }

                                                // UNTESTED (cannot find or down Calibre with this issue)
                                                const string calibreVersionIdentifier = "CALIBRE_VERSION = \"";
                                                calibreDetected = html?.Contains(calibreVersionIdentifier) == true;

                                                if (calibreDetected)
                                                {
                                                    int calibreVersionIdentifierStart = html.IndexOf(calibreVersionIdentifier);
                                                    calibreVersionString = html.Substring(calibreVersionIdentifierStart, html.IndexOf("\"", ++calibreVersionIdentifierStart));
                                                }
                                            }
                                        }

                                        if (calibreDetected)
                                        {
                                            Version calibreVersion = CalibreParser.ParseVersion(calibreVersionString);

                                            Console.WriteLine($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre...");
                                            Logger.Info($"Calibre {calibreVersion} detected! I will index it at max 100 books per 30 seconds, else it will break Calibre...");

                                            await CalibreParser.ParseCalibre(HttpClient, httpResponseMessage.RequestMessage.RequestUri, webDirectory, calibreVersion);

                                            return;
                                        }

                                        Uri originalUri = new Uri(webDirectory.Url);
                                        Logger.Debug($"[{name}] Finish download '{webDirectory.Url}'");

                                        // Process only same site
                                        if (httpResponseMessage.RequestMessage.RequestUri.Host == Session.Root.Uri.Host)
                                        {
                                            int httpStatusCode = (int)httpResponseMessage.StatusCode;

                                            if (!Session.HttpStatusCodes.ContainsKey(httpStatusCode))
                                            {
                                                Session.HttpStatusCodes[httpStatusCode] = 0;
                                            }

                                            Session.HttpStatusCodes[httpStatusCode]++;

                                            if (httpResponseMessage.IsSuccessStatusCode)
                                            {
                                                if (html == null)
                                                {
                                                    html = await GetHtml(httpResponseMessage);
                                                }

                                                Session.TotalHttpTraffic += html.Length;

                                                WebDirectory parsedWebDirectory = await DirectoryParser.ParseHtml(webDirectory, html, HttpClient);
                                                AddProcessedWebDirectory(webDirectory, parsedWebDirectory);
                                            }
                                            else
                                            {
                                                Session.Errors++;
                                                webDirectory.Error = true;

                                                if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                                                {
                                                    Session.UrlsWithErrors.Add(webDirectory.Url);
                                                }

                                                httpResponseMessage.EnsureSuccessStatusCode();
                                            }
                                        }
                                        else
                                        {
                                            Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' which points to '{httpResponseMessage.RequestMessage.RequestUri}'");
                                            Session.Skipped++;
                                        }
                                    });
                                }
                                else
                                {
                                    Logger.Warn($"[{name}] Skipped result of '{webDirectory.Url}' because it is not the same host or path");

                                    Session.Skipped++;
                                }
                            }

                            Logger.Info($"[{name}] Finished processing {webDirectory.Url}");
                        }
                        else
                        {
                            Logger.Warn($"[{name}] Skip, already processed: {webDirectory.Uri}");
                        }
                    }
                    catch (Exception ex)
                    {
                        Logger.Error(ex, $"Error processing Url: '{webDirectory.Url}' from parent '{webDirectory.ParentDirectory.Url}'");

                        Session.Errors++;

                        if (!Session.UrlsWithErrors.Contains(webDirectory.Url))
                        {
                            Session.UrlsWithErrors.Add(webDirectory.Url);
                        }
                    }
                    finally
                    {
                        lock (WebDirectoryProcessorInfoLock)
                        {
                            WebDirectoryProcessorInfo.Remove(name);
                        }
                    }
                }

                Interlocked.Decrement(ref RunningWebDirectoryThreads);

                // Needed!
                await Task.Delay(TimeSpan.FromMilliseconds(10));
            }while (!token.IsCancellationRequested && (!queue.IsEmpty || RunningWebDirectoryThreads > 0));

            Logger.Debug($"Finished [{name}]");
        }