Esempio n. 1
0
        private async void Work()
        {
            var taskNumber = 0;

            semaphore.Wait();
            taskNumber = currentTaskNumber++;
            CurrentTasks.TryAdd(taskNumber, null);
            semaphore.Release();

            while (!cancelSource.IsCancellationRequested)
            {
                Work   w;
                string url;
                try
                {
                    #region Get valid work
                    if (!Manager.IsWorkAvailable || Manager.GetWork(out w, crawlDelay: Config.CrawlDelaySeconds) == false)
                    {
                        // unable to get work, wait a bit and try again
                        CurrentTasks[taskNumber] = null;
                        await Task.Delay(20);

                        continue;
                    }

                    url = w.Url;

                    // check if url is whitelisted
                    if (Extensions.IsUrlWhitelisted(url, Config) == false)
                    {
                        Logger.Log($"Skipping URL '{url}' - Not whitelisted!", Logger.LogSeverity.Debug);
                        continue;
                    }

                    // check robots.txt and blacklisted tags
                    // (this also attempts to download robots.txt on first run)
                    if (robots.IsUrlExcluded(url, Config, true).Result)
                    {
                        continue;
                    }

                    // get crawl-delay as defined by 'robots.txt'
                    var wait = robots.GetWaitTime(url, Config);

                    // wait the difference between [ROBOTS.TXT CRAWL DELAY] - [GLOBAL CRAWL DELAY]
                    var difference = wait - Config.CrawlDelaySeconds;
                    if (difference > 0)
                    {
                        Task.Delay((int)TimeSpan.FromSeconds(difference).TotalMilliseconds).Wait();
                    }
                    #endregion
                }
                catch (Exception ex)
                {
                    Logger.Log($"Error while trying to get valid work! " + ex.GetDetailedMessage(), Logger.LogSeverity.Warning);
                    continue;
                }

                DateTime?recrawlDate = null;
                var      lastCrawl   = w.LastCrawled;
                w.LastCrawled = DateTime.Now;

                CurrentTasks[taskNumber] = url;
                HttpStatusCode?statusCode = null;

                try
                {
                    // Get response headers - DO NOT READ CONTENT yet (performance reasons)
                    var response = await httpClient
                                   .GetAsync(url, HttpCompletionOption.ResponseHeadersRead, cancelSource.Token);

                    statusCode = response.StatusCode;
                    if (!response.IsSuccessStatusCode || cancelSource.IsCancellationRequested)
                    {
                        #region Failed to crawl
                        // TODO: treat differently based on status code (for ex. if page doesn't exist at all, or if 500, 404,...)
                        switch (response.StatusCode)
                        {
                        case HttpStatusCode.Redirect:
                            // Add the redirected location to backlog
                            var newurl = response.Headers.Location.AbsoluteUri;
                            if (string.IsNullOrEmpty(newurl) == false)
                            {
                                // check if URL is eligible for crawling
                                if (Manager.IsUrlEligibleForCrawl(newurl) == false)
                                {
                                    continue;
                                }
                                if (Manager.IsUrlCrawled(newurl))
                                {
                                    // ignore already-crawled urls
                                }
                                else
                                {
                                    Manager.AddToBacklog(newurl);
                                }
                            }
                            break;

                        case HttpStatusCode.MethodNotAllowed:
                        case HttpStatusCode.Gone:
                        case HttpStatusCode.BadRequest:
                        case HttpStatusCode.NoContent:
                        case HttpStatusCode.Unauthorized:
                        case HttpStatusCode.NotFound:
                        case HttpStatusCode.Forbidden:
                            // ignore it - mark as failed
                            break;

                        case HttpStatusCode.BadGateway:
                        case HttpStatusCode.TooManyRequests:
                        case HttpStatusCode.InternalServerError:
                            // if no recrawl date set yet, set it into the future
                            if (w.RecrawlDate == null && w.LastCrawled != null)
                            {
                                recrawlDate = DateTime.Now.AddMinutes(5);
                            }
                            // if recrawl was already set, double it since last time
                            else
                            {
                                var duration = w.RecrawlDate.Value.Subtract(lastCrawl.Value);
                                recrawlDate = DateTime.Now.Add(duration.Multiply(2));
                            }
                            break;

                        default:
                            break;
                        }

                        // if recrawl date was set
                        if (recrawlDate != null)
                        {
                            w.RecrawlDate = recrawlDate;
                        }

                        w.Success = false;

                        continue;
                        #endregion
                    }

                    var mediaType = response.Content?.Headers?.ContentType?.MediaType;
                    if (mediaType == null)
                    {
                        continue;
                    }

                    // Check if media type is set as a scanning target, if yes, scan it for new URLs
                    if (Config.ScanTargetsMediaTypes.Count(x => x == mediaType) > 0)
                    {
                        // scan content for more urls
                        var content = await response.Content.ReadAsStringAsync();

                        UrlFinder.ScanContentAndAddToManager(url, content, Config, plugins, Manager, robots, cancelSource);
                    }

                    // Check if media type is set as an accepted file to download

                    #region Download resource if valid
                    // attempt to get filename
                    var filename = GetFilename(url, mediaType);

                    // check if URL matches defined URL patterns
                    if (Extensions.IsURLMatch(url, Config) == false)
                    {
                        continue;
                    }

                    // don't download file if not acceptable
                    if (IsAcceptable(filename, mediaType) == false ||
                        cancelSource.IsCancellationRequested)
                    {
                        continue;
                    }

                    // check file size limits
                    var size = response.Content.Headers.ContentLength;

                    // if content length is not provided, ignore file
                    if (size == null)
                    {
                        continue;
                    }

                    var sizekB = size / 1024;
                    if (Config.MinimumAllowedFileSizekB != -1 && sizekB < Config.MinimumAllowedFileSizekB)
                    {
                        continue;
                    }
                    if (Config.MaximumAllowedFileSizekB != -1 && sizekB > Config.MaximumAllowedFileSizekB)
                    {
                        continue;
                    }

                    // construct path
                    var directory = GetDirectoryPath(url, true);
                    var path      = Path.Combine(directory, filename);

                    // check plugins
                    if (plugins?.Invoke(p => p.BeforeDownload(url, path), true) == false)
                    {
                        Logger.Log($"Plugin rejected download of '{filename}'", Logger.LogSeverity.Debug);
                        continue;
                    }

                    // get temporary file to download content to
                    var temp = Extensions.GetTempFile(ConfigManager.TemporaryFileTransferDirectory);

                    try
                    {
                        // download content to temporary file
                        using (var fstream = new FileStream(temp, FileMode.Create, FileAccess.Write, FileShare.None))
                            await response.Content.CopyToAsync(fstream);

                        // now compare temp file contents to destination file - check for duplicates using MD5 hash comparing
                        path = Extensions.CopyToAndGetPath(temp, path);

                        plugins?.Invoke(p => p.AfterDownload(url, path));
                    }
                    catch
                    {
                        throw;
                    }
                    finally
                    {
                        File.Delete(temp);
                    }

                    // log the download
                    RecentDownloads.Add(new DownloadedWork(path, response.Content.Headers.ContentLength.Value));

                    // Logger.Log($"Downloaded '{url}' to '{path}'");
                    w.DownloadLocation = Extensions.GetRelativeFilePath(path, Config);
                    w.IsDownloaded     = true;
                    w.Success          = true;

                    Logger.Log($"Downloaded ({response.StatusCode}) {url}");
                    #endregion
                }
                catch (OperationCanceledException) { }
                catch (NullReferenceException nex)
                {
                    Logger.Log($"NullReferenceException while crawling - {url} - {nex.Message} -- {nex.StackTrace}",
                               Logger.LogSeverity.Error);
                }
                catch (IOException iex)
                {
                    // usually happens when trying to download file with same name
                    Logger.Log($"IOException while crawling - {iex.Message}",
                               Logger.LogSeverity.Debug);
                }
                catch (Exception ex)
                {
                    Logger.Log($"Exception while crawling - {url} - ({ex.GetType().Name}) {ex.Message}",
                               Logger.LogSeverity.Debug);
                }
                finally
                {
                    // also log crawled that weren't downloaded and had successful response
                    if (!w.Success && Config.LogEveryCrawl)
                    {
                        if (statusCode == null)
                        {
                            Logger.Log($"Canceled {url}");

                            // TODO: re-add it to backlog maybe?
                        }
                        else
                        {
                            Logger.Log($"Crawled ({statusCode}) {url}");
                        }
                    }

                    CurrentTasks[taskNumber] = null;
                    Manager.ReportWorkResult(w);
                }
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Finds all URLs in given text content from given URL based on the given configuration and automatically adds to given work manager.
        /// </summary>
        /// <param name="url"></param>
        /// <param name="content"></param>
        /// <param name="config"></param>
        /// <param name="plugins"></param>
        /// <param name="manager"></param>
        /// <param name="cancelSource"></param>
        public static void ScanContentAndAddToManager(string url, string content,
                                                      WorkerConfiguration config, PluginManager plugins, WorkManager manager,
                                                      RobotsHandler robotHandler, CancellationTokenSource cancelSource)
        {
            // check plugins for FindUrls implementation
            PluginInfo foundplugin = null;

            if (plugins != null)
            {
                foreach (var p in plugins.Plugins)
                {
                    if (p.FindUrlsImplemented)
                    {
                        foundplugin = p;
                        break;
                    }
                }
            }

            // find URLs (use PLUGIN that overrides it, if it exists)
            if (foundplugin == null)
            {
                foreach (var u in FindUrls(url, content, config))
                {
                    if (cancelSource.IsCancellationRequested)
                    {
                        break;
                    }
                    validateAndAddFoundUrl(u);
                }
            }
            else
            {
                foreach (var u in foundplugin.FindUrls(url, content))
                {
                    if (cancelSource.IsCancellationRequested)
                    {
                        break;
                    }
                    validateAndAddFoundUrl(u);
                }
            }

            // LOCAL FUNCTION FOR VALIDATING FOUND URLS
            void validateAndAddFoundUrl(string u)
            {
                // check if URL is excluded
                if (robotHandler.IsUrlExcluded(u, config).Result)
                {
                    return;
                }

                // check if URL is eligible for crawling
                if (manager.IsUrlEligibleForCrawl(u) == false)
                {
                    return;
                }

                if (manager.IsUrlCrawled(u))
                {
                    // ignore already-crawled urls
                }
                else
                {
                    manager.AddToBacklog(u);
                }
            }
        }