public void Start() { if (IsActive) { throw new InvalidOperationException("Crawler already active!"); } IsActive = true; httpClient = new HttpClient(); httpClient.DefaultRequestHeaders.Add("User-Agent", Config.UserAgent); robots = new RobotsHandler(Config, httpClient); cancelSource = new CancellationTokenSource(); for (int i = 0; i < Config.MaxConcurrency; i++) { new Task(Work, cancelSource.Token, TaskCreationOptions.LongRunning).Start(); } StateChanged?.Invoke(this, true); }
/// <summary> /// Finds all URLs in given text content from given URL based on the given configuration and automatically adds to given work manager. /// </summary> /// <param name="url"></param> /// <param name="content"></param> /// <param name="config"></param> /// <param name="plugins"></param> /// <param name="manager"></param> /// <param name="cancelSource"></param> public static void ScanContentAndAddToManager(string url, string content, WorkerConfiguration config, PluginManager plugins, WorkManager manager, RobotsHandler robotHandler, CancellationTokenSource cancelSource) { // check plugins for FindUrls implementation PluginInfo foundplugin = null; if (plugins != null) { foreach (var p in plugins.Plugins) { if (p.FindUrlsImplemented) { foundplugin = p; break; } } } // find URLs (use PLUGIN that overrides it, if it exists) if (foundplugin == null) { foreach (var u in FindUrls(url, content, config)) { if (cancelSource.IsCancellationRequested) { break; } validateAndAddFoundUrl(u); } } else { foreach (var u in foundplugin.FindUrls(url, content)) { if (cancelSource.IsCancellationRequested) { break; } validateAndAddFoundUrl(u); } } // LOCAL FUNCTION FOR VALIDATING FOUND URLS void validateAndAddFoundUrl(string u) { // check if URL is excluded if (robotHandler.IsUrlExcluded(u, config).Result) { return; } // check if URL is eligible for crawling if (manager.IsUrlEligibleForCrawl(u) == false) { return; } if (manager.IsUrlCrawled(u)) { // ignore already-crawled urls } else { manager.AddToBacklog(u); } } }