private async void Work() { var taskNumber = 0; semaphore.Wait(); taskNumber = currentTaskNumber++; CurrentTasks.TryAdd(taskNumber, null); semaphore.Release(); while (!cancelSource.IsCancellationRequested) { Work w; string url; try { #region Get valid work if (!Manager.IsWorkAvailable || Manager.GetWork(out w, crawlDelay: Config.CrawlDelaySeconds) == false) { // unable to get work, wait a bit and try again CurrentTasks[taskNumber] = null; await Task.Delay(20); continue; } url = w.Url; // check if url is whitelisted if (Extensions.IsUrlWhitelisted(url, Config) == false) { Logger.Log($"Skipping URL '{url}' - Not whitelisted!", Logger.LogSeverity.Debug); continue; } // check robots.txt and blacklisted tags // (this also attempts to download robots.txt on first run) if (robots.IsUrlExcluded(url, Config, true).Result) { continue; } // get crawl-delay as defined by 'robots.txt' var wait = robots.GetWaitTime(url, Config); // wait the difference between [ROBOTS.TXT CRAWL DELAY] - [GLOBAL CRAWL DELAY] var difference = wait - Config.CrawlDelaySeconds; if (difference > 0) { Task.Delay((int)TimeSpan.FromSeconds(difference).TotalMilliseconds).Wait(); } #endregion } catch (Exception ex) { Logger.Log($"Error while trying to get valid work! " + ex.GetDetailedMessage(), Logger.LogSeverity.Warning); continue; } DateTime?recrawlDate = null; var lastCrawl = w.LastCrawled; w.LastCrawled = DateTime.Now; CurrentTasks[taskNumber] = url; HttpStatusCode?statusCode = null; try { // Get response headers - DO NOT READ CONTENT yet (performance reasons) var response = await httpClient .GetAsync(url, HttpCompletionOption.ResponseHeadersRead, cancelSource.Token); statusCode = response.StatusCode; if (!response.IsSuccessStatusCode || cancelSource.IsCancellationRequested) { #region Failed to crawl // TODO: treat differently based on status code (for ex. if page doesn't exist at all, or if 500, 404,...) switch (response.StatusCode) { case HttpStatusCode.Redirect: // Add the redirected location to backlog var newurl = response.Headers.Location.AbsoluteUri; if (string.IsNullOrEmpty(newurl) == false) { // check if URL is eligible for crawling if (Manager.IsUrlEligibleForCrawl(newurl) == false) { continue; } if (Manager.IsUrlCrawled(newurl)) { // ignore already-crawled urls } else { Manager.AddToBacklog(newurl); } } break; case HttpStatusCode.MethodNotAllowed: case HttpStatusCode.Gone: case HttpStatusCode.BadRequest: case HttpStatusCode.NoContent: case HttpStatusCode.Unauthorized: case HttpStatusCode.NotFound: case HttpStatusCode.Forbidden: // ignore it - mark as failed break; case HttpStatusCode.BadGateway: case HttpStatusCode.TooManyRequests: case HttpStatusCode.InternalServerError: // if no recrawl date set yet, set it into the future if (w.RecrawlDate == null && w.LastCrawled != null) { recrawlDate = DateTime.Now.AddMinutes(5); } // if recrawl was already set, double it since last time else { var duration = w.RecrawlDate.Value.Subtract(lastCrawl.Value); recrawlDate = DateTime.Now.Add(duration.Multiply(2)); } break; default: break; } // if recrawl date was set if (recrawlDate != null) { w.RecrawlDate = recrawlDate; } w.Success = false; continue; #endregion } var mediaType = response.Content?.Headers?.ContentType?.MediaType; if (mediaType == null) { continue; } // Check if media type is set as a scanning target, if yes, scan it for new URLs if (Config.ScanTargetsMediaTypes.Count(x => x == mediaType) > 0) { // scan content for more urls var content = await response.Content.ReadAsStringAsync(); UrlFinder.ScanContentAndAddToManager(url, content, Config, plugins, Manager, robots, cancelSource); } // Check if media type is set as an accepted file to download #region Download resource if valid // attempt to get filename var filename = GetFilename(url, mediaType); // check if URL matches defined URL patterns if (Extensions.IsURLMatch(url, Config) == false) { continue; } // don't download file if not acceptable if (IsAcceptable(filename, mediaType) == false || cancelSource.IsCancellationRequested) { continue; } // check file size limits var size = response.Content.Headers.ContentLength; // if content length is not provided, ignore file if (size == null) { continue; } var sizekB = size / 1024; if (Config.MinimumAllowedFileSizekB != -1 && sizekB < Config.MinimumAllowedFileSizekB) { continue; } if (Config.MaximumAllowedFileSizekB != -1 && sizekB > Config.MaximumAllowedFileSizekB) { continue; } // construct path var directory = GetDirectoryPath(url, true); var path = Path.Combine(directory, filename); // check plugins if (plugins?.Invoke(p => p.BeforeDownload(url, path), true) == false) { Logger.Log($"Plugin rejected download of '{filename}'", Logger.LogSeverity.Debug); continue; } // get temporary file to download content to var temp = Extensions.GetTempFile(ConfigManager.TemporaryFileTransferDirectory); try { // download content to temporary file using (var fstream = new FileStream(temp, FileMode.Create, FileAccess.Write, FileShare.None)) await response.Content.CopyToAsync(fstream); // now compare temp file contents to destination file - check for duplicates using MD5 hash comparing path = Extensions.CopyToAndGetPath(temp, path); plugins?.Invoke(p => p.AfterDownload(url, path)); } catch { throw; } finally { File.Delete(temp); } // log the download RecentDownloads.Add(new DownloadedWork(path, response.Content.Headers.ContentLength.Value)); // Logger.Log($"Downloaded '{url}' to '{path}'"); w.DownloadLocation = Extensions.GetRelativeFilePath(path, Config); w.IsDownloaded = true; w.Success = true; Logger.Log($"Downloaded ({response.StatusCode}) {url}"); #endregion } catch (OperationCanceledException) { } catch (NullReferenceException nex) { Logger.Log($"NullReferenceException while crawling - {url} - {nex.Message} -- {nex.StackTrace}", Logger.LogSeverity.Error); } catch (IOException iex) { // usually happens when trying to download file with same name Logger.Log($"IOException while crawling - {iex.Message}", Logger.LogSeverity.Debug); } catch (Exception ex) { Logger.Log($"Exception while crawling - {url} - ({ex.GetType().Name}) {ex.Message}", Logger.LogSeverity.Debug); } finally { // also log crawled that weren't downloaded and had successful response if (!w.Success && Config.LogEveryCrawl) { if (statusCode == null) { Logger.Log($"Canceled {url}"); // TODO: re-add it to backlog maybe? } else { Logger.Log($"Crawled ({statusCode}) {url}"); } } CurrentTasks[taskNumber] = null; Manager.ReportWorkResult(w); } } }
/// <summary> /// Finds all URLs in given text content from given URL based on the given configuration and automatically adds to given work manager. /// </summary> /// <param name="url"></param> /// <param name="content"></param> /// <param name="config"></param> /// <param name="plugins"></param> /// <param name="manager"></param> /// <param name="cancelSource"></param> public static void ScanContentAndAddToManager(string url, string content, WorkerConfiguration config, PluginManager plugins, WorkManager manager, RobotsHandler robotHandler, CancellationTokenSource cancelSource) { // check plugins for FindUrls implementation PluginInfo foundplugin = null; if (plugins != null) { foreach (var p in plugins.Plugins) { if (p.FindUrlsImplemented) { foundplugin = p; break; } } } // find URLs (use PLUGIN that overrides it, if it exists) if (foundplugin == null) { foreach (var u in FindUrls(url, content, config)) { if (cancelSource.IsCancellationRequested) { break; } validateAndAddFoundUrl(u); } } else { foreach (var u in foundplugin.FindUrls(url, content)) { if (cancelSource.IsCancellationRequested) { break; } validateAndAddFoundUrl(u); } } // LOCAL FUNCTION FOR VALIDATING FOUND URLS void validateAndAddFoundUrl(string u) { // check if URL is excluded if (robotHandler.IsUrlExcluded(u, config).Result) { return; } // check if URL is eligible for crawling if (manager.IsUrlEligibleForCrawl(u) == false) { return; } if (manager.IsUrlCrawled(u)) { // ignore already-crawled urls } else { manager.AddToBacklog(u); } } }