/// <summary> /// Process a range of FileID after crawling. Useful if crawled Files were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "fileIDLowerBound"></param> /// <param name = "fileIDUpperBound"></param> public static void ProcessFiles(Crawler <TArachnodeDAO> crawler, long fileIDLowerBound, long fileIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO); for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++) { ArachnodeDataSet.FilesRow filesRow = null; try { //get the File from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields. filesRow = arachnodeDAO.GetFile(i.ToString()); if (filesRow != null) { if (filesRow.Source == null || filesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType))) { filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType)); } else { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); } } } ProcessFile(crawler.ApplicationSettings, crawler.WebSettings, crawler, filesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, fileManager, memoryManager, ruleManager, arachnodeDAO); Console.WriteLine("FileID: " + i + " was processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); OnFileProcessed.BeginInvoke(filesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }