private void Main_Load(object sender, EventArgs e) { try { _arachnodeDAO = new ArachnodeDAO(_applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true); _actionManager = new ActionManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _consoleManager = new ConsoleManager <ArachnodeDAO>(_applicationSettings, _webSettings); _memoryManager = new MemoryManager <ArachnodeDAO>(_applicationSettings, _webSettings); _cacheManager = new CacheManager <ArachnodeDAO>(_applicationSettings, _webSettings); _crawlerPeerManager = new CrawlerPeerManager <ArachnodeDAO>(_applicationSettings, _webSettings, null, _arachnodeDAO); _cache = new Cache <ArachnodeDAO>(_applicationSettings, _webSettings, null, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager); _ruleManager = new RuleManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _discoveryManager = new DiscoveryManager <ArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager); nudWebPageID_ValueChanged(null, null); nudFileID_ValueChanged(null, null); nudImageID_ValueChanged(null, null); } catch (Exception exception) { MessageBox.Show(exception.Message + " ::" + exception.StackTrace, "Browser"); } }
public Cache(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ActionManager <TArachnodeDAO> actionManager, CacheManager <TArachnodeDAO> cacheManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager) { _applicationSettings = applicationSettings; _webSettings = webSettings; _crawler = crawler; _actionManager = actionManager; _cacheManager = cacheManager; _crawlerPeerManager = crawlerPeerManager; _memoryManager = memoryManager; _ruleManager = ruleManager; _cacheItemRemovedCallback = CacheItemRemoved; }
public Main() { InitializeComponent(); _formText = Text; _actionManager = new ActionManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _consoleManager = new ConsoleManager <ArachnodeDAO>(_applicationSettings, _webSettings); _memoryManager = new MemoryManager <ArachnodeDAO>(_applicationSettings, _webSettings); _cacheManager = new CacheManager <ArachnodeDAO>(_applicationSettings, _webSettings); _crawlerPeerManager = new CrawlerPeerManager <ArachnodeDAO>(_applicationSettings, _webSettings, null, null); _cache = new Cache <ArachnodeDAO>(_applicationSettings, _webSettings, null, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager); _ruleManager = new RuleManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _discoveryManager = new DiscoveryManager <ArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager); tbAbsoluteUri_KeyUp(this, new KeyEventArgs(Keys.Enter)); }
/// <summary> /// Process a range of FileID after crawling. Useful if crawled Files were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "fileIDLowerBound"></param> /// <param name = "fileIDUpperBound"></param> public static void ProcessFiles(Crawler <TArachnodeDAO> crawler, long fileIDLowerBound, long fileIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO); for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++) { ArachnodeDataSet.FilesRow filesRow = null; try { //get the File from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields. filesRow = arachnodeDAO.GetFile(i.ToString()); if (filesRow != null) { if (filesRow.Source == null || filesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType))) { filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType)); } else { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); } } } ProcessFile(crawler.ApplicationSettings, crawler.WebSettings, crawler, filesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, fileManager, memoryManager, ruleManager, arachnodeDAO); Console.WriteLine("FileID: " + i + " was processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); OnFileProcessed.BeginInvoke(filesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }
/// <summary> /// Processes a FilesRow after crawling. /// </summary> /// <param name = "filesRow">The files row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "fileManager">The file manager.</param> /// <param name = "fileManager">The file manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.File; crawlRequest.Discovery.ID = filesRow.ID; crawlRequest.Data = filesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the FilesRow.ResponseHeaders string... foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertFiles) { crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
/// <summary> /// Initializes a new instance of the <see cref = "Crawler" /> class. /// </summary> public Crawler(ApplicationSettings applicationSettings, WebSettings webSettings, CrawlMode crawlMode, List <CrawlerPeer> crawlerPeers, List <DatabasePeer> databasePeers, bool enableRenderers) { Guid = Guid.NewGuid(); try { _applicationSettings = applicationSettings; _webSettings = webSettings; _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true); _applicationSettings = _arachnodeDAO.ApplicationSettings; _consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings); _consoleManager.OutputString("arachnode.net " + Assembly.GetExecutingAssembly().GetName().Version, ConsoleColor.Green, ConsoleColor.Gray); _actionManager = new ActionManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _ruleManager = new RuleManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _memoryManager = new MemoryManager <TArachnodeDAO>(_applicationSettings, _webSettings); _cacheManager = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings); _cookieManager = new CookieManager(); _cacheManager = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings); CrawlerPeers = crawlerPeers; DatabasePeers = databasePeers; _crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, CrawlerPeers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true)); _databasePeerManager = new DatabasePeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, DatabasePeers); _cache = new Cache <TArachnodeDAO>(_applicationSettings, _webSettings, this, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager); _dataTypeManager = new DataTypeManager <TArachnodeDAO>(_applicationSettings, _webSettings); _discoveryManager = new DiscoveryManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager); _crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _consoleManager, _discoveryManager); _encodingManager = new EncodingManager <TArachnodeDAO>(_applicationSettings, _webSettings); _htmlManager = new HtmlManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager); _politenessManager = new PolitenessManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache); _proxyManager = new ProxyManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _reportingManager = new ReportingManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); //create required directories... if (!Directory.Exists(_applicationSettings.ConsoleOutputLogsDirectory)) { Directory.CreateDirectory(_applicationSettings.ConsoleOutputLogsDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedFilesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedFilesDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedImagesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedImagesDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedWebPagesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedWebPagesDirectory); } QueryProcessor = new QueryProcessor <TArachnodeDAO>(); _consoleManager.OutputString("Crawler: Initializing Configuration/Database Connection.", ConsoleColor.White, ConsoleColor.Gray); LoadCrawlActions(_arachnodeDAO); LoadCrawlRules(_arachnodeDAO); AreRenderersEnabled = enableRenderers; Engine = new Engine <TArachnodeDAO>(_applicationSettings, _webSettings, this, _cache, _actionManager, _cacheManager, _consoleManager, _cookieManager, _crawlRequestManager, _dataTypeManager, _discoveryManager, _encodingManager, _htmlManager, _memoryManager, _politenessManager, _proxyManager, _reportingManager, _ruleManager, enableRenderers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true)); CrawlMode = crawlMode; /**/ if (CrawlerPeerManager != null && CrawlerPeerManager.CrawlerPeers != null && CrawlerPeerManager.CrawlerPeers.Count != 0) { ConsoleManager.OutputString("Crawler: Starting CrawlerPeerManager Server", ConsoleColor.White, ConsoleColor.Gray); CrawlerPeerManager.StartServer(this, _arachnodeDAO); _crawlerPeerManager.SendStatusMessageToCrawlerPeers(_arachnodeDAO); } /**/ if (Debugger.IsAttached) { _consoleManager.OutputString("Debugger: Attached - Expect Performance Degradation.", ConsoleColor.Yellow, ConsoleColor.Gray); } //update all core/components/managers with the updated ApplicationSettings... #if DEMO Engine.CrawlRequestCompleted += Engine_CrawlRequestCompleted; _stopwatch.Start(); #endif } catch (InvalidConfigurationException invalidConfigurationException) { ProcessException(invalidConfigurationException); throw new InvalidConfigurationException(invalidConfigurationException.ApplicationSettings, invalidConfigurationException.WebSettings, invalidConfigurationException.Message, InvalidConfigurationExceptionSeverity.Error); } catch (Exception exception) { ProcessException(exception); throw new Exception(exception.Message, exception); } }
/// <summary> /// Processes a WebPagesRow after crawling. /// </summary> /// <param name = "webPagesRow">The web pages row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager(); CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage; crawlRequest.Discovery.ID = webPagesRow.ID; crawlRequest.Data = webPagesRow.Source; crawlRequest.CurrentDepth = webPagesRow.CrawlDepth; crawlRequest.Encoding = Encoding.GetEncoding(webPagesRow.CodePage); crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string... foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); //now, process the bytes... encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO); if (applicationSettings.InsertWebPages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk); //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images... crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO); crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }