/// <summary> /// Processes a FilesRow after crawling. /// </summary> /// <param name = "filesRow">The files row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "fileManager">The file manager.</param> /// <param name = "fileManager">The file manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.File; crawlRequest.Discovery.ID = filesRow.ID; crawlRequest.Data = filesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the FilesRow.ResponseHeaders string... foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertFiles) { crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
/// <summary> /// Initializes a new instance of the <see cref = "Crawl" /> class. /// </summary> /// <param name = "crawler">The crawler.</param> /// <param name="actionManager"></param> /// <param name="crawlRequestManager"></param> /// <param name="discoveryManager"></param> /// <param name="htmlManager"></param> /// <param name="politenessManager"></param> /// <param name="ruleManager"></param> /// <param name = "processData">if set to <c>true</c> [process data].</param> public Crawl(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CookieManager cookieManager, CrawlRequestManager <TArachnodeDAO> crawlRequestManager, DataTypeManager <TArachnodeDAO> dataTypeManager, DiscoveryManager <TArachnodeDAO> discoveryManager, EncodingManager <TArachnodeDAO> encodingManager, HtmlManager <TArachnodeDAO> htmlManager, PolitenessManager <TArachnodeDAO> politenessManager, ProxyManager <TArachnodeDAO> proxyManager, RuleManager <TArachnodeDAO> ruleManager, bool processData) { _applicationSettings = applicationSettings; _webSettings = webSettings; UncrawledCrawlRequests = new PriorityQueue <CrawlRequest <TArachnodeDAO> >(); UnassignedDiscoveries = new HashSet <string>(); _crawler = crawler; _crawlInfo.MaximumCrawlDepth = 1; _actionManager = actionManager; _consoleManager = consoleManager; _cookieManager = cookieManager; _crawlRequestManager = crawlRequestManager; _dataTypeManager = dataTypeManager; _discoveryManager = discoveryManager; _encodingManager = encodingManager; _htmlManager = htmlManager; _politenessManager = politenessManager; _proxyManager = proxyManager; _ruleManager = ruleManager; _processData = processData; _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, false, false); _arachnodeDAO.ApplicationSettings = applicationSettings; //_arachnodeDAO.OpenCommandConnections(); _dataManager = new DataManager <TArachnodeDAO>(_applicationSettings, _webSettings, _actionManager, _dataTypeManager, _discoveryManager, _ruleManager, _arachnodeDAO); _fileManager = new FileManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _arachnodeDAO); _imageManager = new ImageManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _arachnodeDAO); _webClient = new WebClient <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager, _cookieManager, _proxyManager); _webPageManager = new WebPageManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _htmlManager, _arachnodeDAO); }
/// <summary> /// Initializes a new instance of the <see cref = "Crawler" /> class. /// </summary> public Crawler(ApplicationSettings applicationSettings, WebSettings webSettings, CrawlMode crawlMode, List <CrawlerPeer> crawlerPeers, List <DatabasePeer> databasePeers, bool enableRenderers) { Guid = Guid.NewGuid(); try { _applicationSettings = applicationSettings; _webSettings = webSettings; _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true); _applicationSettings = _arachnodeDAO.ApplicationSettings; _consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings); _consoleManager.OutputString("arachnode.net " + Assembly.GetExecutingAssembly().GetName().Version, ConsoleColor.Green, ConsoleColor.Gray); _actionManager = new ActionManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _ruleManager = new RuleManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _memoryManager = new MemoryManager <TArachnodeDAO>(_applicationSettings, _webSettings); _cacheManager = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings); _cookieManager = new CookieManager(); _cacheManager = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings); CrawlerPeers = crawlerPeers; DatabasePeers = databasePeers; _crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, CrawlerPeers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true)); _databasePeerManager = new DatabasePeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, DatabasePeers); _cache = new Cache <TArachnodeDAO>(_applicationSettings, _webSettings, this, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager); _dataTypeManager = new DataTypeManager <TArachnodeDAO>(_applicationSettings, _webSettings); _discoveryManager = new DiscoveryManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager); _crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _consoleManager, _discoveryManager); _encodingManager = new EncodingManager <TArachnodeDAO>(_applicationSettings, _webSettings); _htmlManager = new HtmlManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager); _politenessManager = new PolitenessManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache); _proxyManager = new ProxyManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _reportingManager = new ReportingManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); //create required directories... if (!Directory.Exists(_applicationSettings.ConsoleOutputLogsDirectory)) { Directory.CreateDirectory(_applicationSettings.ConsoleOutputLogsDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedFilesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedFilesDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedImagesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedImagesDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedWebPagesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedWebPagesDirectory); } QueryProcessor = new QueryProcessor <TArachnodeDAO>(); _consoleManager.OutputString("Crawler: Initializing Configuration/Database Connection.", ConsoleColor.White, ConsoleColor.Gray); LoadCrawlActions(_arachnodeDAO); LoadCrawlRules(_arachnodeDAO); AreRenderersEnabled = enableRenderers; Engine = new Engine <TArachnodeDAO>(_applicationSettings, _webSettings, this, _cache, _actionManager, _cacheManager, _consoleManager, _cookieManager, _crawlRequestManager, _dataTypeManager, _discoveryManager, _encodingManager, _htmlManager, _memoryManager, _politenessManager, _proxyManager, _reportingManager, _ruleManager, enableRenderers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true)); CrawlMode = crawlMode; /**/ if (CrawlerPeerManager != null && CrawlerPeerManager.CrawlerPeers != null && CrawlerPeerManager.CrawlerPeers.Count != 0) { ConsoleManager.OutputString("Crawler: Starting CrawlerPeerManager Server", ConsoleColor.White, ConsoleColor.Gray); CrawlerPeerManager.StartServer(this, _arachnodeDAO); _crawlerPeerManager.SendStatusMessageToCrawlerPeers(_arachnodeDAO); } /**/ if (Debugger.IsAttached) { _consoleManager.OutputString("Debugger: Attached - Expect Performance Degradation.", ConsoleColor.Yellow, ConsoleColor.Gray); } //update all core/components/managers with the updated ApplicationSettings... #if DEMO Engine.CrawlRequestCompleted += Engine_CrawlRequestCompleted; _stopwatch.Start(); #endif } catch (InvalidConfigurationException invalidConfigurationException) { ProcessException(invalidConfigurationException); throw new InvalidConfigurationException(invalidConfigurationException.ApplicationSettings, invalidConfigurationException.WebSettings, invalidConfigurationException.Message, InvalidConfigurationExceptionSeverity.Error); } catch (Exception exception) { ProcessException(exception); throw new Exception(exception.Message, exception); } }
/// <summary> /// Processes a WebPagesRow after crawling. /// </summary> /// <param name = "webPagesRow">The web pages row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager(); CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage; crawlRequest.Discovery.ID = webPagesRow.ID; crawlRequest.Data = webPagesRow.Source; crawlRequest.CurrentDepth = webPagesRow.CrawlDepth; crawlRequest.Encoding = Encoding.GetEncoding(webPagesRow.CodePage); crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string... foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); //now, process the bytes... encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO); if (applicationSettings.InsertWebPages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk); //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images... crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO); crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }