コード例 #1
0
        internal DiscoveryProcessor(ApplicationSettings applicationSettings, Crawler <TArachnodeDAO> crawler, CrawlRequestManager <TArachnodeDAO> crawlRequestManager)
        {
            _applicationSettings = applicationSettings;

            _crawler             = crawler;
            _crawlRequestManager = crawlRequestManager;

            _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString);
            _arachnodeDAO.ApplicationSettings = _applicationSettings;
        }
コード例 #2
0
        /// <summary>
        ///     Processes a FilesRow after crawling.
        /// </summary>
        /// <param name = "filesRow">The files row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();;
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.File;
            crawlRequest.Discovery.ID            = filesRow.ID;
            crawlRequest.Data        = filesRow.Source;
            crawlRequest.ProcessData = true;
            crawlRequest.WebClient   = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the FilesRow.ResponseHeaders string...
            foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            if (applicationSettings.InsertFiles)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
コード例 #3
0
ファイル: Crawl.cs プロジェクト: vishalishere/arachnode.net
        /// <summary>
        ///     Initializes a new instance of the <see cref = "Crawl" /> class.
        /// </summary>
        /// <param name = "crawler">The crawler.</param>
        /// <param name="actionManager"></param>
        /// <param name="crawlRequestManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="htmlManager"></param>
        /// <param name="politenessManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "processData">if set to <c>true</c> [process data].</param>
        public Crawl(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CookieManager cookieManager, CrawlRequestManager <TArachnodeDAO> crawlRequestManager, DataTypeManager <TArachnodeDAO> dataTypeManager, DiscoveryManager <TArachnodeDAO> discoveryManager, EncodingManager <TArachnodeDAO> encodingManager, HtmlManager <TArachnodeDAO> htmlManager, PolitenessManager <TArachnodeDAO> politenessManager, ProxyManager <TArachnodeDAO> proxyManager, RuleManager <TArachnodeDAO> ruleManager, bool processData)
        {
            _applicationSettings = applicationSettings;
            _webSettings         = webSettings;

            UncrawledCrawlRequests = new PriorityQueue <CrawlRequest <TArachnodeDAO> >();
            UnassignedDiscoveries  = new HashSet <string>();

            _crawler = crawler;

            _crawlInfo.MaximumCrawlDepth = 1;

            _actionManager       = actionManager;
            _consoleManager      = consoleManager;
            _cookieManager       = cookieManager;
            _crawlRequestManager = crawlRequestManager;
            _dataTypeManager     = dataTypeManager;
            _discoveryManager    = discoveryManager;
            _encodingManager     = encodingManager;
            _htmlManager         = htmlManager;
            _politenessManager   = politenessManager;
            _proxyManager        = proxyManager;
            _ruleManager         = ruleManager;

            _processData = processData;

            _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, false, false);
            _arachnodeDAO.ApplicationSettings = applicationSettings;

            //_arachnodeDAO.OpenCommandConnections();

            _dataManager    = new DataManager <TArachnodeDAO>(_applicationSettings, _webSettings, _actionManager, _dataTypeManager, _discoveryManager, _ruleManager, _arachnodeDAO);
            _fileManager    = new FileManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _arachnodeDAO);
            _imageManager   = new ImageManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _arachnodeDAO);
            _webClient      = new WebClient <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager, _cookieManager, _proxyManager);
            _webPageManager = new WebPageManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager, _htmlManager, _arachnodeDAO);
        }
コード例 #4
0
        /// <summary>
        ///     Initializes a new instance of the <see cref = "Crawler" /> class.
        /// </summary>
        public Crawler(ApplicationSettings applicationSettings, WebSettings webSettings, CrawlMode crawlMode, List <CrawlerPeer> crawlerPeers, List <DatabasePeer> databasePeers, bool enableRenderers)
        {
            Guid = Guid.NewGuid();

            try
            {
                _applicationSettings = applicationSettings;
                _webSettings         = webSettings;

                _arachnodeDAO        = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true);
                _applicationSettings = _arachnodeDAO.ApplicationSettings;

                _consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings);

                _consoleManager.OutputString("arachnode.net " + Assembly.GetExecutingAssembly().GetName().Version, ConsoleColor.Green, ConsoleColor.Gray);

                _actionManager = new ActionManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);
                _ruleManager   = new RuleManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);

                _memoryManager = new MemoryManager <TArachnodeDAO>(_applicationSettings, _webSettings);
                _cacheManager  = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings);

                _cookieManager = new CookieManager();
                _cacheManager  = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings);

                CrawlerPeers  = crawlerPeers;
                DatabasePeers = databasePeers;

                _crawlerPeerManager  = new CrawlerPeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, CrawlerPeers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true));
                _databasePeerManager = new DatabasePeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, DatabasePeers);

                _cache = new Cache <TArachnodeDAO>(_applicationSettings, _webSettings, this, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager);

                _dataTypeManager     = new DataTypeManager <TArachnodeDAO>(_applicationSettings, _webSettings);
                _discoveryManager    = new DiscoveryManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager);
                _crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _consoleManager, _discoveryManager);
                _encodingManager     = new EncodingManager <TArachnodeDAO>(_applicationSettings, _webSettings);
                _htmlManager         = new HtmlManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager);
                _politenessManager   = new PolitenessManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache);
                _proxyManager        = new ProxyManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);
                _reportingManager    = new ReportingManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);

                //create required directories...
                if (!Directory.Exists(_applicationSettings.ConsoleOutputLogsDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.ConsoleOutputLogsDirectory);
                }

                if (!Directory.Exists(_applicationSettings.DownloadedFilesDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.DownloadedFilesDirectory);
                }

                if (!Directory.Exists(_applicationSettings.DownloadedImagesDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.DownloadedImagesDirectory);
                }

                if (!Directory.Exists(_applicationSettings.DownloadedWebPagesDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.DownloadedWebPagesDirectory);
                }

                QueryProcessor = new QueryProcessor <TArachnodeDAO>();

                _consoleManager.OutputString("Crawler: Initializing Configuration/Database Connection.", ConsoleColor.White, ConsoleColor.Gray);

                LoadCrawlActions(_arachnodeDAO);
                LoadCrawlRules(_arachnodeDAO);

                AreRenderersEnabled = enableRenderers;

                Engine = new Engine <TArachnodeDAO>(_applicationSettings, _webSettings, this, _cache, _actionManager, _cacheManager, _consoleManager, _cookieManager, _crawlRequestManager, _dataTypeManager, _discoveryManager, _encodingManager, _htmlManager, _memoryManager, _politenessManager, _proxyManager, _reportingManager, _ruleManager, enableRenderers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true));

                CrawlMode = crawlMode;

                /**/

                if (CrawlerPeerManager != null && CrawlerPeerManager.CrawlerPeers != null && CrawlerPeerManager.CrawlerPeers.Count != 0)
                {
                    ConsoleManager.OutputString("Crawler: Starting CrawlerPeerManager Server", ConsoleColor.White, ConsoleColor.Gray);

                    CrawlerPeerManager.StartServer(this, _arachnodeDAO);

                    _crawlerPeerManager.SendStatusMessageToCrawlerPeers(_arachnodeDAO);
                }

                /**/

                if (Debugger.IsAttached)
                {
                    _consoleManager.OutputString("Debugger: Attached - Expect Performance Degradation.", ConsoleColor.Yellow, ConsoleColor.Gray);
                }

                //update all core/components/managers with the updated ApplicationSettings...
#if DEMO
                Engine.CrawlRequestCompleted += Engine_CrawlRequestCompleted;

                _stopwatch.Start();
#endif
            }
            catch (InvalidConfigurationException invalidConfigurationException)
            {
                ProcessException(invalidConfigurationException);

                throw new InvalidConfigurationException(invalidConfigurationException.ApplicationSettings, invalidConfigurationException.WebSettings, invalidConfigurationException.Message, InvalidConfigurationExceptionSeverity.Error);
            }
            catch (Exception exception)
            {
                ProcessException(exception);

                throw new Exception(exception.Message, exception);
            }
        }
コード例 #5
0
        /// <summary>
        ///     Processes a WebPagesRow after crawling.
        /// </summary>
        /// <param name = "webPagesRow">The web pages row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage;
            crawlRequest.Discovery.ID            = webPagesRow.ID;
            crawlRequest.Data         = webPagesRow.Source;
            crawlRequest.CurrentDepth = webPagesRow.CrawlDepth;
            crawlRequest.Encoding     = Encoding.GetEncoding(webPagesRow.CodePage);
            crawlRequest.ProcessData  = true;
            crawlRequest.WebClient    = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string...
            foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            //now, process the bytes...
            encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO);

            if (applicationSettings.InsertWebPages)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk);

            //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images...
            crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO);
            crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }