Ejemplo n.º 1
0
        /// <summary>
        ///     Processes a FilesRow after crawling.
        /// </summary>
        /// <param name = "filesRow">The files row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();;
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.File;
            crawlRequest.Discovery.ID            = filesRow.ID;
            crawlRequest.Data        = filesRow.Source;
            crawlRequest.ProcessData = true;
            crawlRequest.WebClient   = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the FilesRow.ResponseHeaders string...
            foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            if (applicationSettings.InsertFiles)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
Ejemplo n.º 2
0
        /// <summary>
        ///     Processes the crawl request.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "obeyCrawlRules">if set to <c>true</c> [obey crawl rules].</param>
        /// <param name = "executeCrawlActions">if set to <c>true</c> [execute crawl actions].</param>
        public void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions)
        {
            //HACK:!!!  Solve this!!!
//#if DEMO
//            return;
//#endif

            bool wasACacheHit = false;

            try
            {
                crawlRequest.WebClient = WebClient;

                if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered)
                {
                    if (!_politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestRequested, _arachnodeDAO))
                    {
                        Crawler.Engine.OnCrawlRequestThrottled(crawlRequest);

                        return;
                    }

                    _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest);

                    _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, _arachnodeDAO);

                    if (obeyCrawlRules)
                    {
                        _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO);
                    }

                    if (executeCrawlActions)
                    {
                        _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreRequest, _arachnodeDAO);
                    }

                    if (!crawlRequest.IsDisallowed)
                    {
                        _stopwatch.Reset();
                        _stopwatch.Start();

                        try
                        {
                            _dataManager.ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions);
                        }
                        catch (Exception exception2)
                        {
                            throw new Exception(exception2.Message, exception2);
                        }
                        finally
                        {
                            _stopwatch.Stop();

                            _crawlInfo.TotalHttpWebResponseTime += _stopwatch.Elapsed;
                            crawlRequest.HttpWebResponseTime     = _stopwatch.Elapsed;

                            _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO);
                        }

                        Counters.GetInstance().TotalBytesDiscovered(crawlRequest.Data.LongLength);

                        _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PostRequest, _arachnodeDAO);

                        _encodingManager.ProcessCrawlRequest(crawlRequest, _arachnodeDAO);

                        if (obeyCrawlRules)
                        {
                            _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PostRequest, _arachnodeDAO);
                        }

                        //the CrawlRequest could be Disallowed by a PreGet CrawlRule - specifically DataType.cs.
                        if (!crawlRequest.IsDisallowed)
                        {
                            if (_processData)
                            {
                                _crawlRequestManager.ProcessCrawlRequest(crawlRequest, _fileManager, _imageManager, _webPageManager, _arachnodeDAO);
                            }
                        }
                        else
                        {
                            if (crawlRequest.DataType.ContentType == null)
                            {
                                crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);
                            }

                            if (_applicationSettings.InsertDisallowedAbsoluteUris)
                            {
                                _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris);
                            }

                            _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest);
                        }
                    }
                    else
                    {
                        _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO);

                        if (crawlRequest.DataType.ContentType == null)
                        {
                            crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);
                        }

                        if (_applicationSettings.InsertDisallowedAbsoluteUris)
                        {
                            _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris);
                        }

                        _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest);
                    }
                }
                else
                {
                    wasACacheHit = true;

                    //this should only occur when you submit a CR from a rule, or action...
                    _consoleManager.OutputCacheHit(_crawlInfo, crawlRequest, crawlRequest.Discovery);
                }
            }
            catch (Exception exception)
            {
                _stopwatch.Stop();

                if (Crawler.Engine.State != EngineState.Start)
                {
                    //the request was aborted as it was long running and Engine was requested to Stop.
                    if ((crawlRequest.WebClient.WebException != null && crawlRequest.WebClient.WebException.Status == WebExceptionStatus.RequestCanceled) || (exception.InnerException != null && exception.InnerException.Message == "The request was aborted: The request was canceled."))
                    {
                        return;
                    }
                }

                if (crawlRequest.WebClient.WebException != null && crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 && crawlRequest.WebClient.WebException.Message.StartsWith("Unable to connect to the remote server"))
                {
                    _politenessManager.ResubmitCrawlRequest(crawlRequest, false, _arachnodeDAO);

                    _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCanceled, _arachnodeDAO);

                    return;
                }

                try
                {
                    _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO);
                }
                catch (Exception exception2)
                {
                    exception = exception2;
                }

                if (exception.InnerException == null)
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                }
                else
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.InnerException, false);
                }

                crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);

                if (_applicationSettings.InsertDisallowedAbsoluteUris)
                {
                    if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered)
                    {
                        _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.Message, _applicationSettings.ClassifyAbsoluteUris);
                    }
                    else
                    {
                        if (_applicationSettings.InsertDisallowedAbsoluteUriDiscoveries)
                        {
                            _arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri);
                        }
                    }
                }

                _consoleManager.OutputException(_crawlInfo.ThreadNumber, crawlRequest, _arachnodeDAO.LastExceptionID, _arachnodeDAO.LastExceptionMessage);
            }

            if (crawlRequest.IsFromDatabase)
            {
                _arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri);
            }

            _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, _arachnodeDAO);

            if (!wasACacheHit)
            {
                if (executeCrawlActions)
                {
                    _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, _arachnodeDAO);
                }

                Crawler.Engine.OnCrawlRequestCompleted(crawlRequest);
            }

            _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest);

            Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth);

            Counters.GetInstance().CrawlRequestRemoved();

            Counters.GetInstance().CrawlRequestProcessed();

            _crawlInfo.TotalCrawlRequestsProcessed++;
        }
Ejemplo n.º 3
0
        /// <summary>
        ///     Processes a WebPagesRow after crawling.
        /// </summary>
        /// <param name = "webPagesRow">The web pages row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage;
            crawlRequest.Discovery.ID            = webPagesRow.ID;
            crawlRequest.Data         = webPagesRow.Source;
            crawlRequest.CurrentDepth = webPagesRow.CrawlDepth;
            crawlRequest.Encoding     = Encoding.GetEncoding(webPagesRow.CodePage);
            crawlRequest.ProcessData  = true;
            crawlRequest.WebClient    = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string...
            foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            //now, process the bytes...
            encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO);

            if (applicationSettings.InsertWebPages)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk);

            //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images...
            crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO);
            crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }