Exemplo n.º 1
0
        /// <summary>
        ///     Processes a FilesRow after crawling.
        /// </summary>
        /// <param name = "filesRow">The files row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();;
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.File;
            crawlRequest.Discovery.ID            = filesRow.ID;
            crawlRequest.Data        = filesRow.Source;
            crawlRequest.ProcessData = true;
            crawlRequest.WebClient   = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the FilesRow.ResponseHeaders string...
            foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            if (applicationSettings.InsertFiles)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
Exemplo n.º 2
0
        /// <summary>
        ///     Processes a WebPagesRow after crawling.
        /// </summary>
        /// <param name = "webPagesRow">The web pages row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage;
            crawlRequest.Discovery.ID            = webPagesRow.ID;
            crawlRequest.Data         = webPagesRow.Source;
            crawlRequest.CurrentDepth = webPagesRow.CrawlDepth;
            crawlRequest.Encoding     = Encoding.GetEncoding(webPagesRow.CodePage);
            crawlRequest.ProcessData  = true;
            crawlRequest.WebClient    = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string...
            foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            //now, process the bytes...
            encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO);

            if (applicationSettings.InsertWebPages)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk);

            //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images...
            crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO);
            crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
Exemplo n.º 3
0
        public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions)
        {
            IssueWebRequest(crawlRequest, "GET");

            crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest);

            if (obeyCrawlRules)
            {
                _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreGet, _arachnodeDAO);
            }

            if (executeCrawlActions)
            {
                _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreGet, _arachnodeDAO);
            }

            if (!crawlRequest.IsDisallowed)
            {
                try
                {
                    if (crawlRequest.WebClient.HttpWebResponse != null)
                    {
                        crawlRequest.ProcessData = true;

                        bool isLastModifiedOutdated = true;

                        try
                        {
                            isLastModifiedOutdated = crawlRequest.WebClient.HttpWebResponse.LastModified != DateTime.Now;
                        }
                        catch (Exception exception)
                        {
                            _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                        }

                        if (isLastModifiedOutdated)
                        {
                            switch (crawlRequest.DataType.DiscoveryType)
                            {
                            case DiscoveryType.File:
                                if (ApplicationSettings.AssignFileAndImageDiscoveries)     //ANODET: robots.txt
                                {
                                    ArachnodeDataSet.FilesRow filesRow = _arachnodeDAO.GetFile(crawlRequest.Discovery.Uri.AbsoluteUri);

                                    if (filesRow == null)
                                    {
                                        crawlRequest.ProcessData = true;
                                    }
                                    else
                                    {
                                        if (!filesRow.IsResponseHeadersNull())
                                        {
                                            DateTime lastModified;

                                            SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Last-Modified: ", false);

                                            if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified))
                                            {
                                                //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present...
                                                if ((crawlRequest.WebClient.HttpWebResponse).LastModified > lastModified)
                                                {
                                                    crawlRequest.ProcessData = true;
                                                }
                                                else
                                                {
                                                    crawlRequest.ProcessData = false;
                                                }
                                            }
                                            else
                                            {
                                                crawlRequest.ProcessData = false;
                                            }
                                        }
                                        else
                                        {
                                            crawlRequest.ProcessData = true;
                                        }

                                        if (!crawlRequest.ProcessData)
                                        {
                                            if (filesRow.Source.Length != 0)
                                            {
                                                crawlRequest.Data = filesRow.Source;
                                            }
                                            else
                                            {
                                                string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType);

                                                if (File.Exists(discoveryPath))
                                                {
                                                    crawlRequest.Data = File.ReadAllBytes(discoveryPath);
                                                }
                                                else
                                                {
                                                    try
                                                    {
                                                        throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Files database table or at _applicationSettings.DownloadedFilesDirectory.  Therefore, the data was re-downloaded from the server.  The File file may have been deleted from disk or the 'Source' column in the 'Files' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertFileSource = false and _applicationSettings.SaveDiscoveredFilesToDisk = false.");
                                                    }
                                                    catch (Exception exception)
                                                    {
                                                        _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                                                    }

                                                    crawlRequest.ProcessData = true;
                                                }
                                            }
                                        }
                                    }
                                }
                                else
                                {
                                    crawlRequest.ProcessData = false;
                                }
                                break;

                            case DiscoveryType.Image:
                                if (ApplicationSettings.AssignFileAndImageDiscoveries)
                                {
                                    ArachnodeDataSet.ImagesRow imagesRow = _arachnodeDAO.GetImage(crawlRequest.Discovery.Uri.AbsoluteUri);

                                    if (imagesRow == null)
                                    {
                                        crawlRequest.ProcessData = true;
                                    }
                                    else
                                    {
                                        if (!imagesRow.IsResponseHeadersNull())
                                        {
                                            DateTime lastModified;

                                            SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, "Last-Modified: ", false);

                                            if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified))
                                            {
                                                //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present...
                                                if (crawlRequest.WebClient.HttpWebResponse.LastModified > lastModified)
                                                {
                                                    crawlRequest.ProcessData = true;
                                                }
                                                else
                                                {
                                                    crawlRequest.ProcessData = false;
                                                }
                                            }
                                            else
                                            {
                                                crawlRequest.ProcessData = false;
                                            }

                                            if (!crawlRequest.ProcessData)
                                            {
                                                if (imagesRow.Source.Length != 0)
                                                {
                                                    crawlRequest.Data = imagesRow.Source;
                                                }
                                                else
                                                {
                                                    string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType);

                                                    if (File.Exists(discoveryPath))
                                                    {
                                                        crawlRequest.Data = File.ReadAllBytes(discoveryPath);
                                                    }
                                                    else
                                                    {
                                                        try
                                                        {
                                                            throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Images database table or at _applicationSettings.DownloadedImagesDirectory.  Therefore, the data was downloaded from the server.  The Image file may have been deleted from disk or the 'Source' column in the 'Images' table may have been cleared.  A previous crawl may have crawled with both _applicationSettings.InsertImageSource = false and _applicationSettings.SaveDiscoveredImagesToDisk = false.");
                                                        }
                                                        catch (Exception exception)
                                                        {
                                                            _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                                                        }

                                                        crawlRequest.ProcessData = true;
                                                    }
                                                }
                                            }
                                        }
                                        else
                                        {
                                            crawlRequest.ProcessData = true;
                                        }
                                    }
                                }
                                else
                                {
                                    crawlRequest.ProcessData = false;
                                }
                                break;

                            case DiscoveryType.WebPage:
                                ArachnodeDataSet.WebPagesRow webPagesRow = _arachnodeDAO.GetWebPage(crawlRequest.Discovery.Uri.AbsoluteUri);

                                if (webPagesRow == null)
                                {
                                    crawlRequest.ProcessData = true;
                                }
                                else
                                {
                                    if ((crawlRequest.WebClient.HttpWebResponse).LastModified > webPagesRow.LastDiscovered)
                                    {
                                        crawlRequest.ProcessData = true;
                                    }
                                    else
                                    {
                                        crawlRequest.ProcessData = false;
                                    }

                                    if (!crawlRequest.ProcessData)
                                    {
                                        if (webPagesRow.Source.Length != 0)
                                        {
                                            crawlRequest.Data = webPagesRow.Source;
                                        }
                                        else
                                        {
                                            string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType);

                                            if (File.Exists(discoveryPath))
                                            {
                                                crawlRequest.Data = File.ReadAllBytes(discoveryPath);
                                            }
                                            else
                                            {
                                                try
                                                {
                                                    throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the WebPages database table or at _applicationSettings.DownloadedWebPagesDirectory.  Therefore, the data was re-downloaded from the server.  The WebPage file may have been deleted from disk or the 'Source' column in the 'WebPages' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertWebPageSource = false and _applicationSettings.SaveDiscoveredWebPagesToDisk = false.");
                                                }
                                                catch (Exception exception)
                                                {
                                                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                                                }

                                                crawlRequest.ProcessData = true;
                                            }
                                        }
                                    }
                                }
                                break;

                            case DiscoveryType.None:
                                crawlRequest.ProcessData = true;
                                break;
                            }
                        }
                    }
                }
                catch (Exception exception)
                {
                    _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false);
                }

                if (crawlRequest.ProcessData)
                {
                    if (crawlRequest.Data != null)
                    {
                    }

                    if (crawlRequest.RenderType == RenderType.None)
                    {
                        if (crawlRequest.Discovery.Uri.Scheme.ToLowerInvariant() != "ftp")
                        {
                            if (crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.Method == "HEAD")
                            {
                                IssueWebRequest(crawlRequest, "GET");
                            }

                            if (crawlRequest.WebClient.HttpWebResponse != null)
                            {
                                crawlRequest.Data = crawlRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer);
                            }
                        }
                        else
                        {
                            crawlRequest.Data = crawlRequest.WebClient.DownloadFtpData(crawlRequest.Discovery.Uri.AbsoluteUri);
                        }
                    }
                    else
                    {
                        RendererResponse rendererResponse = crawlRequest.Crawl.Crawler.Engine.Render(crawlRequest, RenderAction.Render, crawlRequest.RenderType);

                        if (rendererResponse != null)
                        {
                            if (rendererResponse.HTMLDocumentClass != null)
                            {
                                crawlRequest.Encoding = Encoding.GetEncoding(rendererResponse.HTMLDocumentClass.charset);

                                string outerHTML = rendererResponse.HTMLDocumentClass.documentElement.outerHTML;

                                crawlRequest.Data         = crawlRequest.Encoding.GetBytes(outerHTML);
                                crawlRequest.DecodedHtml  = HttpUtility.HtmlDecode(outerHTML);
                                crawlRequest.Html         = outerHTML;
                                crawlRequest.HtmlDocument = rendererResponse.HTMLDocumentClass;
                            }

                            crawlRequest.RendererMessage = rendererResponse.RendererMessage;
                        }
                    }
                }
            }
            else
            {
                if (crawlRequest.Data == null)
                {
                }
            }

            if (crawlRequest.Data == null)
            {
                crawlRequest.Data = new byte[0];
            }
        }