/// <summary> /// Processes a FilesRow after crawling. /// </summary> /// <param name = "filesRow">The files row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "fileManager">The file manager.</param> /// <param name = "fileManager">The file manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.File; crawlRequest.Discovery.ID = filesRow.ID; crawlRequest.Data = filesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the FilesRow.ResponseHeaders string... foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertFiles) { crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
/// <summary> /// Processes a WebPagesRow after crawling. /// </summary> /// <param name = "webPagesRow">The web pages row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager(); CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage; crawlRequest.Discovery.ID = webPagesRow.ID; crawlRequest.Data = webPagesRow.Source; crawlRequest.CurrentDepth = webPagesRow.CrawlDepth; crawlRequest.Encoding = Encoding.GetEncoding(webPagesRow.CodePage); crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string... foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); //now, process the bytes... encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO); if (applicationSettings.InsertWebPages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk); //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images... crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO); crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { IssueWebRequest(crawlRequest, "GET"); crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreGet, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreGet, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { try { if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.ProcessData = true; bool isLastModifiedOutdated = true; try { isLastModifiedOutdated = crawlRequest.WebClient.HttpWebResponse.LastModified != DateTime.Now; } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (isLastModifiedOutdated) { switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: if (ApplicationSettings.AssignFileAndImageDiscoveries) //ANODET: robots.txt { ArachnodeDataSet.FilesRow filesRow = _arachnodeDAO.GetFile(crawlRequest.Discovery.Uri.AbsoluteUri); if (filesRow == null) { crawlRequest.ProcessData = true; } else { if (!filesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if ((crawlRequest.WebClient.HttpWebResponse).LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = true; } if (!crawlRequest.ProcessData) { if (filesRow.Source.Length != 0) { crawlRequest.Data = filesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Files database table or at _applicationSettings.DownloadedFilesDirectory. Therefore, the data was re-downloaded from the server. The File file may have been deleted from disk or the 'Source' column in the 'Files' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertFileSource = false and _applicationSettings.SaveDiscoveredFilesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.Image: if (ApplicationSettings.AssignFileAndImageDiscoveries) { ArachnodeDataSet.ImagesRow imagesRow = _arachnodeDAO.GetImage(crawlRequest.Discovery.Uri.AbsoluteUri); if (imagesRow == null) { crawlRequest.ProcessData = true; } else { if (!imagesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if (crawlRequest.WebClient.HttpWebResponse.LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (imagesRow.Source.Length != 0) { crawlRequest.Data = imagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Images database table or at _applicationSettings.DownloadedImagesDirectory. Therefore, the data was downloaded from the server. The Image file may have been deleted from disk or the 'Source' column in the 'Images' table may have been cleared. A previous crawl may have crawled with both _applicationSettings.InsertImageSource = false and _applicationSettings.SaveDiscoveredImagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } else { crawlRequest.ProcessData = true; } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.WebPage: ArachnodeDataSet.WebPagesRow webPagesRow = _arachnodeDAO.GetWebPage(crawlRequest.Discovery.Uri.AbsoluteUri); if (webPagesRow == null) { crawlRequest.ProcessData = true; } else { if ((crawlRequest.WebClient.HttpWebResponse).LastModified > webPagesRow.LastDiscovered) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (webPagesRow.Source.Length != 0) { crawlRequest.Data = webPagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the WebPages database table or at _applicationSettings.DownloadedWebPagesDirectory. Therefore, the data was re-downloaded from the server. The WebPage file may have been deleted from disk or the 'Source' column in the 'WebPages' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertWebPageSource = false and _applicationSettings.SaveDiscoveredWebPagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } break; case DiscoveryType.None: crawlRequest.ProcessData = true; break; } } } } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (crawlRequest.ProcessData) { if (crawlRequest.Data != null) { } if (crawlRequest.RenderType == RenderType.None) { if (crawlRequest.Discovery.Uri.Scheme.ToLowerInvariant() != "ftp") { if (crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.Method == "HEAD") { IssueWebRequest(crawlRequest, "GET"); } if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.Data = crawlRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer); } } else { crawlRequest.Data = crawlRequest.WebClient.DownloadFtpData(crawlRequest.Discovery.Uri.AbsoluteUri); } } else { RendererResponse rendererResponse = crawlRequest.Crawl.Crawler.Engine.Render(crawlRequest, RenderAction.Render, crawlRequest.RenderType); if (rendererResponse != null) { if (rendererResponse.HTMLDocumentClass != null) { crawlRequest.Encoding = Encoding.GetEncoding(rendererResponse.HTMLDocumentClass.charset); string outerHTML = rendererResponse.HTMLDocumentClass.documentElement.outerHTML; crawlRequest.Data = crawlRequest.Encoding.GetBytes(outerHTML); crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(outerHTML); crawlRequest.Html = outerHTML; crawlRequest.HtmlDocument = rendererResponse.HTMLDocumentClass; } crawlRequest.RendererMessage = rendererResponse.RendererMessage; } } } } else { if (crawlRequest.Data == null) { } } if (crawlRequest.Data == null) { crawlRequest.Data = new byte[0]; } }