public override byte[] GetFileSource(string fileAbsoluteUriOrID, IArachnodeDAO arachnodeDAO) { if (ApplicationSettings.DownloadedFilesDirectory == null) { throw new Exception("_applicationSettings.DownloadedFilesDirectory is null. This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO."); } ArachnodeDataSet.FilesRow filesRow = arachnodeDAO.GetFile(fileAbsoluteUriOrID); if (filesRow != null) { if (filesRow.Source.Length != 0) { return(filesRow.Source); } else { string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType); if (!File.Exists(discoveryPath)) { throw new Exception("Could not find the File Source in the database or on disk."); } return(File.ReadAllBytes(discoveryPath)); } } return(null); }
private void FileUtilities_OnFileProcessed(ArachnodeDataSet.FilesRow filesRow, string message) { BeginInvoke(new MethodInvoker(delegate { rtbPostProcessingStatus.Text = message + Environment.NewLine + rtbPostProcessingStatus.Text; if (rtbPostProcessingStatus.Text.Length > 10000) { rtbPostProcessingStatus.Text = rtbPostProcessingStatus.Text.Substring(0, 10000); } })); //Application.DoEvents(); //Thread.Sleep(100); }
private void nudFileID_ValueChanged(object sender, EventArgs e) { _filesRow = _arachnodeDAO.GetFile(nudFileID.Value.ToString()); if (_filesRow != null) { _fileDiscoveryPath = _discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedFilesDirectory, _filesRow.AbsoluteUri, _filesRow.FullTextIndexType); llFileDiscoveryPathDirectory.Visible = true; llFileDiscoveryPathDirectory.Text = Path.GetDirectoryName(_fileDiscoveryPath); if (cbAutoView.Checked) { btnViewFile_Click(sender, e); } } else { llFileDiscoveryPathDirectory.Visible = false; wbMain.DocumentText = "The File with the ID of " + nudFileID.Value + " does not exist."; } }
/// <summary> /// Process a range of FileID after crawling. Useful if crawled Files were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "fileIDLowerBound"></param> /// <param name = "fileIDUpperBound"></param> public static void ProcessFiles(Crawler <TArachnodeDAO> crawler, long fileIDLowerBound, long fileIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO); for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++) { ArachnodeDataSet.FilesRow filesRow = null; try { //get the File from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields. filesRow = arachnodeDAO.GetFile(i.ToString()); if (filesRow != null) { if (filesRow.Source == null || filesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType))) { filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType)); } else { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); } } } ProcessFile(crawler.ApplicationSettings, crawler.WebSettings, crawler, filesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, fileManager, memoryManager, ruleManager, arachnodeDAO); Console.WriteLine("FileID: " + i + " was processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); OnFileProcessed.BeginInvoke(filesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }
/// <summary> /// Processes a FilesRow after crawling. /// </summary> /// <param name = "filesRow">The files row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "fileManager">The file manager.</param> /// <param name = "fileManager">The file manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.File; crawlRequest.Discovery.ID = filesRow.ID; crawlRequest.Data = filesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the FilesRow.ResponseHeaders string... foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertFiles) { crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { IssueWebRequest(crawlRequest, "GET"); crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreGet, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreGet, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { try { if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.ProcessData = true; bool isLastModifiedOutdated = true; try { isLastModifiedOutdated = crawlRequest.WebClient.HttpWebResponse.LastModified != DateTime.Now; } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (isLastModifiedOutdated) { switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: if (ApplicationSettings.AssignFileAndImageDiscoveries) //ANODET: robots.txt { ArachnodeDataSet.FilesRow filesRow = _arachnodeDAO.GetFile(crawlRequest.Discovery.Uri.AbsoluteUri); if (filesRow == null) { crawlRequest.ProcessData = true; } else { if (!filesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if ((crawlRequest.WebClient.HttpWebResponse).LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = true; } if (!crawlRequest.ProcessData) { if (filesRow.Source.Length != 0) { crawlRequest.Data = filesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Files database table or at _applicationSettings.DownloadedFilesDirectory. Therefore, the data was re-downloaded from the server. The File file may have been deleted from disk or the 'Source' column in the 'Files' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertFileSource = false and _applicationSettings.SaveDiscoveredFilesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.Image: if (ApplicationSettings.AssignFileAndImageDiscoveries) { ArachnodeDataSet.ImagesRow imagesRow = _arachnodeDAO.GetImage(crawlRequest.Discovery.Uri.AbsoluteUri); if (imagesRow == null) { crawlRequest.ProcessData = true; } else { if (!imagesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if (crawlRequest.WebClient.HttpWebResponse.LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (imagesRow.Source.Length != 0) { crawlRequest.Data = imagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Images database table or at _applicationSettings.DownloadedImagesDirectory. Therefore, the data was downloaded from the server. The Image file may have been deleted from disk or the 'Source' column in the 'Images' table may have been cleared. A previous crawl may have crawled with both _applicationSettings.InsertImageSource = false and _applicationSettings.SaveDiscoveredImagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } else { crawlRequest.ProcessData = true; } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.WebPage: ArachnodeDataSet.WebPagesRow webPagesRow = _arachnodeDAO.GetWebPage(crawlRequest.Discovery.Uri.AbsoluteUri); if (webPagesRow == null) { crawlRequest.ProcessData = true; } else { if ((crawlRequest.WebClient.HttpWebResponse).LastModified > webPagesRow.LastDiscovered) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (webPagesRow.Source.Length != 0) { crawlRequest.Data = webPagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the WebPages database table or at _applicationSettings.DownloadedWebPagesDirectory. Therefore, the data was re-downloaded from the server. The WebPage file may have been deleted from disk or the 'Source' column in the 'WebPages' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertWebPageSource = false and _applicationSettings.SaveDiscoveredWebPagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } break; case DiscoveryType.None: crawlRequest.ProcessData = true; break; } } } } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (crawlRequest.ProcessData) { if (crawlRequest.Data != null) { } if (crawlRequest.RenderType == RenderType.None) { if (crawlRequest.Discovery.Uri.Scheme.ToLowerInvariant() != "ftp") { if (crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.Method == "HEAD") { IssueWebRequest(crawlRequest, "GET"); } if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.Data = crawlRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer); } } else { crawlRequest.Data = crawlRequest.WebClient.DownloadFtpData(crawlRequest.Discovery.Uri.AbsoluteUri); } } else { RendererResponse rendererResponse = crawlRequest.Crawl.Crawler.Engine.Render(crawlRequest, RenderAction.Render, crawlRequest.RenderType); if (rendererResponse != null) { if (rendererResponse.HTMLDocumentClass != null) { crawlRequest.Encoding = Encoding.GetEncoding(rendererResponse.HTMLDocumentClass.charset); string outerHTML = rendererResponse.HTMLDocumentClass.documentElement.outerHTML; crawlRequest.Data = crawlRequest.Encoding.GetBytes(outerHTML); crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(outerHTML); crawlRequest.Html = outerHTML; crawlRequest.HtmlDocument = rendererResponse.HTMLDocumentClass; } crawlRequest.RendererMessage = rendererResponse.RendererMessage; } } } } else { if (crawlRequest.Data == null) { } } if (crawlRequest.Data == null) { crawlRequest.Data = new byte[0]; } }