public override byte[] GetImageSource(string imageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO) { var managedImage = new ManagedImage(); if (ApplicationSettings.DownloadedImagesDirectory == null) { throw new Exception("_applicationSettings.DownloadedImagesDirectory is null. This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO."); } ArachnodeDataSet.ImagesRow imagesRow = arachnodeDAO.GetImage(imageAbsoluteUriOrID); if (imagesRow != null) { if (imagesRow.Source.Length != 0) { return(imagesRow.Source); } else { string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, imagesRow.AbsoluteUri, imagesRow.FullTextIndexType); if (!File.Exists(discoveryPath)) { throw new Exception("Could not find the Image Source in the database or on disk."); } return(File.ReadAllBytes(discoveryPath)); } } return(null); }
private void ImageUtilities_OnImageProcessed(ArachnodeDataSet.ImagesRow imagesRow, string message) { BeginInvoke(new MethodInvoker(delegate { rtbPostProcessingStatus.Text = message + Environment.NewLine + rtbPostProcessingStatus.Text; if (rtbPostProcessingStatus.Text.Length > 10000) { rtbPostProcessingStatus.Text = rtbPostProcessingStatus.Text.Substring(0, 10000); } })); //Application.DoEvents(); //Thread.Sleep(100); }
private void nudImageID_ValueChanged(object sender, EventArgs e) { _imagesRow = _arachnodeDAO.GetImage(nudImageID.Value.ToString()); if (_imagesRow != null) { _imageDiscoveryPath = _discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedImagesDirectory, _imagesRow.AbsoluteUri, _imagesRow.FullTextIndexType); llImageDiscoveryPathDirectory.Visible = true; llImageDiscoveryPathDirectory.Text = Path.GetDirectoryName(_imageDiscoveryPath); if (cbAutoView.Checked) { btnViewImage_Click(sender, e); } } else { llImageDiscoveryPathDirectory.Visible = false; wbMain.DocumentText = "The Image with the ID of " + nudImageID.Value + " does not exist."; } }
/// <summary> /// Process a range of ImageID after crawling. Useful if crawled Images were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "imageIDLowerBound"></param> /// <param name = "imageIDUpperBound"></param> public static void ProcessImages(Crawler <TArachnodeDAO> crawler, long imageIDLowerBound, long imageIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, arachnodeDAO.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); ImageManager <TArachnodeDAO> imageManager = new ImageManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO); for (long i = imageIDLowerBound; i <= imageIDUpperBound; i++) { ArachnodeDataSet.ImagesRow imagesRow = null; try { //get the Image from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the Image is the authoritative source, so we'll use that for all of the fields. imagesRow = arachnodeDAO.GetImage(i.ToString()); if (imagesRow != null) { if (imagesRow.Source == null || imagesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedImagesDirectory, imagesRow.AbsoluteUri, imagesRow.FullTextIndexType))) { imagesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedImagesDirectory, imagesRow.AbsoluteUri, imagesRow.FullTextIndexType)); } else { Console.WriteLine("ImageID: " + i + " was NOT processed successfully."); if (OnImageProcessed != null) { OnImageProcessed.BeginInvoke(imagesRow, "ImageID: " + i + " was NOT processed successfully.", null, null); } } } ProcessImage(crawler.ApplicationSettings, crawler.WebSettings, crawler, imagesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, imageManager, memoryManager, ruleManager, arachnodeDAO); Console.WriteLine("ImageID: " + i + " was processed successfully."); if (OnImageProcessed != null) { OnImageProcessed.BeginInvoke(imagesRow, "ImageID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("ImageID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnImageProcessed != null) { OnImageProcessed.BeginInvoke(imagesRow, "ImageID: " + i + " was NOT processed successfully.", null, null); OnImageProcessed.BeginInvoke(imagesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }
/// <summary> /// Processes an ImagesRow after crawling. /// </summary> /// <param name = "imagesRow">The images row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "imageManager">The image manager.</param> /// <param name = "imageManager">The image manager.</param> /// <param name = "imageManager">The image manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public static void ProcessImage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.ImagesRow imagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, ImageManager <TArachnodeDAO> imageManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(imagesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.Image; crawlRequest.Discovery.ID = imagesRow.ID; crawlRequest.Data = imagesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the ImagesRow.ResponseHeaders string... foreach (string responseHeader in imagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertImages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertImage(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertImageSource ? crawlRequest.Data : new byte[] {}, crawlRequest.DataType.FullTextIndexType); } crawlRequest.ManagedDiscovery = imageManager.ManageImage(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractImageMetaData, applicationSettings.InsertImageMetaData, applicationSettings.SaveDiscoveredImagesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { IssueWebRequest(crawlRequest, "GET"); crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreGet, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreGet, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { try { if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.ProcessData = true; bool isLastModifiedOutdated = true; try { isLastModifiedOutdated = crawlRequest.WebClient.HttpWebResponse.LastModified != DateTime.Now; } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (isLastModifiedOutdated) { switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: if (ApplicationSettings.AssignFileAndImageDiscoveries) //ANODET: robots.txt { ArachnodeDataSet.FilesRow filesRow = _arachnodeDAO.GetFile(crawlRequest.Discovery.Uri.AbsoluteUri); if (filesRow == null) { crawlRequest.ProcessData = true; } else { if (!filesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if ((crawlRequest.WebClient.HttpWebResponse).LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = true; } if (!crawlRequest.ProcessData) { if (filesRow.Source.Length != 0) { crawlRequest.Data = filesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedFilesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Files database table or at _applicationSettings.DownloadedFilesDirectory. Therefore, the data was re-downloaded from the server. The File file may have been deleted from disk or the 'Source' column in the 'Files' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertFileSource = false and _applicationSettings.SaveDiscoveredFilesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.Image: if (ApplicationSettings.AssignFileAndImageDiscoveries) { ArachnodeDataSet.ImagesRow imagesRow = _arachnodeDAO.GetImage(crawlRequest.Discovery.Uri.AbsoluteUri); if (imagesRow == null) { crawlRequest.ProcessData = true; } else { if (!imagesRow.IsResponseHeadersNull()) { DateTime lastModified; SqlString lastModifiedValue = UserDefinedFunctions.ExtractResponseHeader(imagesRow.ResponseHeaders, "Last-Modified: ", false); if (!lastModifiedValue.IsNull && DateTime.TryParse(lastModifiedValue.Value, out lastModified)) { //crawlRequest.WebClient.HttpWebResponse.LastModified will equal DateTime.Now (or close to it) if the 'Last-Modified' ResponseHeader is not present... if (crawlRequest.WebClient.HttpWebResponse.LastModified > lastModified) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (imagesRow.Source.Length != 0) { crawlRequest.Data = imagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the Images database table or at _applicationSettings.DownloadedImagesDirectory. Therefore, the data was downloaded from the server. The Image file may have been deleted from disk or the 'Source' column in the 'Images' table may have been cleared. A previous crawl may have crawled with both _applicationSettings.InsertImageSource = false and _applicationSettings.SaveDiscoveredImagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } else { crawlRequest.ProcessData = true; } } } else { crawlRequest.ProcessData = false; } break; case DiscoveryType.WebPage: ArachnodeDataSet.WebPagesRow webPagesRow = _arachnodeDAO.GetWebPage(crawlRequest.Discovery.Uri.AbsoluteUri); if (webPagesRow == null) { crawlRequest.ProcessData = true; } else { if ((crawlRequest.WebClient.HttpWebResponse).LastModified > webPagesRow.LastDiscovered) { crawlRequest.ProcessData = true; } else { crawlRequest.ProcessData = false; } if (!crawlRequest.ProcessData) { if (webPagesRow.Source.Length != 0) { crawlRequest.Data = webPagesRow.Source; } else { string discoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DataType.FullTextIndexType); if (File.Exists(discoveryPath)) { crawlRequest.Data = File.ReadAllBytes(discoveryPath); } else { try { throw new Exception("The 'LastModified' HttpResponse Header indicated that the Data was not stale, but the Data (Source) could not be found in the WebPages database table or at _applicationSettings.DownloadedWebPagesDirectory. Therefore, the data was re-downloaded from the server. The WebPage file may have been deleted from disk or the 'Source' column in the 'WebPages' table may have been cleared or a previous crawl may have crawled with both _applicationSettings.InsertWebPageSource = false and _applicationSettings.SaveDiscoveredWebPagesToDisk = false."); } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } crawlRequest.ProcessData = true; } } } } break; case DiscoveryType.None: crawlRequest.ProcessData = true; break; } } } } catch (Exception exception) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } if (crawlRequest.ProcessData) { if (crawlRequest.Data != null) { } if (crawlRequest.RenderType == RenderType.None) { if (crawlRequest.Discovery.Uri.Scheme.ToLowerInvariant() != "ftp") { if (crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.Method == "HEAD") { IssueWebRequest(crawlRequest, "GET"); } if (crawlRequest.WebClient.HttpWebResponse != null) { crawlRequest.Data = crawlRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", crawlRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer); } } else { crawlRequest.Data = crawlRequest.WebClient.DownloadFtpData(crawlRequest.Discovery.Uri.AbsoluteUri); } } else { RendererResponse rendererResponse = crawlRequest.Crawl.Crawler.Engine.Render(crawlRequest, RenderAction.Render, crawlRequest.RenderType); if (rendererResponse != null) { if (rendererResponse.HTMLDocumentClass != null) { crawlRequest.Encoding = Encoding.GetEncoding(rendererResponse.HTMLDocumentClass.charset); string outerHTML = rendererResponse.HTMLDocumentClass.documentElement.outerHTML; crawlRequest.Data = crawlRequest.Encoding.GetBytes(outerHTML); crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(outerHTML); crawlRequest.Html = outerHTML; crawlRequest.HtmlDocument = rendererResponse.HTMLDocumentClass; } crawlRequest.RendererMessage = rendererResponse.RendererMessage; } } } } else { if (crawlRequest.Data == null) { } } if (crawlRequest.Data == null) { crawlRequest.Data = new byte[0]; } }