protected void Page_Load(object sender, EventArgs e) { EnableViewState = false; try { if (Request.QueryString.Count == 2 && Request.QueryString.AllKeys[0] == "discoveryID" && Request.QueryString.AllKeys[1] == "absoluteUri") { ArachnodeDataSet.WebPagesRow webPagesRow = ArachnodeDAO.GetWebPage(Request.QueryString["discoveryID"]); if (webPagesRow != null) { string source = null; if (webPagesRow.Source == null || webPagesRow.Source.Length == 0) { string discoveryPath = DiscoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType); if (File.Exists(discoveryPath)) { source = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage)); } else { uxLblException.Text = "The WebPage source was not found in the database or on disk."; uxLblException.Visible = true; return; } } else { source = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } //Request.Url.Scheme + "://" + Request.Url.Authority //ANODET: Should this be a configuration setting? Perhaps - hotlinking isn't exactly polite, but does provide the best user experience. (Version 1.5) uxLWebPage.Text = HtmlManager.CreateHtmlDocument(webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType, source, UriQualificationType.AbsoluteWhenDownloadedDiscoveryIsUnavailable, ArachnodeDAO, true).DocumentNode.OuterHtml; } else { uxLblException.Text = "The WebPage was not found in the database."; uxLblException.Visible = true; } } } catch (Exception exception) { uxLblException.Text = exception.Message; uxLblException.Visible = true; ArachnodeDAO.InsertException(null, null, exception, false); } }
private void nudWebPageID_ValueChanged(object sender, EventArgs e) { _webPagesRow = _arachnodeDAO.GetWebPage(nudWebPageID.Value.ToString()); if (_webPagesRow != null) { _webPageDiscoveryPath = _discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedWebPagesDirectory, _webPagesRow.AbsoluteUri, _webPagesRow.FullTextIndexType); llWebPageDiscoveryPathDirectory.Visible = true; llWebPageDiscoveryPathDirectory.Text = Path.GetDirectoryName(_webPageDiscoveryPath); if (cbAutoView.Checked) { btnViewWebPage_Click(sender, e); } } else { llWebPageDiscoveryPathDirectory.Visible = false; wbMain.DocumentText = "The WebPage with the ID of " + nudWebPageID.Value + " does not exist."; } }
/// <summary> /// Process a range of FileID after crawling. Useful if crawled Files were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "fileIDLowerBound"></param> /// <param name = "fileIDUpperBound"></param> public static void ProcessFiles(Crawler <TArachnodeDAO> crawler, long fileIDLowerBound, long fileIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, arachnodeDAO); for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++) { ArachnodeDataSet.FilesRow filesRow = null; try { //get the File from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields. filesRow = arachnodeDAO.GetFile(i.ToString()); if (filesRow != null) { if (filesRow.Source == null || filesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType))) { filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType)); } else { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); } } } ProcessFile(crawler.ApplicationSettings, crawler.WebSettings, crawler, filesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, fileManager, memoryManager, ruleManager, arachnodeDAO); Console.WriteLine("FileID: " + i + " was processed successfully."); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("FileID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnFileProcessed != null) { OnFileProcessed.BeginInvoke(filesRow, "FileID: " + i + " was NOT processed successfully.", null, null); OnFileProcessed.BeginInvoke(filesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }