private void Main_Load(object sender, EventArgs e) { try { _arachnodeDAO = new ArachnodeDAO(_applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true); _actionManager = new ActionManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _consoleManager = new ConsoleManager <ArachnodeDAO>(_applicationSettings, _webSettings); _memoryManager = new MemoryManager <ArachnodeDAO>(_applicationSettings, _webSettings); _cacheManager = new CacheManager <ArachnodeDAO>(_applicationSettings, _webSettings); _crawlerPeerManager = new CrawlerPeerManager <ArachnodeDAO>(_applicationSettings, _webSettings, null, _arachnodeDAO); _cache = new Cache <ArachnodeDAO>(_applicationSettings, _webSettings, null, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager); _ruleManager = new RuleManager <ArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _discoveryManager = new DiscoveryManager <ArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager); nudWebPageID_ValueChanged(null, null); nudFileID_ValueChanged(null, null); nudImageID_ValueChanged(null, null); } catch (Exception exception) { MessageBox.Show(exception.Message + " ::" + exception.StackTrace, "Browser"); } }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //if a CrawlRequest has a Priority of double.MaxValue it is a CrawlRequest needed to satisfy a piece of content belonging to a WebPage. (e.g. An image, a file.) if (crawlRequest.Priority != 1000001) { while (DateTime.Now.Subtract(crawlRequest.Politeness.LastWebPageHttpWebRequestCompleted).TotalMilliseconds < _threadSleepTimeInMillisecondsBetweenWebRequests) { Thread.Sleep(10); } } crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (DateTime.Now.Subtract(crawlRequest.Politeness.FirstHttpWebRequest) > TimeSpan.FromDays(1)) { crawlRequest.Politeness.FirstHttpWebRequest = DateTime.Now; crawlRequest.Politeness.TotalHttpWebRequestsCompleted = 0; } if (crawlRequest.Politeness.TotalHttpWebRequestsCompleted + crawlRequest.Politeness.TotalHttpWebRequestsCanceled >= _maximumNumberOfWebRequestsPerHostPerDay) { crawlRequest.IsDisallowedReason = "Too many HttpWebRequests per day."; return(true); } return(false); }
public override string GetWebPageSource(string webPageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO) { if (ApplicationSettings.DownloadedWebPagesDirectory == null) { throw new Exception("_applicationSettings.DownloadedWebPagesDirectory is null. This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO."); } string webPageSource = null; ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(webPageAbsoluteUriOrID); if (webPagesRow != null) { if (webPagesRow.Source.Length != 0) { webPageSource = Encoding.GetEncoding(webPagesRow.CodePage).GetString(webPagesRow.Source); } else { string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType); if (!File.Exists(discoveryPath)) { throw new Exception("Could not find the WebPage Source in the database or on disk."); } webPageSource = File.ReadAllText(discoveryPath, Encoding.GetEncoding(webPagesRow.CodePage)); } } return(webPageSource); }
public override byte[] GetImageSource(string imageAbsoluteUriOrID, IArachnodeDAO arachnodeDAO) { var managedImage = new ManagedImage(); if (ApplicationSettings.DownloadedImagesDirectory == null) { throw new Exception("_applicationSettings.DownloadedImagesDirectory is null. This is usually the result of failing to initialize the Application configuration from the ArachnodeDAO."); } ArachnodeDataSet.ImagesRow imagesRow = arachnodeDAO.GetImage(imageAbsoluteUriOrID); if (imagesRow != null) { if (imagesRow.Source.Length != 0) { return(imagesRow.Source); } else { string discoveryPath = GetDiscoveryPath(ApplicationSettings.DownloadedImagesDirectory, imagesRow.AbsoluteUri, imagesRow.FullTextIndexType); if (!File.Exists(discoveryPath)) { throw new Exception("Could not find the Image Source in the database or on disk."); } return(File.ReadAllBytes(discoveryPath)); } } return(null); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (crawlRequest.MaximumDepth < -1) { crawlRequest.IsDisallowedReason = "CrawlRequest.MaximumDepth cannot equal less than -1."; return(true); } if (crawlRequest.MaximumDepth == -1) { crawlRequest.IsDisallowedReason = "CrawlRequest.MaximumDepth cannot equal -1."; return(true); } if (crawlRequest.MaximumDepth > _maximumCrawlRequestDepth) { crawlRequest.IsDisallowedReason = "CrawlRequest.MaximumDepth cannot exceed " + _maximumCrawlRequestDepth + "."; return(true); } return(false); }
protected ADataManager(ApplicationSettings applicationSettings, WebSettings webSettings, ActionManager <TArachnodeDAO> actionManager, DataTypeManager <TArachnodeDAO> dataTypeManager, DiscoveryManager <TArachnodeDAO> discoveryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings) { _dataTypeManager = dataTypeManager; _discoveryManager = discoveryManager; _ruleManager = ruleManager; _actionManager = actionManager; _arachnodeDAO = arachnodeDAO; }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //ANODET: When you add the multi-server caching, the robots.txt file will need to be sent to all other CachePeers. //if we're not being called by the Engine prior to assigning to a Crawl... if (crawlRequest.Crawl != null) { string robotsDotTextAbsoluteUri = crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host + "/robots.txt"; crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (!UserDefinedFunctions.IsDisallowedForAbsoluteUri(robotsDotTextAbsoluteUri, false, false)) { if (crawlRequest.Politeness.DisallowedPaths == null || (crawlRequest.Politeness.DisallowedPaths != null && DateTime.Now.Subtract(crawlRequest.Politeness.DisallowedPathsSince) > TimeSpan.FromDays(1))) { CrawlRequest <TArachnodeDAO> robotsDotTextRequest = new CrawlRequest <TArachnodeDAO>(crawlRequest, crawlRequest.Crawl.Crawler.Cache.GetDiscovery(robotsDotTextAbsoluteUri, arachnodeDAO), 1, 1, (short)UriClassificationType.Host, (short)UriClassificationType.Host, double.MaxValue, RenderType.None, RenderType.None); robotsDotTextRequest.Discovery.DiscoveryState = DiscoveryState.Undiscovered; robotsDotTextRequest.Politeness = crawlRequest.Politeness; Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(crawlRequest.Crawl.Crawler.ApplicationSettings, crawlRequest.Crawl.Crawler.WebSettings, crawlRequest.Crawl.Crawler, crawlRequest.Crawl.Crawler.ActionManager, crawlRequest.Crawl.Crawler.ConsoleManager, crawlRequest.Crawl.Crawler.CookieManager, crawlRequest.Crawl.Crawler.CrawlRequestManager, crawlRequest.Crawl.Crawler.DataTypeManager, crawlRequest.Crawl.Crawler.DiscoveryManager, crawlRequest.Crawl.Crawler.EncodingManager, crawlRequest.Crawl.Crawler.HtmlManager, crawlRequest.Crawl.Crawler.PolitenessManager, crawlRequest.Crawl.Crawler.ProxyManager, crawlRequest.Crawl.Crawler.RuleManager, false); robotsDotTextRequest.Crawl = crawl; crawl.ProcessCrawlRequest(robotsDotTextRequest, false, false); crawlRequest.Politeness.DisallowedPathsSince = DateTime.Now; //The DataManager will not download the byte stream is ApplicationSettings.AssignFileAndImageDicoveries is set to false. This is by design. if (robotsDotTextRequest.Data != null && robotsDotTextRequest.Data.Length == 0 && robotsDotTextRequest.WebClient.WebException == null) { robotsDotTextRequest.Data = robotsDotTextRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer); } SiteCrawler.Value.RobotsDotText robotsDotText = _robotsDotTextManager.ParseRobotsDotTextSource(new Uri(crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host), robotsDotTextRequest.Data); crawlRequest.Politeness.CrawlDelayInMilliseconds = robotsDotText.CrawlDelay * 1000; crawlRequest.Politeness.DisallowedPaths = robotsDotText.DisallowedPaths; } if (crawlRequest.Politeness != null) { if (crawlRequest.Politeness.DisallowedPaths != null) { foreach (string disallowedPath in crawlRequest.Politeness.DisallowedPaths) { if (HttpUtility.UrlDecode(crawlRequest.Discovery.Uri.AbsoluteUri).StartsWith(HttpUtility.UrlDecode(disallowedPath))) { crawlRequest.IsDisallowedReason = "Prohibited by robots.txt."; return(true); } } } } } } return(false); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "discovery">The discovery.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, IArachnodeDAO arachnodeDAO) { //perform application specific logic here... //discovery.IsStorable = discovery.Uri.AbsoluteUri.ToLowerInvariant().Contains(".aspx"); //this plugin could detemine whether a Discovery was Disallowed, but in this example, it doesn't make this determination. return(false); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { if (crawlRequest.Politeness != null) { crawlRequest.Politeness.MaximumActiveHttpWebRequests = 2; } return(false); }
internal DiscoveryProcessor(ApplicationSettings applicationSettings, Crawler <TArachnodeDAO> crawler, CrawlRequestManager <TArachnodeDAO> crawlRequestManager) { _applicationSettings = applicationSettings; _crawler = crawler; _crawlRequestManager = crawlRequestManager; _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString); _arachnodeDAO.ApplicationSettings = _applicationSettings; }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (crawlRequest.DataType.DiscoveryType == DiscoveryType.None) { crawlRequest.IsDisallowedReason = "Disallowed by unassigned DataType. (" + crawlRequest.DataType.ContentType + ")"; return(true); } return(false); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if ((crawlRequest.WebClient.HttpWebResponse).StatusCode != HttpStatusCode.OK) { crawlRequest.IsDisallowedReason = "Disallowed by Status. (" + (crawlRequest.WebClient.HttpWebResponse).StatusCode + ")"; return(true); } return(false); }
public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null) { if (DateTime.Now.Subtract(crawlRequest.WebClient.HttpWebResponse.LastModified).TotalHours > _maximumTotalHoursOld) { crawlRequest.Discovery.IsDisallowedReason = "More than maximum total hours old."; return(true); } } return(false); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null) { if (crawlRequest.WebClient.HttpWebResponse.ContentLength > _maximumContentLengthInBytes) { crawlRequest.IsDisallowedReason = "Disallowed by ContentLength."; return(true); } } return(false); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { bool isDisallowed = false; crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; crawlRequest.IsDisallowedReason = "Disallowed by ResponseHeaders."; if (UserDefinedFunctions.IsDisallowedForResponseHeaders(crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), false)) { isDisallowed = true; } if (_negateIsDisallowed) { isDisallowed = !isDisallowed; } return(isDisallowed); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (crawlRequest.Crawl.Crawler.DiscoveryManager.IsCrawlRestricted(crawlRequest, crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri)) { crawlRequest.IsDisallowedReason = "Disallowed by ResponseUri. " + crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri; return(true); } crawlRequest.Crawl.Crawler.DiscoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, arachnodeDAO); crawlRequest.Discovery = crawlRequest.Crawl.Crawler.Cache.GetDiscovery(crawlRequest.WebClient.HttpWebResponse.ResponseUri, arachnodeDAO); crawlRequest.Crawl.Crawler.DiscoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, arachnodeDAO); return(false); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { bool isDisallowed = false; if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; crawlRequest.IsDisallowedReason = "Disallowed by Source."; if (UserDefinedFunctions.IsDisallowedForSource(crawlRequest.DecodedHtml, false)) { isDisallowed = true; } if (_negateIsDisallowed) { isDisallowed = !isDisallowed; } } return(isDisallowed); }
internal void LoadCrawlActions(IArachnodeDAO arachnodeDAO) { CrawlActions = new Dictionary <string, ACrawlAction <TArachnodeDAO> >(); foreach (ArachnodeDataSet.CrawlActionsRow crawlActionsRow in arachnodeDAO.GetCrawlActions()) { ObjectHandle objectHandle = Engine <TArachnodeDAO> .GetObjectHandle(crawlActionsRow.AssemblyName, crawlActionsRow.TypeName, _applicationSettings, _webSettings); ACrawlAction <TArachnodeDAO> crawlAction = (ACrawlAction <TArachnodeDAO>)objectHandle.Unwrap(); crawlAction.AssemblyName = crawlActionsRow.AssemblyName; crawlAction.IsEnabled = crawlActionsRow.IsEnabled; crawlAction.Order = crawlActionsRow.Order; crawlAction.CrawlActionType = (CrawlActionType)Enum.Parse(typeof(CrawlActionType), crawlActionsRow.CrawlActionTypeID.ToString()); if (!crawlActionsRow.IsSettingsNull()) { crawlAction.Settings = crawlActionsRow.Settings; } crawlAction.TypeName = crawlActionsRow.TypeName; CrawlActions.Add(crawlAction.TypeName, crawlAction); } }
/// <summary> /// Gets the discovery. /// </summary> /// <param name = "absoluteUri">The file or image discovery.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns></returns> public Discovery <TArachnodeDAO> GetDiscovery(string absoluteUri, IArachnodeDAO arachnodeDAO) { string cacheKey = _cacheManager.GetCacheKey(absoluteUri); Discovery <TArachnodeDAO> discovery; //this is a placeholder, if my memory serves me correctly, to expand the referenced functionality... if (_memoryManager.HasDesiredMaximumMemoryUsageInMegabytesEverBeenMet) { discovery = GetDiscovery(absoluteUri, cacheKey, arachnodeDAO); } else { discovery = GetDiscovery(absoluteUri, cacheKey, arachnodeDAO); } if (discovery.Uri.AbsoluteUri != absoluteUri) { discovery.Uri = new Uri(absoluteUri); } return(discovery); }
protected override void OnInit(EventArgs e) { //populates the Application and Web settings... IArachnodeDAO arachnodeDAO = ArachnodeDAO; }
public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //Rendering determines the Encoding... if (crawlRequest.RenderType == RenderType.None) { if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { string contentType = null; if (crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"] != null) { string[] contentTypeHeader = crawlRequest.WebClient.HttpWebResponse.Headers["Content-Type"].Split('='); if (contentTypeHeader.Length == 2) { contentType = contentTypeHeader[1].Replace("utf8", "utf-8"); } } Encoding encoding = null; string decodedHtml = null; try { //first, try and get the Encoding from the 'Content-Type'... if (!string.IsNullOrEmpty(contentType)) { encoding = Encoding.GetEncoding(contentType); } else { decodedHtml = DetermineEncoding(crawlRequest, out encoding); } } catch (Exception exception) { try { //if there is an error, try and get the Encoding from the 'Charset'... decodedHtml = DetermineEncoding(crawlRequest, out encoding); } catch (Exception exception2) { //if there is an error, default to UTF8. arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception, false); arachnodeDAO.InsertException(crawlRequest.Discovery.Uri.AbsoluteUri, null, exception2, false); encoding = Encoding.UTF8; } } crawlRequest.Encoding = encoding; if (encoding == Encoding.UTF8 && decodedHtml != null) { crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(decodedHtml); crawlRequest.Html = decodedHtml; } else { crawlRequest.DecodedHtml = HttpUtility.HtmlDecode(encoding.GetString(crawlRequest.Data)); crawlRequest.Html = encoding.GetString(crawlRequest.Data); } } } }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { return(IsDisallowed(crawlRequest, crawlRequest.Discovery.Uri)); }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "discovery">The discovery.</param> /// <param name = "crawlRuleType">Type of the rule.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, CrawlRuleType crawlRuleType, IArachnodeDAO arachnodeDAO) { switch (crawlRuleType) { case CrawlRuleType.PreRequest: foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preRequestCrawlRules.Values) { foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules) { try { if (crawlRule.IsEnabled && crawlRule.IsDisallowed(discovery, arachnodeDAO)) { discovery.IsDisallowed = true; return(true); } } catch (Exception exception) { arachnodeDAO.InsertException(discovery.Uri.AbsoluteUri, discovery.Uri.AbsoluteUri, exception, false); return(true); } } } break; case CrawlRuleType.PreGet: foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _preGetCrawlRules.Values) { foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules) { try { if (crawlRule.IsEnabled && crawlRule.IsDisallowed(discovery, arachnodeDAO)) { discovery.IsDisallowed = true; return(true); } } catch (Exception exception) { arachnodeDAO.InsertException(discovery.Uri.AbsoluteUri, discovery.Uri.AbsoluteUri, exception, false); return(true); } } } break; case CrawlRuleType.PostRequest: foreach (List <ACrawlRule <TArachnodeDAO> > crawlRules in _postRequestCrawlRules.Values) { foreach (ACrawlRule <TArachnodeDAO> crawlRule in crawlRules) { try { if (crawlRule.IsEnabled && crawlRule.IsDisallowed(discovery, arachnodeDAO)) { discovery.IsDisallowed = true; return(true); } } catch (Exception exception) { arachnodeDAO.InsertException(discovery.Uri.AbsoluteUri, discovery.Uri.AbsoluteUri, exception, false); return(true); } } } break; } return(false); }
public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //as an example... if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null && crawlRequest.WebClient.HttpWebResponse.ResponseUri != null) { if (crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri.EndsWith("503.html") || crawlRequest.WebClient.HttpWebResponse.StatusCode == HttpStatusCode.ServiceUnavailable) { crawlRequest.Crawl.Crawler.PolitenessManager.ResubmitCrawlRequest(crawlRequest, false, arachnodeDAO); } } }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "discovery">The discovery.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, IArachnodeDAO arachnodeDAO) { return(IsDisallowed(discovery, discovery.Uri)); }
/// <summary> /// Processes a FilesRow after crawling. /// </summary> /// <param name = "filesRow">The files row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "fileManager">The file manager.</param> /// <param name = "fileManager">The file manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.File; crawlRequest.Discovery.ID = filesRow.ID; crawlRequest.Data = filesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the FilesRow.ResponseHeaders string... foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertFiles) { crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
/// <summary> /// The WebPageManager. /// </summary> /// <param name = "arachnodeDAO">Must be thread-safe.</param> public WebPageManager(ApplicationSettings applicationSettings, WebSettings webSettings, DiscoveryManager <TArachnodeDAO> discoveryManager, HtmlManager <TArachnodeDAO> htmlManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings, discoveryManager, htmlManager, arachnodeDAO) { }
/**/ public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //here you would insert/update your data storage... if (_crawler == null) { _crawler = crawlRequest.Crawl.Crawler; } }
/// <summary> /// Initializes a new instance of the <see cref = "FileManager{TArachnodeDAO}" /> class. /// </summary> /// <param name = "arachnodeDAO">The arachnode DAO.</param> protected AFileManager(ApplicationSettings applicationSettings, WebSettings webSettings, DiscoveryManager <TArachnodeDAO> discoveryManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings) { _discoveryManager = discoveryManager; _arachnodeDAO = arachnodeDAO; }
/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "discovery">The discovery.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(Discovery <TArachnodeDAO> discovery, IArachnodeDAO arachnodeDAO) { return(false); }