/// <summary> /// Determines whether the specified crawl request is disallowed. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <returns> /// <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>. /// </returns> public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //ANODET: When you add the multi-server caching, the robots.txt file will need to be sent to all other CachePeers. //if we're not being called by the Engine prior to assigning to a Crawl... if (crawlRequest.Crawl != null) { string robotsDotTextAbsoluteUri = crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host + "/robots.txt"; crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason; if (!UserDefinedFunctions.IsDisallowedForAbsoluteUri(robotsDotTextAbsoluteUri, false, false)) { if (crawlRequest.Politeness.DisallowedPaths == null || (crawlRequest.Politeness.DisallowedPaths != null && DateTime.Now.Subtract(crawlRequest.Politeness.DisallowedPathsSince) > TimeSpan.FromDays(1))) { CrawlRequest <TArachnodeDAO> robotsDotTextRequest = new CrawlRequest <TArachnodeDAO>(crawlRequest, crawlRequest.Crawl.Crawler.Cache.GetDiscovery(robotsDotTextAbsoluteUri, arachnodeDAO), 1, 1, (short)UriClassificationType.Host, (short)UriClassificationType.Host, double.MaxValue, RenderType.None, RenderType.None); robotsDotTextRequest.Discovery.DiscoveryState = DiscoveryState.Undiscovered; robotsDotTextRequest.Politeness = crawlRequest.Politeness; Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(crawlRequest.Crawl.Crawler.ApplicationSettings, crawlRequest.Crawl.Crawler.WebSettings, crawlRequest.Crawl.Crawler, crawlRequest.Crawl.Crawler.ActionManager, crawlRequest.Crawl.Crawler.ConsoleManager, crawlRequest.Crawl.Crawler.CookieManager, crawlRequest.Crawl.Crawler.CrawlRequestManager, crawlRequest.Crawl.Crawler.DataTypeManager, crawlRequest.Crawl.Crawler.DiscoveryManager, crawlRequest.Crawl.Crawler.EncodingManager, crawlRequest.Crawl.Crawler.HtmlManager, crawlRequest.Crawl.Crawler.PolitenessManager, crawlRequest.Crawl.Crawler.ProxyManager, crawlRequest.Crawl.Crawler.RuleManager, false); robotsDotTextRequest.Crawl = crawl; crawl.ProcessCrawlRequest(robotsDotTextRequest, false, false); crawlRequest.Politeness.DisallowedPathsSince = DateTime.Now; //The DataManager will not download the byte stream is ApplicationSettings.AssignFileAndImageDicoveries is set to false. This is by design. if (robotsDotTextRequest.Data != null && robotsDotTextRequest.Data.Length == 0 && robotsDotTextRequest.WebClient.WebException == null) { robotsDotTextRequest.Data = robotsDotTextRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer); } SiteCrawler.Value.RobotsDotText robotsDotText = _robotsDotTextManager.ParseRobotsDotTextSource(new Uri(crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host), robotsDotTextRequest.Data); crawlRequest.Politeness.CrawlDelayInMilliseconds = robotsDotText.CrawlDelay * 1000; crawlRequest.Politeness.DisallowedPaths = robotsDotText.DisallowedPaths; } if (crawlRequest.Politeness != null) { if (crawlRequest.Politeness.DisallowedPaths != null) { foreach (string disallowedPath in crawlRequest.Politeness.DisallowedPaths) { if (HttpUtility.UrlDecode(crawlRequest.Discovery.Uri.AbsoluteUri).StartsWith(HttpUtility.UrlDecode(disallowedPath))) { crawlRequest.IsDisallowedReason = "Prohibited by robots.txt."; return(true); } } } } } } return(false); }
/// <summary> /// Determines whether the specified a disallowed is disallowed. /// </summary> /// <param name = "aDisallowed">A disallowed.</param> /// <param name = "uri">The URI.</param> /// <returns> /// <c>true</c> if the specified a disallowed is disallowed; otherwise, <c>false</c>. /// </returns> protected bool IsDisallowed(ADisallowed <TArachnodeDAO> aDisallowed, Uri uri) { bool isDisallowed = false; aDisallowed.OutputIsDisallowedReason = OutputIsDisallowedReason; #region Disallowed by AbsoluteUri. if (UserDefinedFunctions.IsDisallowedForAbsoluteUri(uri.AbsoluteUri, true, false)) { isDisallowed = true; } if (_negateIsDisallowedForAbsoluteUri) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by AbsoluteUri."; return(true); } #endregion #region Disallowed by Domain. if (UserDefinedFunctions.IsDisallowedForDomain(uri.AbsoluteUri, false, false)) { isDisallowed = true; } if (_negateIsDisallowedForDomain) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by Domain."; return(true); } #endregion #region Disallowed by Extension. if (UserDefinedFunctions.IsDisallowedForExtension(uri.AbsoluteUri, false, false)) { isDisallowed = true; } if (_negateIsDisallowedForExtension) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by Extension."; return(true); } #endregion #region Disallowed by FileExtension. if (UserDefinedFunctions.IsDisallowedForFileExtension(uri.AbsoluteUri, false, false)) { isDisallowed = true; } if (_negateIsDisallowedForFileExtension) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by FileExtension."; return(true); } #endregion #region Disallowed by Host. if (UserDefinedFunctions.IsDisallowedForHost(uri.AbsoluteUri, false, false)) { isDisallowed = true; } if (_negateIsDisallowedForHost) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by Host."; return(true); } #endregion #region Disallowed by Scheme. if (UserDefinedFunctions.IsDisallowedForScheme(uri.AbsoluteUri, false, false)) { isDisallowed = true; } if (_negateIsDisallowedForScheme) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by Scheme."; return(true); } #endregion #region Disallowed by repeating AbsoluteUri. if (_detectRepeatingAbsoluteUrisRegex.IsMatch(uri.AbsoluteUri)) { isDisallowed = true; } if (_negateIsDisallowedForRepeatingAbsoluteUri) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by repeating AbsoluteUri."; return(true); } #endregion #region Disallowed by maximum directory depth. if (uri.AbsoluteUri.Replace("/", string.Empty).Length < uri.AbsoluteUri.Length - (_maximumDirectoryDepth + 2)) { isDisallowed = true; } if (_negateIsDisallowedForMaximumDirectoryDepth) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by maximum directory depth."; return(true); } #endregion #region Disallowed by named anchor. //if (_disallowNamedAnchors && _detectAbsoluteUrisWithNamedAnchorRegex.IsMatch(discovery.Uri.AbsoluteUri)) if (_disallowNamedAnchors && !string.IsNullOrEmpty(uri.Fragment)) { isDisallowed = true; } if (_negateIsDisallowedForNamedAnchor) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by named anchor."; return(true); } #endregion #region Disallowed by query string. //if (_disallowQueryStrings && _detectAbsoluteUrisWithQueryStringRegex.IsMatch(discovery.Uri.AbsoluteUri)) if (_disallowQueryStrings && !string.IsNullOrEmpty(uri.Query)) { isDisallowed = true; } if (_negateIsDisallowedForQueryString) { isDisallowed = !isDisallowed; } if (isDisallowed) { aDisallowed.IsDisallowedReason = "Disallowed by query string."; return(true); } #endregion return(false); }