/// <summary>
        ///     Determines whether the specified crawl request is disallowed.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is disallowed; otherwise, <c>false</c>.
        /// </returns>
        public override bool IsDisallowed(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //ANODET: When you add the multi-server caching, the robots.txt file will need to be sent to all other CachePeers.

            //if we're not being called by the Engine prior to assigning to a Crawl...
            if (crawlRequest.Crawl != null)
            {
                string robotsDotTextAbsoluteUri = crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host + "/robots.txt";

                crawlRequest.OutputIsDisallowedReason = OutputIsDisallowedReason;

                if (!UserDefinedFunctions.IsDisallowedForAbsoluteUri(robotsDotTextAbsoluteUri, false, false))
                {
                    if (crawlRequest.Politeness.DisallowedPaths == null || (crawlRequest.Politeness.DisallowedPaths != null && DateTime.Now.Subtract(crawlRequest.Politeness.DisallowedPathsSince) > TimeSpan.FromDays(1)))
                    {
                        CrawlRequest <TArachnodeDAO> robotsDotTextRequest = new CrawlRequest <TArachnodeDAO>(crawlRequest, crawlRequest.Crawl.Crawler.Cache.GetDiscovery(robotsDotTextAbsoluteUri, arachnodeDAO), 1, 1, (short)UriClassificationType.Host, (short)UriClassificationType.Host, double.MaxValue, RenderType.None, RenderType.None);
                        robotsDotTextRequest.Discovery.DiscoveryState = DiscoveryState.Undiscovered;
                        robotsDotTextRequest.Politeness = crawlRequest.Politeness;

                        Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(crawlRequest.Crawl.Crawler.ApplicationSettings, crawlRequest.Crawl.Crawler.WebSettings, crawlRequest.Crawl.Crawler, crawlRequest.Crawl.Crawler.ActionManager, crawlRequest.Crawl.Crawler.ConsoleManager, crawlRequest.Crawl.Crawler.CookieManager, crawlRequest.Crawl.Crawler.CrawlRequestManager, crawlRequest.Crawl.Crawler.DataTypeManager, crawlRequest.Crawl.Crawler.DiscoveryManager, crawlRequest.Crawl.Crawler.EncodingManager, crawlRequest.Crawl.Crawler.HtmlManager, crawlRequest.Crawl.Crawler.PolitenessManager, crawlRequest.Crawl.Crawler.ProxyManager, crawlRequest.Crawl.Crawler.RuleManager, false);

                        robotsDotTextRequest.Crawl = crawl;

                        crawl.ProcessCrawlRequest(robotsDotTextRequest, false, false);

                        crawlRequest.Politeness.DisallowedPathsSince = DateTime.Now;

                        //The DataManager will not download the byte stream is ApplicationSettings.AssignFileAndImageDicoveries is set to false.  This is by design.
                        if (robotsDotTextRequest.Data != null && robotsDotTextRequest.Data.Length == 0 && robotsDotTextRequest.WebClient.WebException == null)
                        {
                            robotsDotTextRequest.Data = robotsDotTextRequest.WebClient.DownloadHttpData(crawlRequest.Discovery.Uri.AbsoluteUri, robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "gzip", robotsDotTextRequest.WebClient.HttpWebResponse.ContentEncoding.ToLowerInvariant() == "deflate", crawlRequest.Crawl.Crawler.CookieContainer);
                        }

                        SiteCrawler.Value.RobotsDotText robotsDotText = _robotsDotTextManager.ParseRobotsDotTextSource(new Uri(crawlRequest.Discovery.Uri.Scheme + Uri.SchemeDelimiter + crawlRequest.Discovery.Uri.Host), robotsDotTextRequest.Data);

                        crawlRequest.Politeness.CrawlDelayInMilliseconds = robotsDotText.CrawlDelay * 1000;
                        crawlRequest.Politeness.DisallowedPaths          = robotsDotText.DisallowedPaths;
                    }

                    if (crawlRequest.Politeness != null)
                    {
                        if (crawlRequest.Politeness.DisallowedPaths != null)
                        {
                            foreach (string disallowedPath in crawlRequest.Politeness.DisallowedPaths)
                            {
                                if (HttpUtility.UrlDecode(crawlRequest.Discovery.Uri.AbsoluteUri).StartsWith(HttpUtility.UrlDecode(disallowedPath)))
                                {
                                    crawlRequest.IsDisallowedReason = "Prohibited by robots.txt.";
                                    return(true);
                                }
                            }
                        }
                    }
                }
            }

            return(false);
        }
Exemple #2
0
        /// <summary>
        ///     Determines whether the specified a disallowed is disallowed.
        /// </summary>
        /// <param name = "aDisallowed">A disallowed.</param>
        /// <param name = "uri">The URI.</param>
        /// <returns>
        ///     <c>true</c> if the specified a disallowed is disallowed; otherwise, <c>false</c>.
        /// </returns>
        protected bool IsDisallowed(ADisallowed <TArachnodeDAO> aDisallowed, Uri uri)
        {
            bool isDisallowed = false;

            aDisallowed.OutputIsDisallowedReason = OutputIsDisallowedReason;

            #region Disallowed by AbsoluteUri.

            if (UserDefinedFunctions.IsDisallowedForAbsoluteUri(uri.AbsoluteUri, true, false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForAbsoluteUri)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by AbsoluteUri.";

                return(true);
            }

            #endregion

            #region Disallowed by Domain.

            if (UserDefinedFunctions.IsDisallowedForDomain(uri.AbsoluteUri, false, false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForDomain)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by Domain.";

                return(true);
            }

            #endregion

            #region Disallowed by Extension.

            if (UserDefinedFunctions.IsDisallowedForExtension(uri.AbsoluteUri, false, false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForExtension)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by Extension.";

                return(true);
            }

            #endregion

            #region Disallowed by FileExtension.

            if (UserDefinedFunctions.IsDisallowedForFileExtension(uri.AbsoluteUri, false, false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForFileExtension)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by FileExtension.";

                return(true);
            }

            #endregion

            #region Disallowed by Host.

            if (UserDefinedFunctions.IsDisallowedForHost(uri.AbsoluteUri, false, false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForHost)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by Host.";

                return(true);
            }

            #endregion

            #region Disallowed by Scheme.

            if (UserDefinedFunctions.IsDisallowedForScheme(uri.AbsoluteUri, false, false))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForScheme)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by Scheme.";

                return(true);
            }

            #endregion

            #region Disallowed by repeating AbsoluteUri.

            if (_detectRepeatingAbsoluteUrisRegex.IsMatch(uri.AbsoluteUri))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForRepeatingAbsoluteUri)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by repeating AbsoluteUri.";

                return(true);
            }

            #endregion

            #region Disallowed by maximum directory depth.

            if (uri.AbsoluteUri.Replace("/", string.Empty).Length < uri.AbsoluteUri.Length - (_maximumDirectoryDepth + 2))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForMaximumDirectoryDepth)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by maximum directory depth.";

                return(true);
            }

            #endregion

            #region Disallowed by named anchor.

            //if (_disallowNamedAnchors && _detectAbsoluteUrisWithNamedAnchorRegex.IsMatch(discovery.Uri.AbsoluteUri))
            if (_disallowNamedAnchors && !string.IsNullOrEmpty(uri.Fragment))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForNamedAnchor)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by named anchor.";

                return(true);
            }

            #endregion

            #region Disallowed by query string.

            //if (_disallowQueryStrings && _detectAbsoluteUrisWithQueryStringRegex.IsMatch(discovery.Uri.AbsoluteUri))
            if (_disallowQueryStrings && !string.IsNullOrEmpty(uri.Query))
            {
                isDisallowed = true;
            }

            if (_negateIsDisallowedForQueryString)
            {
                isDisallowed = !isDisallowed;
            }

            if (isDisallowed)
            {
                aDisallowed.IsDisallowedReason = "Disallowed by query string.";

                return(true);
            }

            #endregion

            return(false);
        }