Пример #1
0
        /// <summary>
        ///     Determines the type of the data.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <returns></returns>
        public override DataType DetermineDataType(CrawlRequest <TArachnodeDAO> crawlRequest)
        {
            DataType dataType;

            string extension = UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri.ToLower()).Value;

            if (crawlRequest.WebClient.HttpWebResponse != null && !string.IsNullOrEmpty(crawlRequest.WebClient.HttpWebResponse.ContentType))
            {
                string contentType = crawlRequest.WebClient.HttpWebResponse.ContentType.Split(';')[0].ToLowerInvariant().Replace("\"", "");

                if (AllowedDataTypes.ContainsKey(contentType))
                {
                    dataType = DetermineDataType(contentType, extension);
                }
                else
                {
                    if (_contentTypesByName.ContainsKey(contentType))
                    {
                        dataType = new DataType(contentType, _contentTypesByName[contentType], DiscoveryType.None, extension, null, null);
                    }
                    else
                    {
                        dataType = new DataType(contentType, _contentTypesByName["UNKNOWN"], DiscoveryType.None, extension, null, null);
                    }
                }
            }
            else
            {
                dataType = new DataType(null, _contentTypesByName["UNKNOWN"], DiscoveryType.None, null, null, null);
            }

            return(dataType);
        }
Пример #2
0
        /// <summary>
        ///     Determines whether the specified crawl request is restricted.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "uriClassificationType">Type of the URI classification.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>.
        /// </returns>
        protected override bool IsRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri, short uriClassificationType)
        {
            if (uriClassificationType == (short)UriClassificationType.None)
            {
                return(false);
            }

            if ((uriClassificationType & (short)UriClassificationType.Domain) == (short)UriClassificationType.Domain)
            {
                if (UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractDomain(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Extension) == (short)UriClassificationType.Extension)
            {
                if (UserDefinedFunctions.ExtractExtension(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractExtension(absoluteUri, false))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.FileExtension) == (short)UriClassificationType.FileExtension)
            {
                if (UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractFileExtension(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Host) == (short)UriClassificationType.Host)
            {
                if (UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractHost(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Scheme) == (short)UriClassificationType.Scheme)
            {
                if (UserDefinedFunctions.ExtractScheme(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractScheme(absoluteUri, false))
                {
                    return(true);
                }
            }

            if (uriClassificationType >= (short)UriClassificationType.OriginalDirectoryLevelUp)
            {
                string crawlRequestOriginatorAbsoluteUriDirectory;

                if (crawlRequest.Originator == null)
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Parent.Uri.LocalPath));
                }
                else
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Originator.Uri.LocalPath));
                }

                string absoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(new Uri(absoluteUri).LocalPath));

                if (crawlRequestOriginatorAbsoluteUriDirectory == null)
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = "\\";
                }

                if (absoluteUriDirectory == null)
                {
                    absoluteUriDirectory = "\\";
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelUp) == (short)UriClassificationType.OriginalDirectoryLevelUp)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory && absoluteUriDirectory != "\\")
                                {
                                    return(true);
                                }

                                return(false);
                            }
                        }

                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length < absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri)
                            {
                                return(true);
                            }

                            if (crawlRequest.CurrentDepth == 1)
                            {
                                crawlRequest.IsStorable = false;

                                return(false);
                            }
                        }
                    }
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) == (short)UriClassificationType.OriginalDirectoryLevelDown)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory)
                                {
                                    return(true);
                                }
                            }

                            if (!absoluteUriDirectory.StartsWith(crawlRequestOriginatorAbsoluteUriDirectory))
                            {
                                return(true);
                            }
                        }

                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length > absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri)
                            {
                                return(true);
                            }

                            if (crawlRequest.CurrentDepth == 1)
                            {
                                crawlRequest.IsStorable = false;

                                return(false);
                            }
                        }
                    }
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown && (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory)
                        {
                            return(true);
                        }
                    }

                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length != absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            return(true);
                        }
                    }
                }
            }


            return(false);
        }