/// <summary> /// Determines the type of the data. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <returns></returns> public override DataType DetermineDataType(CrawlRequest <TArachnodeDAO> crawlRequest) { DataType dataType; string extension = UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri.ToLower()).Value; if (crawlRequest.WebClient.HttpWebResponse != null && !string.IsNullOrEmpty(crawlRequest.WebClient.HttpWebResponse.ContentType)) { string contentType = crawlRequest.WebClient.HttpWebResponse.ContentType.Split(';')[0].ToLowerInvariant().Replace("\"", ""); if (AllowedDataTypes.ContainsKey(contentType)) { dataType = DetermineDataType(contentType, extension); } else { if (_contentTypesByName.ContainsKey(contentType)) { dataType = new DataType(contentType, _contentTypesByName[contentType], DiscoveryType.None, extension, null, null); } else { dataType = new DataType(contentType, _contentTypesByName["UNKNOWN"], DiscoveryType.None, extension, null, null); } } } else { dataType = new DataType(null, _contentTypesByName["UNKNOWN"], DiscoveryType.None, null, null, null); } return(dataType); }
/// <summary> /// Determines whether the specified crawl request is restricted. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "uriClassificationType">Type of the URI classification.</param> /// <returns> /// <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>. /// </returns> protected override bool IsRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri, short uriClassificationType) { if (uriClassificationType == (short)UriClassificationType.None) { return(false); } if ((uriClassificationType & (short)UriClassificationType.Domain) == (short)UriClassificationType.Domain) { if (UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractDomain(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Extension) == (short)UriClassificationType.Extension) { if (UserDefinedFunctions.ExtractExtension(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractExtension(absoluteUri, false)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.FileExtension) == (short)UriClassificationType.FileExtension) { if (UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractFileExtension(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Host) == (short)UriClassificationType.Host) { if (UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractHost(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Scheme) == (short)UriClassificationType.Scheme) { if (UserDefinedFunctions.ExtractScheme(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractScheme(absoluteUri, false)) { return(true); } } if (uriClassificationType >= (short)UriClassificationType.OriginalDirectoryLevelUp) { string crawlRequestOriginatorAbsoluteUriDirectory; if (crawlRequest.Originator == null) { crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Parent.Uri.LocalPath)); } else { crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Originator.Uri.LocalPath)); } string absoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(new Uri(absoluteUri).LocalPath)); if (crawlRequestOriginatorAbsoluteUriDirectory == null) { crawlRequestOriginatorAbsoluteUriDirectory = "\\"; } if (absoluteUriDirectory == null) { absoluteUriDirectory = "\\"; } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelUp) == (short)UriClassificationType.OriginalDirectoryLevelUp) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory && absoluteUriDirectory != "\\") { return(true); } return(false); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length < absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } else { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri) { return(true); } if (crawlRequest.CurrentDepth == 1) { crawlRequest.IsStorable = false; return(false); } } } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) == (short)UriClassificationType.OriginalDirectoryLevelDown) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory) { return(true); } } if (!absoluteUriDirectory.StartsWith(crawlRequestOriginatorAbsoluteUriDirectory)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length > absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } else { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri) { return(true); } if (crawlRequest.CurrentDepth == 1) { crawlRequest.IsStorable = false; return(false); } } } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown && (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length != absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } } return(false); }