/// <summary> /// Gets the priority for host. /// </summary> /// <param name = "absoluteUri">The absolute URI.</param> /// <returns></returns> public override double?GetPriorityForHost(string absoluteUri) { double?strength; if (_priorities.TryGetValue(UserDefinedFunctions.ExtractHost(absoluteUri).Value, out strength)) { return(strength); } return(0); }
/// <summary> /// Gets the strength for host. /// </summary> /// <param name = "absoluteUri">The absolute URI.</param> /// <returns></returns> public override double?GetStrengthForHost(string absoluteUri) { double?strength; if (_hyperLinks_MOST_POPULAR_HOSTS_BY_HOSTS.TryGetValue(UserDefinedFunctions.ExtractHost(absoluteUri).Value, out strength)) { return(strength); } return(0); }
public override bool WasCrawlRequestRedirected(CrawlRequest <TArachnodeDAO> crawlRequest) { if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null) { //http://msdn.microsoft.com/en-us/library/system.net.httpstatuscode.aspx var statusCode = (int)crawlRequest.WebClient.HttpWebResponse.StatusCode; if ((statusCode >= 300 && statusCode <= 303) || statusCode == 307 || UserDefinedFunctions.ExtractHost(crawlRequest.WebClient.HttpWebRequest.RequestUri.AbsoluteUri).Value != UserDefinedFunctions.ExtractHost(crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri).Value) { return(true); } } return(false); }
protected override void CreateDocument(Document document, long discoveryID, DiscoveryType discoveryType, string absoluteUri, string contentToIndex, int codePage, string fullTextIndexType, float strength, string discoveryPath, int threadNumber) { //a bare bones example of what you could do to add a new field to the index... //if (discoveryType == DiscoveryType.WebPage) //{ // HtmlDocument htmlDocument = new HtmlDocument(); // htmlDocument.LoadHtml(contentToIndex); // HtmlNode htmlNode = htmlDocument.DocumentNode.SelectSingleNode("/html/body"); // string body = htmlNode.InnerText; // document.Add(new Field("body", "Mike", Field.Store.NO, Field.Index.UN_TOKENIZED)); //} document.Add(new Field("indexkey", discoveryType.ToString().ToLower().Substring(0, 1) + discoveryID, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("discoveryid", discoveryID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("discoverytype", discoveryType.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED)); //Discovery document.Add(new Field("absoluteuri", absoluteUri, Field.Store.YES, Field.Index.ANALYZED)); //core fields document.Add(new Field("text", contentToIndex, Field.Store.NO, Field.Index.ANALYZED)); document.Add(new Field("codepage", codePage.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("title", _title.Match(contentToIndex).Groups["Title"].Value.Trim(), Field.Store.YES, Field.Index.ANALYZED)); //DiscoveryPath document.Add(new Field("discoverypath", discoveryPath, Field.Store.YES, Field.Index.NO)); //AbsoluteUri Classification document.Add(new Field("domain", UserDefinedFunctions.ExtractDomain(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("extension", UserDefinedFunctions.ExtractExtension(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("host", UserDefinedFunctions.ExtractHost(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("scheme", UserDefinedFunctions.ExtractScheme(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); //FullTextIndexType - used to store the extension that can be used with the default IIS MIME types configuration... (.pl images cannot be served without MIME type modification...) document.Add(new Field("fulltextindextype", fullTextIndexType, Field.Store.YES, Field.Index.NOT_ANALYZED)); AddDocument(document, absoluteUri, strength); }
/// <summary> /// Determines whether the specified crawl request is restricted. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "uriClassificationType">Type of the URI classification.</param> /// <returns> /// <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>. /// </returns> protected override bool IsRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri, short uriClassificationType) { if (uriClassificationType == (short)UriClassificationType.None) { return(false); } if ((uriClassificationType & (short)UriClassificationType.Domain) == (short)UriClassificationType.Domain) { if (UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractDomain(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Extension) == (short)UriClassificationType.Extension) { if (UserDefinedFunctions.ExtractExtension(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractExtension(absoluteUri, false)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.FileExtension) == (short)UriClassificationType.FileExtension) { if (UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractFileExtension(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Host) == (short)UriClassificationType.Host) { if (UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractHost(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Scheme) == (short)UriClassificationType.Scheme) { if (UserDefinedFunctions.ExtractScheme(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractScheme(absoluteUri, false)) { return(true); } } if (uriClassificationType >= (short)UriClassificationType.OriginalDirectoryLevelUp) { string crawlRequestOriginatorAbsoluteUriDirectory; if (crawlRequest.Originator == null) { crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Parent.Uri.LocalPath)); } else { crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Originator.Uri.LocalPath)); } string absoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(new Uri(absoluteUri).LocalPath)); if (crawlRequestOriginatorAbsoluteUriDirectory == null) { crawlRequestOriginatorAbsoluteUriDirectory = "\\"; } if (absoluteUriDirectory == null) { absoluteUriDirectory = "\\"; } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelUp) == (short)UriClassificationType.OriginalDirectoryLevelUp) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory && absoluteUriDirectory != "\\") { return(true); } return(false); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length < absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } else { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri) { return(true); } if (crawlRequest.CurrentDepth == 1) { crawlRequest.IsStorable = false; return(false); } } } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) == (short)UriClassificationType.OriginalDirectoryLevelDown) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory) { return(true); } } if (!absoluteUriDirectory.StartsWith(crawlRequestOriginatorAbsoluteUriDirectory)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length > absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } else { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri) { return(true); } if (crawlRequest.CurrentDepth == 1) { crawlRequest.IsStorable = false; return(false); } } } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown && (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length != absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } } return(false); }