public override void GetCookies(string absoluteUri, CookieContainer cookieContainer) { try { lock (_cookieContainerLock) { if (cookieContainer == null) { cookieContainer = new CookieContainer(); } Uri uri = new Uri(absoluteUri); uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + UserDefinedFunctions.ExtractDomain(absoluteUri).Value); if (cookieContainer.GetCookies(uri).Count == 0) { string cookieHeaders = InternetGetCookieEx(absoluteUri); if (cookieHeaders != null) { CookieCollection cookieCollection = BuildCookieCollection(cookieHeaders); cookieContainer.Add(uri, cookieCollection); } } uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + "www." + uri.Host); if (cookieContainer.GetCookies(uri).Count == 0) { string cookieHeaders = InternetGetCookieEx(absoluteUri); if (cookieHeaders != null) { CookieCollection cookieCollection = BuildCookieCollection(cookieHeaders); cookieContainer.Add(uri, cookieCollection); } } } } catch (Exception) { } }
protected override void CreateDocument(Document document, long discoveryID, DiscoveryType discoveryType, string absoluteUri, string contentToIndex, int codePage, string fullTextIndexType, float strength, string discoveryPath, int threadNumber) { //a bare bones example of what you could do to add a new field to the index... //if (discoveryType == DiscoveryType.WebPage) //{ // HtmlDocument htmlDocument = new HtmlDocument(); // htmlDocument.LoadHtml(contentToIndex); // HtmlNode htmlNode = htmlDocument.DocumentNode.SelectSingleNode("/html/body"); // string body = htmlNode.InnerText; // document.Add(new Field("body", "Mike", Field.Store.NO, Field.Index.UN_TOKENIZED)); //} document.Add(new Field("indexkey", discoveryType.ToString().ToLower().Substring(0, 1) + discoveryID, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("discoveryid", discoveryID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("discoverytype", discoveryType.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED)); //Discovery document.Add(new Field("absoluteuri", absoluteUri, Field.Store.YES, Field.Index.ANALYZED)); //core fields document.Add(new Field("text", contentToIndex, Field.Store.NO, Field.Index.ANALYZED)); document.Add(new Field("codepage", codePage.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("title", _title.Match(contentToIndex).Groups["Title"].Value.Trim(), Field.Store.YES, Field.Index.ANALYZED)); //DiscoveryPath document.Add(new Field("discoverypath", discoveryPath, Field.Store.YES, Field.Index.NO)); //AbsoluteUri Classification document.Add(new Field("domain", UserDefinedFunctions.ExtractDomain(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("extension", UserDefinedFunctions.ExtractExtension(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("host", UserDefinedFunctions.ExtractHost(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("scheme", UserDefinedFunctions.ExtractScheme(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); //FullTextIndexType - used to store the extension that can be used with the default IIS MIME types configuration... (.pl images cannot be served without MIME type modification...) document.Add(new Field("fulltextindextype", fullTextIndexType, Field.Store.YES, Field.Index.NOT_ANALYZED)); AddDocument(document, absoluteUri, strength); }
public override void UpdateCookies(string absoluteUri, CookieContainer cookieContainer, CookieCollection cookieCollection) { try { lock (_cookieContainerLock) { if (cookieContainer != null && cookieCollection != null) { Uri uri = new Uri(absoluteUri); uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + UserDefinedFunctions.ExtractDomain(absoluteUri).Value); cookieContainer.Add(uri, cookieCollection); uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + "www." + uri.Host); cookieContainer.Add(uri, cookieCollection); } } } catch (Exception) { } }
private void Engine_CrawlRequestCompleted2(CrawlRequest <ArachnodeDAO> sender) { if (UserDefinedFunctions.ExtractDomain(sender.Discovery.Uri.AbsoluteUri).Value != "nbc.com") { } }
public override bool ManagePoliteness(CrawlRequest <TArachnodeDAO> crawlRequest, PolitenessState politenessState, IArachnodeDAO arachnodeDAO) { if (crawlRequest != null && crawlRequest.Politeness == null) { string domain = UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri).Value; //politeness/throttling can operate per host (cars.msn.com) or per domain (msn.com)... //string host = UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri).Value; //domain = host; crawlRequest.Politeness = _cache.GetPoliteness(domain); if (crawlRequest.Politeness == null) { crawlRequest.Politeness = new Politeness(domain); crawlRequest.Politeness.FirstHttpWebRequest = DateTime.Now; _cache.AddPoliteness(crawlRequest.Politeness); } } if (crawlRequest != null && crawlRequest.Politeness != null) { if (politenessState == PolitenessState.HttpWebRequestRequested) { if ((crawlRequest.Politeness.CrawlDelayInMilliseconds != 0 && DateTime.Now.Subtract(crawlRequest.Politeness.LastHttpWebRequestCompleted).TotalMilliseconds < crawlRequest.Politeness.CrawlDelayInMilliseconds)) { ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO); return(false); } if (crawlRequest.Politeness.ActiveHttpWebRequests >= crawlRequest.Politeness.MaximumActiveHttpWebRequests) { ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO); return(false); } if (ApplicationSettings.AutoThrottleHttpWebRequests) { if (crawlRequest.Politeness.AutoThrottleHttpWebRequests) { if (crawlRequest.Politeness.CrawlDelayInMilliseconds == 0) { if (crawlRequest.Politeness.LastHttpWebRequestCompleted == DateTime.MinValue) { crawlRequest.Politeness.LastHttpWebRequestCompleted = crawlRequest.Politeness.LastHttpWebRequestRequested; } double millisecondsBetweenLastCanceledAndLastCompleted = crawlRequest.Politeness.LastHttpWebRequestCanceled.Subtract(crawlRequest.Politeness.LastHttpWebRequestCompleted).TotalMilliseconds; double millisecondsBetweenNowAndLastRequested = DateTime.Now.Subtract(crawlRequest.Politeness.LastHttpWebRequestRequested).TotalMilliseconds; if (crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds == 0 && millisecondsBetweenLastCanceledAndLastCompleted > 0) { crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds = millisecondsBetweenLastCanceledAndLastCompleted; } if (crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds > millisecondsBetweenNowAndLastRequested) { if (millisecondsBetweenLastCanceledAndLastCompleted > 0) { crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds = millisecondsBetweenLastCanceledAndLastCompleted; } ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO); return(false); } else { crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds *= 0.9; } } } } lock (_lock) { crawlRequest.Politeness.ActiveHttpWebRequests++; } crawlRequest.Politeness.LastHttpWebRequestRequested = DateTime.Now; return(true); } lock (_lock) { crawlRequest.Politeness.ActiveHttpWebRequests--; if (crawlRequest.Politeness.ActiveHttpWebRequests < 0) { //shouldn't occur... crawlRequest.Politeness.ActiveHttpWebRequests = 0; } } switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastFileHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalFileHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastFileHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalFileHttpWebRequestsCanceled++; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalFileDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalFileHttpWebResponseTime += crawlRequest.HttpWebResponseTime; break; case DiscoveryType.Image: switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastImageHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalImageHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastImageHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalImageHttpWebRequestsCanceled++; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalImageDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalImageHttpWebResponseTime += crawlRequest.HttpWebResponseTime; break; case DiscoveryType.WebPage: switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastWebPageHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalWebPageHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastWebPageHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalWebPageHttpWebRequestsCanceled++; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalWebPageDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalWebPageHttpWebResponseTime += crawlRequest.HttpWebResponseTime; break; } switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalHttpWebRequestsCanceled++; crawlRequest.Politeness.AutoThrottleHttpWebRequests = true; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalHttpWebResponseTime += crawlRequest.HttpWebResponseTime; } return(true); }
/// <summary> /// Determines whether the specified crawl request is restricted. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "uriClassificationType">Type of the URI classification.</param> /// <returns> /// <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>. /// </returns> protected override bool IsRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri, short uriClassificationType) { if (uriClassificationType == (short)UriClassificationType.None) { return(false); } if ((uriClassificationType & (short)UriClassificationType.Domain) == (short)UriClassificationType.Domain) { if (UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractDomain(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Extension) == (short)UriClassificationType.Extension) { if (UserDefinedFunctions.ExtractExtension(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractExtension(absoluteUri, false)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.FileExtension) == (short)UriClassificationType.FileExtension) { if (UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractFileExtension(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Host) == (short)UriClassificationType.Host) { if (UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractHost(absoluteUri)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.Scheme) == (short)UriClassificationType.Scheme) { if (UserDefinedFunctions.ExtractScheme(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractScheme(absoluteUri, false)) { return(true); } } if (uriClassificationType >= (short)UriClassificationType.OriginalDirectoryLevelUp) { string crawlRequestOriginatorAbsoluteUriDirectory; if (crawlRequest.Originator == null) { crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Parent.Uri.LocalPath)); } else { crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Originator.Uri.LocalPath)); } string absoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(new Uri(absoluteUri).LocalPath)); if (crawlRequestOriginatorAbsoluteUriDirectory == null) { crawlRequestOriginatorAbsoluteUriDirectory = "\\"; } if (absoluteUriDirectory == null) { absoluteUriDirectory = "\\"; } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelUp) == (short)UriClassificationType.OriginalDirectoryLevelUp) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory && absoluteUriDirectory != "\\") { return(true); } return(false); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length < absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } else { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri) { return(true); } if (crawlRequest.CurrentDepth == 1) { crawlRequest.IsStorable = false; return(false); } } } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) == (short)UriClassificationType.OriginalDirectoryLevelDown) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory) { return(true); } } if (!absoluteUriDirectory.StartsWith(crawlRequestOriginatorAbsoluteUriDirectory)) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length > absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } else { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri) { return(true); } if (crawlRequest.CurrentDepth == 1) { crawlRequest.IsStorable = false; return(false); } } } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown && (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown) { if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory) { if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory) { return(true); } } if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel) { if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length != absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length) { return(true); } } } } return(false); }