public abstract bool ManagePoliteness(CrawlRequest <TArachnodeDAO> crawlRequest, PolitenessState politenessState, IArachnodeDAO arachnodeDAO);
public override bool ManagePoliteness(CrawlRequest <TArachnodeDAO> crawlRequest, PolitenessState politenessState, IArachnodeDAO arachnodeDAO) { if (crawlRequest != null && crawlRequest.Politeness == null) { string domain = UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri).Value; //politeness/throttling can operate per host (cars.msn.com) or per domain (msn.com)... //string host = UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri).Value; //domain = host; crawlRequest.Politeness = _cache.GetPoliteness(domain); if (crawlRequest.Politeness == null) { crawlRequest.Politeness = new Politeness(domain); crawlRequest.Politeness.FirstHttpWebRequest = DateTime.Now; _cache.AddPoliteness(crawlRequest.Politeness); } } if (crawlRequest != null && crawlRequest.Politeness != null) { if (politenessState == PolitenessState.HttpWebRequestRequested) { if ((crawlRequest.Politeness.CrawlDelayInMilliseconds != 0 && DateTime.Now.Subtract(crawlRequest.Politeness.LastHttpWebRequestCompleted).TotalMilliseconds < crawlRequest.Politeness.CrawlDelayInMilliseconds)) { ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO); return(false); } if (crawlRequest.Politeness.ActiveHttpWebRequests >= crawlRequest.Politeness.MaximumActiveHttpWebRequests) { ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO); return(false); } if (ApplicationSettings.AutoThrottleHttpWebRequests) { if (crawlRequest.Politeness.AutoThrottleHttpWebRequests) { if (crawlRequest.Politeness.CrawlDelayInMilliseconds == 0) { if (crawlRequest.Politeness.LastHttpWebRequestCompleted == DateTime.MinValue) { crawlRequest.Politeness.LastHttpWebRequestCompleted = crawlRequest.Politeness.LastHttpWebRequestRequested; } double millisecondsBetweenLastCanceledAndLastCompleted = crawlRequest.Politeness.LastHttpWebRequestCanceled.Subtract(crawlRequest.Politeness.LastHttpWebRequestCompleted).TotalMilliseconds; double millisecondsBetweenNowAndLastRequested = DateTime.Now.Subtract(crawlRequest.Politeness.LastHttpWebRequestRequested).TotalMilliseconds; if (crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds == 0 && millisecondsBetweenLastCanceledAndLastCompleted > 0) { crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds = millisecondsBetweenLastCanceledAndLastCompleted; } if (crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds > millisecondsBetweenNowAndLastRequested) { if (millisecondsBetweenLastCanceledAndLastCompleted > 0) { crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds = millisecondsBetweenLastCanceledAndLastCompleted; } ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO); return(false); } else { crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds *= 0.9; } } } } lock (_lock) { crawlRequest.Politeness.ActiveHttpWebRequests++; } crawlRequest.Politeness.LastHttpWebRequestRequested = DateTime.Now; return(true); } lock (_lock) { crawlRequest.Politeness.ActiveHttpWebRequests--; if (crawlRequest.Politeness.ActiveHttpWebRequests < 0) { //shouldn't occur... crawlRequest.Politeness.ActiveHttpWebRequests = 0; } } switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastFileHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalFileHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastFileHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalFileHttpWebRequestsCanceled++; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalFileDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalFileHttpWebResponseTime += crawlRequest.HttpWebResponseTime; break; case DiscoveryType.Image: switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastImageHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalImageHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastImageHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalImageHttpWebRequestsCanceled++; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalImageDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalImageHttpWebResponseTime += crawlRequest.HttpWebResponseTime; break; case DiscoveryType.WebPage: switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastWebPageHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalWebPageHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastWebPageHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalWebPageHttpWebRequestsCanceled++; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalWebPageDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalWebPageHttpWebResponseTime += crawlRequest.HttpWebResponseTime; break; } switch (politenessState) { case PolitenessState.HttpWebRequestCompleted: crawlRequest.Politeness.LastHttpWebRequestCompleted = DateTime.Now; crawlRequest.Politeness.TotalHttpWebRequestsCompleted++; break; case PolitenessState.HttpWebRequestCanceled: crawlRequest.Politeness.LastHttpWebRequestCanceled = DateTime.Now; crawlRequest.Politeness.TotalHttpWebRequestsCanceled++; crawlRequest.Politeness.AutoThrottleHttpWebRequests = true; break; } if (crawlRequest.Data != null) { crawlRequest.Politeness.TotalDownloadedBytes += crawlRequest.Data.LongLength; } crawlRequest.Politeness.TotalHttpWebResponseTime += crawlRequest.HttpWebResponseTime; } return(true); }