public override int GetHashCode() { unchecked { int result = (CrawlStep != null ? CrawlStep.GetHashCode() : 0); result = (result * 397) ^ (Properties != null ? Properties.GetHashCode() : 0); result = (result * 397) ^ (Referrer != null ? Referrer.GetHashCode() : 0); return(result); } }
/// <summary> /// Executes OnDownloadException event /// </summary> private void OnDownloadException(Exception exception, CrawlStep crawlStep, CrawlStep referrer) { long downloadErrors = Interlocked.Increment(ref m_DownloadErrors); if (MaximumHttpDownloadErrors.HasValue && MaximumHttpDownloadErrors.Value > downloadErrors) { m_Logger.Error("Number of maximum failed downloads exceeded({0}), cancelling crawl", MaximumHttpDownloadErrors.Value); StopCrawl(); } m_Logger.Error("Download exception while downloading {0}, error was {1}", crawlStep.Uri, exception); DownloadException.ExecuteEvent(this, () => new DownloadExceptionEventArgs(crawlStep, referrer, exception)); }
/// <summary> /// Executes OnDownloadException event /// </summary> private void OnDownloadException(Exception exception, CrawlStep crawlStep, CrawlStep referrer) { var downloadErrors = Interlocked.Increment(ref this.m_DownloadErrors); if (this.MaximumHttpDownloadErrors.HasValue && this.MaximumHttpDownloadErrors.Value > downloadErrors) { this.m_Logger.Error("Number of maximum failed downloads exceeded({0}), cancelling crawl", this.MaximumHttpDownloadErrors.Value); this.StopCrawl(); } this.m_Logger.Error("Download exception while downloading {0}, error was {1}", crawlStep.Uri, exception); DownloadException?.Invoke(this, new DownloadExceptionEventArgs(crawlStep, referrer, exception)); }
/// <summary> /// Download content from a url /// </summary> /// <param name="step">Step in crawler that contains url to download</param> /// <returns>Downloaded content</returns> private PropertyBag Download(CrawlStep step) { try { IWebDownloader webDownloader = m_DownloaderFactory.GetDownloader(); m_Logger.Verbose("Downloading {0}", step.Uri); return(webDownloader.Download(step, DownloadMethod.Get)); } catch (Exception ex) { OnDownloadException(ex, step); } return(null); }
/// <summary> /// Returns true to continue crawl of this url, else false /// </summary> /// <returns>True if this step should be cancelled, else false</returns> private bool OnBeforeDownload(CrawlStep crawlStep) { EventHandler <BeforeDownloadEventArgs> beforeDownloadTmp = BeforeDownload; if (beforeDownloadTmp.IsNull()) { return(crawlStep.IsAllowed); } BeforeDownloadEventArgs e = new BeforeDownloadEventArgs(!crawlStep.IsAllowed, crawlStep); beforeDownloadTmp(this, e); return(!e.Cancel); }
/// <summary> /// Returns true to continue crawl of this url, else false /// </summary> /// <returns>True if this step should be cancelled, else false</returns> private bool OnAfterDownload(CrawlStep crawlStep, PropertyBag response) { EventHandler <AfterDownloadEventArgs> afterDownloadTmp = AfterDownload; if (afterDownloadTmp.IsNull()) { return(crawlStep.IsAllowed); } AfterDownloadEventArgs e = new AfterDownloadEventArgs(!crawlStep.IsAllowed, response); afterDownloadTmp(this, e); return(!e.Cancel); }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name="uri">url to crawl</param> /// <param name="depth">depth of the url</param> /// <param name="referrer">Step which the url was located</param> /// <param name="properties">Custom properties</param> public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary <string, object> properties) { if (!m_Crawling) { throw new InvalidOperationException("Crawler must be running before adding steps"); } if (m_CrawlStopped) { return; } if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) || // Only accept http(s) schema (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) || !IsAllowedUrl(uri, referrer)) { if (depth == 0) { StopCrawl(); } return; } if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity))) { return; } // Make new crawl step CrawlStep crawlStep = new CrawlStep(uri, depth) { IsExternalUrl = IsExternalUrl(uri), IsAllowed = true, }; m_CrawlerQueue.Push(new CrawlerQueueEntry { CrawlStep = crawlStep, Referrer = referrer, Properties = properties }); m_Logger.Verbose("Added {0} to queue referred from {1}", crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString()); StartNew(); }
/// <summary> /// Queue a new step on the crawler queue /// </summary> /// <param name = "uri">url to crawl</param> /// <param name = "depth">depth of the url</param> /// <param name = "referrer">Step which the url was located</param> /// <param name = "properties">Custom properties</param> public async Task AddStepAsync(Uri uri, int depth, CrawlStep referrer, Dictionary <string, object> properties) { if (!this.m_Crawling) { throw new InvalidOperationException("Crawler must be running before adding steps"); } if (this.m_CrawlStopped) { return; } var allowedReferrer = await this.m_CrawlerRules.IsAllowedUrlAsync(uri, referrer); if ((uri.Scheme != "https" && uri.Scheme != "http") || // Only accept http(s) schema (this.MaximumCrawlDepth.HasValue && this.MaximumCrawlDepth.Value > 0 && depth >= this.MaximumCrawlDepth.Value) || !allowedReferrer || !this.m_CrawlerHistory.Register(uri.GetUrlKeyString(this.UriSensitivity))) { if (depth == 0) { StopCrawl(); } return; } // Make new crawl step var crawlStep = new CrawlStep(uri, depth) { IsExternalUrl = this.m_CrawlerRules.IsExternalUrl(uri), IsAllowed = true, }; this.m_CrawlerQueue.Push(new CrawlerQueueEntry { CrawlStep = crawlStep, Referrer = referrer, Properties = properties }); this.m_Logger.Verbose("Added {0} to queue referred from {1}", crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString()); ProcessQueue(); }
/// <summary> /// Checks if the crawler should follow an url /// </summary> /// <param name="uri">Url to check</param> /// <param name="referrer"></param> /// <returns>True if the crawler should follow the url, else false</returns> protected virtual bool IsAllowedUrl(Uri uri, CrawlStep referrer) { if (MaximumUrlSize.HasValue && MaximumUrlSize.Value > 10 && uri.ToString().Length > MaximumUrlSize.Value) { return(false); } if (!IncludeFilter.IsNull() && IncludeFilter.Any(f => f.Match(uri, referrer))) { return(true); } if (!ExcludeFilter.IsNull() && ExcludeFilter.Any(f => f.Match(uri, referrer))) { return(false); } if (IsExternalUrl(uri)) { return(false); } return(!AdhereToRobotRules || m_Robot.IsAllowed(UserAgent, uri)); }
public int CompareTo(CrawlerQueueEntry other) { return(CrawlStep.CompareTo(other.CrawlStep)); }