Example #1
0
 public override int GetHashCode()
 {
     unchecked
     {
         int result = (CrawlStep != null ? CrawlStep.GetHashCode() : 0);
         result = (result * 397) ^ (Properties != null ? Properties.GetHashCode() : 0);
         result = (result * 397) ^ (Referrer != null ? Referrer.GetHashCode() : 0);
         return(result);
     }
 }
Example #2
0
        /// <summary>
        /// Executes OnDownloadException event
        /// </summary>
        private void OnDownloadException(Exception exception, CrawlStep crawlStep, CrawlStep referrer)
        {
            long downloadErrors = Interlocked.Increment(ref m_DownloadErrors);

            if (MaximumHttpDownloadErrors.HasValue && MaximumHttpDownloadErrors.Value > downloadErrors)
            {
                m_Logger.Error("Number of maximum failed downloads exceeded({0}), cancelling crawl", MaximumHttpDownloadErrors.Value);
                StopCrawl();
            }

            m_Logger.Error("Download exception while downloading {0}, error was {1}", crawlStep.Uri, exception);
            DownloadException.ExecuteEvent(this, () => new DownloadExceptionEventArgs(crawlStep, referrer, exception));
        }
Example #3
0
        /// <summary>
        /// Executes OnDownloadException event
        /// </summary>
        private void OnDownloadException(Exception exception, CrawlStep crawlStep, CrawlStep referrer)
        {
            var downloadErrors = Interlocked.Increment(ref this.m_DownloadErrors);

            if (this.MaximumHttpDownloadErrors.HasValue && this.MaximumHttpDownloadErrors.Value > downloadErrors)
            {
                this.m_Logger.Error("Number of maximum failed downloads exceeded({0}), cancelling crawl", this.MaximumHttpDownloadErrors.Value);
                this.StopCrawl();
            }

            this.m_Logger.Error("Download exception while downloading {0}, error was {1}", crawlStep.Uri, exception);
            DownloadException?.Invoke(this, new DownloadExceptionEventArgs(crawlStep, referrer, exception));
        }
Example #4
0
        /// <summary>
        /// Download content from a url
        /// </summary>
        /// <param name="step">Step in crawler that contains url to download</param>
        /// <returns>Downloaded content</returns>
        private PropertyBag Download(CrawlStep step)
        {
            try
            {
                IWebDownloader webDownloader = m_DownloaderFactory.GetDownloader();
                m_Logger.Verbose("Downloading {0}", step.Uri);
                return(webDownloader.Download(step, DownloadMethod.Get));
            }
            catch (Exception ex)
            {
                OnDownloadException(ex, step);
            }

            return(null);
        }
Example #5
0
        /// <summary>
        /// Returns true to continue crawl of this url, else false
        /// </summary>
        /// <returns>True if this step should be cancelled, else false</returns>
        private bool OnBeforeDownload(CrawlStep crawlStep)
        {
            EventHandler <BeforeDownloadEventArgs> beforeDownloadTmp = BeforeDownload;

            if (beforeDownloadTmp.IsNull())
            {
                return(crawlStep.IsAllowed);
            }

            BeforeDownloadEventArgs e =
                new BeforeDownloadEventArgs(!crawlStep.IsAllowed, crawlStep);

            beforeDownloadTmp(this, e);
            return(!e.Cancel);
        }
Example #6
0
        /// <summary>
        /// Returns true to continue crawl of this url, else false
        /// </summary>
        /// <returns>True if this step should be cancelled, else false</returns>
        private bool OnAfterDownload(CrawlStep crawlStep, PropertyBag response)
        {
            EventHandler <AfterDownloadEventArgs> afterDownloadTmp = AfterDownload;

            if (afterDownloadTmp.IsNull())
            {
                return(crawlStep.IsAllowed);
            }

            AfterDownloadEventArgs e =
                new AfterDownloadEventArgs(!crawlStep.IsAllowed, response);

            afterDownloadTmp(this, e);
            return(!e.Cancel);
        }
Example #7
0
        /// <summary>
        /// Queue a new step on the crawler queue
        /// </summary>
        /// <param name="uri">url to crawl</param>
        /// <param name="depth">depth of the url</param>
        /// <param name="referrer">Step which the url was located</param>
        /// <param name="properties">Custom properties</param>
        public void AddStep(Uri uri, int depth, CrawlStep referrer, Dictionary <string, object> properties)
        {
            if (!m_Crawling)
            {
                throw new InvalidOperationException("Crawler must be running before adding steps");
            }

            if (m_CrawlStopped)
            {
                return;
            }

            if ((uri.Scheme != Uri.UriSchemeHttps && uri.Scheme != Uri.UriSchemeHttp) ||             // Only accept http(s) schema
                (MaximumCrawlDepth.HasValue && MaximumCrawlDepth.Value > 0 && depth >= MaximumCrawlDepth.Value) ||
                !IsAllowedUrl(uri, referrer))
            {
                if (depth == 0)
                {
                    StopCrawl();
                }

                return;
            }


            if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity)))
            {
                return;
            }

            // Make new crawl step
            CrawlStep crawlStep = new CrawlStep(uri, depth)
            {
                IsExternalUrl = IsExternalUrl(uri),
                IsAllowed     = true,
            };

            m_CrawlerQueue.Push(new CrawlerQueueEntry
            {
                CrawlStep  = crawlStep,
                Referrer   = referrer,
                Properties = properties
            });
            m_Logger.Verbose("Added {0} to queue referred from {1}",
                             crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString());
            StartNew();
        }
Example #8
0
        /// <summary>
        ///     Queue a new step on the crawler queue
        /// </summary>
        /// <param name = "uri">url to crawl</param>
        /// <param name = "depth">depth of the url</param>
        /// <param name = "referrer">Step which the url was located</param>
        /// <param name = "properties">Custom properties</param>
        public async Task AddStepAsync(Uri uri, int depth, CrawlStep referrer, Dictionary <string, object> properties)
        {
            if (!this.m_Crawling)
            {
                throw new InvalidOperationException("Crawler must be running before adding steps");
            }

            if (this.m_CrawlStopped)
            {
                return;
            }

            var allowedReferrer = await this.m_CrawlerRules.IsAllowedUrlAsync(uri, referrer);

            if ((uri.Scheme != "https" && uri.Scheme != "http") || // Only accept http(s) schema
                (this.MaximumCrawlDepth.HasValue && this.MaximumCrawlDepth.Value > 0 && depth >= this.MaximumCrawlDepth.Value) ||
                !allowedReferrer ||
                !this.m_CrawlerHistory.Register(uri.GetUrlKeyString(this.UriSensitivity)))
            {
                if (depth == 0)
                {
                    StopCrawl();
                }

                return;
            }

            // Make new crawl step
            var crawlStep = new CrawlStep(uri, depth)
            {
                IsExternalUrl = this.m_CrawlerRules.IsExternalUrl(uri),
                IsAllowed     = true,
            };

            this.m_CrawlerQueue.Push(new CrawlerQueueEntry
            {
                CrawlStep  = crawlStep,
                Referrer   = referrer,
                Properties = properties
            });
            this.m_Logger.Verbose("Added {0} to queue referred from {1}",
                                  crawlStep.Uri, referrer.IsNull() ? string.Empty : referrer.Uri.ToString());
            ProcessQueue();
        }
Example #9
0
        /// <summary>
        /// Checks if the crawler should follow an url
        /// </summary>
        /// <param name="uri">Url to check</param>
        /// <param name="referrer"></param>
        /// <returns>True if the crawler should follow the url, else false</returns>
        protected virtual bool IsAllowedUrl(Uri uri, CrawlStep referrer)
        {
            if (MaximumUrlSize.HasValue && MaximumUrlSize.Value > 10 && uri.ToString().Length > MaximumUrlSize.Value)
            {
                return(false);
            }

            if (!IncludeFilter.IsNull() && IncludeFilter.Any(f => f.Match(uri, referrer)))
            {
                return(true);
            }

            if (!ExcludeFilter.IsNull() && ExcludeFilter.Any(f => f.Match(uri, referrer)))
            {
                return(false);
            }

            if (IsExternalUrl(uri))
            {
                return(false);
            }

            return(!AdhereToRobotRules || m_Robot.IsAllowed(UserAgent, uri));
        }
Example #10
0
 public int CompareTo(CrawlerQueueEntry other)
 {
     return(CrawlStep.CompareTo(other.CrawlStep));
 }