/// <summary> /// Evaluates if the current document must be added or if it can be skipped /// </summary> /// <param name="propertyBag"></param> /// <param name="updateCrawler"></param> /// <param name="id"></param> /// <param name="message"></param> /// <returns></returns> protected bool EvaluateSkipConditions(PropertyBag propertyBag, UpdateContextAwareCrawler updateCrawler, string id, out String message) { message = String.Empty; var htmlDocProp = propertyBag["HtmlDoc"]; if (htmlDocProp != null && htmlDocProp.Value != null) { var htmlDoc = htmlDocProp.Value as HtmlDocument; if (htmlDoc != null) { //Raise a custom event indicating that a document is analysed var args = new CrawlDocumentAnalyseEventArgs(updateCrawler, htmlDoc); Event.RaiseEvent("SiteCrawler:DocumentAnalyse", args); //When the Skip field is set the event handlers have indicated that this document should be skipped if (args.Skip) { message = "CrawlDocumentAnalyse Skip = true"; return(true); } if (EvaluateSkipConditions(htmlDoc, id, out message)) { return(true); } } } return(false); }
private void Crawl(IndexUpdateContext context) { if (_isrunning) { _logger.InfoFormat("Crawler is already running, aborting"); return; } lock (_runninglock) { if (_isrunning) { _logger.InfoFormat("Crawler is already running, aborting"); return; } _isrunning = true; var dir = _directoryHelper.GetDirectoryName(_index); _cancelled = false; try { _directoryHelper.CreateDirectoryBackup(dir); GetIndexWriter(context).DeleteDocuments(new Term(BuiltinFields.Tags, ValueOrEmpty(Tags))); var runningContextId = ShortID.NewId(); var urls = GetTransformedUrls().ToList(); if (_logger != null) { urls.ForEach(url => _logger.InfoFormat("Starting url: {0}", url)); } var documentProcessor = (_logger != null && _logger.IsDebugEnabled) ? new LogHtmlDocumentProcessor(_logger, _indexFilters, _followFilters) : new HtmlDocumentProcessor(_indexFilters, _followFilters); using (var c = new UpdateContextAwareCrawler(context, runningContextId, urls, new LogLoggerBridge(_logger), documentProcessor, this)) { if (_logger != null) { _logger.Info(String.Format("Crawler started: Using {0} threads", MaximumThreadCount)); } c.AdhereToRobotRules = AdhereToRobotRules; c.MaximumThreadCount = MaximumThreadCount; c.UriSensitivity = UriSensitivity; if (MaximumCrawlDepth > 0) { c.MaximumCrawlDepth = MaximumCrawlDepth; } if (MaximumDocuments > 0) { c.MaximumCrawlCount = MaximumDocuments; } if (MaximumCrawlTime.TotalMinutes > 0) { c.MaximumCrawlTime = MaximumCrawlTime; } c.UseCookies = UseCookies; c.ExcludeFilter = new[] { new RegexFilter(new Regex(RegexExcludeFilter)) }; c.AfterDownload += CrawlerAfterDownload; c.PipelineException += CrawlerPipelineException; c.DownloadException += CrawlerDownloadException; c.Cancelled += CrawlerCancelled; Event.RaiseEvent("SiteCrawler:Started", new CrawlStartedEventArgs(c)); c.Crawl(); Event.RaiseEvent("SiteCrawler:Finished", new CrawlFinishedEventArgs(c)); } } catch (Exception crawlException) { if (_logger != null) { _logger.Error(GetExceptionLog(crawlException).ToString()); } if (_directoryHelper.RestoreDirectoryBackup(dir)) { _cancelled = false; } } finally { if (_logger != null) { _logger.Info("Crawler finished"); } _isrunning = false; if (!_cancelled) { _directoryHelper.DeleteBackupDirectory(dir); } } } }