Esempio n. 1
0
        /// <summary>
        /// Evaluates if the current document must be added or if it can be skipped
        /// </summary>
        /// <param name="propertyBag"></param>
        /// <param name="updateCrawler"></param>
        /// <param name="id"></param>
        /// <param name="message"></param>
        /// <returns></returns>
        protected bool EvaluateSkipConditions(PropertyBag propertyBag, UpdateContextAwareCrawler updateCrawler, string id, out String message)
        {
            message = String.Empty;
            var htmlDocProp = propertyBag["HtmlDoc"];

            if (htmlDocProp != null && htmlDocProp.Value != null)
            {
                var htmlDoc = htmlDocProp.Value as HtmlDocument;
                if (htmlDoc != null)
                {
                    //Raise a custom event indicating that a document is analysed
                    var args = new CrawlDocumentAnalyseEventArgs(updateCrawler, htmlDoc);
                    Event.RaiseEvent("SiteCrawler:DocumentAnalyse", args);
                    //When the Skip field is set the event handlers have indicated that this document should be skipped
                    if (args.Skip)
                    {
                        message = "CrawlDocumentAnalyse Skip = true";
                        return(true);
                    }

                    if (EvaluateSkipConditions(htmlDoc, id, out message))
                    {
                        return(true);
                    }
                }
            }
            return(false);
        }
Esempio n. 2
0
        private void Crawl(IndexUpdateContext context)
        {
            if (_isrunning)
            {
                _logger.InfoFormat("Crawler is already running, aborting");
                return;
            }

            lock (_runninglock)
            {
                if (_isrunning)
                {
                    _logger.InfoFormat("Crawler is already running, aborting");
                    return;
                }
                _isrunning = true;

                var dir = _directoryHelper.GetDirectoryName(_index);

                _cancelled = false;
                try
                {
                    _directoryHelper.CreateDirectoryBackup(dir);
                    GetIndexWriter(context).DeleteDocuments(new Term(BuiltinFields.Tags, ValueOrEmpty(Tags)));

                    var runningContextId = ShortID.NewId();
                    var urls             = GetTransformedUrls().ToList();
                    if (_logger != null)
                    {
                        urls.ForEach(url => _logger.InfoFormat("Starting url: {0}", url));
                    }

                    var documentProcessor = (_logger != null && _logger.IsDebugEnabled)
                            ? new LogHtmlDocumentProcessor(_logger, _indexFilters, _followFilters)
                            : new HtmlDocumentProcessor(_indexFilters, _followFilters);

                    using (var c = new UpdateContextAwareCrawler(context, runningContextId, urls, new LogLoggerBridge(_logger), documentProcessor, this))
                    {
                        if (_logger != null)
                        {
                            _logger.Info(String.Format("Crawler started: Using {0} threads", MaximumThreadCount));
                        }
                        c.AdhereToRobotRules = AdhereToRobotRules;
                        c.MaximumThreadCount = MaximumThreadCount;
                        c.UriSensitivity     = UriSensitivity;

                        if (MaximumCrawlDepth > 0)
                        {
                            c.MaximumCrawlDepth = MaximumCrawlDepth;
                        }

                        if (MaximumDocuments > 0)
                        {
                            c.MaximumCrawlCount = MaximumDocuments;
                        }

                        if (MaximumCrawlTime.TotalMinutes > 0)
                        {
                            c.MaximumCrawlTime = MaximumCrawlTime;
                        }

                        c.UseCookies    = UseCookies;
                        c.ExcludeFilter = new[]
                        {
                            new RegexFilter(new Regex(RegexExcludeFilter))
                        };

                        c.AfterDownload     += CrawlerAfterDownload;
                        c.PipelineException += CrawlerPipelineException;
                        c.DownloadException += CrawlerDownloadException;
                        c.Cancelled         += CrawlerCancelled;

                        Event.RaiseEvent("SiteCrawler:Started", new CrawlStartedEventArgs(c));

                        c.Crawl();

                        Event.RaiseEvent("SiteCrawler:Finished", new CrawlFinishedEventArgs(c));
                    }
                }

                catch (Exception crawlException)
                {
                    if (_logger != null)
                    {
                        _logger.Error(GetExceptionLog(crawlException).ToString());
                    }
                    if (_directoryHelper.RestoreDirectoryBackup(dir))
                    {
                        _cancelled = false;
                    }
                }
                finally
                {
                    if (_logger != null)
                    {
                        _logger.Info("Crawler finished");
                    }
                    _isrunning = false;
                    if (!_cancelled)
                    {
                        _directoryHelper.DeleteBackupDirectory(dir);
                    }
                }
            }
        }