Beispiel #1
0
        //IPipelineStep.Process
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            try
            {
                var updateCrawler = crawler as UpdateContextAwareCrawler;
                if (updateCrawler == null)
                {
                    if (_logger != null)
                    {
                        _logger.Info("Crawler is not an UpdateContextAwareCrawler, we can't deal with this crawler");
                    }
                    return;
                }

                Uri currentUri = ExtractCurrentUri(propertyBag);

                string id          = currentUri.PathAndQuery;
                String depthString = CreateDepthString(propertyBag.Step.Depth);

                if (_logger != null)
                {
                    _logger.Info(String.Format("{0}| Process | HTTP-{1} | {2}", depthString, propertyBag.StatusCode, id));
                }

                lock (updateCrawler.UpdateContext)
                {
                    GetIndexWriter(updateCrawler.UpdateContext).DeleteDocuments(new Term(BuiltinFields.Path, id));
                    if (propertyBag.StatusCode == System.Net.HttpStatusCode.OK)
                    {
                        //this should have been done by NCrawler, but let's do it here... could move this to seperate crawlerrulesservice class, but then we'd have to download the content again
                        String message;
                        if (crawler.AdhereToRobotRules &&
                            EvaluateSkipConditions(propertyBag, updateCrawler, id, out message))
                        {
                            if (_logger != null)
                            {
                                _logger.Info(String.Format("{0}| Skipped | {1} | {2}", depthString, message, id));
                            }
                            return;
                        }

                        var document = CreateDocument(propertyBag, updateCrawler.RunningContextId, id);
                        updateCrawler.UpdateContext.AddDocument(document);

                        //Raise event that the givven document is updated
                        Event.RaiseEvent("SiteCrawler:DocumentUpdated", new CrawlDocumentUpdatedEventArgs(updateCrawler, document));

                        if (_logger != null)
                        {
                            _logger.InfoFormat("{0}| Add/Update | {1}", depthString, id);
                        }
                    }
                    else if (propertyBag.StatusCode == System.Net.HttpStatusCode.NotFound)
                    {
                        if (_logger != null)
                        {
                            _logger.InfoFormat("Crawler encoutered 404 for [{0}]", id);
                        }
                        //Raise an event that the Document was not found
                        Event.RaiseEvent("SiteCrawler:DocumentNotFound",
                                         new CrawlDocumentErrorEventArgs(updateCrawler, id, propertyBag));
                    }
                    else if (propertyBag.StatusCode == HttpStatusCode.ServiceUnavailable)
                    {
                        _logger.WarnFormat("Crawler encountered status {0} ({1}) for document {2}, ABORTING CRAWLER!!",
                                           propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id);
                        Event.RaiseEvent("SiteCrawler:DocumentError", new CrawlDocumentErrorEventArgs(updateCrawler, id, propertyBag));
                        //server is shutting down or is too busy, abort the indexing!!
                        crawler.Cancel();
                    }
                    else
                    {
                        if (_logger != null)
                        {
                            _logger.WarnFormat("Crawler encountered status {0} ({1}) for document {2}",
                                               propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id);
                        }
                        //Raise an event that the document request returned an error
                        Event.RaiseEvent("SiteCrawler:DocumentError", new CrawlDocumentErrorEventArgs(updateCrawler, id, propertyBag));
                        if (propertyBag.Step.Depth == 0)
                        {
                            if (_logger != null)
                            {
                                _logger.Warn("ABORTING CRAWLER DUE TO ERROR ON FIRST REQUEST");
                            }
                            crawler.Cancel();
                        }
                    }
                }
            }
            catch (Exception crawlExeption)
            {
                if (_logger != null)
                {
                    _logger.Error(GetExceptionLog(crawlExeption).ToString());
                }

                throw;
            }
        }