//IPipelineStep.Process public void Process(Crawler crawler, PropertyBag propertyBag) { try { var updateCrawler = crawler as UpdateContextAwareCrawler; if (updateCrawler == null) { if (_logger != null) { _logger.Info("Crawler is not an UpdateContextAwareCrawler, we can't deal with this crawler"); } return; } Uri currentUri = ExtractCurrentUri(propertyBag); string id = currentUri.PathAndQuery; String depthString = CreateDepthString(propertyBag.Step.Depth); if (_logger != null) { _logger.Info(String.Format("{0}| Process | HTTP-{1} | {2}", depthString, propertyBag.StatusCode, id)); } lock (updateCrawler.UpdateContext) { GetIndexWriter(updateCrawler.UpdateContext).DeleteDocuments(new Term(BuiltinFields.Path, id)); if (propertyBag.StatusCode == System.Net.HttpStatusCode.OK) { //this should have been done by NCrawler, but let's do it here... could move this to seperate crawlerrulesservice class, but then we'd have to download the content again String message; if (crawler.AdhereToRobotRules && EvaluateSkipConditions(propertyBag, updateCrawler, id, out message)) { if (_logger != null) { _logger.Info(String.Format("{0}| Skipped | {1} | {2}", depthString, message, id)); } return; } var document = CreateDocument(propertyBag, updateCrawler.RunningContextId, id); updateCrawler.UpdateContext.AddDocument(document); //Raise event that the givven document is updated Event.RaiseEvent("SiteCrawler:DocumentUpdated", new CrawlDocumentUpdatedEventArgs(updateCrawler, document)); if (_logger != null) { _logger.InfoFormat("{0}| Add/Update | {1}", depthString, id); } } else if (propertyBag.StatusCode == System.Net.HttpStatusCode.NotFound) { if (_logger != null) { _logger.InfoFormat("Crawler encoutered 404 for [{0}]", id); } //Raise an event that the Document was not found Event.RaiseEvent("SiteCrawler:DocumentNotFound", new CrawlDocumentErrorEventArgs(updateCrawler, id, propertyBag)); } else if (propertyBag.StatusCode == HttpStatusCode.ServiceUnavailable) { _logger.WarnFormat("Crawler encountered status {0} ({1}) for document {2}, ABORTING CRAWLER!!", propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id); Event.RaiseEvent("SiteCrawler:DocumentError", new CrawlDocumentErrorEventArgs(updateCrawler, id, propertyBag)); //server is shutting down or is too busy, abort the indexing!! crawler.Cancel(); } else { if (_logger != null) { _logger.WarnFormat("Crawler encountered status {0} ({1}) for document {2}", propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id); } //Raise an event that the document request returned an error Event.RaiseEvent("SiteCrawler:DocumentError", new CrawlDocumentErrorEventArgs(updateCrawler, id, propertyBag)); if (propertyBag.Step.Depth == 0) { if (_logger != null) { _logger.Warn("ABORTING CRAWLER DUE TO ERROR ON FIRST REQUEST"); } crawler.Cancel(); } } } } catch (Exception crawlExeption) { if (_logger != null) { _logger.Error(GetExceptionLog(crawlExeption).ToString()); } throw; } }