/// <summary> /// Method used to start the content crawl. This method retrieves all searchable content from the CMS /// and calls the assigned indexer which will post the content to the search server. The crawler runs in /// two modes. These modes are triggered by the crawlstartdate. /// /// Full Crawl: When the crawlstartdate is null all searchable content is retrieved from the CMS /// and posted to the search server. When performing a full crawl the indexer will compare what is /// posted with what is currently in the index and will deleting everything in the index that is not in /// the current set of posted items. /// /// Partial Crawl: When a crawlstartdate is defined then only content published on or after /// the crawl start date will be retrieved and posted to the search server. In addition /// deleted and archived items will be deleted from the search index. /// </summary> /// <returns></returns> public IndexResults RunCrawler(Global <T> .StatusCallBack statusCallback, DateTime?crawlStartDate) { var dateStart = DateTime.Now; var fullCrawl = (crawlStartDate == null); var searchablePages = GetSearchablePages(ContentReference.RootPage, crawlStartDate); var results = (fullCrawl) ? _contentIndexer.RunFullIndex(searchablePages, statusCallback, IndexerCallback) : _contentIndexer.RunUpdate(searchablePages, statusCallback, IndexerCallback); if (!fullCrawl) { results.DeleteCnt = _contentIndexer.Delete(GetTrashCanPages(ContentReference.RootPage, crawlStartDate)); results.DeleteCnt += _contentIndexer.Delete(GetArchivedPages(ContentReference.RootPage, crawlStartDate)); } results.Duration = (DateTime.Now - dateStart); return(results); }
/// <summary> /// /// </summary> /// <returns></returns> public CrawlerResults Run() { var results = new CrawlerResults { SourceId = _crawlSettings.SourceId }; var startTime = DateTime.Now; LoggerInfo("Starting Web Crawl"); BaseUrl = UrlParser.GetHostName(_crawlSettings.SeedUrl); BaseSchema = UrlParser.GetSchema(_crawlSettings.SeedUrl); var seedPageResp = HttpClient.GetRequest(_crawlSettings.SeedUrl); LoggerInfo(string.Format("Crawling {0}", _crawlSettings.SeedUrl)); LinksProcessed = new List <string>(); LinksToIndex = new List <string>(); var links = GetLinks(seedPageResp); var depth = 1; ProcessLinks(links, depth); var searchableContent = new List <IWebCrawlPage>(); foreach (var link in LinksToIndex) { LoggerDebug(string.Format("Extracting {0}", link)); var page = ProcessPage(link); if (page != null) { searchableContent.Add(page); } if (HandleStatusCallBack()) { return(results); } System.Threading.Thread.Sleep(1000); } LoggerInfo("Running Indexer"); var indexResults = _Indexer.RunUpdate(searchableContent, null, null); results.CrawlPages = searchableContent; results.CrawledCnt = searchableContent.Count; results.IndexedCnt = indexResults.TotalCnt; results.TotalCnt = indexResults.TotalCnt; results.ErrorCnt = indexResults.ErrorCnt; results.Duration = (DateTime.Now - startTime); LoggerInfo("Web Crawler finished."); return(results); }