/// <summary>
        /// Method used to start the content crawl. This method retrieves all searchable content from the CMS
        /// and calls the assigned indexer which will post the content to the search server. The crawler runs in
        /// two modes. These modes are triggered by the crawlstartdate.
        ///
        /// Full Crawl: When the crawlstartdate is null all searchable content is retrieved from the CMS
        /// and posted to the search server. When performing a full crawl the indexer will compare what is
        /// posted with what is currently in the index and will deleting everything in the index that is not in
        /// the current set of posted items.
        ///
        /// Partial Crawl: When a crawlstartdate is defined then only content published on or after
        /// the crawl start date will be retrieved and posted to the search server. In addition
        /// deleted and archived items will be deleted from the search index.
        /// </summary>
        /// <returns></returns>
        public IndexResults RunCrawler(Global <T> .StatusCallBack statusCallback, DateTime?crawlStartDate)
        {
            var dateStart = DateTime.Now;

            var fullCrawl = (crawlStartDate == null);

            var searchablePages = GetSearchablePages(ContentReference.RootPage, crawlStartDate);

            var results = (fullCrawl) ?
                          _contentIndexer.RunFullIndex(searchablePages, statusCallback, IndexerCallback) :
                          _contentIndexer.RunUpdate(searchablePages, statusCallback, IndexerCallback);

            if (!fullCrawl)
            {
                results.DeleteCnt  = _contentIndexer.Delete(GetTrashCanPages(ContentReference.RootPage, crawlStartDate));
                results.DeleteCnt += _contentIndexer.Delete(GetArchivedPages(ContentReference.RootPage, crawlStartDate));
            }

            results.Duration = (DateTime.Now - dateStart);

            return(results);
        }
示例#2
0
        /// <summary>
        ///
        /// </summary>
        /// <returns></returns>
        public CrawlerResults Run()
        {
            var results = new CrawlerResults
            {
                SourceId = _crawlSettings.SourceId
            };

            var startTime = DateTime.Now;

            LoggerInfo("Starting Web Crawl");

            BaseUrl    = UrlParser.GetHostName(_crawlSettings.SeedUrl);
            BaseSchema = UrlParser.GetSchema(_crawlSettings.SeedUrl);

            var seedPageResp = HttpClient.GetRequest(_crawlSettings.SeedUrl);

            LoggerInfo(string.Format("Crawling {0}", _crawlSettings.SeedUrl));

            LinksProcessed = new List <string>();
            LinksToIndex   = new List <string>();

            var links = GetLinks(seedPageResp);
            var depth = 1;

            ProcessLinks(links, depth);

            var searchableContent = new List <IWebCrawlPage>();

            foreach (var link in LinksToIndex)
            {
                LoggerDebug(string.Format("Extracting {0}", link));

                var page = ProcessPage(link);

                if (page != null)
                {
                    searchableContent.Add(page);
                }

                if (HandleStatusCallBack())
                {
                    return(results);
                }

                System.Threading.Thread.Sleep(1000);
            }

            LoggerInfo("Running Indexer");


            var indexResults = _Indexer.RunUpdate(searchableContent, null, null);

            results.CrawlPages = searchableContent;
            results.CrawledCnt = searchableContent.Count;
            results.IndexedCnt = indexResults.TotalCnt;
            results.TotalCnt   = indexResults.TotalCnt;
            results.ErrorCnt   = indexResults.ErrorCnt;
            results.Duration   = (DateTime.Now - startTime);

            LoggerInfo("Web Crawler finished.");

            return(results);
        }