コード例 #1
0
 private SourceChanges InitialCrawl(WebConnectorJobConfiguration config, WebCrawlerJobState state)
 {
     return(CrawlLinks(config,
                       new List <PageQueueItem>()
     {
         new PageQueueItem()
         {
             CreateDate = DateTime.UtcNow,
             Id = config.StartUrl,
             Url = new Uri(config.StartUrl)
         }
     }, state));
 }
コード例 #2
0
        /// <summary>
        /// Tries to get the current job state
        /// If no state exists, a new job state will be created and initiated
        /// </summary>
        /// <param name="config"></param>
        private WebCrawlerJobState GetCurrentJobState(WebConnectorJobConfiguration config)
        {
            var state = _stateService.LoadState(config.JobName);

            if (state == null)
            {
                Log.Information($"{config.JobName}: Initializing fresh crawl");
                ResetConnector(config.JobName);
                state = new WebCrawlerJobState()
                {
                    InitDate          = DateTime.UtcNow,
                    Message           = "Initializing..",
                    State             = JobState.InitialCrawling,
                    LastExecutionDate = DateTime.UtcNow,
                    Name = config.JobName,
                };
                _stateService.SaveState(state);
            }
            return(new WebCrawlerJobState(state));
        }
コード例 #3
0
        /// <summary>
        /// Handles the queueitems (links) given
        /// Checks if the links should be downloaded
        /// </summary>
        /// <param name="config"></param>
        /// <param name="queue"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        private SourceChanges CrawlLinks(WebConnectorJobConfiguration config, IList <PageQueueItem> queue, WebCrawlerJobState state)
        {
            if (!queue.Any())
            {
                return(new SourceChanges());
            }

            var documents = new List <IDocument>(queue.Count);
            var info      = new StringBuilder();

            foreach (var queueItem in queue)
            {
                Log.Information($"Crawling {queueItem.Id}");
                var pageState = _documentStateService.Get(config.JobName, queueItem.Id).ToPageState();
                if (!pageState.ShouldVerify())
                {
                    info.AppendLine($"{queueItem.Id} (Skipped since it already exists, will be verified {pageState.VerifyDate})");
                    continue;
                }

                var page = _pageService.Download(queueItem.Url, config);
                if (page.Id != queueItem.Id)//the page uses eg. a canonical url, check for a new DocumentState for that id
                {
                    pageState       = _documentStateService.Get(config.JobName, page.Id).ToPageState();
                    queueItem.Depth = pageState?.Depth ?? queueItem.Depth;
                }
                page.Depth = queueItem.Depth;

                documents.Add(HandlePage(page, config, pageState, info));
            }

            var queueCount = _queueService.Count(config.JobName);

            state.BatchCount = queueCount;
            state.Message    = $"Handled {queue.Count} pages ({queueCount} left in queue): \r\n{info}";
            return(new SourceChanges(documents));
        }