private SourceChanges InitialCrawl(WebConnectorJobConfiguration config, WebCrawlerJobState state) { return(CrawlLinks(config, new List <PageQueueItem>() { new PageQueueItem() { CreateDate = DateTime.UtcNow, Id = config.StartUrl, Url = new Uri(config.StartUrl) } }, state)); }
/// <summary> /// Tries to get the current job state /// If no state exists, a new job state will be created and initiated /// </summary> /// <param name="config"></param> private WebCrawlerJobState GetCurrentJobState(WebConnectorJobConfiguration config) { var state = _stateService.LoadState(config.JobName); if (state == null) { Log.Information($"{config.JobName}: Initializing fresh crawl"); ResetConnector(config.JobName); state = new WebCrawlerJobState() { InitDate = DateTime.UtcNow, Message = "Initializing..", State = JobState.InitialCrawling, LastExecutionDate = DateTime.UtcNow, Name = config.JobName, }; _stateService.SaveState(state); } return(new WebCrawlerJobState(state)); }
/// <summary> /// Handles the queueitems (links) given /// Checks if the links should be downloaded /// </summary> /// <param name="config"></param> /// <param name="queue"></param> /// <param name="state"></param> /// <returns></returns> private SourceChanges CrawlLinks(WebConnectorJobConfiguration config, IList <PageQueueItem> queue, WebCrawlerJobState state) { if (!queue.Any()) { return(new SourceChanges()); } var documents = new List <IDocument>(queue.Count); var info = new StringBuilder(); foreach (var queueItem in queue) { Log.Information($"Crawling {queueItem.Id}"); var pageState = _documentStateService.Get(config.JobName, queueItem.Id).ToPageState(); if (!pageState.ShouldVerify()) { info.AppendLine($"{queueItem.Id} (Skipped since it already exists, will be verified {pageState.VerifyDate})"); continue; } var page = _pageService.Download(queueItem.Url, config); if (page.Id != queueItem.Id)//the page uses eg. a canonical url, check for a new DocumentState for that id { pageState = _documentStateService.Get(config.JobName, page.Id).ToPageState(); queueItem.Depth = pageState?.Depth ?? queueItem.Depth; } page.Depth = queueItem.Depth; documents.Add(HandlePage(page, config, pageState, info)); } var queueCount = _queueService.Count(config.JobName); state.BatchCount = queueCount; state.Message = $"Handled {queue.Count} pages ({queueCount} left in queue): \r\n{info}"; return(new SourceChanges(documents)); }