public HttpResponseObject Download(Uri url, WebConnectorJobConfiguration config, NetworkCredential credential = null) { using (var handler = new HttpClientHandler() { Credentials = credential ?? CredentialCache.DefaultCredentials, AllowAutoRedirect = true }) using (var client = new HttpClient(handler) { Timeout = _timeout }) { try { var response = client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead).Result; if (!response.IsSuccessStatusCode) { return new HttpResponseObject() { ConfigUsed = config, RequestUrl = url, StatusCode = response.StatusCode } } ; return(CreateResponseObject(response, config)); } catch (Exception e) { Log.Logger.Error($"Error when crawling {url}: {e.Message}"); throw e; } } }
private IDocument HandleNotAuthorizedPage(NotAuthorizedPage page, WebConnectorJobConfiguration config, StringBuilder information) { _documentStateService.Delete(config.JobName, page.Id); information.AppendLine($"{page.Id} (Page not authorized.)"); Log.Warning($"Not authorized to see page {page.Url} ({page.Reason})"); return(new DeleteDocument(page.Id, config.JobName)); }
private static void CreateCrawlBlogWebJob(IContainer container) { var config = new WebConnectorJobConfiguration() { StartUrl = "http://blog.cwa.me.uk/", Credential = null, DefaultVerifyFrequency = new Frequency() { Minutes = 2 }, Depth = 2, JobName = "TestWebJob", NumberOfPagesPerExecution = 10 }; var webJob = new Job() { ConcurrentLimit = 1, Configuration = config, Description = "Testar", Enabled = true, Name = "TestWebJob", TriggerCronSyntax = "*/10 * * * * ?" }; container.Resolve <IJobService>().SaveOrUpdate(webJob); }
private static void CreateFileSystemJob(IContainer container) { var config = new WebConnectorJobConfiguration() { StartUrl = "https://www.taqaglobal.com/investors/h1-2017-financial-results", Credential = null, DefaultVerifyFrequency = new Frequency() { Minutes = 2 }, Depth = 1, JobName = "FileSystemJob", NumberOfPagesPerExecution = 10, PageFilter = new PageFilter() { ExcludeBinaryPages = false } }; var webJob = new Job() { ConcurrentLimit = 1, Configuration = config, Description = "Testar", Enabled = true, Name = "FileSystemJob", TriggerCronSyntax = "*/10 * * * * ?" }; container.Resolve <IJobService>().SaveOrUpdate(webJob); }
public Page Download(Uri url, WebConnectorJobConfiguration config) { var responseObject = _downloadHandler.Download(url, config, config.Credential.ToNetworkCredential()); var pageHandler = _pageHandlers.FirstOrDefault(x => x.CanHandle(responseObject)); return(pageHandler == null ? new IgnoredPage(url.ToString(), $"No Page handler registered that supports {responseObject}") : pageHandler.Extract(responseObject)); }
protected DateTime GetVerifyDate(HttpContentHeaders headers, WebConnectorJobConfiguration config) { //Example: Expires:Sun, 19 Nov 1978 05:00:00 GMT var expiresHead = headers.Expires; //If no value or value that is less than today if (!expiresHead.HasValue || expiresHead.Value.DateTime < DateTime.UtcNow.AddDays(-1)) { return(DateTime.UtcNow + config.DefaultVerifyFrequency.ToTimeSpan()); } return(expiresHead.Value.DateTime); }
private SourceChanges InitialCrawl(WebConnectorJobConfiguration config, WebCrawlerJobState state) { return(CrawlLinks(config, new List <PageQueueItem>() { new PageQueueItem() { CreateDate = DateTime.UtcNow, Id = config.StartUrl, Url = new Uri(config.StartUrl) } }, state)); }
private IDocument HandleBinaryPage(BinaryPage page, WebConnectorJobConfiguration config, PageState pageState, StringBuilder information) { if (!pageState.PageIsChanged(page)) { information.AppendLine($"{page.Id} (No change detected, skipping)"); return(new IgnoreDocument(page.Id, config.JobName, "No change detected, skipping")); } Log.Information($"Adding BinaryPage {page.Id}"); _documentStateService.UpdatePageState(page, config.JobName); information.AppendLine(pageState == null ? $"{page.Id} (Adding new binary page" : $"{page.Id} (Adding updated binary page"); return(_pageService.ConvertToDocument(page, config.JobName)); }
public SourceChanges ExecuteFetch(WebConnectorJobConfiguration config) { var timer = new Stopwatch(); timer.Start(); var state = GetCurrentJobState(config); state.LastExecutionDate = DateTime.UtcNow; var queue = _queueService.Pop(config.JobName, config.NumberOfPagesPerExecution).ToPageQueueItems(); try { if (!queue.Any() && state.State == JobState.InitialCrawling) { var changes = InitialCrawl(config, state); state.State = JobState.IncrementalCrawling; return(changes); } if (!queue.Any()) { var docsToVerify = _documentStateService.PushVerifyDocumentsToQueue(config.JobName); Log.Information($"{config.JobName}: {docsToVerify} documents that should be verified pushed to queue"); if (docsToVerify > 0) { queue = _queueService.Pop(config.JobName, config.NumberOfPagesPerExecution).ToPageQueueItems(); } } state.Status = JobStatus.Ok; return(CrawlLinks(config, queue, state)); } catch (Exception e) { var error = $"{config.JobName}: Error when crawling, {e.Message}. Pushing {queue.Count} queue items back to stack"; Log.Error(e, error); state.SetErrorState(e); _queueService.Push(config.JobName, queue.ToQueueItems()); state.BatchCount = _queueService.Count(config.JobName); return(new SourceChanges()); } finally { timer.Stop(); state.LastExecutionTimeMs = (int)timer.ElapsedMilliseconds; _stateService.SaveState(state); } }
private int HandleLinks(Page page, WebConnectorJobConfiguration config) { if (page.Depth >= config.Depth) { Log.Information($"{config.JobName}: {page.Url} is at depth {page.Depth} which is the maximum allowed depth ({config.Depth}), links from this page will be ignored"); return(0); } var links = _pageService.ScrapeLinks(page).ToList(); var validLinks = _pageService.GetValidLinks(page, links, config.LinkFilter); var newLinks = _queueService.Push( config.JobName, validLinks.Select(link => link.ToQueueItem(page)).ToList()); Log.Information($"{config.JobName}: Added {newLinks} new links (total valid links / total links: " + $"{validLinks.Count}/{links.Count}) from {page.Url.AbsoluteUri} (depth: {page.Depth})"); return(newLinks); }
/// <summary> /// Tries to get the current job state /// If no state exists, a new job state will be created and initiated /// </summary> /// <param name="config"></param> private WebCrawlerJobState GetCurrentJobState(WebConnectorJobConfiguration config) { var state = _stateService.LoadState(config.JobName); if (state == null) { Log.Information($"{config.JobName}: Initializing fresh crawl"); ResetConnector(config.JobName); state = new WebCrawlerJobState() { InitDate = DateTime.UtcNow, Message = "Initializing..", State = JobState.InitialCrawling, LastExecutionDate = DateTime.UtcNow, Name = config.JobName, }; _stateService.SaveState(state); } return(new WebCrawlerJobState(state)); }
private IDocument HandleWebpage(WebPage page, WebConnectorJobConfiguration config, PageState pageState, StringBuilder information) { if (!pageState.PageIsChanged(page)) { information.AppendLine($"{page.Id} (No change detected, skipping)"); return(new IgnoreDocument(page.Id, config.JobName, "No change detected, skipping")); } var newLinkCount = HandleLinks(page, config); Log.Information($"Adding WebPage {page.Id}"); _documentStateService.UpdatePageState(page, config.JobName); var linkInfo = page.Depth >= config.Depth ? $"and ignoring links since the depth is {page.Depth}" : $"with {newLinkCount} new links (with depth {page.Depth + 1})"; information.AppendLine(pageState == null ? $"{page.Id} (Adding new html page {linkInfo}" : $"{page.Id} (Adding updated html page {linkInfo}"); return(_pageService.ConvertToDocument(page, config.JobName)); }
/// <summary> /// Handles the queueitems (links) given /// Checks if the links should be downloaded /// </summary> /// <param name="config"></param> /// <param name="queue"></param> /// <param name="state"></param> /// <returns></returns> private SourceChanges CrawlLinks(WebConnectorJobConfiguration config, IList <PageQueueItem> queue, WebCrawlerJobState state) { if (!queue.Any()) { return(new SourceChanges()); } var documents = new List <IDocument>(queue.Count); var info = new StringBuilder(); foreach (var queueItem in queue) { Log.Information($"Crawling {queueItem.Id}"); var pageState = _documentStateService.Get(config.JobName, queueItem.Id).ToPageState(); if (!pageState.ShouldVerify()) { info.AppendLine($"{queueItem.Id} (Skipped since it already exists, will be verified {pageState.VerifyDate})"); continue; } var page = _pageService.Download(queueItem.Url, config); if (page.Id != queueItem.Id)//the page uses eg. a canonical url, check for a new DocumentState for that id { pageState = _documentStateService.Get(config.JobName, page.Id).ToPageState(); queueItem.Depth = pageState?.Depth ?? queueItem.Depth; } page.Depth = queueItem.Depth; documents.Add(HandlePage(page, config, pageState, info)); } var queueCount = _queueService.Count(config.JobName); state.BatchCount = queueCount; state.Message = $"Handled {queue.Count} pages ({queueCount} left in queue): \r\n{info}"; return(new SourceChanges(documents)); }
private IDocument HandlePage(Page page, WebConnectorJobConfiguration config, PageState pageState, StringBuilder information) { if (page is IgnoredPage) { return(HandleIgnorePage(page as IgnoredPage, config, information)); } if (page is NotFoundPage) { return(HandleNotFoundPage(page as NotFoundPage, config, information)); } if (page is NotAuthorizedPage) { return(HandleNotAuthorizedPage(page as NotAuthorizedPage, config, information)); } if (page is WebPage) { return(HandleWebpage(page as WebPage, config, pageState, information)); } if (page is BinaryPage) { return(HandleBinaryPage(page as BinaryPage, config, pageState, information)); } throw new NotSupportedException("No support for handling page of type " + page.GetType().Name); }
private HttpResponseObject CreateResponseObject(HttpResponseMessage response, WebConnectorJobConfiguration config) { Encoding encoding = GetEncoding(response); var responseObject = new HttpResponseObject() { RequestUrl = response.RequestMessage.RequestUri, ConfigUsed = config, ContentType = response.Content.Headers.ContentType.MediaType, Encoding = encoding, Headers = response.Content.Headers, StatusCode = response.StatusCode, }; using (var sr = response.Content.ReadAsStreamAsync().Result) { responseObject.Response = ReadFully(sr, response.Content.Headers.ContentLength); } return(responseObject); }
private IDocument HandleNotFoundPage(NotFoundPage page, WebConnectorJobConfiguration config, StringBuilder information) { _documentStateService.Delete(config.JobName, page.Id); information.AppendLine($"{page.Id} (Page not found. Sending delete command)"); return(new DeleteDocument(page.Id, config.JobName)); }
private IDocument HandleIgnorePage(IgnoredPage page, WebConnectorJobConfiguration config, StringBuilder information) { information.AppendLine($"{page.Id} (Page is ignored: {(page as IgnoredPage).Reason})"); return(new IgnoreDocument(page.Id, config.JobName, page.Reason)); }