Exemple #1
0
 public HttpResponseObject Download(Uri url, WebConnectorJobConfiguration config, NetworkCredential credential = null)
 {
     using (var handler = new HttpClientHandler()
     {
         Credentials = credential ?? CredentialCache.DefaultCredentials, AllowAutoRedirect = true
     })
         using (var client = new HttpClient(handler)
         {
             Timeout = _timeout
         })
         {
             try
             {
                 var response = client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead).Result;
                 if (!response.IsSuccessStatusCode)
                 {
                     return new HttpResponseObject()
                            {
                                ConfigUsed = config,
                                RequestUrl = url,
                                StatusCode = response.StatusCode
                            }
                 }
                 ;
                 return(CreateResponseObject(response, config));
             }
             catch (Exception e)
             {
                 Log.Logger.Error($"Error when crawling {url}: {e.Message}");
                 throw e;
             }
         }
 }
 private IDocument HandleNotAuthorizedPage(NotAuthorizedPage page, WebConnectorJobConfiguration config, StringBuilder information)
 {
     _documentStateService.Delete(config.JobName, page.Id);
     information.AppendLine($"{page.Id} (Page not authorized.)");
     Log.Warning($"Not authorized to see page {page.Url} ({page.Reason})");
     return(new DeleteDocument(page.Id, config.JobName));
 }
Exemple #3
0
        private static void CreateCrawlBlogWebJob(IContainer container)
        {
            var config = new WebConnectorJobConfiguration()
            {
                StartUrl               = "http://blog.cwa.me.uk/",
                Credential             = null,
                DefaultVerifyFrequency = new Frequency()
                {
                    Minutes = 2
                },
                Depth   = 2,
                JobName = "TestWebJob",
                NumberOfPagesPerExecution = 10
            };
            var webJob = new Job()
            {
                ConcurrentLimit   = 1,
                Configuration     = config,
                Description       = "Testar",
                Enabled           = true,
                Name              = "TestWebJob",
                TriggerCronSyntax = "*/10 * * * * ?"
            };

            container.Resolve <IJobService>().SaveOrUpdate(webJob);
        }
Exemple #4
0
        private static void CreateFileSystemJob(IContainer container)
        {
            var config = new WebConnectorJobConfiguration()
            {
                StartUrl               = "https://www.taqaglobal.com/investors/h1-2017-financial-results",
                Credential             = null,
                DefaultVerifyFrequency = new Frequency()
                {
                    Minutes = 2
                },
                Depth   = 1,
                JobName = "FileSystemJob",
                NumberOfPagesPerExecution = 10,
                PageFilter = new PageFilter()
                {
                    ExcludeBinaryPages = false
                }
            };
            var webJob = new Job()
            {
                ConcurrentLimit   = 1,
                Configuration     = config,
                Description       = "Testar",
                Enabled           = true,
                Name              = "FileSystemJob",
                TriggerCronSyntax = "*/10 * * * * ?"
            };

            container.Resolve <IJobService>().SaveOrUpdate(webJob);
        }
Exemple #5
0
        public Page Download(Uri url, WebConnectorJobConfiguration config)
        {
            var responseObject = _downloadHandler.Download(url, config, config.Credential.ToNetworkCredential());

            var pageHandler = _pageHandlers.FirstOrDefault(x => x.CanHandle(responseObject));

            return(pageHandler == null
                ? new IgnoredPage(url.ToString(), $"No Page handler registered that supports {responseObject}")
                : pageHandler.Extract(responseObject));
        }
        protected DateTime GetVerifyDate(HttpContentHeaders headers, WebConnectorJobConfiguration config)
        {
            //Example: Expires:Sun, 19 Nov 1978 05:00:00 GMT
            var expiresHead = headers.Expires;

            //If no value or value that is less than today
            if (!expiresHead.HasValue || expiresHead.Value.DateTime < DateTime.UtcNow.AddDays(-1))
            {
                return(DateTime.UtcNow + config.DefaultVerifyFrequency.ToTimeSpan());
            }

            return(expiresHead.Value.DateTime);
        }
 private SourceChanges InitialCrawl(WebConnectorJobConfiguration config, WebCrawlerJobState state)
 {
     return(CrawlLinks(config,
                       new List <PageQueueItem>()
     {
         new PageQueueItem()
         {
             CreateDate = DateTime.UtcNow,
             Id = config.StartUrl,
             Url = new Uri(config.StartUrl)
         }
     }, state));
 }
        private IDocument HandleBinaryPage(BinaryPage page, WebConnectorJobConfiguration config, PageState pageState,
                                           StringBuilder information)
        {
            if (!pageState.PageIsChanged(page))
            {
                information.AppendLine($"{page.Id} (No change detected, skipping)");
                return(new IgnoreDocument(page.Id, config.JobName, "No change detected, skipping"));
            }

            Log.Information($"Adding BinaryPage {page.Id}");
            _documentStateService.UpdatePageState(page, config.JobName);
            information.AppendLine(pageState == null
                    ? $"{page.Id} (Adding new binary page"
                    : $"{page.Id} (Adding updated binary page");
            return(_pageService.ConvertToDocument(page, config.JobName));
        }
        public SourceChanges ExecuteFetch(WebConnectorJobConfiguration config)
        {
            var timer = new Stopwatch(); timer.Start();
            var state = GetCurrentJobState(config);

            state.LastExecutionDate = DateTime.UtcNow;

            var queue = _queueService.Pop(config.JobName, config.NumberOfPagesPerExecution).ToPageQueueItems();

            try
            {
                if (!queue.Any() && state.State == JobState.InitialCrawling)
                {
                    var changes = InitialCrawl(config, state);
                    state.State = JobState.IncrementalCrawling;
                    return(changes);
                }
                if (!queue.Any())
                {
                    var docsToVerify = _documentStateService.PushVerifyDocumentsToQueue(config.JobName);
                    Log.Information($"{config.JobName}: {docsToVerify} documents that should be verified pushed to queue");
                    if (docsToVerify > 0)
                    {
                        queue = _queueService.Pop(config.JobName, config.NumberOfPagesPerExecution).ToPageQueueItems();
                    }
                }

                state.Status = JobStatus.Ok;
                return(CrawlLinks(config, queue, state));
            }
            catch (Exception e)
            {
                var error = $"{config.JobName}: Error when crawling, {e.Message}. Pushing {queue.Count} queue items back to stack";
                Log.Error(e, error);
                state.SetErrorState(e);
                _queueService.Push(config.JobName, queue.ToQueueItems());
                state.BatchCount = _queueService.Count(config.JobName);
                return(new SourceChanges());
            }
            finally
            {
                timer.Stop();
                state.LastExecutionTimeMs = (int)timer.ElapsedMilliseconds;
                _stateService.SaveState(state);
            }
        }
        private int HandleLinks(Page page, WebConnectorJobConfiguration config)
        {
            if (page.Depth >= config.Depth)
            {
                Log.Information($"{config.JobName}: {page.Url} is at depth {page.Depth} which is the maximum allowed depth ({config.Depth}), links from this page will be ignored");
                return(0);
            }
            var links      = _pageService.ScrapeLinks(page).ToList();
            var validLinks = _pageService.GetValidLinks(page, links, config.LinkFilter);

            var newLinks = _queueService.Push(
                config.JobName,
                validLinks.Select(link => link.ToQueueItem(page)).ToList());

            Log.Information($"{config.JobName}: Added {newLinks} new links (total valid links / total links: " +
                            $"{validLinks.Count}/{links.Count}) from {page.Url.AbsoluteUri} (depth: {page.Depth})");
            return(newLinks);
        }
        /// <summary>
        /// Tries to get the current job state
        /// If no state exists, a new job state will be created and initiated
        /// </summary>
        /// <param name="config"></param>
        private WebCrawlerJobState GetCurrentJobState(WebConnectorJobConfiguration config)
        {
            var state = _stateService.LoadState(config.JobName);

            if (state == null)
            {
                Log.Information($"{config.JobName}: Initializing fresh crawl");
                ResetConnector(config.JobName);
                state = new WebCrawlerJobState()
                {
                    InitDate          = DateTime.UtcNow,
                    Message           = "Initializing..",
                    State             = JobState.InitialCrawling,
                    LastExecutionDate = DateTime.UtcNow,
                    Name = config.JobName,
                };
                _stateService.SaveState(state);
            }
            return(new WebCrawlerJobState(state));
        }
        private IDocument HandleWebpage(WebPage page, WebConnectorJobConfiguration config, PageState pageState, StringBuilder information)
        {
            if (!pageState.PageIsChanged(page))
            {
                information.AppendLine($"{page.Id} (No change detected, skipping)");
                return(new IgnoreDocument(page.Id, config.JobName, "No change detected, skipping"));
            }

            var newLinkCount = HandleLinks(page, config);

            Log.Information($"Adding WebPage {page.Id}");
            _documentStateService.UpdatePageState(page, config.JobName);
            var linkInfo = page.Depth >= config.Depth
                    ? $"and ignoring links since the depth is {page.Depth}"
                    : $"with {newLinkCount} new links (with depth {page.Depth + 1})";

            information.AppendLine(pageState == null
                    ? $"{page.Id} (Adding new html page {linkInfo}"
                    : $"{page.Id} (Adding updated html page {linkInfo}");
            return(_pageService.ConvertToDocument(page, config.JobName));
        }
        /// <summary>
        /// Handles the queueitems (links) given
        /// Checks if the links should be downloaded
        /// </summary>
        /// <param name="config"></param>
        /// <param name="queue"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        private SourceChanges CrawlLinks(WebConnectorJobConfiguration config, IList <PageQueueItem> queue, WebCrawlerJobState state)
        {
            if (!queue.Any())
            {
                return(new SourceChanges());
            }

            var documents = new List <IDocument>(queue.Count);
            var info      = new StringBuilder();

            foreach (var queueItem in queue)
            {
                Log.Information($"Crawling {queueItem.Id}");
                var pageState = _documentStateService.Get(config.JobName, queueItem.Id).ToPageState();
                if (!pageState.ShouldVerify())
                {
                    info.AppendLine($"{queueItem.Id} (Skipped since it already exists, will be verified {pageState.VerifyDate})");
                    continue;
                }

                var page = _pageService.Download(queueItem.Url, config);
                if (page.Id != queueItem.Id)//the page uses eg. a canonical url, check for a new DocumentState for that id
                {
                    pageState       = _documentStateService.Get(config.JobName, page.Id).ToPageState();
                    queueItem.Depth = pageState?.Depth ?? queueItem.Depth;
                }
                page.Depth = queueItem.Depth;

                documents.Add(HandlePage(page, config, pageState, info));
            }

            var queueCount = _queueService.Count(config.JobName);

            state.BatchCount = queueCount;
            state.Message    = $"Handled {queue.Count} pages ({queueCount} left in queue): \r\n{info}";
            return(new SourceChanges(documents));
        }
 private IDocument HandlePage(Page page, WebConnectorJobConfiguration config, PageState pageState, StringBuilder information)
 {
     if (page is IgnoredPage)
     {
         return(HandleIgnorePage(page as IgnoredPage, config, information));
     }
     if (page is NotFoundPage)
     {
         return(HandleNotFoundPage(page as NotFoundPage, config, information));
     }
     if (page is NotAuthorizedPage)
     {
         return(HandleNotAuthorizedPage(page as NotAuthorizedPage, config, information));
     }
     if (page is WebPage)
     {
         return(HandleWebpage(page as WebPage, config, pageState, information));
     }
     if (page is BinaryPage)
     {
         return(HandleBinaryPage(page as BinaryPage, config, pageState, information));
     }
     throw new NotSupportedException("No support for handling page of type " + page.GetType().Name);
 }
Exemple #15
0
        private HttpResponseObject CreateResponseObject(HttpResponseMessage response, WebConnectorJobConfiguration config)
        {
            Encoding encoding       = GetEncoding(response);
            var      responseObject = new HttpResponseObject()
            {
                RequestUrl  = response.RequestMessage.RequestUri,
                ConfigUsed  = config,
                ContentType = response.Content.Headers.ContentType.MediaType,
                Encoding    = encoding,
                Headers     = response.Content.Headers,
                StatusCode  = response.StatusCode,
            };

            using (var sr = response.Content.ReadAsStreamAsync().Result)
            {
                responseObject.Response = ReadFully(sr, response.Content.Headers.ContentLength);
            }


            return(responseObject);
        }
 private IDocument HandleNotFoundPage(NotFoundPage page, WebConnectorJobConfiguration config, StringBuilder information)
 {
     _documentStateService.Delete(config.JobName, page.Id);
     information.AppendLine($"{page.Id} (Page not found. Sending delete command)");
     return(new DeleteDocument(page.Id, config.JobName));
 }
 private IDocument HandleIgnorePage(IgnoredPage page, WebConnectorJobConfiguration config, StringBuilder information)
 {
     information.AppendLine($"{page.Id} (Page is ignored: {(page as IgnoredPage).Reason})");
     return(new IgnoreDocument(page.Id, config.JobName, page.Reason));
 }