Пример #1
0
        private void Callback(Page page)
        {
            Interlocked.Increment(ref _numberOfPagesCrawled);

            try
            {
                page.Examiner = _examinerFactory.CreateExaminer(_log, page);
            }
            catch (Exception exception)
            {
                _log.Log(new LogMessage(LogType.Error, "Uncaught exception when creating examiner", exception, page.Uri));
                return;
            }

            foreach (var inspector in _inspectors)
            {
                try
                {
                    inspector.InspectPage(page);
                }
                catch (Exception exception)
                {
                    _log.Log(new InspectorLogMessage(inspector, new LogMessage(LogType.Error, "Uncaught exception when calling InspectPage", exception, page.Uri)));
                }
            }
        }
Пример #2
0
        private void CheckExternalLinks(Page page)
        {
            var externalLinks = page.GetExternalLinks();
            foreach (var externalLink in externalLinks)
            {
                try
                {
                    HttpStatusCode httpStatusCode;
                    if (_checkedExternalLinks.ContainsKey(externalLink))
                        httpStatusCode = _checkedExternalLinks[externalLink];
                    else
                    {
                        httpStatusCode = GetHttpStatusCode(externalLink);
                        _checkedExternalLinks.GetOrAdd(externalLink, httpStatusCode);
                    }

                    if (httpStatusCode != HttpStatusCode.OK)
                        Error(externalLink, page.Uri, httpStatusCode);
                }
                catch (Exception exception)
                {
                    Error("Could not get page", exception, externalLink);
                }
            }
        }
 public override void InspectPage(Page page)
 {
     foreach (var link in page.Examiner.Links)
     {
         if (_targets.Exists(t => t == link) && IsInfoEnabled)
             Info($"{link} referenced from {page.Uri}");
     }
 }
 public override void InspectPage(Page page)
 {
     foreach (var link in page.Examiner.Links)
     {
         if (_schemes.Exists(s => s == link.Scheme) && IsInfoEnabled)
             Info($"{link} found on {page.Uri}");
     }
 }
 public override void InspectPage(Page page)
 {
     var externalLinks = page.GetExternalLinks();
     foreach (var externalLink in externalLinks)
     {
         if(!_urisToIgnore.Contains(externalLink) && IsInfoEnabled)
             Info($"{externalLink} found on {page.Uri}");
     }
 }
        public override void InspectPage(Page page)
        {
            if (page.ContentByteSize > _byteSize)
            {
                Error(GetMessage(page));
                return;
            }

            if (IsInfoEnabled)
                Info(GetMessage(page));
        }
Пример #7
0
        public override void InspectPage(Page page)
        {
            if (page.Response.StatusCode == HttpStatusCode.OK)
            {
                if (_checkExternalLinks)
                    CheckExternalLinks(page);

                return;
            }

            Error(page.Uri, page.Referrer, page.Response.StatusCode);
        }
        public override void InspectPage(Page page)
        {
            var timeSpan = page.RequestCompletedOnUtc - page.RequestStartedOnUtc;
            var loadTime = (int) timeSpan.TotalMilliseconds;
            if (loadTime  > _milliseconds)
            {
                Error(GetMessage(page.Uri, loadTime));
                return;
            }

            if (IsInfoEnabled)
                Info(GetMessage(page.Uri, loadTime));
        }
Пример #9
0
        private Page Convert(CrawledPage crawledPage)
        {
            var requestCompleted = crawledPage.RequestCompleted;
            if (crawledPage.DownloadContentCompleted.HasValue)
                requestCompleted = crawledPage.DownloadContentCompleted.Value;

            var byteSize = 0;
            if (crawledPage.Content.Bytes != null)
                byteSize = crawledPage.Content.Bytes.Length;

            var page = new Page(
                crawledPage.Uri,
                crawledPage.ParentUri,
                crawledPage.RequestStarted.ToUniversalTime(),
                requestCompleted.ToUniversalTime(),
                Convert(crawledPage.HttpWebRequest),
                Convert(crawledPage.HttpWebResponse),
                crawledPage.Content.Text,
                byteSize);

            return page;
        }
 public IExaminer CreateExaminer(ILog log, Page page)
 {
     return new AngleSharpExaminer(log, page.Uri, page.Content);
 }
 private string GetMessage(Page page)
 {
     return $"{page.Uri} is {page.ContentByteSize} bytes";
 }
Пример #12
0
 public abstract void InspectPage(Page page);