private void Callback(Page page) { Interlocked.Increment(ref _numberOfPagesCrawled); try { page.Examiner = _examinerFactory.CreateExaminer(_log, page); } catch (Exception exception) { _log.Log(new LogMessage(LogType.Error, "Uncaught exception when creating examiner", exception, page.Uri)); return; } foreach (var inspector in _inspectors) { try { inspector.InspectPage(page); } catch (Exception exception) { _log.Log(new InspectorLogMessage(inspector, new LogMessage(LogType.Error, "Uncaught exception when calling InspectPage", exception, page.Uri))); } } }
private void CheckExternalLinks(Page page) { var externalLinks = page.GetExternalLinks(); foreach (var externalLink in externalLinks) { try { HttpStatusCode httpStatusCode; if (_checkedExternalLinks.ContainsKey(externalLink)) httpStatusCode = _checkedExternalLinks[externalLink]; else { httpStatusCode = GetHttpStatusCode(externalLink); _checkedExternalLinks.GetOrAdd(externalLink, httpStatusCode); } if (httpStatusCode != HttpStatusCode.OK) Error(externalLink, page.Uri, httpStatusCode); } catch (Exception exception) { Error("Could not get page", exception, externalLink); } } }
public override void InspectPage(Page page) { foreach (var link in page.Examiner.Links) { if (_targets.Exists(t => t == link) && IsInfoEnabled) Info($"{link} referenced from {page.Uri}"); } }
public override void InspectPage(Page page) { foreach (var link in page.Examiner.Links) { if (_schemes.Exists(s => s == link.Scheme) && IsInfoEnabled) Info($"{link} found on {page.Uri}"); } }
public override void InspectPage(Page page) { var externalLinks = page.GetExternalLinks(); foreach (var externalLink in externalLinks) { if(!_urisToIgnore.Contains(externalLink) && IsInfoEnabled) Info($"{externalLink} found on {page.Uri}"); } }
public override void InspectPage(Page page) { if (page.ContentByteSize > _byteSize) { Error(GetMessage(page)); return; } if (IsInfoEnabled) Info(GetMessage(page)); }
public override void InspectPage(Page page) { if (page.Response.StatusCode == HttpStatusCode.OK) { if (_checkExternalLinks) CheckExternalLinks(page); return; } Error(page.Uri, page.Referrer, page.Response.StatusCode); }
public override void InspectPage(Page page) { var timeSpan = page.RequestCompletedOnUtc - page.RequestStartedOnUtc; var loadTime = (int) timeSpan.TotalMilliseconds; if (loadTime > _milliseconds) { Error(GetMessage(page.Uri, loadTime)); return; } if (IsInfoEnabled) Info(GetMessage(page.Uri, loadTime)); }
private Page Convert(CrawledPage crawledPage) { var requestCompleted = crawledPage.RequestCompleted; if (crawledPage.DownloadContentCompleted.HasValue) requestCompleted = crawledPage.DownloadContentCompleted.Value; var byteSize = 0; if (crawledPage.Content.Bytes != null) byteSize = crawledPage.Content.Bytes.Length; var page = new Page( crawledPage.Uri, crawledPage.ParentUri, crawledPage.RequestStarted.ToUniversalTime(), requestCompleted.ToUniversalTime(), Convert(crawledPage.HttpWebRequest), Convert(crawledPage.HttpWebResponse), crawledPage.Content.Text, byteSize); return page; }
public IExaminer CreateExaminer(ILog log, Page page) { return new AngleSharpExaminer(log, page.Uri, page.Content); }
private string GetMessage(Page page) { return $"{page.Uri} is {page.ContentByteSize} bytes"; }
public abstract void InspectPage(Page page);