private Link addPage(Uri pageToVisit, Uri sourcePage) { if (pageToVisit.Host != BaseUri.Host && !Configuration.SpiderAllowHostViolation) { if (!hViolated.Contains(pageToVisit.Host)) // ignore the entire domain { lock (hViolated) { hViolated.Add(pageToVisit.Host); } log.Warning($"[WRN] Host Violation {pageToVisit}"); } return(null); } var lnk = new Link(pageToVisit, sourcePage); if (FetchRewrite != null) { var ev = new FetchRewriteEventArgs(pageToVisit); FetchRewrite(this, ev); // Default Uri Equality ignores Fragment if (ev.NewUri != null && ev.NewUri.ToString() != pageToVisit.ToString()) { if (ev.ShowOnLog) { log.Information($"[REW] {pageToVisit} -> {ev.NewUri}"); } lnk.ResourceRewritten(ev.NewUri); } } if (SpiderWorkData.Moved301.ContainsKey(lnk.Uri.ToString())) { string newUri = SpiderWorkData.Moved301[lnk.Uri.ToString()]; lnk.ResourceMoved(new Uri(newUri)); if (alreadyExecuted(lnk.Uri)) { return(null); } } if (alreadyExecuted(lnk.Uri)) { return(null); } var args = new ShouldFetchEventArgs(lnk); ShouldFetch?.Invoke(this, args); if (args.Cancel) { return(null); } qAdded.Enqueue(lnk); return(lnk); }
private void Downloader_ShouldFetch(object Sender, ShouldFetchEventArgs args) { if (SpiderWorkData.Error404.Contains(args.Link.Uri.ToString())) { args.Reason = ShouldFetchEventArgs.Reasons.PreviousError; args.Cancel = true; return; } args.Source = FetchEventArgs.EventSource.Downloader; shouldFetch(Sender, args); }
private void shouldFetch(object Sender, ShouldFetchEventArgs args) { if (alreadyExecuted(args.Link)) { args.Cancel = true; args.Reason = ShouldFetchEventArgs.Reasons.AlreadyFetched; return; } // Ask user ShouldFetch?.Invoke(this, args); if (args.Cancel) { if (args.Reason == ShouldFetchEventArgs.Reasons.None) { args.Reason = ShouldFetchEventArgs.Reasons.UserCancelled; } if (args.Reason == ShouldFetchEventArgs.Reasons.UserCancelled) { log.Information($"[USER CANCEL] {args.Link}"); } } }
private void Cacher_ShouldFetch(object Sender, ShouldFetchEventArgs args) { args.Source = FetchEventArgs.EventSource.Cacher; shouldFetch(Sender, args); }
private Link addPage(Uri pageToVisit, Uri sourcePage) { if (!Configuration.SpiderAllowHostViolation && isHostViolation(pageToVisit, BaseUri)) { lock (hViolated) { if (!hViolated.Contains(pageToVisit.Host)) // ignore the entire domain { hViolated.Add(pageToVisit.Host); log.Warning($"[WRN] Host Violation {pageToVisit}"); } } return(null); } var lnk = new Link(pageToVisit, sourcePage); try { if (AddPageFilter != null && !AddPageFilter(lnk)) { return(null); } } catch (Exception ex) { log.Error(ex, "AddPageFilter error"); OnError?.Invoke(this, new ErrorEventArgs() { Source = FetchEventArgs.EventSource.Scheduler, Exception = ex }); } if (FetchRewrite != null) { var ev = new FetchRewriteEventArgs(pageToVisit); FetchRewrite(this, ev); // Default Uri Equality ignores Fragment if (ev.NewUri != null && ev.NewUri.ToString() != pageToVisit.ToString()) { if (ev.ShowOnLog) { log.Information($"[REW] {pageToVisit} -> {ev.NewUri}"); } lnk.ResourceRewritten(ev.NewUri); } } if (SpiderWorkData.Moved301.ContainsKey(lnk.Uri.ToString())) { string newUri = SpiderWorkData.Moved301[lnk.Uri.ToString()]; lnk.ResourceMoved(new Uri(newUri)); if (alreadyExecuted(lnk.Uri)) { return(null); } } if (alreadyExecuted(lnk.Uri)) { return(null); } try { var args = new ShouldFetchEventArgs(lnk); ShouldFetch?.Invoke(this, args); if (args.Cancel) { return(null); } } catch (Exception ex) { log.Error(ex, "ShouldFetch error"); OnError?.Invoke(this, new ErrorEventArgs() { Source = FetchEventArgs.EventSource.Scheduler, Exception = ex }); } qAdded.Enqueue(lnk); return(lnk); }