Beispiel #1
0
        private Link addPage(Uri pageToVisit, Uri sourcePage)
        {
            if (pageToVisit.Host != BaseUri.Host && !Configuration.SpiderAllowHostViolation)
            {
                if (!hViolated.Contains(pageToVisit.Host)) // ignore the entire domain
                {
                    lock (hViolated)
                    {
                        hViolated.Add(pageToVisit.Host);
                    }
                    log.Warning($"[WRN] Host Violation {pageToVisit}");
                }
                return(null);
            }

            var lnk = new Link(pageToVisit, sourcePage);

            if (FetchRewrite != null)
            {
                var ev = new FetchRewriteEventArgs(pageToVisit);
                FetchRewrite(this, ev);
                // Default Uri Equality ignores Fragment
                if (ev.NewUri != null && ev.NewUri.ToString() != pageToVisit.ToString())
                {
                    if (ev.ShowOnLog)
                    {
                        log.Information($"[REW] {pageToVisit} -> {ev.NewUri}");
                    }
                    lnk.ResourceRewritten(ev.NewUri);
                }
            }

            if (SpiderWorkData.Moved301.ContainsKey(lnk.Uri.ToString()))
            {
                string newUri = SpiderWorkData.Moved301[lnk.Uri.ToString()];
                lnk.ResourceMoved(new Uri(newUri));

                if (alreadyExecuted(lnk.Uri))
                {
                    return(null);
                }
            }

            if (alreadyExecuted(lnk.Uri))
            {
                return(null);
            }

            var args = new ShouldFetchEventArgs(lnk);

            ShouldFetch?.Invoke(this, args);
            if (args.Cancel)
            {
                return(null);
            }

            qAdded.Enqueue(lnk);
            return(lnk);
        }
Beispiel #2
0
        private void Downloader_ShouldFetch(object Sender, ShouldFetchEventArgs args)
        {
            if (SpiderWorkData.Error404.Contains(args.Link.Uri.ToString()))
            {
                args.Reason = ShouldFetchEventArgs.Reasons.PreviousError;
                args.Cancel = true;
                return;
            }

            args.Source = FetchEventArgs.EventSource.Downloader;
            shouldFetch(Sender, args);
        }
Beispiel #3
0
        private void shouldFetch(object Sender, ShouldFetchEventArgs args)
        {
            if (alreadyExecuted(args.Link))
            {
                args.Cancel = true;
                args.Reason = ShouldFetchEventArgs.Reasons.AlreadyFetched;
                return;
            }

            // Ask user
            ShouldFetch?.Invoke(this, args);
            if (args.Cancel)
            {
                if (args.Reason == ShouldFetchEventArgs.Reasons.None)
                {
                    args.Reason = ShouldFetchEventArgs.Reasons.UserCancelled;
                }

                if (args.Reason == ShouldFetchEventArgs.Reasons.UserCancelled)
                {
                    log.Information($"[USER CANCEL] {args.Link}");
                }
            }
        }
Beispiel #4
0
 private void Cacher_ShouldFetch(object Sender, ShouldFetchEventArgs args)
 {
     args.Source = FetchEventArgs.EventSource.Cacher;
     shouldFetch(Sender, args);
 }
Beispiel #5
0
        private Link addPage(Uri pageToVisit, Uri sourcePage)
        {
            if (!Configuration.SpiderAllowHostViolation && isHostViolation(pageToVisit, BaseUri))
            {
                lock (hViolated)
                {
                    if (!hViolated.Contains(pageToVisit.Host)) // ignore the entire domain
                    {
                        hViolated.Add(pageToVisit.Host);
                        log.Warning($"[WRN] Host Violation {pageToVisit}");
                    }
                }
                return(null);
            }

            var lnk = new Link(pageToVisit, sourcePage);

            try
            {
                if (AddPageFilter != null && !AddPageFilter(lnk))
                {
                    return(null);
                }
            }
            catch (Exception ex)
            {
                log.Error(ex, "AddPageFilter error");
                OnError?.Invoke(this, new ErrorEventArgs()
                {
                    Source = FetchEventArgs.EventSource.Scheduler, Exception = ex
                });
            }

            if (FetchRewrite != null)
            {
                var ev = new FetchRewriteEventArgs(pageToVisit);
                FetchRewrite(this, ev);
                // Default Uri Equality ignores Fragment
                if (ev.NewUri != null && ev.NewUri.ToString() != pageToVisit.ToString())
                {
                    if (ev.ShowOnLog)
                    {
                        log.Information($"[REW] {pageToVisit} -> {ev.NewUri}");
                    }
                    lnk.ResourceRewritten(ev.NewUri);
                }
            }

            if (SpiderWorkData.Moved301.ContainsKey(lnk.Uri.ToString()))
            {
                string newUri = SpiderWorkData.Moved301[lnk.Uri.ToString()];
                lnk.ResourceMoved(new Uri(newUri));

                if (alreadyExecuted(lnk.Uri))
                {
                    return(null);
                }
            }

            if (alreadyExecuted(lnk.Uri))
            {
                return(null);
            }

            try
            {
                var args = new ShouldFetchEventArgs(lnk);
                ShouldFetch?.Invoke(this, args);
                if (args.Cancel)
                {
                    return(null);
                }
            }
            catch (Exception ex)
            {
                log.Error(ex, "ShouldFetch error");
                OnError?.Invoke(this, new ErrorEventArgs()
                {
                    Source = FetchEventArgs.EventSource.Scheduler, Exception = ex
                });
            }

            qAdded.Enqueue(lnk);
            return(lnk);
        }