Example #1
0
        private void ExecuteJobForSubscription(Subscription subscription)
        {
            CrawlJob   job = _jobFactory.CreateInstance();
            List <Uri> alreadyProcessedUrls = _alreadyProcessedUrls.GetOrAdd(subscription.Id, valueFactory: _ => new List <Uri>());

            job.Execute(subscription, alreadyProcessedUrls);
        }
Example #2
0
 [JsonConstructor] // need this to tell JSON.NET which constructor to pick
 public JobStatusUpdate(CrawlJob job, CrawlJobStats stats, JobStatus status, DateTime startTime, DateTime?endTime)
 {
     Job       = job;
     StartTime = startTime;
     EndTime   = endTime;
     Status    = status;
     Stats     = stats;
 }
Example #3
0
 public CrawlMaster(CrawlJob job)
 {
     Job           = job;
     RunningStatus = new JobStatusUpdate(Job);
     TotalStats    = new CrawlJobStats(Job).Copy(1); // count the index page as "discovered"
     Context.SetReceiveTimeout(TimeSpan.FromSeconds(5));
     WaitingForTracker();
 }
 public CrawlMaster(CrawlJob job)
 {
     Job = job;
     RunningStatus = new JobStatusUpdate(Job);
     TotalStats = new CrawlJobStats(Job);
     Context.SetReceiveTimeout(TimeSpan.FromSeconds(5));
     WaitingForTracker();
 }
 public CrawlMaster(CrawlJob job)
 {
     Job           = job;
     RunningStatus = new JobStatusUpdate(Job);
     TotalStats    = new CrawlJobStats(Job);
     Context.SetReceiveTimeout(TimeSpan.FromSeconds(5));
     WaitingForTracker();
 }
Example #6
0
 public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads)
 {
     Job = job;
     DownloadsTracker       = downloadsTracker;
     MaxConcurrentDownloads = maxConcurrentDownloads;
     Commander = commander;
     Stats     = new CrawlJobStats(Job);
     Receiving();
 }
 public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads)
 {
     Job = job;
     DownloadsTracker = downloadsTracker;
     MaxConcurrentDownloads = maxConcurrentDownloads;
     Commander = commander;
     Stats = new CrawlJobStats(Job);
     Receiving();
 }
        public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker,
                                   long maxConcurrentDownloads)
        {
            Job = job;
            DownloadsTracker       = downloadsTracker;
            MaxConcurrentDownloads = maxConcurrentDownloads;
            Commander = commander;
            Stats     = new CrawlJobStats(Job);
            var selfHtmlSink = Sink.ActorRef <CheckDocuments>(Self, StreamCompleteTick.Instance);
            var selfDocSink  = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance);
            var selfImgSink  = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance);
            var htmlFlow     = Flow.Create <CrawlDocument>().Via(DownloadFlow.SelectDocType())
                               .Throttle(30, TimeSpan.FromSeconds(5), 100, ThrottleMode.Shaping)
                               .Via(DownloadFlow.ProcessHtmlDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient()));

            var imageFlow = Flow.Create <CrawlDocument>()
                            .Via(DownloadFlow.SelectDocType())
                            .Throttle(30, TimeSpan.FromSeconds(1), 100, ThrottleMode.Shaping)
                            .Via(DownloadFlow.ProcessImageDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient()))
                            .Via(DownloadFlow.ProcessCompletedDownload());

            var source = Source.ActorRef <CrawlDocument>(5000, OverflowStrategy.DropTail);

            var graph = GraphDsl.Create(source, (builder, s) =>
            {
                // html flows
                var downloadHtmlFlow       = builder.Add(htmlFlow);
                var downloadBroadcast      = builder.Add(new Broadcast <DownloadHtmlResult>(2));
                var completedDownload      = builder.Add(DownloadFlow.ProcessCompletedHtmlDownload());
                var parseCompletedDownload = builder.Add(ParseFlow.GetParseFlow(Job));
                var htmlSink = builder.Add(selfHtmlSink);
                var docSink  = builder.Add(selfDocSink);
                builder.From(downloadHtmlFlow).To(downloadBroadcast);
                builder.From(downloadBroadcast.Out(0)).To(completedDownload.Inlet);
                builder.From(downloadBroadcast.Out(1)).To(parseCompletedDownload.Inlet);
                builder.From(parseCompletedDownload).To(htmlSink);
                builder.From(completedDownload).To(docSink);

                // image flows
                var imgSink           = builder.Add(selfImgSink);
                var downloadImageFlow = builder.Add(imageFlow);
                builder.From(downloadImageFlow).To(imgSink);

                var sourceBroadcast = builder.Add(new Broadcast <CrawlDocument>(2));
                builder.From(sourceBroadcast.Out(0)).To(downloadImageFlow.Inlet);
                builder.From(sourceBroadcast.Out(1)).To(downloadHtmlFlow.Inlet);

                builder.From(s.Outlet).To(sourceBroadcast.In);

                return(ClosedShape.Instance);
            });

            SourceActor = Context.Materializer().Materialize(graph);

            Receiving();
        }
Example #9
0
 public static bool CanMakeAbsoluteUri(CrawlJob jobRoot, string rawUri)
 {
     if (Uri.IsWellFormedUriString(rawUri, UriKind.Absolute))
     {
         return(true);
     }
     try
     {
         var absUri    = new Uri(jobRoot.Root, rawUri);
         var returnVal = absUri.Scheme.Equals(Uri.UriSchemeHttp) || absUri.Scheme.Equals(Uri.UriSchemeHttps);
         return(returnVal);
     }
     catch
     {
         return(false);
     }
 }
Example #10
0
 //
 // GET: /Crawl/Run
 public ActionResult Run(bool sync = true)
 {
     var sw = Stopwatch.StartNew();
     var job = new CrawlJob();
     if (sync) {
         job.SyncDigLinks();
     }
     else {
         job.DigLinks();
     }
     sw.Stop();
     var unhandledLinks  =  job.GetUnhandledLinks(999, DateTime.Now.AddMinutes(30));
     var result = new {
         Span = sw.Elapsed.ToString() ,
         UnhandledLinks = unhandledLinks
     };
     return Json(result, JsonRequestBehavior.AllowGet);
 }
Example #11
0
        public async Task Test()
        {
            var limiter      = new RollingWindowRateLimiter(10000, TimeSpan.FromMinutes(1));
            var proxyService = new DefaultProxyService();
            var agent        = new WebAgent(limiter, proxyService);

            var job = new CrawlJob()
            {
                Domain = new Uri("https://reddit.com/"),
                CompletionConditions = new List <ICrawlCompletionCondition>
                {
                    new MaxPagesCrawledCondition(100),
                    new MaxTimeCondition(TimeSpan.FromMinutes(3)),
                    new MaxResultsFoundCondition(2000)
                },
                ThreadAllowance = 10,
                Cookies         = new List <Cookie> {
                    new Cookie("over18", "1", "/", "reddit.com")
                },
                Regex = "<img.+?src=\"(?<image>.+?)\""
            };

            using (var crawler = new Crawler(agent))
            {
                var results = await crawler.Crawl(job);

                Console.WriteLine(results.CrawlCount);
                Console.WriteLine(results.QueueSize);
                Console.WriteLine(results.ResultsCount);

                foreach (var item in results.Data)
                {
                    Console.WriteLine(item.Item2);
                }
            }
        }
 public ParseWorker(CrawlJob jobRoot, IActorRef coordinatorActor)
 {
     JobRoot = jobRoot;
     CoordinatorActor = coordinatorActor;
     WaitingForDownloadActor();
 }
 public JobStatusUpdate(CrawlJob job) : this(job, null, JobStatus.Starting, DateTime.UtcNow, null)
 {
 }
Example #14
0
 public JobFound(CrawlJob key, IActorRef crawlMaster)
 {
     CrawlMaster = crawlMaster;
     Key         = key;
 }
Example #15
0
 public SubscribeToJob(CrawlJob job, IActorRef subscriber)
 {
     Subscriber = subscriber;
     Job        = job;
 }
Example #16
0
        public static Flow <DownloadHtmlResult, CheckDocuments, NotUsed> GetParseFlow(CrawlJob jobRoot)
        {
            return(Flow.Create <DownloadHtmlResult>().Async()
                   .Select(downloadHtmlResult =>
            {
                var requestedUrls = new List <CrawlDocument>();

                var htmlString = downloadHtmlResult.Content;
                var doc = new HtmlDocument();
                doc.LoadHtml(htmlString);

                //find all of the IMG tags via XPATH
                var imgs = doc.DocumentNode.SelectNodes("//img[@src]");

                //find all of the A...HREF tags via XPATH
                var links = doc.DocumentNode.SelectNodes("//a[@href]");

                /* PROCESS ALL IMAGES */
                if (imgs != null)
                {
                    var validImgUris =
                        imgs.Select(x => x.Attributes["src"].Value)
                        .Where(x => CanMakeAbsoluteUri(jobRoot, x))
                        .Select(x => ToAsboluteUri(jobRoot, x))
                        .Where(x => AbsoluteUriIsInDomain(jobRoot, x))
                        .Select(y => new CrawlDocument(y, true));

                    requestedUrls = requestedUrls.Concat(validImgUris).ToList();
                }

                /* PROCESS ALL LINKS */
                if (links != null)
                {
                    var validLinkUris =
                        links.Select(x => x.Attributes["href"].Value)
                        .Where(x => CanMakeAbsoluteUri(jobRoot, x))
                        .Select(x => ToAsboluteUri(jobRoot, x))
                        .Where(x => AbsoluteUriIsInDomain(jobRoot, x))
                        .Select(y => new CrawlDocument(y, false));

                    requestedUrls = requestedUrls.Concat(validLinkUris).ToList();
                }

                return new CheckDocuments(requestedUrls, ActorRefs.NoSender, TimeSpan.FromMilliseconds(requestedUrls.Count * 5000));
            }));
        }
Example #17
0
 public StartJob(CrawlJob job, ActorRef requestor)
 {
     Requestor = requestor;
     Job       = job;
 }
 public CreatedTracker(CrawlJob key, ActorRef tracker)
 {
     Tracker = tracker;
     Key     = key;
 }
 public JobFound(CrawlJob key, IActorRef crawlMaster)
 {
     CrawlMaster = crawlMaster;
     Key = key;
 }
 public TrackerFound(CrawlJob key, ActorRef tracker)
 {
     Key     = key;
     Tracker = tracker;
 }
 public TrackerDead(CrawlJob key)
 {
     Key = key;
 }
 public TrackerNotFound(CrawlJob key)
 {
     Key = key;
 }
 public GetDownloadTracker(CrawlJob key)
 {
     Key = key;
 }
 public RequestDownloadTrackerFor(CrawlJob key, ActorRef originator)
 {
     Originator = originator;
     Key        = key;
 }
 public FindRunningJob(CrawlJob key)
 {
     Key = key;
 }
 public StopJob(CrawlJob job, IActorRef requestor)
 {
     Requestor = requestor;
     Job = job;
 }
 public CreatedTracker(CrawlJob key, IActorRef tracker)
 {
     Tracker = tracker;
     Key = key;
 }
Example #28
0
 public static bool AbsoluteUriIsInDomain(CrawlJob jobRoot, Uri otherUri)
 {
     return(jobRoot.Domain == otherUri.Host);
 }
 public CrawlJobStats(CrawlJob key)
 {
     Key = key;
 }
 public UnsubscribeFromJob(CrawlJob job, ActorRef subscriber)
 {
     Subscriber = subscriber;
     Job        = job;
 }
 public JobStatusMessage(CrawlJob job, string documentTitle, string message)
 {
     Message = message;
     DocumentTitle = documentTitle;
     Job = job;
 }
 public JobNotFound(CrawlJob key)
 {
     Key = key;
 }
Example #33
0
 public static Uri ToAsboluteUri(CrawlJob jobRoot, string rawUri)
 {
     return(Uri.IsWellFormedUriString(rawUri, UriKind.Absolute) ? new Uri(rawUri, UriKind.Absolute) : new Uri(jobRoot.Root, rawUri));
 }
Example #34
0
 public CrawlJobStats(CrawlJob key)
 {
     Key = key;
 }
Example #35
0
 public FindRunningJob(CrawlJob key)
 {
     Key = key;
 }
 public JobStatusUpdate(CrawlJob job)
 {
     Job = job;
     StartTime = DateTime.UtcNow;
     Status = JobStatus.Starting;
 }
Example #37
0
 public JobNotFound(CrawlJob key)
 {
     Key = key;
 }
 public UnsubscribeFromJob(CrawlJob job, IActorRef subscriber)
 {
     Subscriber = subscriber;
     Job = job;
 }
Example #39
0
 public JobStatusUpdate(CrawlJob job)
 {
     Job       = job;
     StartTime = DateTime.UtcNow;
     Status    = JobStatus.Starting;
 }
 public TrackerFound(CrawlJob key, IActorRef tracker)
 {
     Key = key;
     Tracker = tracker;
 }
 public SubscribeToJob(CrawlJob job, IActorRef subscriber)
 {
     Subscriber = subscriber;
     Job = job;
 }
 public TrackerDead(CrawlJob key)
 {
     Key = key;
 }
 public RequestDownloadTrackerFor(CrawlJob key, IActorRef originator)
 {
     Originator = originator;
     Key = key;
 }
Example #44
0
 public JobStatusMessage(CrawlJob job, string documentTitle, string message)
 {
     Message       = message;
     DocumentTitle = documentTitle;
     Job           = job;
 }
Example #45
0
 public ParseWorker(CrawlJob jobRoot, IActorRef coordinatorActor)
 {
     JobRoot          = jobRoot;
     CoordinatorActor = coordinatorActor;
     WaitingForDownloadActor();
 }
 public GetDownloadTracker(CrawlJob key)
 {
     Key = key;
 }
Example #47
0
 public StopJob(CrawlJob job, IActorRef requestor)
 {
     Requestor = requestor;
     Job       = job;
 }
 public TrackerNotFound(CrawlJob key)
 {
     Key = key;
 }