public CrawlMaster(CrawlJob job)
 {
     Job           = job;
     RunningStatus = new JobStatusUpdate(Job);
     TotalStats    = new CrawlJobStats(Job);
     Context.SetReceiveTimeout(TimeSpan.FromSeconds(5));
     WaitingForTracker();
 }
Exemple #2
0
 [JsonConstructor] // need this to tell JSON.NET which constructor to pick
 public JobStatusUpdate(CrawlJob job, CrawlJobStats stats, JobStatus status, DateTime startTime, DateTime?endTime)
 {
     Job       = job;
     StartTime = startTime;
     EndTime   = endTime;
     Status    = status;
     Stats     = stats;
 }
 public CrawlMaster(CrawlJob job)
 {
     Job           = job;
     RunningStatus = new JobStatusUpdate(Job);
     TotalStats    = new CrawlJobStats(Job).Copy(1); // count the index page as "discovered"
     Context.SetReceiveTimeout(TimeSpan.FromSeconds(5));
     WaitingForTracker();
 }
Exemple #4
0
 public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads)
 {
     Job = job;
     DownloadsTracker       = downloadsTracker;
     MaxConcurrentDownloads = maxConcurrentDownloads;
     Commander = commander;
     Stats     = new CrawlJobStats(Job);
     Receiving();
 }
        public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker,
                                   long maxConcurrentDownloads)
        {
            Job = job;
            DownloadsTracker       = downloadsTracker;
            MaxConcurrentDownloads = maxConcurrentDownloads;
            Commander = commander;
            Stats     = new CrawlJobStats(Job);
            var selfHtmlSink = Sink.ActorRef <CheckDocuments>(Self, StreamCompleteTick.Instance);
            var selfDocSink  = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance);
            var selfImgSink  = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance);
            var htmlFlow     = Flow.Create <CrawlDocument>().Via(DownloadFlow.SelectDocType())
                               .Throttle(30, TimeSpan.FromSeconds(5), 100, ThrottleMode.Shaping)
                               .Via(DownloadFlow.ProcessHtmlDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient()));

            var imageFlow = Flow.Create <CrawlDocument>()
                            .Via(DownloadFlow.SelectDocType())
                            .Throttle(30, TimeSpan.FromSeconds(1), 100, ThrottleMode.Shaping)
                            .Via(DownloadFlow.ProcessImageDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient()))
                            .Via(DownloadFlow.ProcessCompletedDownload());

            var source = Source.ActorRef <CrawlDocument>(5000, OverflowStrategy.DropTail);

            var graph = GraphDsl.Create(source, (builder, s) =>
            {
                // html flows
                var downloadHtmlFlow       = builder.Add(htmlFlow);
                var downloadBroadcast      = builder.Add(new Broadcast <DownloadHtmlResult>(2));
                var completedDownload      = builder.Add(DownloadFlow.ProcessCompletedHtmlDownload());
                var parseCompletedDownload = builder.Add(ParseFlow.GetParseFlow(Job));
                var htmlSink = builder.Add(selfHtmlSink);
                var docSink  = builder.Add(selfDocSink);
                builder.From(downloadHtmlFlow).To(downloadBroadcast);
                builder.From(downloadBroadcast.Out(0)).To(completedDownload.Inlet);
                builder.From(downloadBroadcast.Out(1)).To(parseCompletedDownload.Inlet);
                builder.From(parseCompletedDownload).To(htmlSink);
                builder.From(completedDownload).To(docSink);

                // image flows
                var imgSink           = builder.Add(selfImgSink);
                var downloadImageFlow = builder.Add(imageFlow);
                builder.From(downloadImageFlow).To(imgSink);

                var sourceBroadcast = builder.Add(new Broadcast <CrawlDocument>(2));
                builder.From(sourceBroadcast.Out(0)).To(downloadImageFlow.Inlet);
                builder.From(sourceBroadcast.Out(1)).To(downloadHtmlFlow.Inlet);

                builder.From(s.Outlet).To(sourceBroadcast.In);

                return(ClosedShape.Instance);
            });

            SourceActor = Context.Materializer().Materialize(graph);

            Receiving();
        }
        private void Receiving()
        {
            Receive <PublishStatsTick>(stats =>
            {
                if (!Stats.IsEmpty)
                {
                    _logger.Info("Publishing {0} to parent", Stats);

                    Commander.Tell(Stats.Copy());

                    //reset our stats after publishing
                    Stats = Stats.Reset();
                }
            });

            //Received word from a ParseWorker that we need to check for new documents
            Receive <CheckDocuments>(documents =>
            {
                //forward this onto the downloads tracker, but have it reply back to our parent router so the work might get distributed more evenly
                DownloadsTracker.Tell(documents, Context.Parent);
            });

            //Update our local stats
            Receive <DiscoveredDocuments>(discovered =>
            {
                Stats = Stats.WithDiscovered(discovered);
            });

            //Received word from the DownloadTracker that we need to process some docs
            Receive <ProcessDocuments>(process =>
            {
                foreach (var doc in process.Documents)
                {
                    SourceActor.Tell(doc);
                }
            });

            //hand the work off to the downloaders
            Receive <IDownloadDocument>(download =>
            {
                SourceActor.Tell(download.Document);
            });

            Receive <CompletedDocument>(completed =>
            {
                _logger.Info("Logging completed download {0} bytes {1}", completed.Document.DocumentUri, completed.NumBytes);
                Stats = Stats.WithCompleted(completed);
                _logger.Info("Total stats {0}", Stats);
            });

            Receive <StreamCompleteTick>(_ =>
            {
                _logger.Info("Stream has completed. No more messages to process.");
            });
        }
Exemple #7
0
        /// <summary>
        ///     Combine two stats objects
        /// </summary>
        public CrawlJobStats Merge(CrawlJobStats other)
        {
            if (CanMerge(other))
            {
                return(Copy(HtmlDocumentsDiscovered + other.HtmlDocumentsDiscovered,
                            ImagesDiscovered + other.ImagesDiscovered,
                            HtmlDocumentsDownloaded + other.HtmlDocumentsDownloaded,
                            ImagesDownloaded + other.ImagesDownloaded,
                            HtmlBytesDownloaded + other.HtmlBytesDownloaded,
                            ImageBytesDownloaded + other.ImageBytesDownloaded));
            }

            return(this);
        }
Exemple #8
0
 /// <summary>
 ///     Determine if this instance can merge with another <see cref="CrawlJobStats" />
 /// </summary>
 public bool CanMerge(CrawlJobStats other)
 {
     return(Key.Equals(other.Key));
 }
 public JobStatusUpdate WithStats(CrawlJobStats newStats)
 {
     return new JobStatusUpdate(Job, newStats, Status, StartTime, EndTime);
 }
Exemple #10
0
        private void Receiving()
        {
            Receive <PublishStatsTick>(stats =>
            {
                if (!Stats.IsEmpty)
                {
                    _logger.Info("Publishing {0} to parent", Stats);

                    Commander.Tell(Stats.Copy());

                    //reset our stats after publishing
                    Stats = Stats.Reset();
                }
            });

            //Received word from a ParseWorker that we need to check for new documents
            Receive <CheckDocuments>(documents =>
            {
                //forward this onto the downloads tracker, but have it reply back to us
                DownloadsTracker.Tell(documents);
            });

            //Update our local stats
            Receive <DiscoveredDocuments>(discovered =>
            {
                Stats = Stats.WithDiscovered(discovered);
            });

            //Received word from the DownloadTracker that we need to process some docs
            Receive <ProcessDocuments>(process =>
            {
                foreach (var doc in process.Documents)
                {
                    // Context.Parent is the router between the coordinators and the Commander
                    if (doc.IsImage)
                    {
                        Context.Parent.Tell(new DownloadWorker.DownloadImage(doc));
                    }
                    else
                    {
                        Context.Parent.Tell(new DownloadWorker.DownloadHtmlDocument(doc));
                    }
                }
            });

            //hand the work off to the downloaders
            Receive <DownloadWorker.IDownloadDocument>(download =>
            {
                DownloaderRouter.Tell(download);
            });

            Receive <CompletedDocument>(completed =>
            {
                //TODO: send verbose status messages to commander here?
                Stats = Stats.WithCompleted(completed);
            });

            /* Set all of our local downloaders to message our local parsers */
            Receive <DownloadWorker.RequestParseActor>(request =>
            {
                Sender.Tell(new DownloadWorker.SetParseActor(ParserRouter));
            });

            /* Set all of our local parsers to message our local downloaders */
            Receive <ParseWorker.RequestDownloadActor>(request =>
            {
                Sender.Tell(new ParseWorker.SetDownloadActor(DownloaderRouter));
            });
        }
        /// <summary>
        /// Combine two stats objects
        /// </summary>
        public CrawlJobStats Merge(CrawlJobStats other)
        {
            if (CanMerge(other))
            {
                return Copy(HtmlDocumentsDiscovered + other.HtmlDocumentsDiscovered,
                    ImagesDiscovered + other.ImagesDiscovered,
                    HtmlDocumentsDownloaded + other.HtmlDocumentsDownloaded,
                    ImagesDownloaded + other.ImagesDownloaded,
                    HtmlBytesDownloaded + other.HtmlBytesDownloaded,
                    ImageBytesDownloaded + other.ImageBytesDownloaded);
            }

            return this;
        }
 /// <summary>
 /// Determine if this instance can merge with another <see cref="CrawlJobStats"/>
 /// </summary>
 public bool CanMerge(CrawlJobStats other)
 {
     return (Key.Equals(other.Key));
 }