public CrawlMaster(CrawlJob job) { Job = job; RunningStatus = new JobStatusUpdate(Job); TotalStats = new CrawlJobStats(Job); Context.SetReceiveTimeout(TimeSpan.FromSeconds(5)); WaitingForTracker(); }
[JsonConstructor] // need this to tell JSON.NET which constructor to pick public JobStatusUpdate(CrawlJob job, CrawlJobStats stats, JobStatus status, DateTime startTime, DateTime?endTime) { Job = job; StartTime = startTime; EndTime = endTime; Status = status; Stats = stats; }
public CrawlMaster(CrawlJob job) { Job = job; RunningStatus = new JobStatusUpdate(Job); TotalStats = new CrawlJobStats(Job).Copy(1); // count the index page as "discovered" Context.SetReceiveTimeout(TimeSpan.FromSeconds(5)); WaitingForTracker(); }
public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads) { Job = job; DownloadsTracker = downloadsTracker; MaxConcurrentDownloads = maxConcurrentDownloads; Commander = commander; Stats = new CrawlJobStats(Job); Receiving(); }
public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads) { Job = job; DownloadsTracker = downloadsTracker; MaxConcurrentDownloads = maxConcurrentDownloads; Commander = commander; Stats = new CrawlJobStats(Job); var selfHtmlSink = Sink.ActorRef <CheckDocuments>(Self, StreamCompleteTick.Instance); var selfDocSink = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance); var selfImgSink = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance); var htmlFlow = Flow.Create <CrawlDocument>().Via(DownloadFlow.SelectDocType()) .Throttle(30, TimeSpan.FromSeconds(5), 100, ThrottleMode.Shaping) .Via(DownloadFlow.ProcessHtmlDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient())); var imageFlow = Flow.Create <CrawlDocument>() .Via(DownloadFlow.SelectDocType()) .Throttle(30, TimeSpan.FromSeconds(1), 100, ThrottleMode.Shaping) .Via(DownloadFlow.ProcessImageDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient())) .Via(DownloadFlow.ProcessCompletedDownload()); var source = Source.ActorRef <CrawlDocument>(5000, OverflowStrategy.DropTail); var graph = GraphDsl.Create(source, (builder, s) => { // html flows var downloadHtmlFlow = builder.Add(htmlFlow); var downloadBroadcast = builder.Add(new Broadcast <DownloadHtmlResult>(2)); var completedDownload = builder.Add(DownloadFlow.ProcessCompletedHtmlDownload()); var parseCompletedDownload = builder.Add(ParseFlow.GetParseFlow(Job)); var htmlSink = builder.Add(selfHtmlSink); var docSink = builder.Add(selfDocSink); builder.From(downloadHtmlFlow).To(downloadBroadcast); builder.From(downloadBroadcast.Out(0)).To(completedDownload.Inlet); builder.From(downloadBroadcast.Out(1)).To(parseCompletedDownload.Inlet); builder.From(parseCompletedDownload).To(htmlSink); builder.From(completedDownload).To(docSink); // image flows var imgSink = builder.Add(selfImgSink); var downloadImageFlow = builder.Add(imageFlow); builder.From(downloadImageFlow).To(imgSink); var sourceBroadcast = builder.Add(new Broadcast <CrawlDocument>(2)); builder.From(sourceBroadcast.Out(0)).To(downloadImageFlow.Inlet); builder.From(sourceBroadcast.Out(1)).To(downloadHtmlFlow.Inlet); builder.From(s.Outlet).To(sourceBroadcast.In); return(ClosedShape.Instance); }); SourceActor = Context.Materializer().Materialize(graph); Receiving(); }
private void Receiving() { Receive <PublishStatsTick>(stats => { if (!Stats.IsEmpty) { _logger.Info("Publishing {0} to parent", Stats); Commander.Tell(Stats.Copy()); //reset our stats after publishing Stats = Stats.Reset(); } }); //Received word from a ParseWorker that we need to check for new documents Receive <CheckDocuments>(documents => { //forward this onto the downloads tracker, but have it reply back to our parent router so the work might get distributed more evenly DownloadsTracker.Tell(documents, Context.Parent); }); //Update our local stats Receive <DiscoveredDocuments>(discovered => { Stats = Stats.WithDiscovered(discovered); }); //Received word from the DownloadTracker that we need to process some docs Receive <ProcessDocuments>(process => { foreach (var doc in process.Documents) { SourceActor.Tell(doc); } }); //hand the work off to the downloaders Receive <IDownloadDocument>(download => { SourceActor.Tell(download.Document); }); Receive <CompletedDocument>(completed => { _logger.Info("Logging completed download {0} bytes {1}", completed.Document.DocumentUri, completed.NumBytes); Stats = Stats.WithCompleted(completed); _logger.Info("Total stats {0}", Stats); }); Receive <StreamCompleteTick>(_ => { _logger.Info("Stream has completed. No more messages to process."); }); }
/// <summary> /// Combine two stats objects /// </summary> public CrawlJobStats Merge(CrawlJobStats other) { if (CanMerge(other)) { return(Copy(HtmlDocumentsDiscovered + other.HtmlDocumentsDiscovered, ImagesDiscovered + other.ImagesDiscovered, HtmlDocumentsDownloaded + other.HtmlDocumentsDownloaded, ImagesDownloaded + other.ImagesDownloaded, HtmlBytesDownloaded + other.HtmlBytesDownloaded, ImageBytesDownloaded + other.ImageBytesDownloaded)); } return(this); }
/// <summary> /// Determine if this instance can merge with another <see cref="CrawlJobStats" /> /// </summary> public bool CanMerge(CrawlJobStats other) { return(Key.Equals(other.Key)); }
public JobStatusUpdate WithStats(CrawlJobStats newStats) { return new JobStatusUpdate(Job, newStats, Status, StartTime, EndTime); }
private void Receiving() { Receive <PublishStatsTick>(stats => { if (!Stats.IsEmpty) { _logger.Info("Publishing {0} to parent", Stats); Commander.Tell(Stats.Copy()); //reset our stats after publishing Stats = Stats.Reset(); } }); //Received word from a ParseWorker that we need to check for new documents Receive <CheckDocuments>(documents => { //forward this onto the downloads tracker, but have it reply back to us DownloadsTracker.Tell(documents); }); //Update our local stats Receive <DiscoveredDocuments>(discovered => { Stats = Stats.WithDiscovered(discovered); }); //Received word from the DownloadTracker that we need to process some docs Receive <ProcessDocuments>(process => { foreach (var doc in process.Documents) { // Context.Parent is the router between the coordinators and the Commander if (doc.IsImage) { Context.Parent.Tell(new DownloadWorker.DownloadImage(doc)); } else { Context.Parent.Tell(new DownloadWorker.DownloadHtmlDocument(doc)); } } }); //hand the work off to the downloaders Receive <DownloadWorker.IDownloadDocument>(download => { DownloaderRouter.Tell(download); }); Receive <CompletedDocument>(completed => { //TODO: send verbose status messages to commander here? Stats = Stats.WithCompleted(completed); }); /* Set all of our local downloaders to message our local parsers */ Receive <DownloadWorker.RequestParseActor>(request => { Sender.Tell(new DownloadWorker.SetParseActor(ParserRouter)); }); /* Set all of our local parsers to message our local downloaders */ Receive <ParseWorker.RequestDownloadActor>(request => { Sender.Tell(new ParseWorker.SetDownloadActor(DownloaderRouter)); }); }
/// <summary> /// Combine two stats objects /// </summary> public CrawlJobStats Merge(CrawlJobStats other) { if (CanMerge(other)) { return Copy(HtmlDocumentsDiscovered + other.HtmlDocumentsDiscovered, ImagesDiscovered + other.ImagesDiscovered, HtmlDocumentsDownloaded + other.HtmlDocumentsDownloaded, ImagesDownloaded + other.ImagesDownloaded, HtmlBytesDownloaded + other.HtmlBytesDownloaded, ImageBytesDownloaded + other.ImageBytesDownloaded); } return this; }
/// <summary> /// Determine if this instance can merge with another <see cref="CrawlJobStats"/> /// </summary> public bool CanMerge(CrawlJobStats other) { return (Key.Equals(other.Key)); }