public async Task ShallLogErrorOnJobTransitionError() { var request = new InferenceJob { JobId = "1", PayloadId = "1", State = InferenceJobState.Created, Source = "Source" }; request.SetStoragePath("/job"); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.TransitionState(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>(), It.IsAny <CancellationToken>())) .Throws(new Exception("error")); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _serviceScopeFactory.Object, _fileSystem.Object, _configuration); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _jobStore.Verify(p => p.TransitionState(request, InferenceJobStatus.Fail, It.IsAny <CancellationToken>()), Times.Once()); _logger.VerifyLogging("Error while transitioning job state.", LogLevel.Error, Times.Once()); }
public async Task TransitionState_Fail_ShallTransitionJob(InferenceJobState initalState, InferenceJobState endingState) { var job = new InferenceJob(); job.JobId = Guid.NewGuid().ToString(); job.PayloadId = Guid.NewGuid().ToString(); job.SetStoragePath("/path/to/job"); job.State = initalState; job.TryCount = 1; var cancellationSource = new CancellationTokenSource(); _inferenceJobRepository.SetupSequence(p => p.AsQueryable()) .Returns((new List <InferenceJob>() { job }).AsQueryable()); _inferenceJobRepository.Setup(p => p.SaveChangesAsync(It.IsAny <CancellationToken>())); var jobStore = new ClaraJobRepository( _logger.Object, _configuration, _fileSystem, _inferenceJobRepository.Object); var result = await jobStore.TransitionState(job, InferenceJobStatus.Fail, cancellationSource.Token); Assert.Equal(job, result); Assert.Equal(endingState, endingState); Assert.Equal(2, result.TryCount); _logger.VerifyLoggingMessageBeginsWith($"Putting inference job {job.JobId} back to {endingState} state for retry.", LogLevel.Information, Times.Once()); _inferenceJobRepository.Verify(p => p.SaveChangesAsync(cancellationSource.Token), Times.Once()); }
private async Task UpdateInferenceJob(InferenceJob job, CancellationToken cancellationToken = default) { Guard.Against.Null(job, nameof(job)); await Policy .Handle <Exception>() .WaitAndRetryAsync( 3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)), (exception, timeSpan, retryCount, context) => { _logger.Log(LogLevel.Warning, exception, $"Failed to update job. Waiting {timeSpan} before next retry. Retry attempt {retryCount}."); }) .ExecuteAsync(async(cancellationTokenInsideExecution) => { _logger.Log(LogLevel.Debug, $"Updating inference job."); await _inferenceJobRepository.SaveChangesAsync(cancellationTokenInsideExecution); if (job.State == InferenceJobState.Completed || job.State == InferenceJobState.Faulted) { _inferenceJobRepository.Detach(job); } _logger.Log(LogLevel.Debug, $"Inference job updated."); }, cancellationToken) .ConfigureAwait(false); }
public async Task UpdateSuccess_ShallDeleteJobCrd() { _kubernetesClient .Setup(p => p.DeleteNamespacedCustomObjectWithHttpMessagesAsync(It.IsAny <CustomResourceDefinition>(), It.IsAny <string>())) .Returns(Task.FromResult(new HttpOperationResponse <object> { Response = new HttpResponseMessage() })); var item = new InferenceJob("/path/to/job", new Job { JobId = Guid.NewGuid().ToString(), PayloadId = Guid.NewGuid().ToString() }); var jobStore = new JobStore( _loggerFactory.Object, _configuration, _kubernetesClient.Object, _fileSystem); await jobStore.Update(item, InferenceJobStatus.Success); _logger.VerifyLogging($"Removing job {item.JobId} from job store as completed.", LogLevel.Information, Times.Once()); _logger.VerifyLogging($"Job {item.JobId} removed from job store.", LogLevel.Information, Times.Once()); _kubernetesClient.Verify(p => p.DeleteNamespacedCustomObjectWithHttpMessagesAsync(It.IsAny <CustomResourceDefinition>(), item.JobId), Times.Once()); }
public async Task ShallFailJobOnException() { var request = new InferenceJob("/job", new Job { JobId = "1", PayloadId = "1" }); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.Update(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>())); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _jobsApi.Object, _payloadsApi.Object, _jobStore.Object, _fileSystem.Object); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _logger.VerifyLogging("Error uploading payloads/starting job.", LogLevel.Error, Times.Once()); _jobStore.Verify(p => p.Update(request, InferenceJobStatus.Fail), Times.Once()); }
public async Task UpdateFail_ShallUpdateCountAndUpdateCrd() { _kubernetesClient .Setup(p => p.DeleteNamespacedCustomObjectWithHttpMessagesAsync(It.IsAny <CustomResourceDefinition>(), It.IsAny <string>())) .Returns(Task.FromResult(new HttpOperationResponse <object> { Response = new HttpResponseMessage() })); _kubernetesClient .Setup(p => p.PatchNamespacedCustomObjectWithHttpMessagesAsync(It.IsAny <CustomResourceDefinition>(), It.IsAny <object>(), It.IsAny <string>())) .Returns(Task.FromResult(new HttpOperationResponse <object> { Response = new HttpResponseMessage() })); var item = new InferenceJob("/path/to/job", new Job { JobId = Guid.NewGuid().ToString(), PayloadId = Guid.NewGuid().ToString() }); item.TryCount = 2; var jobStore = new JobStore( _loggerFactory.Object, _configuration, _kubernetesClient.Object, _fileSystem); await jobStore.Update(item, InferenceJobStatus.Fail); _logger.VerifyLogging($"Adding job {item.JobId} back to job store for retry.", LogLevel.Debug, Times.Once()); _logger.VerifyLogging($"Job {item.JobId} added back to job store for retry.", LogLevel.Information, Times.Once()); _kubernetesClient.Verify(p => p.DeleteNamespacedCustomObjectWithHttpMessagesAsync(It.IsAny <CustomResourceDefinition>(), item.JobId), Times.Never()); _kubernetesClient.Verify(p => p.PatchNamespacedCustomObjectWithHttpMessagesAsync(It.IsAny <CustomResourceDefinition>(), It.IsAny <object>(), item.JobId), Times.Once()); }
public async Task Update(InferenceJob request, InferenceJobStatus status) { if (status == InferenceJobStatus.Success) { _logger.Log(LogLevel.Information, $"Removing job {request.JobId} from job store as completed."); await Delete(request); } else { if (++request.TryCount > MaxRetryLimit) { _logger.Log(LogLevel.Information, $"Exceeded maximum job submission retries; removing job {request.JobId} from job store."); await Delete(request); } else { _logger.Log(LogLevel.Debug, $"Adding job {request.JobId} back to job store for retry."); request.State = InferenceJobState.Queued; _logger.Log(LogLevel.Debug, $"Updating request {request.JobId} to Queued."); await UpdateInferenceJob(request); _logger.Log(LogLevel.Information, $"Job {request.JobId} added back to job store for retry."); } } }
public async Task TransitionState_Fail_ShallPutJobInFaultedState() { var job = new InferenceJob(); job.JobId = Guid.NewGuid().ToString(); job.PayloadId = Guid.NewGuid().ToString(); job.SetStoragePath("/path/to/job"); job.State = InferenceJobState.Creating; job.TryCount = 3; var cancellationSource = new CancellationTokenSource(); _inferenceJobRepository.SetupSequence(p => p.AsQueryable()) .Returns((new List <InferenceJob>() { job }).AsQueryable()); _inferenceJobRepository.Setup(p => p.SaveChangesAsync(It.IsAny <CancellationToken>())); var jobStore = new ClaraJobRepository( _logger.Object, _configuration, _fileSystem, _inferenceJobRepository.Object); var result = await jobStore.TransitionState(job, InferenceJobStatus.Fail, cancellationSource.Token); Assert.Equal(job, result); Assert.Equal(InferenceJobState.Faulted, result.State); Assert.Equal(4, result.TryCount); _logger.VerifyLoggingMessageBeginsWith($"Job {job.JobId} exceeded maximum number of retries.", LogLevel.Warning, Times.Once()); _inferenceJobRepository.Verify(p => p.SaveChangesAsync(cancellationSource.Token), Times.Once()); }
public async Task StartsJobAndTransitionsState() { var request = new InferenceJob { JobId = "1", PayloadId = "1", State = InferenceJobState.Starting, Source = "Source" }; request.SetStoragePath("/job"); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.TransitionState(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>(), It.IsAny <CancellationToken>())); _jobsApi.Setup(p => p.Start(It.IsAny <Job>())); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _serviceScopeFactory.Object, _fileSystem.Object, _configuration); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _jobStore.Verify(p => p.TransitionState(request, InferenceJobStatus.Success, It.IsAny <CancellationToken>()), Times.Once()); _jobsApi.Verify(p => p.Start(It.IsAny <Job>()), Times.Once()); }
public async Task Take_ShallReturnAJob(InferenceJobState initalState, InferenceJobState endingState) { var job = new InferenceJob(); job.JobId = Guid.NewGuid().ToString(); job.PayloadId = Guid.NewGuid().ToString(); job.SetStoragePath("/path/to/job"); job.State = initalState; var cancellationSource = new CancellationTokenSource(); _inferenceJobRepository.SetupSequence(p => p.AsQueryable()) .Returns((new List <InferenceJob>() { job }).AsQueryable()); var jobStore = new ClaraJobRepository( _logger.Object, _configuration, _fileSystem, _inferenceJobRepository.Object); var result = await jobStore.Take(cancellationSource.Token); Assert.Equal(job, result); Assert.Equal(endingState, job.State); _logger.VerifyLoggingMessageBeginsWith($"Updating inference job {job.JobId} from {initalState } to {endingState}.", LogLevel.Information, Times.Once()); }
private void MakeACopyOfPayload(InferenceJob request) { Guard.Against.Null(request, nameof(request)); _logger.Log(LogLevel.Information, $"Copying {request.Instances.Count} instances to {request.JobPayloadsStoragePath}."); CopyInstances(request); CopyResources(request); }
private async Task UploadFiles(InferenceJob job, string basePath) { Guard.Against.Null(job, nameof(job)); Guard.Against.Null(basePath, nameof(basePath)); // allow empty var filePaths = _fileSystem.Directory.GetFiles(job.JobPayloadsStoragePath, "*", System.IO.SearchOption.AllDirectories); if (!basePath.EndsWith(_fileSystem.Path.DirectorySeparatorChar)) { basePath += _fileSystem.Path.DirectorySeparatorChar; } using var logger = _logger.BeginScope(new LogginDataDictionary <string, object> { { "BasePath", basePath }, { "JobId", job.JobId }, { "PayloadId", job.PayloadId } }); _logger.Log(LogLevel.Information, "Uploading {0} files.", filePaths.LongLength); var failureCount = 0; var options = new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = _configuration.Value.Services.Platform.ParallelUploads }; var block = new ActionBlock <string>(async(file) => { try { using var scope = _serviceScopeFactory.CreateScope(); var payloadsApi = scope.ServiceProvider.GetRequiredService <IPayloads>(); var name = file.Replace(basePath, ""); await payloadsApi.Upload(job.PayloadId, name, file); // remove file immediately upon success upload to avoid another upload on next retry _cleanupQueue.QueueInstance(file); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, $"Error uploading file: {file}."); Interlocked.Increment(ref failureCount); } }, options); foreach (var file in filePaths) { block.Post(file); } block.Complete(); await block.Completion; if (failureCount != 0) { throw new PayloadUploadException($"Failed to upload {failureCount} files."); } _logger.Log(LogLevel.Information, "Upload to payload completed."); }
private JobCustomResource CreateFromRequest(InferenceJob request) { return(new JobCustomResource { Kind = CustomResourceDefinition.JobsCrd.Kind, ApiVersion = CustomResourceDefinition.JobsCrd.ApiVersion, Metadata = new k8s.Models.V1ObjectMeta { Name = request.JobId }, Spec = request, Status = InferenceJobCrdStatus.Default }); }
private async Task CreateJob(InferenceJob job) { Guard.Against.Null(job, nameof(job)); var metadata = new JobMetadataBuilder(); metadata.AddSourceName(job.Source); using var scope = _serviceScopeFactory.CreateScope(); var jobsApi = scope.ServiceProvider.GetRequiredService <IJobs>(); var createdJob = await jobsApi.Create(job.PipelineId, job.JobName, job.Priority, metadata); job.JobId = createdJob.JobId; job.PayloadId = createdJob.PayloadId; _logger.Log(LogLevel.Information, $"New JobId={job.JobId}, PayloadId={job.PayloadId}."); }
private void ConfigureStoragePath(InferenceJob job) { Guard.Against.Null(job, nameof(job)); var targetStoragePath = string.Empty; if (_fileSystem.Directory.TryGenerateDirectory(_fileSystem.Path.Combine(_configuration.Value.Storage.TemporaryDataDirFullPath, "jobs", $"{job.JobId}"), out targetStoragePath)) { _logger.Log(LogLevel.Information, $"Job payloads directory set to {targetStoragePath}"); job.SetStoragePath(targetStoragePath); } else { throw new JobStoreException($"Failed to generate a temporary storage location"); } }
public async Task <InferenceJob> TransitionState(InferenceJob job, InferenceJobStatus status, CancellationToken cancellationToken = default) { Guard.Against.Null(job, nameof(job)); if (status == InferenceJobStatus.Success) { var originalState = job.State; job.State = job.State switch { InferenceJobState.Creating => InferenceJobState.Created, InferenceJobState.MetadataUploading => InferenceJobState.MetadataUploaded, InferenceJobState.PayloadUploading => InferenceJobState.PayloadUploaded, InferenceJobState.Starting => InferenceJobState.Completed, _ => throw new ApplicationException($"unsupported job state {job.State}") }; job.TryCount = 0; job.LastUpdate = DateTime.MinValue; _logger.Log(LogLevel.Information, $"Updating inference job state {job.JobId} from {originalState } to {job.State}."); await UpdateInferenceJob(job, cancellationToken); } else { if (++job.TryCount > _configuration.Value.Services.Platform.MaxRetries) { _logger.Log(LogLevel.Warning, $"Job {job.JobId} exceeded maximum number of retries."); job.State = InferenceJobState.Faulted; } else { job.State = job.State switch { InferenceJobState.Creating => InferenceJobState.Queued, InferenceJobState.MetadataUploading => InferenceJobState.Created, InferenceJobState.PayloadUploading => InferenceJobState.MetadataUploaded, InferenceJobState.Starting => InferenceJobState.PayloadUploaded, _ => throw new ApplicationException($"unsupported job state {job.State}") }; _logger.Log(LogLevel.Information, $"Putting inference job {job.JobId} back to {job.State} state for retry."); } job.LastUpdate = DateTime.UtcNow; await UpdateInferenceJob(job, cancellationToken); } return(job); }
private async Task Delete(InferenceJob request) { var operationResponse = await Policy .Handle <HttpOperationException>() .WaitAndRetryAsync( 3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)), (exception, timeSpan, retryCount, context) => { _logger.Log(LogLevel.Warning, exception, $"Failed to delete job {request.JobId} in CRD. Waiting {timeSpan} before next retry. Retry attempt {retryCount}. {(exception as HttpOperationException)?.Response?.Content}"); }) .ExecuteAsync(async() => await _kubernetesClient.DeleteNamespacedCustomObjectWithHttpMessagesAsync(CustomResourceDefinition.JobsCrd, request.JobId)) .ConfigureAwait(false); operationResponse.Response.EnsureSuccessStatusCode(); _logger.Log(LogLevel.Information, $"Job {request.JobId} removed from job store."); }
private void CleanupJobFiles(InferenceJob job) { Guard.Against.Null(job, nameof(job)); if (!_fileSystem.Directory.Exists(job.JobPayloadsStoragePath)) { return; } using var _ = (_logger.BeginScope(new LogginDataDictionary <string, object> { { "JobId", job.JobId }, { "PayloadId", job.PayloadId } })); var filePaths = _fileSystem.Directory.GetFiles(job.JobPayloadsStoragePath, "*", System.IO.SearchOption.AllDirectories); _logger.Log(LogLevel.Debug, $"Notifying Disk Reclaimer to delete {filePaths.LongLength} files."); foreach (var file in filePaths) { _cleanupQueue.QueueInstance(file); } _logger.Log(LogLevel.Information, $"Notified Disk Reclaimer to delete {filePaths.LongLength} files."); }
public async Task Add_ShallAddItem() { var job = new InferenceJob(); job.JobId = Guid.NewGuid().ToString(); job.PayloadId = Guid.NewGuid().ToString(); job.Instances.Add(InstanceGenerator.GenerateInstance("./aet", "aet", fileSystem: _fileSystem)); var jobStore = new ClaraJobRepository( _logger.Object, _configuration, _fileSystem, _inferenceJobRepository.Object); await jobStore.Add(job); _inferenceJobRepository.Verify(p => p.AddAsync(It.IsAny <InferenceJob>(), It.IsAny <CancellationToken>()), Times.Once()); _inferenceJobRepository.Verify(p => p.SaveChangesAsync(It.IsAny <CancellationToken>()), Times.Once()); }
private async Task BackgroundProcessing(CancellationToken cancellationToken) { _logger.Log(LogLevel.Information, "Job Submitter Hosted Service is running."); while (!cancellationToken.IsCancellationRequested) { InferenceJob job = null; try { job = await _jobStore.Take(cancellationToken); using (_logger.BeginScope(new Dictionary <string, object> { { "JobId", job.JobId }, { "PayloadId", job.PayloadId } })) { var files = _fileSystem.Directory.GetFiles(job.JobPayloadsStoragePath, "*", System.IO.SearchOption.AllDirectories); await UploadFiles(job, job.JobPayloadsStoragePath, files); await _jobsApi.Start(job); await _jobStore.Update(job, InferenceJobStatus.Success); RemoveFiles(files); } } catch (OperationCanceledException ex) { _logger.Log(LogLevel.Warning, ex, "Job Store Service canceled: {0}"); } catch (InvalidOperationException ex) { _logger.Log(LogLevel.Warning, ex, "Job Store Service may be disposed or Jobs API returned an error: {0}"); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, "Error uploading payloads/starting job."); if (job != null) { await _jobStore.Update(job, InferenceJobStatus.Fail); } } } _logger.Log(LogLevel.Information, "Cancellation requested."); }
private void CopyResources(InferenceJob request) { Guard.Against.Null(request, nameof(request)); var files = new Stack <string>(request.Resources); var retrySleepMs = 1000; var retryCount = 0; while (files.Count > 0) { try { var target = _fileSystem.Path.GetFhirStoragePath(request.JobPayloadsStoragePath); _fileSystem.Directory.CreateDirectoryIfNotExists(target); var file = files.Peek(); var filename = _fileSystem.Path.GetFileName(file); var destPath = _fileSystem.Path.Combine(target, filename); _fileSystem.File.Copy(file, destPath, true); _logger.Log(LogLevel.Debug, $"Resource {filename} moved to {destPath}"); files.Pop(); } catch (IOException ex) when((ex.HResult & 0xFFFF) == ERROR_HANDLE_DISK_FULL || (ex.HResult & 0xFFFF) == ERROR_DISK_FULL) { if (++retryCount > 3) { _logger.Log(LogLevel.Error, ex, $"Error copying file to {request.JobPayloadsStoragePath}; destination may be out of disk space. Exceeded maximum retries."); throw; } _logger.Log(LogLevel.Error, ex, $"Error copying file to {request.JobPayloadsStoragePath}; destination may be out of disk space, will retry in {retrySleepMs}ms."); Thread.Sleep(retryCount * retrySleepMs); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, $"Failed to copy file {request.JobPayloadsStoragePath}."); throw; } } _logger.Log( files.Count == 0 ? LogLevel.Information : LogLevel.Warning, $"Copied {request.Resources.Count - files.Count:D} files to '{request.JobPayloadsStoragePath}'."); }
public async Task Add_ShallRetryOnFailure() { var job = new InferenceJob(); job.JobId = Guid.NewGuid().ToString(); job.PayloadId = Guid.NewGuid().ToString(); job.Instances.Add(InstanceGenerator.GenerateInstance("./aet", "aet", fileSystem: _fileSystem)); var jobStore = new ClaraJobRepository( _logger.Object, _configuration, _fileSystem, _inferenceJobRepository.Object); _inferenceJobRepository.Setup(p => p.AddAsync(It.IsAny <InferenceJob>(), It.IsAny <CancellationToken>())).Throws(new Exception("error")); await Assert.ThrowsAsync <Exception>(async() => await jobStore.Add(job)); _logger.VerifyLoggingMessageBeginsWith($"Error saving inference job.", LogLevel.Error, Times.Exactly(3)); }
private async Task UploadMetadata(InferenceJob job) { Guard.Against.Null(job, nameof(job)); using var scope = _serviceScopeFactory.CreateScope(); var files = _fileSystem.Directory.GetFiles(job.JobPayloadsStoragePath, "*", System.IO.SearchOption.AllDirectories); var jobsMetadataFactory = scope.ServiceProvider.GetRequiredService <IJobMetadataBuilderFactory>(); var metadata = jobsMetadataFactory.Build( _configuration.Value.Services.Platform.UploadMetadata, _configuration.Value.Services.Platform.MetadataDicomSource, files); if (!metadata.IsNullOrEmpty()) { var jobsApi = scope.ServiceProvider.GetRequiredService <IJobs>(); await jobsApi.AddMetadata(job, metadata); } }
public async Task UploadsMetadataAndTransitionsState() { var request = new InferenceJob { JobId = "1", PayloadId = "1", State = InferenceJobState.MetadataUploading, Source = "Source" }; request.SetStoragePath("/job"); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.TransitionState(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>(), It.IsAny <CancellationToken>())); _fileSystem.Setup(p => p.Directory.GetFiles(It.IsAny <string>(), It.IsAny <string>(), It.IsAny <SearchOption>())) .Returns(new string[] { "/file1", "/file2" }); _jobMetadataBuilderFactory.Setup(p => p.Build(It.IsAny <bool>(), It.IsAny <IReadOnlyList <string> >(), It.IsAny <IReadOnlyList <string> >())) .Returns(new JobMetadataBuilder() { { "Test", "TestValue" } }); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _serviceScopeFactory.Object, _fileSystem.Object, _configuration); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _jobStore.Verify(p => p.TransitionState(request, InferenceJobStatus.Success, It.IsAny <CancellationToken>()), Times.Once()); _fileSystem.Verify(p => p.Directory.GetFiles(It.IsAny <string>(), It.IsAny <string>(), It.IsAny <SearchOption>()), Times.Once()); _jobMetadataBuilderFactory.Verify(p => p.Build(It.IsAny <bool>(), It.IsAny <IReadOnlyList <string> >(), It.IsAny <IReadOnlyList <string> >()), Times.Once()); }
public async Task ShallFailJobOnPayloadUploadException() { var request = new InferenceJob { JobId = "1", PayloadId = "1", State = InferenceJobState.PayloadUploading, Source = "Source" }; request.SetStoragePath("/job"); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.TransitionState(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>(), It.IsAny <CancellationToken>())); _fileSystem.Setup(p => p.Directory.GetFiles(It.IsAny <string>(), It.IsAny <string>(), System.IO.SearchOption.AllDirectories)) .Returns(new string[] { "/file1", "file2", "file3" }); _payloadsApi.Setup(p => p.Upload(It.IsAny <string>(), It.IsAny <string>(), It.IsAny <string>())) .Throws(new Exception("error")); _instanceCleanupQueue.Setup(p => p.QueueInstance(It.IsAny <string>())); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _serviceScopeFactory.Object, _fileSystem.Object, _configuration); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _logger.VerifyLoggingMessageBeginsWith("Error uploading file:", LogLevel.Error, Times.Exactly(3)); _logger.VerifyLogging($"Failed to upload {3} files.", LogLevel.Error, Times.Once()); _jobStore.Verify(p => p.TransitionState(request, InferenceJobStatus.Fail, It.IsAny <CancellationToken>()), Times.Once()); _instanceCleanupQueue.Verify(p => p.QueueInstance(It.IsAny <string>()), Times.Never()); }
public async Task UploadsPayloadAndTransitionsState() { var request = new InferenceJob { JobId = "1", PayloadId = "1", State = InferenceJobState.PayloadUploading, Source = "Source" }; request.SetStoragePath("/job"); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.TransitionState(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>(), It.IsAny <CancellationToken>())); _fileSystem.Setup(p => p.Directory.GetFiles(It.IsAny <string>(), It.IsAny <string>(), System.IO.SearchOption.AllDirectories)) .Returns(new string[] { "/file1", "/file2", "/file3" }); _payloadsApi.Setup(p => p.Upload(It.IsAny <string>(), It.IsAny <string>(), It.IsAny <string>())); _instanceCleanupQueue.Setup(p => p.QueueInstance(It.IsAny <string>())); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _serviceScopeFactory.Object, _fileSystem.Object, _configuration); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _logger.VerifyLogging("Uploading 3 files.", LogLevel.Information, Times.Once()); _logger.VerifyLogging("Upload to payload completed.", LogLevel.Information, Times.Once()); _jobStore.Verify(p => p.TransitionState(request, InferenceJobStatus.Success, It.IsAny <CancellationToken>()), Times.Once()); _jobsApi.Verify(p => p.AddMetadata(It.IsAny <Job>(), It.IsAny <Dictionary <string, string> >()), Times.Never()); _instanceCleanupQueue.Verify(p => p.QueueInstance(It.IsAny <string>()), Times.Exactly(3)); }
public async Task Add_ShallRetryCopyThenThrow() { var fileSystem = new Mock <IFileSystem>(); fileSystem.Setup(p => p.Directory).Returns(_fileSystem.Directory); fileSystem.Setup(p => p.Path).Returns(_fileSystem.Path); fileSystem.Setup(p => p.File.Create(It.IsAny <string>())) .Returns((string path) => _fileSystem.File.Create(path)); fileSystem.Setup(p => p.File.Copy(It.IsAny <string>(), It.IsAny <string>(), It.IsAny <bool>())) .Throws(new IOException("error", ClaraJobRepository.ERROR_DISK_FULL)); var job = new InferenceJob(); job.JobId = Guid.NewGuid().ToString(); job.PayloadId = Guid.NewGuid().ToString(); job.SetStoragePath("/path/to/job"); job.Instances.Add(InstanceGenerator.GenerateInstance("./aet", "aet", fileSystem: fileSystem.Object)); _configuration.Value.Storage.Temporary = "./aet"; var cancellationSource = new CancellationTokenSource(); _inferenceJobRepository.SetupSequence(p => p.AsQueryable()) .Returns((new List <InferenceJob>() { job }).AsQueryable()); var jobStore = new ClaraJobRepository( _logger.Object, _configuration, fileSystem.Object, _inferenceJobRepository.Object); await Assert.ThrowsAsync <IOException>(async() => await jobStore.Add(job)); _logger.VerifyLoggingMessageBeginsWith($"Error copying file to {job.JobPayloadsStoragePath}; destination may be out of disk space, will retry in {1000}ms.", LogLevel.Error, Times.Exactly(3)); _logger.VerifyLoggingMessageBeginsWith($"Error copying file to {job.JobPayloadsStoragePath}; destination may be out of disk space. Exceeded maximum retries.", LogLevel.Error, Times.Once()); }
public async Task ShallCompleteRequest() { var request = new InferenceJob("/job", new Job { JobId = "JID", PayloadId = "PID" }); _jobStore.SetupSequence(p => p.Take(It.IsAny <CancellationToken>())) .Returns(Task.FromResult(request)) .Returns(() => { _cancellationTokenSource.Cancel(); throw new OperationCanceledException(); }); _jobStore.Setup(p => p.Update(It.IsAny <InferenceJob>(), It.IsAny <InferenceJobStatus>())); _fileSystem.Setup(p => p.Directory.GetFiles(It.IsAny <string>(), It.IsAny <string>(), System.IO.SearchOption.AllDirectories)) .Returns(new string[] { "/file1", "file2", "file3" }); _payloadsApi.Setup(p => p.Upload(It.IsAny <string>(), It.IsAny <string>(), It.IsAny <IEnumerable <string> >())); _jobsApi.Setup(p => p.Start(It.IsAny <Job>())); _instanceCleanupQueue.Setup(p => p.QueueInstance(It.IsAny <string>())); var service = new JobSubmissionService( _instanceCleanupQueue.Object, _logger.Object, _jobsApi.Object, _payloadsApi.Object, _jobStore.Object, _fileSystem.Object); await service.StartAsync(_cancellationTokenSource.Token); BlockUntilCanceled(_cancellationTokenSource.Token); _logger.VerifyLogging("Uploading 3 files.", LogLevel.Information, Times.Once()); _logger.VerifyLogging("Upload to payload completed.", LogLevel.Information, Times.Once()); _jobsApi.Verify(p => p.Start(request), Times.Once()); _jobStore.Verify(p => p.Update(request, InferenceJobStatus.Success), Times.Once()); _instanceCleanupQueue.Verify(p => p.QueueInstance(It.IsAny <string>()), Times.Exactly(3)); }
private void MakeACopyOfPayload(InferenceJob request) { _logger.Log(LogLevel.Information, $"Copying {request.Instances.Count} instances to {request.JobPayloadsStoragePath}."); var files = new Stack <InstanceStorageInfo>(request.Instances); var retrySleepMs = 1000; var retryCount = 0; while (files.Count > 0) { try { var file = files.Peek(); var destPath = _fileSystem.Path.Combine(request.JobPayloadsStoragePath, $"{file.SopInstanceUid}.dcm"); _fileSystem.File.Copy(file.InstanceStorageFullPath, destPath, true); _logger.Log(LogLevel.Debug, $"Instance {file.SopInstanceUid} moved to {destPath}"); files.Pop(); } catch (IOException ex) when((ex.HResult & 0xFFFF) == 0x27 || (ex.HResult & 0xFFFF) == 0x70) { if (++retryCount > 3) { _logger.Log(LogLevel.Error, ex, $"Error copying file to {request.JobPayloadsStoragePath}; destination may be out of disk space. Exceeded maximum retries."); throw; } _logger.Log(LogLevel.Error, ex, $"Error copying file to {request.JobPayloadsStoragePath}; destination may be out of disk space, will retry in {retrySleepMs}ms"); Thread.Sleep(retryCount * retrySleepMs); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, $"Failed to copy file {request.JobPayloadsStoragePath}"); throw; } } _logger.Log( files.Count == 0 ? LogLevel.Information : LogLevel.Warning, $"Copied {request.Instances.Count - files.Count} files to {request.JobPayloadsStoragePath}."); }
private void CopyInstances(InferenceJob request) { Guard.Against.Null(request, nameof(request)); var files = new Stack <InstanceStorageInfo>(request.Instances); var retrySleepMs = 1000; var retryCount = 0; while (files.Count > 0) { try { var file = files.Peek(); var destinationFile = file.CopyTo(_fileSystem.Path.GetDicomStoragePath(request.JobPayloadsStoragePath)); _logger.Log(LogLevel.Debug, $"Instance {file.SopInstanceUid} moved to {destinationFile}"); files.Pop(); } catch (IOException ex) when((ex.HResult & 0xFFFF) == ERROR_HANDLE_DISK_FULL || (ex.HResult & 0xFFFF) == ERROR_DISK_FULL) { if (++retryCount > 3) { _logger.Log(LogLevel.Error, ex, $"Error copying file to {request.JobPayloadsStoragePath}; destination may be out of disk space. Exceeded maximum retries."); throw; } _logger.Log(LogLevel.Error, ex, $"Error copying file to {request.JobPayloadsStoragePath}; destination may be out of disk space, will retry in {retrySleepMs}ms."); Thread.Sleep(retryCount * retrySleepMs); } catch (Exception ex) { _logger.Log(LogLevel.Error, ex, $"Failed to copy file {request.JobPayloadsStoragePath}."); throw; } } _logger.Log( files.Count == 0 ? LogLevel.Information : LogLevel.Warning, $"Copied {request.Instances.Count - files.Count:D} files to '{request.JobPayloadsStoragePath}'."); }