public async Task GivenAPartitionedExecutor_WhenExecute_ResultShouldBeReturnedInOrder() { int itemCount = 9873; var testConsumer = new TestFhirDataConsumer(itemCount); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(new TestFhirDataReader(itemCount), testConsumer, (content) => content) { BatchSize = 100, PartitionCount = 10 }; int totalCount = 0; int consumeCount = 0; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref totalCount, args.ProcessCompleted); Interlocked.Add(ref consumeCount, args.ConsumeCompleted); }; await executor.ExecuteAsync(CancellationToken.None, progress : progress); Assert.Equal(itemCount, testConsumer.CurrentOffset); Assert.Equal(99, testConsumer.BatchCount); // Progress report is triggered by event, wait 1 second here in case progress not report. await Task.Delay(TimeSpan.FromSeconds(1)); Assert.Equal(itemCount, totalCount); Assert.Equal(itemCount, consumeCount); }
public async Task GivenAPartitionedExecutor_WhenExecute_ResultShouldBeReturnedInOrder() { int itemCount = 9873; var testConsumer = new TestFhirDataConsumer(itemCount); FhirPartitionedExecutor executor = new FhirPartitionedExecutor(new TestFhirDataReader(itemCount), testConsumer, (content) => content) { BatchSize = 100, PartitionCount = 10 }; int totalCount = 0; int consumeCount = 0; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref totalCount, args.ProcessCompleted); Interlocked.Add(ref consumeCount, args.ConsumeCompleted); }; await executor.ExecuteAsync(CancellationToken.None, progress : progress); Assert.Equal(itemCount, totalCount); Assert.Equal(itemCount, testConsumer.CurrentOffset); Assert.Equal(99, testConsumer.BatchCount); Assert.Equal(9873, consumeCount); }
public async Task GivenABlobFile_WhenExecutorWithoutAnonymize_DataShouldBeSame(string connectionString, string containerName, string blobName) { string targetContainerName = Guid.NewGuid().ToString("N"); string targetBlobName = Guid.NewGuid().ToString("N"); BlobContainerClient containerClient = new BlobContainerClient(connectionString, targetContainerName); await containerClient.CreateIfNotExistsAsync(); try { BlobClient sourceBlobClient = new BlobClient(connectionString, containerName, blobName, DataFactoryCustomActivity.BlobClientOptions.Value); BlockBlobClient targetBlobClient = new BlockBlobClient(connectionString, targetContainerName, targetBlobName, DataFactoryCustomActivity.BlobClientOptions.Value); using FhirBlobDataStream stream = new FhirBlobDataStream(sourceBlobClient); using FhirStreamReader reader = new FhirStreamReader(stream); FhirBlobConsumer consumer = new FhirBlobConsumer(targetBlobClient); var executor = new FhirPartitionedExecutor <string, string>(reader, consumer, content => content); await executor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); Assert.Equal(sourceBlobClient.GetProperties().Value.ContentLength, targetBlobClient.GetProperties().Value.ContentLength); } finally { await containerClient.DeleteIfExistsAsync().ConfigureAwait(false); } }
private async Task AnonymizeBlobsInJsonFormat(ActivityInputData inputData, BlobContainerClient inputContainer, BlobContainerClient outputContainer, string inputBlobPrefix, string outputBlobPrefix) { IEnumerable <BlobItem> blobsInJsonFormat = inputContainer.GetBlobs(BlobTraits.None, BlobStates.None, inputBlobPrefix, default).Where(blob => IsInputFileInJsonFormat(blob.Name)); FhirEnumerableReader <BlobItem> reader = new FhirEnumerableReader <BlobItem>(blobsInJsonFormat); Func <BlobItem, Task <string> > anonymizeBlobFunc = async(blob) => { string outputBlobName = GetOutputBlobName(blob.Name, inputBlobPrefix, outputBlobPrefix); Console.WriteLine($"[{blob.Name}]:Processing... output to container '{outputContainer.Name}'"); var inputBlobClient = new BlobClient(inputData.SourceStorageConnectionString, inputContainer.Name, blob.Name, BlobClientOptions.Value); var outputBlobClient = new BlockBlobClient(inputData.DestinationStorageConnectionString, outputContainer.Name, outputBlobName, BlobClientOptions.Value); if (inputData.SkipExistedFile && await outputBlobClient.ExistsAsync().ConfigureAwait(false)) { Console.WriteLine($"[{blob.Name}]:'{outputBlobName}' already exist. Skip"); return(string.Empty); } await outputBlobClient.DeleteIfExistsAsync().ConfigureAwait(false); await AnonymizeSingleBlobInJsonFormatAsync(inputBlobClient, outputBlobClient, blob.Name, inputBlobPrefix).ConfigureAwait(false); return(string.Empty); }; FhirPartitionedExecutor <BlobItem, string> executor = new FhirPartitionedExecutor <BlobItem, string>(reader, null, anonymizeBlobFunc); executor.PartitionCount = Environment.ProcessorCount * 2; executor.BatchSize = 1; await executor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); }
public async Task GivenAPartitionedExecutor_WhenIOExceptionThrowFromConsumer_ExecutionShouldStop() { int itemCount = 91873; var testConsumer = new TestFhirDataConsumer(itemCount) { BreakOnOffset = 2342 }; var reader = new TestFhirDataReader(itemCount); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, testConsumer, (content) => content); await Assert.ThrowsAsync <IOException>(async() => await executor.ExecuteAsync(CancellationToken.None)); }
public async Task GivenAPartitionedExecutorBreakOnExceptionEnabled_WhenExceptionThrow_ExecutionShouldStop() { int itemCount = 9873; var testConsumer = new TestFhirDataConsumer(itemCount); FhirPartitionedExecutor executor = new FhirPartitionedExecutor(new TestFhirDataReader(itemCount), testConsumer, (content) => content); executor.AnonymizerFunction = (content) => { throw new InvalidOperationException(); }; await Assert.ThrowsAsync <InvalidOperationException>(async() => await executor.ExecuteAsync(CancellationToken.None, true)); }
public async Task GivenAPartitionedExecutorBreakOnExceptionEnabled_WhenExceptionThrow_ExecutionShouldStop() { int itemCount = 9873; var testConsumer = new TestFhirDataConsumer(itemCount); Func <string, string> invalidOperationFunc = (content) => { throw new InvalidOperationException(); }; var executor = new FhirPartitionedExecutor <string, string>( new TestFhirDataReader(itemCount), testConsumer, invalidOperationFunc); await Assert.ThrowsAsync <InvalidOperationException>(async() => await executor.ExecuteAsync(CancellationToken.None)); }
public void AnonymizeBulkDataFolder(string inputFolder, string outputFolder, bool isRecursive) { var directorySearchOption = isRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly; var bulkResourceFileList = Directory.EnumerateFiles(inputFolder, "*.ndjson", directorySearchOption).ToList(); Console.WriteLine($"Find {bulkResourceFileList.Count()} bulk data resource files in '{inputFolder}'."); foreach (var bulkResourceFileName in bulkResourceFileList) { Console.WriteLine($"Processing {bulkResourceFileName}"); var bulkResourceOutputFileName = GetResourceOutputFileName(bulkResourceFileName, inputFolder, outputFolder); if (isRecursive) { var resourceOutputFolder = Path.GetDirectoryName(bulkResourceOutputFileName); Directory.CreateDirectory(resourceOutputFolder); } int completedCount = 0; int failedCount = 0; int consumeCompletedCount = 0; using (FileStream inputStream = new FileStream(bulkResourceFileName, FileMode.Open)) using (FileStream outputStream = new FileStream(bulkResourceOutputFileName, FileMode.Create)) { using FhirStreamReader reader = new FhirStreamReader(inputStream); using FhirStreamConsumer consumer = new FhirStreamConsumer(outputStream); Func <string, string> anonymizeFunction = (content) => _engine.AnonymizeJson(content); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); FhirPartitionedExecutor executor = new FhirPartitionedExecutor(reader, consumer, anonymizeFunction); executor.PartitionCount = Environment.ProcessorCount; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref completedCount, args.ProcessCompleted); Interlocked.Add(ref failedCount, args.ProcessFailed); Interlocked.Add(ref consumeCompletedCount, args.ConsumeCompleted); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}]: {completedCount} Process completed. {failedCount} Process failed. {consumeCompletedCount} Consume completed."); }; executor.ExecuteAsync(CancellationToken.None, false, progress).Wait(); } Console.WriteLine($"Finished processing '{bulkResourceFileName}'!"); } }
public async Task AnonymizeAsync() { var directorySearchOption = _options.IsRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly; var resourceFileList = Directory.EnumerateFiles(_inputFolder, "*.json", directorySearchOption).ToList(); Console.WriteLine($"Find {resourceFileList.Count()} json resource files in '{_inputFolder}'."); FhirEnumerableReader <string> reader = new FhirEnumerableReader <string>(resourceFileList); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, null) { KeepOrder = false, BatchSize = 1, PartitionCount = Environment.ProcessorCount * 2 }; executor.AnonymizerFunctionAsync = async file => { try { return(await FileAnonymize(file).ConfigureAwait(false)); } catch (Exception ex) { Console.Error.WriteLine($"Error:\nResource: {file}\nErrorMessage: {ex.ToString()}"); throw; } }; Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); int completedCount = 0; int failedCount = 0; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref completedCount, args.ProcessCompleted); Interlocked.Add(ref failedCount, args.ProcessFailed); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {completedCount} Process completed. {failedCount} Process failed."); }; await executor.ExecuteAsync(cancellationToken : CancellationToken.None, false, progress).ConfigureAwait(false); }
public async Task GivenAPartitionedExecutor_WhenCancelled_OperationCancelledExceptionShouldBeThrown() { int itemCount = 9873; var testConsumer = new TestFhirDataConsumer(itemCount); FhirPartitionedExecutor executor = new FhirPartitionedExecutor(new TestFhirDataReader(itemCount), testConsumer, (content) => content); executor.AnonymizerFunction = (content) => { Thread.Sleep(10); return(content); }; CancellationTokenSource source = new CancellationTokenSource(); source.CancelAfter(1000); await Assert.ThrowsAsync <OperationCanceledException>(async() => await executor.ExecuteAsync(source.Token)); }
public async Task GivenAPartitionedExecutorNotKeepOrder_WhenExecute_AllResultShouldBeReturned() { int itemCount = 29873; var testConsumer = new TestFhirDataConsumer(itemCount) { CheckOrder = false }; Random random = new Random(); Func <string, Task <string> > anonymizeFunc = async content => { if (random.Next() % 100 == 0) { await Task.Delay(TimeSpan.FromMilliseconds(200)); } return(await Task.FromResult(content)); }; FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(new TestFhirDataReader(itemCount), testConsumer, anonymizeFunc) { BatchSize = 100, PartitionCount = 12, KeepOrder = false }; int totalCount = 0; int consumeCount = 0; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref totalCount, args.ProcessCompleted); Interlocked.Add(ref consumeCount, args.ConsumeCompleted); }; await executor.ExecuteAsync(CancellationToken.None, progress : progress); Assert.Equal(itemCount, testConsumer.CurrentOffset); Assert.Equal(299, testConsumer.BatchCount); // Progress report is triggered by event, wait 1 second here in case progress not report. await Task.Delay(TimeSpan.FromSeconds(1)); Assert.Equal(itemCount, totalCount); Assert.Equal(itemCount, consumeCount); }
private async Task AnonymizeSingleBlobInNdJsonFormatAsync(BlobClient inputBlobClient, BlockBlobClient outputBlobClient, string blobName, string inputFolderPrefix) { var processedCount = 0; int skippedCount = 0; var consumedCount = 0; using FhirBlobDataStream inputStream = new FhirBlobDataStream(inputBlobClient); FhirStreamReader reader = new FhirStreamReader(inputStream); FhirBlobConsumer consumer = new FhirBlobConsumer(outputBlobClient); var engine = AnonymizerEngine.CreateWithFileContext(_configFile, blobName, inputFolderPrefix); Func <string, string> anonymizerFunction = (item) => { try { return(engine.AnonymizeJson(item)); } catch (Exception ex) { Console.WriteLine($"[{blobName}]: Anonymize partial failed, you can find detail error message in stderr.txt."); Console.Error.WriteLine($"[{blobName}]: Resource: {item}\nErrorMessage: {ex.Message}\n Details: {ex.ToString()}\nStackTrace: {ex.StackTrace}"); throw; } }; Stopwatch stopWatch = Stopwatch.StartNew(); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, consumer, anonymizerFunction); executor.PartitionCount = Environment.ProcessorCount * 2; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref processedCount, args.ProcessCompleted); Interlocked.Add(ref skippedCount, args.ProcessSkipped); Interlocked.Add(ref consumedCount, args.ConsumeCompleted); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {processedCount} Completed. {skippedCount} Skipped. {consumedCount} consume completed."); }; await executor.ExecuteAsync(CancellationToken.None, progress).ConfigureAwait(false); }
public async Task AnonymizeAsync() { var directorySearchOption = _isRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly; var bulkResourceFileList = Directory.EnumerateFiles(_inputFolder, "*.ndjson", directorySearchOption).ToList(); Console.WriteLine($"Find {bulkResourceFileList.Count()} bulk data resource files in '{_inputFolder}'."); foreach (var bulkResourceFileName in bulkResourceFileList) { Console.WriteLine($"Processing {bulkResourceFileName}"); var bulkResourceOutputFileName = GetResourceOutputFileName(bulkResourceFileName, _inputFolder, _outputFolder); if (_isRecursive) { var resourceOutputFolder = Path.GetDirectoryName(bulkResourceOutputFileName); Directory.CreateDirectory(resourceOutputFolder); } int completedCount = 0; int failedCount = 0; int consumeCompletedCount = 0; using (FileStream inputStream = new FileStream(bulkResourceFileName, FileMode.Open)) using (FileStream outputStream = new FileStream(bulkResourceOutputFileName, FileMode.Create)) { using FhirStreamReader reader = new FhirStreamReader(inputStream); using FhirStreamConsumer consumer = new FhirStreamConsumer(outputStream); Func <string, string> anonymizeFunction = (content) => { try { var engine = AnonymizerEngine.CreateWithFileContext(_configFilePath, bulkResourceFileName, _inputFolder); var settings = new AnonymizerSettings() { IsPrettyOutput = false, ValidateInput = _validateInput, ValidateOutput = _validateOutput }; return(engine.AnonymizeJson(content, settings)); } catch (Exception ex) { Console.Error.WriteLine($"Error:\nResource: {content}\nErrorMessage: {ex.ToString()}"); throw; } }; Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, consumer, anonymizeFunction); executor.PartitionCount = Environment.ProcessorCount * 2; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref completedCount, args.ProcessCompleted); Interlocked.Add(ref failedCount, args.ProcessFailed); Interlocked.Add(ref consumeCompletedCount, args.ConsumeCompleted); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {completedCount} Process completed. {failedCount} Process failed. {consumeCompletedCount} Consume completed."); }; await executor.ExecuteAsync(CancellationToken.None, false, progress).ConfigureAwait(false); } Console.WriteLine($"Finished processing '{bulkResourceFileName}'!"); } }