public async Task GivenABlobFile_WhenExecutorWithoutAnonymize_DataShouldBeSame(string connectionString, string containerName, string blobName) { string targetContainerName = Guid.NewGuid().ToString("N"); string targetBlobName = Guid.NewGuid().ToString("N"); BlobContainerClient containerClient = new BlobContainerClient(connectionString, targetContainerName); await containerClient.CreateIfNotExistsAsync(); try { BlobClient sourceBlobClient = new BlobClient(connectionString, containerName, blobName, DataFactoryCustomActivity.BlobClientOptions.Value); BlockBlobClient targetBlobClient = new BlockBlobClient(connectionString, targetContainerName, targetBlobName, DataFactoryCustomActivity.BlobClientOptions.Value); using FhirBlobDataStream stream = new FhirBlobDataStream(sourceBlobClient); using FhirStreamReader reader = new FhirStreamReader(stream); FhirBlobConsumer consumer = new FhirBlobConsumer(targetBlobClient); var executor = new FhirPartitionedExecutor <string, string>(reader, consumer, content => content); await executor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); Assert.Equal(sourceBlobClient.GetProperties().Value.ContentLength, targetBlobClient.GetProperties().Value.ContentLength); } finally { await containerClient.DeleteIfExistsAsync().ConfigureAwait(false); } }
public void AnonymizeBulkDataFolder(string inputFolder, string outputFolder, bool isRecursive) { var directorySearchOption = isRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly; var bulkResourceFileList = Directory.EnumerateFiles(inputFolder, "*.ndjson", directorySearchOption).ToList(); Console.WriteLine($"Find {bulkResourceFileList.Count()} bulk data resource files in '{inputFolder}'."); foreach (var bulkResourceFileName in bulkResourceFileList) { Console.WriteLine($"Processing {bulkResourceFileName}"); var bulkResourceOutputFileName = GetResourceOutputFileName(bulkResourceFileName, inputFolder, outputFolder); if (isRecursive) { var resourceOutputFolder = Path.GetDirectoryName(bulkResourceOutputFileName); Directory.CreateDirectory(resourceOutputFolder); } int completedCount = 0; int failedCount = 0; int consumeCompletedCount = 0; using (FileStream inputStream = new FileStream(bulkResourceFileName, FileMode.Open)) using (FileStream outputStream = new FileStream(bulkResourceOutputFileName, FileMode.Create)) { using FhirStreamReader reader = new FhirStreamReader(inputStream); using FhirStreamConsumer consumer = new FhirStreamConsumer(outputStream); Func <string, string> anonymizeFunction = (content) => _engine.AnonymizeJson(content); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); FhirPartitionedExecutor executor = new FhirPartitionedExecutor(reader, consumer, anonymizeFunction); executor.PartitionCount = Environment.ProcessorCount; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref completedCount, args.ProcessCompleted); Interlocked.Add(ref failedCount, args.ProcessFailed); Interlocked.Add(ref consumeCompletedCount, args.ConsumeCompleted); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}]: {completedCount} Process completed. {failedCount} Process failed. {consumeCompletedCount} Consume completed."); }; executor.ExecuteAsync(CancellationToken.None, false, progress).Wait(); } Console.WriteLine($"Finished processing '{bulkResourceFileName}'!"); } }
public async Task GivenAFhirStreamReader_WhenLoadData_ShouldLoadAllDataFromStream() { using MemoryStream inputStream = new MemoryStream(); using StreamWriter writer = new StreamWriter(inputStream); await writer.WriteLineAsync("abc"); await writer.WriteLineAsync("bcd"); await writer.WriteLineAsync(""); writer.Flush(); inputStream.Position = 0; using FhirStreamReader reader = new FhirStreamReader(inputStream); Assert.Equal("abc", await reader.NextAsync()); Assert.Equal("bcd", await reader.NextAsync()); Assert.Equal("", await reader.NextAsync()); Assert.Null(await reader.NextAsync()); }
private async Task AnonymizeSingleBlobInNdJsonFormatAsync(BlobClient inputBlobClient, BlockBlobClient outputBlobClient, string blobName, string inputFolderPrefix) { var processedCount = 0; int skippedCount = 0; var consumedCount = 0; using FhirBlobDataStream inputStream = new FhirBlobDataStream(inputBlobClient); FhirStreamReader reader = new FhirStreamReader(inputStream); FhirBlobConsumer consumer = new FhirBlobConsumer(outputBlobClient); var engine = AnonymizerEngine.CreateWithFileContext(_configFile, blobName, inputFolderPrefix); Func <string, string> anonymizerFunction = (item) => { try { return(engine.AnonymizeJson(item)); } catch (Exception ex) { Console.WriteLine($"[{blobName}]: Anonymize partial failed, you can find detail error message in stderr.txt."); Console.Error.WriteLine($"[{blobName}]: Resource: {item}\nErrorMessage: {ex.Message}\n Details: {ex.ToString()}\nStackTrace: {ex.StackTrace}"); throw; } }; Stopwatch stopWatch = Stopwatch.StartNew(); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, consumer, anonymizerFunction); executor.PartitionCount = Environment.ProcessorCount * 2; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref processedCount, args.ProcessCompleted); Interlocked.Add(ref skippedCount, args.ProcessSkipped); Interlocked.Add(ref consumedCount, args.ConsumeCompleted); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {processedCount} Completed. {skippedCount} Skipped. {consumedCount} consume completed."); }; await executor.ExecuteAsync(CancellationToken.None, progress).ConfigureAwait(false); }
public async Task AnonymizeAsync() { var directorySearchOption = _isRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly; var bulkResourceFileList = Directory.EnumerateFiles(_inputFolder, "*.ndjson", directorySearchOption).ToList(); Console.WriteLine($"Find {bulkResourceFileList.Count()} bulk data resource files in '{_inputFolder}'."); foreach (var bulkResourceFileName in bulkResourceFileList) { Console.WriteLine($"Processing {bulkResourceFileName}"); var bulkResourceOutputFileName = GetResourceOutputFileName(bulkResourceFileName, _inputFolder, _outputFolder); if (_isRecursive) { var resourceOutputFolder = Path.GetDirectoryName(bulkResourceOutputFileName); Directory.CreateDirectory(resourceOutputFolder); } int completedCount = 0; int failedCount = 0; int consumeCompletedCount = 0; using (FileStream inputStream = new FileStream(bulkResourceFileName, FileMode.Open)) using (FileStream outputStream = new FileStream(bulkResourceOutputFileName, FileMode.Create)) { using FhirStreamReader reader = new FhirStreamReader(inputStream); using FhirStreamConsumer consumer = new FhirStreamConsumer(outputStream); Func <string, string> anonymizeFunction = (content) => { try { var engine = AnonymizerEngine.CreateWithFileContext(_configFilePath, bulkResourceFileName, _inputFolder); var settings = new AnonymizerSettings() { IsPrettyOutput = false, ValidateInput = _validateInput, ValidateOutput = _validateOutput }; return(engine.AnonymizeJson(content, settings)); } catch (Exception ex) { Console.Error.WriteLine($"Error:\nResource: {content}\nErrorMessage: {ex.ToString()}"); throw; } }; Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, consumer, anonymizeFunction); executor.PartitionCount = Environment.ProcessorCount * 2; Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>(); progress.ProgressChanged += (obj, args) => { Interlocked.Add(ref completedCount, args.ProcessCompleted); Interlocked.Add(ref failedCount, args.ProcessFailed); Interlocked.Add(ref consumeCompletedCount, args.ConsumeCompleted); Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {completedCount} Process completed. {failedCount} Process failed. {consumeCompletedCount} Consume completed."); }; await executor.ExecuteAsync(CancellationToken.None, false, progress).ConfigureAwait(false); } Console.WriteLine($"Finished processing '{bulkResourceFileName}'!"); } }