public async Task GivenAPartitionedExecutor_WhenExecute_ResultShouldBeReturnedInOrder()
        {
            int itemCount    = 9873;
            var testConsumer = new TestFhirDataConsumer(itemCount);
            FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(new TestFhirDataReader(itemCount), testConsumer, (content) => content)
            {
                BatchSize      = 100,
                PartitionCount = 10
            };

            int totalCount   = 0;
            int consumeCount = 0;
            Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();

            progress.ProgressChanged += (obj, args) =>
            {
                Interlocked.Add(ref totalCount, args.ProcessCompleted);
                Interlocked.Add(ref consumeCount, args.ConsumeCompleted);
            };
            await executor.ExecuteAsync(CancellationToken.None, progress : progress);

            Assert.Equal(itemCount, testConsumer.CurrentOffset);
            Assert.Equal(99, testConsumer.BatchCount);

            // Progress report is triggered by event, wait 1 second here in case progress not report.
            await Task.Delay(TimeSpan.FromSeconds(1));

            Assert.Equal(itemCount, totalCount);
            Assert.Equal(itemCount, consumeCount);
        }
        public async Task GivenAPartitionedExecutor_WhenExecute_ResultShouldBeReturnedInOrder()
        {
            int itemCount    = 9873;
            var testConsumer = new TestFhirDataConsumer(itemCount);
            FhirPartitionedExecutor executor = new FhirPartitionedExecutor(new TestFhirDataReader(itemCount), testConsumer, (content) => content)
            {
                BatchSize      = 100,
                PartitionCount = 10
            };

            int totalCount   = 0;
            int consumeCount = 0;
            Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();

            progress.ProgressChanged += (obj, args) =>
            {
                Interlocked.Add(ref totalCount, args.ProcessCompleted);
                Interlocked.Add(ref consumeCount, args.ConsumeCompleted);
            };
            await executor.ExecuteAsync(CancellationToken.None, progress : progress);

            Assert.Equal(itemCount, totalCount);
            Assert.Equal(itemCount, testConsumer.CurrentOffset);
            Assert.Equal(99, testConsumer.BatchCount);
            Assert.Equal(9873, consumeCount);
        }
        public async Task GivenABlobFile_WhenExecutorWithoutAnonymize_DataShouldBeSame(string connectionString, string containerName, string blobName)
        {
            string targetContainerName = Guid.NewGuid().ToString("N");
            string targetBlobName      = Guid.NewGuid().ToString("N");

            BlobContainerClient containerClient = new BlobContainerClient(connectionString, targetContainerName);
            await containerClient.CreateIfNotExistsAsync();

            try
            {
                BlobClient      sourceBlobClient = new BlobClient(connectionString, containerName, blobName, DataFactoryCustomActivity.BlobClientOptions.Value);
                BlockBlobClient targetBlobClient = new BlockBlobClient(connectionString, targetContainerName, targetBlobName, DataFactoryCustomActivity.BlobClientOptions.Value);

                using FhirBlobDataStream stream = new FhirBlobDataStream(sourceBlobClient);
                using FhirStreamReader reader   = new FhirStreamReader(stream);
                FhirBlobConsumer consumer = new FhirBlobConsumer(targetBlobClient);

                var executor = new FhirPartitionedExecutor <string, string>(reader, consumer, content => content);
                await executor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);

                Assert.Equal(sourceBlobClient.GetProperties().Value.ContentLength, targetBlobClient.GetProperties().Value.ContentLength);
            }
            finally
            {
                await containerClient.DeleteIfExistsAsync().ConfigureAwait(false);
            }
        }
        private async Task AnonymizeBlobsInJsonFormat(ActivityInputData inputData, BlobContainerClient inputContainer, BlobContainerClient outputContainer, string inputBlobPrefix, string outputBlobPrefix)
        {
            IEnumerable <BlobItem>          blobsInJsonFormat = inputContainer.GetBlobs(BlobTraits.None, BlobStates.None, inputBlobPrefix, default).Where(blob => IsInputFileInJsonFormat(blob.Name));
            FhirEnumerableReader <BlobItem> reader            = new FhirEnumerableReader <BlobItem>(blobsInJsonFormat);
            Func <BlobItem, Task <string> > anonymizeBlobFunc = async(blob) =>
            {
                string outputBlobName = GetOutputBlobName(blob.Name, inputBlobPrefix, outputBlobPrefix);
                Console.WriteLine($"[{blob.Name}]:Processing... output to container '{outputContainer.Name}'");

                var inputBlobClient  = new BlobClient(inputData.SourceStorageConnectionString, inputContainer.Name, blob.Name, BlobClientOptions.Value);
                var outputBlobClient = new BlockBlobClient(inputData.DestinationStorageConnectionString, outputContainer.Name, outputBlobName, BlobClientOptions.Value);
                if (inputData.SkipExistedFile && await outputBlobClient.ExistsAsync().ConfigureAwait(false))
                {
                    Console.WriteLine($"[{blob.Name}]:'{outputBlobName}' already exist. Skip");
                    return(string.Empty);
                }

                await outputBlobClient.DeleteIfExistsAsync().ConfigureAwait(false);

                await AnonymizeSingleBlobInJsonFormatAsync(inputBlobClient, outputBlobClient, blob.Name, inputBlobPrefix).ConfigureAwait(false);

                return(string.Empty);
            };

            FhirPartitionedExecutor <BlobItem, string> executor = new FhirPartitionedExecutor <BlobItem, string>(reader, null, anonymizeBlobFunc);

            executor.PartitionCount = Environment.ProcessorCount * 2;
            executor.BatchSize      = 1;

            await executor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
        }
        public async Task GivenAPartitionedExecutor_WhenIOExceptionThrowFromConsumer_ExecutionShouldStop()
        {
            int itemCount    = 91873;
            var testConsumer = new TestFhirDataConsumer(itemCount)
            {
                BreakOnOffset = 2342
            };
            var reader = new TestFhirDataReader(itemCount);
            FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, testConsumer, (content) => content);

            await Assert.ThrowsAsync <IOException>(async() => await executor.ExecuteAsync(CancellationToken.None));
        }
        public async Task GivenAPartitionedExecutorBreakOnExceptionEnabled_WhenExceptionThrow_ExecutionShouldStop()
        {
            int itemCount    = 9873;
            var testConsumer = new TestFhirDataConsumer(itemCount);
            FhirPartitionedExecutor executor = new FhirPartitionedExecutor(new TestFhirDataReader(itemCount), testConsumer, (content) => content);

            executor.AnonymizerFunction = (content) =>
            {
                throw new InvalidOperationException();
            };

            await Assert.ThrowsAsync <InvalidOperationException>(async() => await executor.ExecuteAsync(CancellationToken.None, true));
        }
Esempio n. 7
0
        public async Task GivenAPartitionedExecutorBreakOnExceptionEnabled_WhenExceptionThrow_ExecutionShouldStop()
        {
            int itemCount    = 9873;
            var testConsumer = new TestFhirDataConsumer(itemCount);
            Func <string, string> invalidOperationFunc = (content) =>
            {
                throw new InvalidOperationException();
            };
            var executor = new FhirPartitionedExecutor <string, string>(
                new TestFhirDataReader(itemCount),
                testConsumer,
                invalidOperationFunc);

            await Assert.ThrowsAsync <InvalidOperationException>(async() => await executor.ExecuteAsync(CancellationToken.None));
        }
        public void AnonymizeBulkDataFolder(string inputFolder, string outputFolder, bool isRecursive)
        {
            var directorySearchOption = isRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly;
            var bulkResourceFileList  = Directory.EnumerateFiles(inputFolder, "*.ndjson", directorySearchOption).ToList();

            Console.WriteLine($"Find {bulkResourceFileList.Count()} bulk data resource files in '{inputFolder}'.");

            foreach (var bulkResourceFileName in bulkResourceFileList)
            {
                Console.WriteLine($"Processing {bulkResourceFileName}");

                var bulkResourceOutputFileName = GetResourceOutputFileName(bulkResourceFileName, inputFolder, outputFolder);
                if (isRecursive)
                {
                    var resourceOutputFolder = Path.GetDirectoryName(bulkResourceOutputFileName);
                    Directory.CreateDirectory(resourceOutputFolder);
                }

                int completedCount        = 0;
                int failedCount           = 0;
                int consumeCompletedCount = 0;
                using (FileStream inputStream = new FileStream(bulkResourceFileName, FileMode.Open))
                    using (FileStream outputStream = new FileStream(bulkResourceOutputFileName, FileMode.Create))
                    {
                        using FhirStreamReader reader     = new FhirStreamReader(inputStream);
                        using FhirStreamConsumer consumer = new FhirStreamConsumer(outputStream);
                        Func <string, string> anonymizeFunction = (content) => _engine.AnonymizeJson(content);

                        Stopwatch stopWatch = new Stopwatch();
                        stopWatch.Start();

                        FhirPartitionedExecutor executor = new FhirPartitionedExecutor(reader, consumer, anonymizeFunction);
                        executor.PartitionCount = Environment.ProcessorCount;
                        Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();
                        progress.ProgressChanged += (obj, args) =>
                        {
                            Interlocked.Add(ref completedCount, args.ProcessCompleted);
                            Interlocked.Add(ref failedCount, args.ProcessFailed);
                            Interlocked.Add(ref consumeCompletedCount, args.ConsumeCompleted);
                            Console.WriteLine($"[{stopWatch.Elapsed.ToString()}]: {completedCount} Process completed. {failedCount} Process failed. {consumeCompletedCount} Consume completed.");
                        };

                        executor.ExecuteAsync(CancellationToken.None, false, progress).Wait();
                    }

                Console.WriteLine($"Finished processing '{bulkResourceFileName}'!");
            }
        }
        public async Task AnonymizeAsync()
        {
            var directorySearchOption = _options.IsRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly;
            var resourceFileList      = Directory.EnumerateFiles(_inputFolder, "*.json", directorySearchOption).ToList();

            Console.WriteLine($"Find {resourceFileList.Count()} json resource files in '{_inputFolder}'.");

            FhirEnumerableReader <string>            reader   = new FhirEnumerableReader <string>(resourceFileList);
            FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, null)
            {
                KeepOrder      = false,
                BatchSize      = 1,
                PartitionCount = Environment.ProcessorCount * 2
            };

            executor.AnonymizerFunctionAsync = async file =>
            {
                try
                {
                    return(await FileAnonymize(file).ConfigureAwait(false));
                }
                catch (Exception ex)
                {
                    Console.Error.WriteLine($"Error:\nResource: {file}\nErrorMessage: {ex.ToString()}");
                    throw;
                }
            };

            Stopwatch stopWatch = new Stopwatch();

            stopWatch.Start();

            int completedCount = 0;
            int failedCount    = 0;
            Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();

            progress.ProgressChanged += (obj, args) =>
            {
                Interlocked.Add(ref completedCount, args.ProcessCompleted);
                Interlocked.Add(ref failedCount, args.ProcessFailed);

                Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {completedCount} Process completed. {failedCount} Process failed.");
            };

            await executor.ExecuteAsync(cancellationToken : CancellationToken.None, false, progress).ConfigureAwait(false);
        }
        public async Task GivenAPartitionedExecutor_WhenCancelled_OperationCancelledExceptionShouldBeThrown()
        {
            int itemCount    = 9873;
            var testConsumer = new TestFhirDataConsumer(itemCount);
            FhirPartitionedExecutor executor = new FhirPartitionedExecutor(new TestFhirDataReader(itemCount), testConsumer, (content) => content);

            executor.AnonymizerFunction = (content) =>
            {
                Thread.Sleep(10);
                return(content);
            };

            CancellationTokenSource source = new CancellationTokenSource();

            source.CancelAfter(1000);
            await Assert.ThrowsAsync <OperationCanceledException>(async() => await executor.ExecuteAsync(source.Token));
        }
        public async Task GivenAPartitionedExecutorNotKeepOrder_WhenExecute_AllResultShouldBeReturned()
        {
            int itemCount    = 29873;
            var testConsumer = new TestFhirDataConsumer(itemCount)
            {
                CheckOrder = false
            };

            Random random = new Random();
            Func <string, Task <string> > anonymizeFunc = async content =>
            {
                if (random.Next() % 100 == 0)
                {
                    await Task.Delay(TimeSpan.FromMilliseconds(200));
                }

                return(await Task.FromResult(content));
            };
            FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(new TestFhirDataReader(itemCount), testConsumer, anonymizeFunc)
            {
                BatchSize      = 100,
                PartitionCount = 12,
                KeepOrder      = false
            };

            int totalCount   = 0;
            int consumeCount = 0;
            Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();

            progress.ProgressChanged += (obj, args) =>
            {
                Interlocked.Add(ref totalCount, args.ProcessCompleted);
                Interlocked.Add(ref consumeCount, args.ConsumeCompleted);
            };
            await executor.ExecuteAsync(CancellationToken.None, progress : progress);

            Assert.Equal(itemCount, testConsumer.CurrentOffset);
            Assert.Equal(299, testConsumer.BatchCount);

            // Progress report is triggered by event, wait 1 second here in case progress not report.
            await Task.Delay(TimeSpan.FromSeconds(1));

            Assert.Equal(itemCount, totalCount);
            Assert.Equal(itemCount, consumeCount);
        }
        private async Task AnonymizeSingleBlobInNdJsonFormatAsync(BlobClient inputBlobClient, BlockBlobClient outputBlobClient, string blobName, string inputFolderPrefix)
        {
            var processedCount = 0;
            int skippedCount   = 0;
            var consumedCount  = 0;

            using FhirBlobDataStream inputStream = new FhirBlobDataStream(inputBlobClient);
            FhirStreamReader reader   = new FhirStreamReader(inputStream);
            FhirBlobConsumer consumer = new FhirBlobConsumer(outputBlobClient);
            var engine = AnonymizerEngine.CreateWithFileContext(_configFile, blobName, inputFolderPrefix);
            Func <string, string> anonymizerFunction = (item) =>
            {
                try
                {
                    return(engine.AnonymizeJson(item));
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"[{blobName}]: Anonymize partial failed, you can find detail error message in stderr.txt.");
                    Console.Error.WriteLine($"[{blobName}]: Resource: {item}\nErrorMessage: {ex.Message}\n Details: {ex.ToString()}\nStackTrace: {ex.StackTrace}");
                    throw;
                }
            };

            Stopwatch stopWatch = Stopwatch.StartNew();
            FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, consumer, anonymizerFunction);

            executor.PartitionCount = Environment.ProcessorCount * 2;
            Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();

            progress.ProgressChanged += (obj, args) =>
            {
                Interlocked.Add(ref processedCount, args.ProcessCompleted);
                Interlocked.Add(ref skippedCount, args.ProcessSkipped);
                Interlocked.Add(ref consumedCount, args.ConsumeCompleted);

                Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {processedCount} Completed. {skippedCount} Skipped. {consumedCount} consume completed.");
            };

            await executor.ExecuteAsync(CancellationToken.None, progress).ConfigureAwait(false);
        }
        public async Task AnonymizeAsync()
        {
            var directorySearchOption = _isRecursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly;
            var bulkResourceFileList  = Directory.EnumerateFiles(_inputFolder, "*.ndjson", directorySearchOption).ToList();

            Console.WriteLine($"Find {bulkResourceFileList.Count()} bulk data resource files in '{_inputFolder}'.");

            foreach (var bulkResourceFileName in bulkResourceFileList)
            {
                Console.WriteLine($"Processing {bulkResourceFileName}");

                var bulkResourceOutputFileName = GetResourceOutputFileName(bulkResourceFileName, _inputFolder, _outputFolder);
                if (_isRecursive)
                {
                    var resourceOutputFolder = Path.GetDirectoryName(bulkResourceOutputFileName);
                    Directory.CreateDirectory(resourceOutputFolder);
                }

                int completedCount        = 0;
                int failedCount           = 0;
                int consumeCompletedCount = 0;
                using (FileStream inputStream = new FileStream(bulkResourceFileName, FileMode.Open))
                    using (FileStream outputStream = new FileStream(bulkResourceOutputFileName, FileMode.Create))
                    {
                        using FhirStreamReader reader     = new FhirStreamReader(inputStream);
                        using FhirStreamConsumer consumer = new FhirStreamConsumer(outputStream);
                        Func <string, string> anonymizeFunction = (content) =>
                        {
                            try
                            {
                                var engine   = AnonymizerEngine.CreateWithFileContext(_configFilePath, bulkResourceFileName, _inputFolder);
                                var settings = new AnonymizerSettings()
                                {
                                    IsPrettyOutput = false,
                                    ValidateInput  = _validateInput,
                                    ValidateOutput = _validateOutput
                                };
                                return(engine.AnonymizeJson(content, settings));
                            }
                            catch (Exception ex)
                            {
                                Console.Error.WriteLine($"Error:\nResource: {content}\nErrorMessage: {ex.ToString()}");
                                throw;
                            }
                        };

                        Stopwatch stopWatch = new Stopwatch();
                        stopWatch.Start();

                        FhirPartitionedExecutor <string, string> executor = new FhirPartitionedExecutor <string, string>(reader, consumer, anonymizeFunction);
                        executor.PartitionCount = Environment.ProcessorCount * 2;

                        Progress <BatchAnonymizeProgressDetail> progress = new Progress <BatchAnonymizeProgressDetail>();
                        progress.ProgressChanged += (obj, args) =>
                        {
                            Interlocked.Add(ref completedCount, args.ProcessCompleted);
                            Interlocked.Add(ref failedCount, args.ProcessFailed);
                            Interlocked.Add(ref consumeCompletedCount, args.ConsumeCompleted);

                            Console.WriteLine($"[{stopWatch.Elapsed.ToString()}][tid:{args.CurrentThreadId}]: {completedCount} Process completed. {failedCount} Process failed. {consumeCompletedCount} Consume completed.");
                        };

                        await executor.ExecuteAsync(CancellationToken.None, false, progress).ConfigureAwait(false);
                    }

                Console.WriteLine($"Finished processing '{bulkResourceFileName}'!");
            }
        }