public static async Task GivenInvalidSchemaType_WhenProcess_ExceptionShouldBeThrown() { var parquetDataProcessor = new ParquetDataProcessor(_fhirSchemaManager, _arrowConfigurationOptions, _nullParquetDataProcessorLogger); var jsonBatchData = new JsonBatchData(_testPatients); await Assert.ThrowsAsync <ParquetDataProcessorException>( () => parquetDataProcessor.ProcessAsync(jsonBatchData, new ProcessParameters("InvalidResourceType"))); }
public static async Task GivenAValidInputData_WhenProcess_CorrectResultShouldBeReturned() { var parquetDataProcessor = new ParquetDataProcessor(_fhirSchemaManager, _arrowConfigurationOptions, _nullParquetDataProcessorLogger); var jsonBatchData = new JsonBatchData(_testPatients); var resultBatchData = await parquetDataProcessor.ProcessAsync(jsonBatchData, new ProcessParameters("Patient")); var resultStream = new MemoryStream(); resultBatchData.Value.CopyTo(resultStream); var expectedResult = GetExpectedParquetStream(Path.Combine(_expectTestDataFolder, "Expected_Patient.parquet")); Assert.Equal(expectedResult.ToArray(), resultStream.ToArray()); }
public static async Task GivenInvalidJsonBatchData_WhenProcess_ExceptionShouldBeThrown() { var parquetDataProcessor = new ParquetDataProcessor(_fhirSchemaManager, _arrowConfigurationOptions, _nullParquetDataProcessorLogger); var invalidTestData = new JObject { { "resourceType", "Patient" }, { "name", "Invalid field content, should be an array." }, }; var invalidJsonBatchData = new JsonBatchData(new List <JObject> { invalidTestData, invalidTestData }); await Assert.ThrowsAsync <ParquetDataProcessorException>(() => parquetDataProcessor.ProcessAsync(invalidJsonBatchData, new ProcessParameters("Patient"))); }
public Task <StreamBatchData> ProcessAsync( JsonBatchData inputData, ProcessParameters processParameters, CancellationToken cancellationToken = default) { cancellationToken.ThrowIfCancellationRequested(); // Preprocess data JsonBatchData preprocessedData = Preprocess(inputData, processParameters.SchemaType, cancellationToken); // Get FHIR schema for the input data. var schema = _fhirSchemaManager.GetSchema(processParameters.SchemaType); if (schema == null) { _logger.LogError($"The FHIR schema node could not be found for schema type '{processParameters.SchemaType}'."); throw new ParquetDataProcessorException($"The FHIR schema node could not be found for schema type '{processParameters.SchemaType}'."); } var inputStream = ConvertJsonDataToStream(processParameters.SchemaType, preprocessedData.Values); if (inputStream == null) { // Return null if no data has been converted. return(Task.FromResult <StreamBatchData>(null)); } // Convert JSON data to parquet stream. try { var resultStream = _parquetConverterWrapper.ConvertToParquetStream(processParameters.SchemaType, inputStream); return(Task.FromResult( new StreamBatchData( resultStream, preprocessedData.Values.Count(), processParameters.SchemaType) )); } catch (Exception ex) { _logger.LogError($"Exception happened when converting input data to parquet for \"{processParameters.SchemaType}\"."); throw new ParquetDataProcessorException($"Exception happened when converting input data to parquet for \"{processParameters.SchemaType}\".", ex); } }
public static async Task GivenAValidMultipleLargeInputData_WhenProcess_CorrectResultShouldBeReturned() { var largePatientSingleSet = TestUtils.LoadNdjsonData(Path.Combine(_testDataFolder, "Large_Patient.ndjson")); var largeTestData = Enumerable.Repeat(largePatientSingleSet, 100).SelectMany(x => x); var parquetDataProcessor = new ParquetDataProcessor(_fhirSchemaManager, _arrowConfigurationOptions, _nullParquetDataProcessorLogger); var jsonBatchData = new JsonBatchData(largeTestData); var resultBatchData = await parquetDataProcessor.ProcessAsync(jsonBatchData, new ProcessParameters("Patient")); var resultStream = new MemoryStream(); resultBatchData.Value.CopyTo(resultStream); var expectedResult = GetExpectedParquetStream(Path.Combine(_expectTestDataFolder, "Expected_Patient_MultipleLargeSize.parquet")); Assert.Equal(expectedResult.ToArray(), resultStream.ToArray()); }
public static async Task GivenDataAllRecordsLengthLargerThanBlockSize_WhenProcess_NullResultShouldReturned() { // Set BlockSize small here, only shortPatientData can be retained an be converting to parquet result. var arrowConfigurationOptions = Options.Create(new ArrowConfiguration() { ReadOptions = new ArrowReadOptionsConfiguration() { BlockSize = 50 }, }); var parquetDataProcessor = new ParquetDataProcessor(_fhirSchemaManager, arrowConfigurationOptions, _nullParquetDataProcessorLogger); var testData = new List <JObject>(_testPatients); var jsonBatchData = new JsonBatchData(testData); StreamBatchData result = await parquetDataProcessor.ProcessAsync(jsonBatchData, new ProcessParameters("Patient")); Assert.Null(result); }
public JsonBatchData Preprocess( JsonBatchData inputData, string schemaType, CancellationToken cancellationToken = default) { cancellationToken.ThrowIfCancellationRequested(); // Get FHIR schema for the input data. var schema = _fhirSchemaManager.GetSchema(schemaType); if (schema == null) { _logger.LogError($"The FHIR schema node could not be found for schema type '{schemaType}'."); throw new ParquetDataProcessorException($"The FHIR schema node could not be found for schema type '{schemaType}'."); } var processedJsonData = inputData.Values .Select(json => ProcessStructObject(json, schema)) .Where(processedResult => processedResult != null); return(new JsonBatchData(processedJsonData)); }
public static async Task GivenDataWithSomeRecordsLengthLargerThanBlockSize_WhenProcess_LargeRecordsShouldBeIgnored() { var shortPatientData = new JObject { { "resourceType", "Patient" }, { "id", "example" }, }; var testData = new List <JObject>(_testPatients) { shortPatientData }; // Set BlockSize small here, only shortPatientData can be retained an be converting to parquet result. var arrowConfigurationOptions = Options.Create(new ArrowConfiguration() { ReadOptions = new ArrowReadOptionsConfiguration() { BlockSize = 50 }, }); var parquetDataProcessor = new ParquetDataProcessor(_fhirSchemaManager, arrowConfigurationOptions, _nullParquetDataProcessorLogger); var jsonBatchData = new JsonBatchData(testData); var resultBatchData = await parquetDataProcessor.ProcessAsync(jsonBatchData, new ProcessParameters("Patient")); var resultStream = new MemoryStream(); resultBatchData.Value.CopyTo(resultStream); var expectedResult = GetExpectedParquetStream(Path.Combine(_expectTestDataFolder, "Expected_Patient_IgnoreLargeLength.parquet")); Assert.Equal(expectedResult.ToArray(), resultStream.ToArray()); }