Validate(SubjectData subjectData, ExecutionContext executionContext, ImportMessage message) { _logger.LogInformation($"Validating: {message.DataFileName}"); await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_1); return(await ValidateCsvFile(subjectData.DataBlob, false) .OnSuccessDo(async() => await ValidateCsvFile(subjectData.MetaBlob, true)) .OnSuccess( async() => { await using var dataFileStream = await _fileStorageService.StreamBlob(subjectData.DataBlob); var dataFileTable = DataTableUtils.CreateFromStream(dataFileStream); await using var metaFileStream = await _fileStorageService.StreamBlob(subjectData.MetaBlob); var metaFileTable = DataTableUtils.CreateFromStream(metaFileStream); return await ValidateMetaHeader(metaFileTable.Columns) .OnSuccess(() => ValidateMetaRows(metaFileTable.Columns, metaFileTable.Rows)) .OnSuccess(() => ValidateObservationHeaders(dataFileTable.Columns)) .OnSuccess( () => ValidateAndCountObservations(dataFileTable.Columns, dataFileTable.Rows, executionContext, message.Release.Id, message.DataFileName) .OnSuccess( result => { _logger.LogInformation( $"Validating: {message.DataFileName} complete"); return result; } ) ); })); }
public async Task ImportFiltersAndLocations(DataColumnCollection cols, DataRowCollection rows, SubjectMeta subjectMeta, StatisticsDbContext context, Guid releaseId, string dataFileName) { // Clearing the caches is required here as the seeder shares the cache with all subjects _importerFilterService.ClearCache(); _importerLocationService.ClearCache(); var headers = CsvUtil.GetColumnValues(cols); var rowCount = 1; var totalRows = rows.Count; foreach (DataRow row in rows) { if (rowCount % STAGE_2_ROW_CHECK == 0) { var currentStatus = await _importStatusService.GetImportStatus(releaseId, dataFileName); if (currentStatus.IsFinishedOrAborting()) { _logger.LogInformation($"Import for {dataFileName} has finished or is being aborted, " + $"so finishing importing Filters and Locations early"); return; } await _importStatusService.UpdateStatus(releaseId, dataFileName, IStatus.STAGE_2, (double)rowCount / totalRows * 100); } CreateFiltersAndLocationsFromCsv(context, CsvUtil.GetRowValues(row), headers, subjectMeta.Filters); rowCount++; } }
private async Task SplitFiles( ImportMessage message, SubjectData subjectData, DataTable dataFileTable) { var headerList = CsvUtil.GetColumnValues(dataFileTable.Columns); var batches = dataFileTable.Rows.OfType <DataRow>().Batch(message.RowsPerBatch); var batchCount = 1; var numRows = dataFileTable.Rows.Count + 1; var numBatches = (int)Math.Ceiling((double)dataFileTable.Rows.Count / message.RowsPerBatch); var existingBatchFiles = await _fileStorageService.GetBatchFilesForDataFile( message.Release.Id, message.DataFileName); var existingBatchFileNumbers = existingBatchFiles .AsQueryable() .Select(blobInfo => GetBatchNumberFromBatchFileName(blobInfo.FileName)); // TODO: EES-1608 - this flag keeps a track of whether any batch files have been generated to date. // It is used in a legacy check to determine whether or not to generate a "no rows" batch file. // EES-1608 will investigate what the circumstances are that could lead to a "no rows" batch file // situation, and whether this check can actually be entirely removed or not. var batchFilesExist = existingBatchFileNumbers.Any(); foreach (var batch in batches) { var currentStatus = await _importStatusService.GetImportStatus(message.Release.Id, message.DataFileName); if (currentStatus.IsFinishedOrAborting()) { _logger.LogInformation($"Import for {message.DataFileName} is finished or aborting - " + $"stopping creating batch files"); return; } var batchFileName = $"{message.DataFileName}_{batchCount:000000}"; if (existingBatchFileNumbers.Contains(batchCount)) { _logger.LogInformation($"Batch file {batchFileName} already exists - not recreating"); batchCount++; continue; } var batchFilePath = $"{BatchesDir}/{batchFileName}"; await using var stream = new MemoryStream(); var writer = new StreamWriter(stream); await writer.FlushAsync(); var table = new DataTable(); CopyColumns(dataFileTable, table); CopyRows(table, batch.ToList(), headerList); var percentageComplete = (double)batchCount / numBatches * 100; await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_3, percentageComplete); // If no lines then don't create a batch or message unless it's the last one & there are zero // lines in total in which case create a zero lines batch if (table.Rows.Count == 0 && (batchCount != numBatches || batchFilesExist)) { batchCount++; continue; } WriteDataTableToStream(table, writer); await writer.FlushAsync(); stream.Seek(0, SeekOrigin.Begin); await _fileStorageService.UploadStream( message.Release.Id, fileType : FileType.Data, fileName : batchFilePath, stream : stream, contentType : "text/csv", FileStorageUtils.GetDataFileMetaValues( name: subjectData.DataBlob.Name, metaFileName: subjectData.DataBlob.GetMetaFileName(), userName: subjectData.DataBlob.GetUserName(), numberOfRows: numRows ) ); batchFilesExist = true; batchCount++; } }
public async void ProcessUploads( [QueueTrigger("imports-pending")] ImportMessage message, ExecutionContext executionContext, [Queue("imports-pending")] ICollector <ImportMessage> importStagesMessageQueue, [Queue("imports-available")] ICollector <ImportObservationsMessage> importObservationsMessageQueue ) { try { var status = await _importStatusService.GetImportStatus(message.Release.Id, message.DataFileName); _logger.LogInformation($"Processor Function processing import message for " + $"{message.DataFileName} at stage {status.Status}"); switch (status.Status) { case IStatus.CANCELLING: _logger.LogInformation($"Import for {message.DataFileName} is in the process of being " + $"cancelled, so not processing to the next import stage - marking as " + $"CANCELLED"); await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.CANCELLED, 100); break; case IStatus.CANCELLED: _logger.LogInformation($"Import for {message.DataFileName} is cancelled, so not " + $"processing any further"); break; case IStatus.QUEUED: case IStatus.PROCESSING_ARCHIVE_FILE: { if (message.ArchiveFileName != "") { _logger.LogInformation($"Unpacking archive for {message.DataFileName}"); await _processorService.ProcessUnpackingArchive(message); } await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_1); importStagesMessageQueue.Add(message); break; } case IStatus.STAGE_1: await _processorService.ProcessStage1(message, executionContext); await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_2); importStagesMessageQueue.Add(message); break; case IStatus.STAGE_2: await _processorService.ProcessStage2(message); await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_3); importStagesMessageQueue.Add(message); break; case IStatus.STAGE_3: await _processorService.ProcessStage3(message); await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_4); importStagesMessageQueue.Add(message); break; case IStatus.STAGE_4: await _processorService.ProcessStage4Messages(message, importObservationsMessageQueue); break; } } catch (Exception e) { var ex = GetInnerException(e); await _batchService.FailImport(message.Release.Id, message.DataFileName, new List <ValidationError> { new ValidationError(ex.Message) }); _logger.LogError(ex, $"{GetType().Name} function FAILED for : Datafile: " + $"{message.DataFileName} : {ex.Message}"); _logger.LogError(ex.StackTrace); } }
public async Task ImportObservations(ImportObservationsMessage message, StatisticsDbContext context) { var releaseId = message.ReleaseId; var status = await _importStatusService.GetImportStatus(releaseId, message.DataFileName); if (status.IsFinished()) { _logger.LogInformation($"Import for {message.DataFileName} already finished with state " + $"{status.Status} - ignoring Observations in file {message.ObservationsFilePath}"); return; } if (status.Status == CANCELLING) { _logger.LogInformation($"Import for {message.DataFileName} is CANCELLING " + $"{status.Status} - ignoring Observations in file {message.ObservationsFilePath} " + $"and marking import as CANCELLED"); await _importStatusService.UpdateStatus(releaseId, message.DataFileName, CANCELLED, 100); return; } var subjectData = await _fileStorageService.GetSubjectData(message.ReleaseId, message.ObservationsFilePath); var releaseSubject = GetReleaseSubjectLink(message.ReleaseId, message.SubjectId, context); await using var datafileStream = await _fileStorageService.StreamBlob(subjectData.DataBlob); var dataFileTable = DataTableUtils.CreateFromStream(datafileStream); await using var metaFileStream = await _fileStorageService.StreamBlob(subjectData.MetaBlob); var metaFileTable = DataTableUtils.CreateFromStream(metaFileStream); await context.Database.CreateExecutionStrategy().Execute(async() => { await using var transaction = await context.Database.BeginTransactionAsync(); await _importerService.ImportObservations( dataFileTable.Columns, dataFileTable.Rows, releaseSubject.Subject, _importerService.GetMeta(metaFileTable, releaseSubject.Subject, context), message.BatchNo, message.RowsPerBatch, context ); await transaction.CommitAsync(); await context.Database.CloseConnectionAsync(); }); if (message.NumBatches > 1) { await _fileStorageService.DeleteBlobByPath(message.ObservationsFilePath); } await CheckComplete(releaseId, message, context); }