private static async Task <Either <IEnumerable <ValidationError>, Unit> > ValidateMetaRows( DataColumnCollection cols, DataRowCollection rows) { var errors = new List <ValidationError>(); var idx = 0; foreach (DataRow row in rows) { idx++; try { ImporterMetaService.GetMetaRow(CsvUtil.GetColumnValues(cols), row); } catch (Exception e) { errors.Add(new ValidationError($"error at row {idx}: {MetaFileHasInvalidValues.GetEnumLabel()} : {e.Message}")); } } if (errors.Count > 0) { return(errors); } return(Unit.Instance); }
public SubjectMeta Get(DataColumnCollection cols, DataRowCollection rows, Subject subject, StatisticsDbContext context) { var metaRows = GetMetaRows(CsvUtil.GetColumnValues(cols), rows); var filters = GetFilters(metaRows, subject, context).ToList(); var indicators = GetIndicators(metaRows, subject, context).ToList(); return(new SubjectMeta { Filters = filters, Indicators = indicators }); }
ValidateAndCountObservations( DataColumnCollection cols, DataRowCollection rows, ExecutionContext executionContext, Guid releaseId, string dataFileName) { var idx = 0; var filteredRows = 0; var totalRowCount = 0; var errors = new List <ValidationError>(); var dataRows = rows.Count; foreach (DataRow row in rows) { idx++; if (errors.Count == 100) { errors.Add(new ValidationError(FirstOneHundredErrors.GetEnumLabel())); break; } try { var rowValues = CsvUtil.GetRowValues(row); var colValues = CsvUtil.GetColumnValues(cols); ImporterService.GetGeographicLevel(rowValues, colValues); ImporterService.GetTimeIdentifier(rowValues, colValues); ImporterService.GetYear(rowValues, colValues); if (!IsGeographicLevelIgnored(rowValues, colValues)) { filteredRows++; } } catch (Exception e) { errors.Add(new ValidationError($"error at row {idx}: {e.Message}")); } totalRowCount++; if (totalRowCount % STAGE_1_ROW_CHECK == 0) { await _importStatusService.UpdateStatus(releaseId, dataFileName, IStatus.STAGE_1, (double)totalRowCount / dataRows * 100); } } if (errors.Count > 0) { return(errors); } await _importStatusService.UpdateStatus(releaseId, dataFileName, IStatus.STAGE_1, 100); var rowsPerBatch = Convert.ToInt32(LoadAppSettings(executionContext).GetValue <string>("RowsPerBatch")); return(new ProcessorStatistics { FilteredObservationCount = filteredRows, RowsPerBatch = rowsPerBatch, NumBatches = FileStorageUtils.GetNumBatches(totalRowCount, rowsPerBatch) }); }
private async Task SplitFiles( ImportMessage message, SubjectData subjectData, DataTable dataFileTable) { var headerList = CsvUtil.GetColumnValues(dataFileTable.Columns); var batches = dataFileTable.Rows.OfType <DataRow>().Batch(message.RowsPerBatch); var batchCount = 1; var numRows = dataFileTable.Rows.Count + 1; var numBatches = (int)Math.Ceiling((double)dataFileTable.Rows.Count / message.RowsPerBatch); var existingBatchFiles = await _fileStorageService.GetBatchFilesForDataFile( message.Release.Id, message.DataFileName); var existingBatchFileNumbers = existingBatchFiles .AsQueryable() .Select(blobInfo => GetBatchNumberFromBatchFileName(blobInfo.FileName)); // TODO: EES-1608 - this flag keeps a track of whether any batch files have been generated to date. // It is used in a legacy check to determine whether or not to generate a "no rows" batch file. // EES-1608 will investigate what the circumstances are that could lead to a "no rows" batch file // situation, and whether this check can actually be entirely removed or not. var batchFilesExist = existingBatchFileNumbers.Any(); foreach (var batch in batches) { var currentStatus = await _importStatusService.GetImportStatus(message.Release.Id, message.DataFileName); if (currentStatus.IsFinishedOrAborting()) { _logger.LogInformation($"Import for {message.DataFileName} is finished or aborting - " + $"stopping creating batch files"); return; } var batchFileName = $"{message.DataFileName}_{batchCount:000000}"; if (existingBatchFileNumbers.Contains(batchCount)) { _logger.LogInformation($"Batch file {batchFileName} already exists - not recreating"); batchCount++; continue; } var batchFilePath = $"{BatchesDir}/{batchFileName}"; await using var stream = new MemoryStream(); var writer = new StreamWriter(stream); await writer.FlushAsync(); var table = new DataTable(); CopyColumns(dataFileTable, table); CopyRows(table, batch.ToList(), headerList); var percentageComplete = (double)batchCount / numBatches * 100; await _importStatusService.UpdateStatus(message.Release.Id, message.DataFileName, IStatus.STAGE_3, percentageComplete); // If no lines then don't create a batch or message unless it's the last one & there are zero // lines in total in which case create a zero lines batch if (table.Rows.Count == 0 && (batchCount != numBatches || batchFilesExist)) { batchCount++; continue; } WriteDataTableToStream(table, writer); await writer.FlushAsync(); stream.Seek(0, SeekOrigin.Begin); await _fileStorageService.UploadStream( message.Release.Id, fileType : FileType.Data, fileName : batchFilePath, stream : stream, contentType : "text/csv", FileStorageUtils.GetDataFileMetaValues( name: subjectData.DataBlob.Name, metaFileName: subjectData.DataBlob.GetMetaFileName(), userName: subjectData.DataBlob.GetUserName(), numberOfRows: numRows ) ); batchFilesExist = true; batchCount++; } }
private async Task SplitFiles( DataImport dataImport, DataTable dataFileTable) { var colValues = CsvUtil.GetColumnValues(dataFileTable.Columns); var batches = dataFileTable.Rows.OfType <DataRow>().Batch(dataImport.RowsPerBatch); var batchCount = 1; var numRows = dataFileTable.Rows.Count + 1; var numBatches = (int)Math.Ceiling((double)dataFileTable.Rows.Count / dataImport.RowsPerBatch); var existingBatchFiles = await _batchService.GetBatchFilesForDataFile(dataImport.File); var existingBatchFileNumbers = existingBatchFiles .AsQueryable() .Select(blobInfo => GetBatchNumberFromBatchFileName(blobInfo.FileName)); // TODO: EES-1608 - this flag keeps a track of whether any batch files have been generated to date. // It is used in a legacy check to determine whether or not to generate a "no rows" batch file. // EES-1608 will investigate what the circumstances are that could lead to a "no rows" batch file // situation, and whether this check can actually be entirely removed or not. var batchFilesExist = existingBatchFileNumbers.Any(); foreach (var batch in batches) { var currentStatus = await _dataImportService.GetImportStatus(dataImport.Id); if (currentStatus.IsFinishedOrAborting()) { _logger.LogInformation( $"Import for {dataImport.File.Filename} is finished or aborting - stopping creating batch files"); return; } if (existingBatchFileNumbers.Contains(batchCount)) { _logger.LogInformation($"Batch {batchCount} already exists - not recreating"); batchCount++; continue; } await using var stream = new MemoryStream(); var writer = new StreamWriter(stream); await writer.FlushAsync(); var table = new DataTable(); CopyColumns(dataFileTable, table); CopyRows(table, batch.ToList(), colValues, dataImport.HasSoleGeographicLevel()); var percentageComplete = (double)batchCount / numBatches * 100; await _dataImportService.UpdateStatus(dataImport.Id, DataImportStatus.STAGE_3, percentageComplete); // If no lines then don't create a batch unless it's the last one & there are zero // lines in total in which case create a zero lines batch if (table.Rows.Count == 0 && (batchCount != numBatches || batchFilesExist)) { _logger.LogInformation($"Skipping batch file for row count {table.Rows.Count} with batchCount {batchCount} and numBatches {numBatches} and batchFilesExist {batchFilesExist} and batch {batch.Count()}"); batchCount++; continue; } WriteDataTableToStream(table, writer); await writer.FlushAsync(); stream.Seek(0, SeekOrigin.Begin); await _blobStorageService.UploadStream( containerName : PrivateReleaseFiles, path : dataImport.File.BatchPath(batchCount), stream : stream, contentType : "text/csv", metadata : GetDataFileMetaValues( metaFileName: dataImport.MetaFile.Filename, numberOfRows: numRows )); batchFilesExist = true; batchCount++; } }