private static async Task <Either <IEnumerable <ValidationError>, Unit> > ValidateMetaRows(
            DataColumnCollection cols, DataRowCollection rows)
        {
            var errors = new List <ValidationError>();
            var idx    = 0;

            foreach (DataRow row in rows)
            {
                idx++;

                try
                {
                    ImporterMetaService.GetMetaRow(CsvUtil.GetColumnValues(cols), row);
                }
                catch (Exception e)
                {
                    errors.Add(new ValidationError($"error at row {idx}: {MetaFileHasInvalidValues.GetEnumLabel()} : {e.Message}"));
                }
            }

            if (errors.Count > 0)
            {
                return(errors);
            }

            return(Unit.Instance);
        }
        public SubjectMeta Get(DataColumnCollection cols, DataRowCollection rows, Subject subject, StatisticsDbContext context)
        {
            var metaRows   = GetMetaRows(CsvUtil.GetColumnValues(cols), rows);
            var filters    = GetFilters(metaRows, subject, context).ToList();
            var indicators = GetIndicators(metaRows, subject, context).ToList();

            return(new SubjectMeta
            {
                Filters = filters,
                Indicators = indicators
            });
        }
        ValidateAndCountObservations(
            DataColumnCollection cols,
            DataRowCollection rows,
            ExecutionContext executionContext,
            Guid releaseId,
            string dataFileName)
        {
            var idx           = 0;
            var filteredRows  = 0;
            var totalRowCount = 0;
            var errors        = new List <ValidationError>();
            var dataRows      = rows.Count;

            foreach (DataRow row in rows)
            {
                idx++;
                if (errors.Count == 100)
                {
                    errors.Add(new ValidationError(FirstOneHundredErrors.GetEnumLabel()));
                    break;
                }

                try
                {
                    var rowValues = CsvUtil.GetRowValues(row);
                    var colValues = CsvUtil.GetColumnValues(cols);

                    ImporterService.GetGeographicLevel(rowValues, colValues);
                    ImporterService.GetTimeIdentifier(rowValues, colValues);
                    ImporterService.GetYear(rowValues, colValues);

                    if (!IsGeographicLevelIgnored(rowValues, colValues))
                    {
                        filteredRows++;
                    }
                }
                catch (Exception e)
                {
                    errors.Add(new ValidationError($"error at row {idx}: {e.Message}"));
                }

                totalRowCount++;

                if (totalRowCount % STAGE_1_ROW_CHECK == 0)
                {
                    await _importStatusService.UpdateStatus(releaseId,
                                                            dataFileName,
                                                            IStatus.STAGE_1,
                                                            (double)totalRowCount / dataRows * 100);
                }
            }

            if (errors.Count > 0)
            {
                return(errors);
            }

            await _importStatusService.UpdateStatus(releaseId,
                                                    dataFileName,
                                                    IStatus.STAGE_1,
                                                    100);

            var rowsPerBatch = Convert.ToInt32(LoadAppSettings(executionContext).GetValue <string>("RowsPerBatch"));

            return(new ProcessorStatistics
            {
                FilteredObservationCount = filteredRows,
                RowsPerBatch = rowsPerBatch,
                NumBatches = FileStorageUtils.GetNumBatches(totalRowCount, rowsPerBatch)
            });
        }
        private async Task SplitFiles(
            ImportMessage message,
            SubjectData subjectData,
            DataTable dataFileTable)
        {
            var headerList = CsvUtil.GetColumnValues(dataFileTable.Columns);
            var batches    = dataFileTable.Rows.OfType <DataRow>().Batch(message.RowsPerBatch);
            var batchCount = 1;
            var numRows    = dataFileTable.Rows.Count + 1;
            var numBatches = (int)Math.Ceiling((double)dataFileTable.Rows.Count / message.RowsPerBatch);

            var existingBatchFiles = await _fileStorageService.GetBatchFilesForDataFile(
                message.Release.Id,
                message.DataFileName);

            var existingBatchFileNumbers = existingBatchFiles
                                           .AsQueryable()
                                           .Select(blobInfo => GetBatchNumberFromBatchFileName(blobInfo.FileName));

            // TODO: EES-1608 - this flag keeps a track of whether any batch files have been generated to date.
            // It is used in a legacy check to determine whether or not to generate a "no rows" batch file.
            // EES-1608 will investigate what the circumstances are that could lead to a "no rows" batch file
            // situation, and whether this check can actually be entirely removed or not.
            var batchFilesExist = existingBatchFileNumbers.Any();

            foreach (var batch in batches)
            {
                var currentStatus = await _importStatusService.GetImportStatus(message.Release.Id, message.DataFileName);

                if (currentStatus.IsFinishedOrAborting())
                {
                    _logger.LogInformation($"Import for {message.DataFileName} is finished or aborting - " +
                                           $"stopping creating batch files");
                    return;
                }

                var batchFileName = $"{message.DataFileName}_{batchCount:000000}";

                if (existingBatchFileNumbers.Contains(batchCount))
                {
                    _logger.LogInformation($"Batch file {batchFileName} already exists - not recreating");
                    batchCount++;
                    continue;
                }

                var batchFilePath = $"{BatchesDir}/{batchFileName}";

                await using var stream = new MemoryStream();
                var writer = new StreamWriter(stream);
                await writer.FlushAsync();

                var table = new DataTable();
                CopyColumns(dataFileTable, table);
                CopyRows(table, batch.ToList(), headerList);

                var percentageComplete = (double)batchCount / numBatches * 100;

                await _importStatusService.UpdateStatus(message.Release.Id,
                                                        message.DataFileName,
                                                        IStatus.STAGE_3,
                                                        percentageComplete);

                // If no lines then don't create a batch or message unless it's the last one & there are zero
                // lines in total in which case create a zero lines batch
                if (table.Rows.Count == 0 && (batchCount != numBatches || batchFilesExist))
                {
                    batchCount++;
                    continue;
                }

                WriteDataTableToStream(table, writer);
                await writer.FlushAsync();

                stream.Seek(0, SeekOrigin.Begin);

                await _fileStorageService.UploadStream(
                    message.Release.Id,
                    fileType : FileType.Data,
                    fileName : batchFilePath,
                    stream : stream,
                    contentType : "text/csv",
                    FileStorageUtils.GetDataFileMetaValues(
                        name: subjectData.DataBlob.Name,
                        metaFileName: subjectData.DataBlob.GetMetaFileName(),
                        userName: subjectData.DataBlob.GetUserName(),
                        numberOfRows: numRows
                        )
                    );

                batchFilesExist = true;
                batchCount++;
            }
        }
Beispiel #5
0
        private async Task SplitFiles(
            DataImport dataImport,
            DataTable dataFileTable)
        {
            var colValues  = CsvUtil.GetColumnValues(dataFileTable.Columns);
            var batches    = dataFileTable.Rows.OfType <DataRow>().Batch(dataImport.RowsPerBatch);
            var batchCount = 1;
            var numRows    = dataFileTable.Rows.Count + 1;
            var numBatches = (int)Math.Ceiling((double)dataFileTable.Rows.Count / dataImport.RowsPerBatch);

            var existingBatchFiles = await _batchService.GetBatchFilesForDataFile(dataImport.File);

            var existingBatchFileNumbers = existingBatchFiles
                                           .AsQueryable()
                                           .Select(blobInfo => GetBatchNumberFromBatchFileName(blobInfo.FileName));

            // TODO: EES-1608 - this flag keeps a track of whether any batch files have been generated to date.
            // It is used in a legacy check to determine whether or not to generate a "no rows" batch file.
            // EES-1608 will investigate what the circumstances are that could lead to a "no rows" batch file
            // situation, and whether this check can actually be entirely removed or not.
            var batchFilesExist = existingBatchFileNumbers.Any();

            foreach (var batch in batches)
            {
                var currentStatus = await _dataImportService.GetImportStatus(dataImport.Id);

                if (currentStatus.IsFinishedOrAborting())
                {
                    _logger.LogInformation(
                        $"Import for {dataImport.File.Filename} is finished or aborting - stopping creating batch files");
                    return;
                }

                if (existingBatchFileNumbers.Contains(batchCount))
                {
                    _logger.LogInformation($"Batch {batchCount} already exists - not recreating");
                    batchCount++;
                    continue;
                }

                await using var stream = new MemoryStream();
                var writer = new StreamWriter(stream);
                await writer.FlushAsync();

                var table = new DataTable();
                CopyColumns(dataFileTable, table);
                CopyRows(table, batch.ToList(), colValues, dataImport.HasSoleGeographicLevel());

                var percentageComplete = (double)batchCount / numBatches * 100;

                await _dataImportService.UpdateStatus(dataImport.Id, DataImportStatus.STAGE_3, percentageComplete);

                // If no lines then don't create a batch unless it's the last one & there are zero
                // lines in total in which case create a zero lines batch
                if (table.Rows.Count == 0 && (batchCount != numBatches || batchFilesExist))
                {
                    _logger.LogInformation($"Skipping batch file for row count {table.Rows.Count} with batchCount {batchCount} and numBatches {numBatches} and batchFilesExist {batchFilesExist} and batch {batch.Count()}");
                    batchCount++;
                    continue;
                }

                WriteDataTableToStream(table, writer);
                await writer.FlushAsync();

                stream.Seek(0, SeekOrigin.Begin);

                await _blobStorageService.UploadStream(
                    containerName : PrivateReleaseFiles,
                    path : dataImport.File.BatchPath(batchCount),
                    stream : stream,
                    contentType : "text/csv",
                    metadata : GetDataFileMetaValues(
                        metaFileName: dataImport.MetaFile.Filename,
                        numberOfRows: numRows
                        ));

                batchFilesExist = true;
                batchCount++;
            }
        }