private DatasetDataTableMergeResult Merge(TableLoadResult latestTableLoadResult, TableLoadResult tableLoadResultToMerge) { IEnumerable <RowLoadResult> newRows = tableLoadResultToMerge.GetRowsMissingFrom(latestTableLoadResult).ToArray(); IEnumerable <RowLoadResult> updatedRows = tableLoadResultToMerge.GetRowsWhereFieldsDifferFromMatchIn(latestTableLoadResult).ToArray(); latestTableLoadResult.UpdateMatchingRowsWithFieldsValuesFrom(updatedRows); latestTableLoadResult.AddRows(newRows); return(new DatasetDataTableMergeResult { TableDefinitionName = latestTableLoadResult.TableDefinition.Name, NewRowsCount = newRows.Count(), UpdatedRowsCount = updatedRows.Count() }); }
private async Task <BuildProject> ProcessDataset(Dataset dataset, string specificationId, string relationshipId, int version, Reference user) { string dataDefinitionId = dataset.Definition.Id; DatasetVersion datasetVersion = dataset.History.Where(v => v.Version == version).SingleOrDefault(); if (datasetVersion == null) { _logger.Error("Dataset version not found for dataset '{name}' ({id}) version '{version}'", dataset.Id, dataset.Name, version); throw new NonRetriableException($"Dataset version not found for dataset '{dataset.Name}' ({dataset.Name}) version '{version}'"); } string fullBlobName = datasetVersion.BlobName; DatasetDefinition datasetDefinition = (await _datasetRepository.GetDatasetDefinitionsByQuery(m => m.Id == dataDefinitionId))?.FirstOrDefault(); if (datasetDefinition == null) { _logger.Error($"Unable to find a data definition for id: {dataDefinitionId}, for blob: {fullBlobName}"); throw new NonRetriableException($"Unable to find a data definition for id: {dataDefinitionId}, for blob: {fullBlobName}"); } BuildProject buildProject = await _calcsRepository.GetBuildProjectBySpecificationId(specificationId); if (buildProject == null) { _logger.Error($"Unable to find a build project for specification id: {specificationId}"); throw new NonRetriableException($"Unable to find a build project for id: {specificationId}"); } TableLoadResult loadResult = await GetTableResult(fullBlobName, datasetDefinition); if (loadResult == null) { _logger.Error($"Failed to load table result"); throw new NonRetriableException($"Failed to load table result"); } await PersistDataset(loadResult, dataset, datasetDefinition, buildProject, specificationId, relationshipId, version, user); return(buildProject); }
private async Task PersistDataset(TableLoadResult loadResult, Dataset dataset, DatasetDefinition datasetDefinition, BuildProject buildProject, string specificationId, string relationshipId, int version, Reference user) { IEnumerable <ProviderSummary> providerSummaries = await _providerService.FetchCoreProviderData(); Guard.IsNullOrWhiteSpace(relationshipId, nameof(relationshipId)); IList <ProviderSourceDataset> providerSourceDatasets = new List <ProviderSourceDataset>(); if (buildProject.DatasetRelationships == null) { _logger.Error($"No dataset relationships found for build project with id : '{buildProject.Id}' for specification '{specificationId}'"); return; } DatasetRelationshipSummary relationshipSummary = buildProject.DatasetRelationships.FirstOrDefault(m => m.Relationship.Id == relationshipId); if (relationshipSummary == null) { _logger.Error($"No dataset relationship found for build project with id : {buildProject.Id} with data definition id {datasetDefinition.Id} and relationshipId '{relationshipId}'"); return; } ConcurrentDictionary <string, ProviderSourceDataset> existingCurrent = new ConcurrentDictionary <string, ProviderSourceDataset>(); IEnumerable <ProviderSourceDataset> existingCurrentDatasets = await _providerResultsRepositoryPolicy.ExecuteAsync(() => _providersResultsRepository.GetCurrentProviderSourceDatasets(specificationId, relationshipId)); if (existingCurrentDatasets.AnyWithNullCheck()) { foreach (ProviderSourceDataset currentDataset in existingCurrentDatasets) { existingCurrent.TryAdd(currentDataset.ProviderId, currentDataset); } } ConcurrentDictionary <string, ProviderSourceDataset> resultsByProviderId = new ConcurrentDictionary <string, ProviderSourceDataset>(); ConcurrentDictionary <string, ProviderSourceDataset> updateCurrentDatasets = new ConcurrentDictionary <string, ProviderSourceDataset>(); Parallel.ForEach(loadResult.Rows, (RowLoadResult row) => { IEnumerable <string> allProviderIds = GetProviderIdsForIdentifier(datasetDefinition, row, providerSummaries); foreach (string providerId in allProviderIds) { if (!resultsByProviderId.TryGetValue(providerId, out ProviderSourceDataset sourceDataset)) { sourceDataset = new ProviderSourceDataset { DataGranularity = relationshipSummary.DataGranularity, SpecificationId = specificationId, DefinesScope = relationshipSummary.DefinesScope, DataRelationship = new Reference(relationshipSummary.Relationship.Id, relationshipSummary.Relationship.Name), DatasetRelationshipSummary = new Reference(relationshipSummary.Id, relationshipSummary.Name), ProviderId = providerId }; sourceDataset.Current = new ProviderSourceDatasetVersion { Rows = new List <Dictionary <string, object> >(), Dataset = new VersionReference(dataset.Id, dataset.Name, version), Date = DateTimeOffset.Now.ToLocalTime(), ProviderId = providerId, Version = 1, PublishStatus = Models.Versioning.PublishStatus.Draft, ProviderSourceDatasetId = sourceDataset.Id, Author = user }; if (!resultsByProviderId.TryAdd(providerId, sourceDataset)) { resultsByProviderId.TryGetValue(providerId, out sourceDataset); } } if (_featureToggle.IsUseFieldDefinitionIdsInSourceDatasetsEnabled()) { sourceDataset.DataDefinitionId = relationshipSummary.DatasetDefinition.Id; Dictionary <string, object> rows = new Dictionary <string, object>(); foreach (KeyValuePair <string, object> rowField in row.Fields) { foreach (TableDefinition tableDefinition in datasetDefinition.TableDefinitions) { FieldDefinition fieldDefinition = tableDefinition.FieldDefinitions.FirstOrDefault(m => m.Name == rowField.Key); if (fieldDefinition != null) { rows.Add(fieldDefinition.Id, rowField.Value); } } } sourceDataset.Current.Rows.Add(rows); } else { sourceDataset.DataDefinition = new Reference(relationshipSummary.DatasetDefinition.Id, relationshipSummary.DatasetDefinition.Name); sourceDataset.Current.Rows.Add(row.Fields); } } }); ConcurrentBag <ProviderSourceDatasetVersion> historyToSave = new ConcurrentBag <ProviderSourceDatasetVersion>(); List <Task> historySaveTasks = new List <Task>(resultsByProviderId.Count); SemaphoreSlim throttler = new SemaphoreSlim(initialCount: 15); foreach (KeyValuePair <string, ProviderSourceDataset> providerSourceDataset in resultsByProviderId) { await throttler.WaitAsync(); historySaveTasks.Add( Task.Run(async() => { try { string providerId = providerSourceDataset.Key; ProviderSourceDataset sourceDataset = providerSourceDataset.Value; ProviderSourceDatasetVersion newVersion = null; if (existingCurrent.ContainsKey(providerId)) { newVersion = existingCurrent[providerId].Current.Clone() as ProviderSourceDatasetVersion; string existingDatasetJson = JsonConvert.SerializeObject(existingCurrent[providerId].Current.Rows); string latestDatasetJson = JsonConvert.SerializeObject(sourceDataset.Current.Rows); if (existingDatasetJson != latestDatasetJson) { newVersion = await _sourceDatasetsVersionRepository.CreateVersion(newVersion, existingCurrent[providerId].Current, providerId); newVersion.Author = user; newVersion.Rows = sourceDataset.Current.Rows; sourceDataset.Current = newVersion; updateCurrentDatasets.TryAdd(providerId, sourceDataset); historyToSave.Add(newVersion); } existingCurrent.TryRemove(providerId, out ProviderSourceDataset existingProviderSourceDataset); } else { newVersion = sourceDataset.Current; updateCurrentDatasets.TryAdd(providerId, sourceDataset); historyToSave.Add(newVersion); } } finally { throttler.Release(); } })); } await TaskHelper.WhenAllAndThrow(historySaveTasks.ToArray()); if (updateCurrentDatasets.Count > 0) { _logger.Information($"Saving {updateCurrentDatasets.Count()} updated source datasets"); await _providerResultsRepositoryPolicy.ExecuteAsync(() => _providersResultsRepository.UpdateCurrentProviderSourceDatasets(updateCurrentDatasets.Values)); } if (_featureToggle.IsProviderResultsSpecificationCleanupEnabled() && existingCurrent.Any()) { _logger.Information($"Removing {existingCurrent.Count()} missing source datasets"); await _providerResultsRepositoryPolicy.ExecuteAsync(() => _providersResultsRepository.DeleteCurrentProviderSourceDatasets(existingCurrent.Values)); foreach (IEnumerable <ProviderSourceDataset> providerSourceDataSets in existingCurrent.Values.Partition <ProviderSourceDataset>(1000)) { await SendProviderSourceDatasetCleanupMessageToTopic(specificationId, ServiceBusConstants.TopicNames.ProviderSourceDatasetCleanup, providerSourceDataSets); } } if (historyToSave.Any()) { _logger.Information($"Saving {historyToSave.Count()} items to history"); await _sourceDatasetsVersionRepository.SaveVersions(historyToSave); } Reference relationshipReference = new Reference(relationshipSummary.Relationship.Id, relationshipSummary.Relationship.Name); DatasetAggregations datasetAggregations = GenerateAggregations(datasetDefinition, loadResult, specificationId, relationshipReference); if (!datasetAggregations.Fields.IsNullOrEmpty()) { await _datasetsAggregationsRepository.CreateDatasetAggregations(datasetAggregations); } await _cacheProvider.RemoveAsync <List <CalculationAggregation> >($"{CacheKeys.DatasetAggregationsForSpecification}{specificationId}"); await PopulateProviderSummariesForSpecification(specificationId, providerSummaries); }
private DatasetAggregations GenerateAggregations(DatasetDefinition datasetDefinition, TableLoadResult tableLoadResult, string specificationId, Reference datasetRelationship) { DatasetAggregations datasetAggregations = new DatasetAggregations { SpecificationId = specificationId, DatasetRelationshipId = datasetRelationship.Id, Fields = new List <AggregatedField>() }; string identifierPrefix = $"Datasets.{DatasetTypeGenerator.GenerateIdentifier(datasetRelationship.Name)}"; IEnumerable <FieldDefinition> fieldDefinitions = datasetDefinition.TableDefinitions.SelectMany(m => m.FieldDefinitions); RowLoadResult rowLoadResult = tableLoadResult.Rows.FirstOrDefault(); if (rowLoadResult != null) { foreach (KeyValuePair <string, object> field in rowLoadResult.Fields) { FieldDefinition fieldDefinition = fieldDefinitions.FirstOrDefault(m => m.Name == field.Key); string fieldName = fieldDefinition.Name; if (fieldDefinition.IsAggregable && fieldDefinition.IsNumeric) { string identifierName = $"{identifierPrefix}.{DatasetTypeGenerator.GenerateIdentifier(fieldName)}"; decimal sum = tableLoadResult.Rows.SelectMany(m => m.Fields.Where(f => f.Key == fieldName)).Sum(s => s.Value != null ? Convert.ToDecimal(s.Value) : 0); decimal average = tableLoadResult.Rows.SelectMany(m => m.Fields.Where(f => f.Key == fieldName)).Average(s => s.Value != null ? Convert.ToDecimal(s.Value) : 0); decimal min = tableLoadResult.Rows.SelectMany(m => m.Fields.Where(f => f.Key == fieldName)).Min(s => s.Value != null ? Convert.ToDecimal(s.Value) : 0); decimal max = tableLoadResult.Rows.SelectMany(m => m.Fields.Where(f => f.Key == fieldName)).Max(s => s.Value != null ? Convert.ToDecimal(s.Value) : 0); IList <AggregatedField> aggregatedFields = new List <AggregatedField> { new AggregatedField { FieldDefinitionName = identifierName, FieldType = AggregatedType.Sum, Value = sum }, new AggregatedField { FieldDefinitionName = identifierName, FieldType = AggregatedType.Average, Value = average }, new AggregatedField { FieldDefinitionName = identifierName, FieldType = AggregatedType.Min, Value = min }, new AggregatedField { FieldDefinitionName = identifierName, FieldType = AggregatedType.Max, Value = max } }; datasetAggregations.Fields = datasetAggregations.Fields.Concat(aggregatedFields); } } } return(datasetAggregations); }
public async Task <DatasetDataMergeResult> Merge(DatasetDefinition datasetDefinition, string latestBlobFileName, string blobFileNameToMerge) { DatasetDataMergeResult result = new DatasetDataMergeResult(); bool success; string errorMessage; List <TableLoadResult> latestTableLoadResults; List <TableLoadResult> tableLoadResultsToMerge; (success, errorMessage, latestTableLoadResults) = await ReadExcelDatasetData(datasetDefinition, latestBlobFileName); if (!success) { result.ErrorMessage = errorMessage; _logger.Error(errorMessage); return(result); } (success, errorMessage, tableLoadResultsToMerge) = await ReadExcelDatasetData(datasetDefinition, blobFileNameToMerge); if (!success) { result.ErrorMessage = errorMessage; _logger.Error(errorMessage); return(result); } foreach (TableLoadResult latestTableLoadResult in latestTableLoadResults) { TableLoadResult tableLoadResultToMerge = tableLoadResultsToMerge.FirstOrDefault(x => x.TableDefinition?.Name == latestTableLoadResult.TableDefinition.Name); if (tableLoadResultToMerge == null || !tableLoadResultToMerge.Rows.Any()) { result.TablesMergeResults.Add(new DatasetDataTableMergeResult { TableDefinitionName = latestTableLoadResult.TableDefinition.Name }); } else { // Merge updates latestTableLoadResult with tableLoadResultToMerge data result.TablesMergeResults.Add(Merge(latestTableLoadResult, tableLoadResultToMerge)); } } if (result.HasChanges) { // NOTE: If any new / updated rows after merge (rows merged into latest (previous version) dataset), then the merge file will be replaced with latest merge data. byte[] excelAsBytes = _excelDatasetWriter.Write(datasetDefinition, latestTableLoadResults); ICloudBlob blob = await _blobClient.GetBlobReferenceFromServerAsync(blobFileNameToMerge); try { await using MemoryStream memoryStream = new MemoryStream(excelAsBytes); await _blobClientPolicy.ExecuteAsync(() => blob.UploadFromStreamAsync(memoryStream)); } catch (Exception ex) { result.ErrorMessage = $"Failed to upload {datasetDefinition.Name} to blob storage after merge."; _logger.Error(ex, result.ErrorMessage); } } return(result); }