protected override async Task RunJob(JobInfo jobInfo, UserAccount userInfo) { var targetDirectory = Path.Combine(Configuration.RepoBaseDir, jobInfo.JobId); Log.Information("Using local directory {localDirPath}", targetDirectory); Log.Information("Clone Repository: {gitRepositoryUrl} => {targetDirectory}", jobInfo.GitRepositoryUrl, targetDirectory); await _git.CloneRepository(jobInfo.GitRepositoryUrl, targetDirectory, AuthenticationToken, userInfo); var datasetIri = new Uri(jobInfo.DatasetIri); DeleteCsvAndMetadata(targetDirectory, jobInfo.DatasetId, ProgressLog); var dataDockRepository = _repositoryFactory.GetRepositoryForJob(jobInfo, ProgressLog); dataDockRepository.DeleteDataset(datasetIri); await UpdateHtmlPagesAsync(dataDockRepository, null); if (await _git.CommitChanges(targetDirectory, $"Deleted dataset {datasetIri}", userInfo)) { await _git.PushChanges(jobInfo.GitRepositoryUrl, targetDirectory, AuthenticationToken); } try { await _datasetStore.DeleteDatasetAsync(jobInfo.OwnerId, jobInfo.RepositoryId, jobInfo.DatasetId); } catch (Exception ex) { Log.Error(ex, "Failed to remove dataset record."); throw new WorkerException(ex, "Failed to remove dataset record. Your repository is updated but the dataset may still show in the main lodlab portal"); } Log.Information("Dataset Deleted: {OwnerId}/RepositoryId/{DatasetId}", jobInfo.OwnerId, jobInfo.RepositoryId, jobInfo.DatasetId); ProgressLog.DatasetDeleted(jobInfo.OwnerId, jobInfo.RepositoryId, jobInfo.DatasetId); }
protected override async Task RunJob(JobInfo job, UserAccount userAccount) { ProgressLog.Info("Starting import job processing for " + userAccount.UserId); var targetDirectory = Path.Combine(_configuration.RepoBaseDir, job.JobId); Log.Information("Using local directory {localDirPath}", targetDirectory); // Clone the repository await _git.CloneRepository(job.GitRepositoryUrl, targetDirectory, AuthenticationToken, userAccount); // Retrieve CSV and CSVM files to src directory in the repository await AddCsvFilesToRepository(targetDirectory, job.DatasetId, job.CsvFileName, job.CsvFileId, job.CsvmFileId); var csvPath = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName); var metaPath = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName + "-metadata.json"); // Parse the JSON metadata JObject metadataJson; using (var metadataReader = File.OpenText(metaPath)) { var metadataString = metadataReader.ReadToEnd(); metadataJson = JObject.Parse(metadataString); } // Run the CSV to RDF conversion var repositoryUri = new Uri(_dataDockUriService.GetRepositoryUri(job.OwnerId, job.RepositoryId)); var publisherIri = new Uri(_dataDockUriService.GetRepositoryPublisherIdentifier(job.OwnerId, job.RepositoryId)); var datasetUri = new Uri(job.DatasetIri); var datasetMetadataGraphIri = new Uri(datasetUri + "/metadata"); var rootMetadataGraphIri = new Uri(_dataDockUriService.GetMetadataGraphIdentifier(job.OwnerId, job.RepositoryId)); var definitionsGraphIri = new Uri(_dataDockUriService.GetDefinitionsGraphIdentifier(job.OwnerId, job.RepositoryId)); var dateTag = DateTime.UtcNow.ToString("yyyyMMdd_HHmmss"); var releaseTag = MakeSafeTag(job.DatasetId + "_" + dateTag); var publisher = await GetPublisherContactInfo(job.OwnerId, job.RepositoryId); var ntriplesDownloadLink = new Uri($"https://github.com/{job.OwnerId}/{job.RepositoryId}/releases/download/{releaseTag}/{releaseTag}.nt.gz"); var csvDownloadLink = new Uri(repositoryUri + $"csv/{job.DatasetId}/{job.CsvFileName}"); using (var tmpReader = File.OpenText(csvPath)) { var header = tmpReader.ReadLine(); Log.Information("CSV header: {CsvHeader}", header); } var metadataBaseUri = new Uri(datasetUri + "/csv/" + job.CsvFileName + "-metadata.json"); IGraph datasetGraph = await GenerateDatasetGraphAsync(csvPath, metadataJson, metadataBaseUri); IGraph metadataGraph = GenerateMetadataGraph(datasetUri, publisherIri, metadataJson, new[] { ntriplesDownloadLink, csvDownloadLink }, datasetGraph); IGraph definitionsGraph = GenerateDefinitionsGraph(metadataJson); var dataDataDockRepository = _dataDataDockRepositoryFactory.GetRepositoryForJob(job, ProgressLog); dataDataDockRepository.UpdateDataset( datasetGraph, datasetUri, job.OverwriteExistingData, metadataGraph, datasetMetadataGraphIri, definitionsGraph, definitionsGraphIri, publisherIri, publisher, "", "", rootMetadataGraphIri); await UpdateHtmlPagesAsync(dataDataDockRepository, new[] { datasetUri, datasetMetadataGraphIri, rootMetadataGraphIri }); // Add and Commit all changes if (await _git.CommitChanges(targetDirectory, $"Added {job.CsvFileName} to dataset {job.DatasetIri}", userAccount)) { await _git.PushChanges(job.GitRepositoryUrl, targetDirectory, AuthenticationToken); await _git.MakeRelease(datasetGraph, releaseTag, job.OwnerId, job.RepositoryId, job.DatasetId, targetDirectory, AuthenticationToken); } // Update the dataset repository try { var voidMetadataJson = ExtractVoidMetadata(metadataGraph); var datasetInfo = new DatasetInfo { OwnerId = job.OwnerId, RepositoryId = job.RepositoryId, DatasetId = job.DatasetId, LastModified = DateTime.UtcNow, CsvwMetadata = metadataJson, VoidMetadata = voidMetadataJson, ShowOnHomePage = job.IsPublic, Tags = metadataJson["dcat:keyword"]?.ToObject <List <string> >() }; await _datasetStore.CreateOrUpdateDatasetRecordAsync(datasetInfo); ProgressLog.DatasetUpdated(datasetInfo); } catch (Exception ex) { Log.Error(ex, "Failed to update dataset record"); throw new WorkerException(ex, "Failed to update dataset record. Your repository is updated, but may not show in the portal."); } }