Example #1
0
        protected override async Task RunJob(JobInfo job, UserAccount userAccount)
        {
            ProgressLog.Info("Starting import job processing for " + userAccount.UserId);

            var targetDirectory = Path.Combine(_configuration.RepoBaseDir, job.JobId);

            Log.Information("Using local directory {localDirPath}", targetDirectory);

            // Clone the repository
            await _git.CloneRepository(job.GitRepositoryUrl, targetDirectory, AuthenticationToken, userAccount);

            // Retrieve CSV and CSVM files to src directory in the repository
            await AddCsvFilesToRepository(targetDirectory,
                                          job.DatasetId,
                                          job.CsvFileName,
                                          job.CsvFileId,
                                          job.CsvmFileId);

            var csvPath  = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName);
            var metaPath = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName + "-metadata.json");

            // Parse the JSON metadata
            JObject metadataJson;

            using (var metadataReader = File.OpenText(metaPath))
            {
                var metadataString = metadataReader.ReadToEnd();
                metadataJson = JObject.Parse(metadataString);
            }

            // Run the CSV to RDF conversion
            var repositoryUri           = new Uri(_dataDockUriService.GetRepositoryUri(job.OwnerId, job.RepositoryId));
            var publisherIri            = new Uri(_dataDockUriService.GetRepositoryPublisherIdentifier(job.OwnerId, job.RepositoryId));
            var datasetUri              = new Uri(job.DatasetIri);
            var datasetMetadataGraphIri = new Uri(datasetUri + "/metadata");
            var rootMetadataGraphIri    = new Uri(_dataDockUriService.GetMetadataGraphIdentifier(job.OwnerId, job.RepositoryId));
            var definitionsGraphIri     = new Uri(_dataDockUriService.GetDefinitionsGraphIdentifier(job.OwnerId, job.RepositoryId));
            var dateTag    = DateTime.UtcNow.ToString("yyyyMMdd_HHmmss");
            var releaseTag = MakeSafeTag(job.DatasetId + "_" + dateTag);
            var publisher  = await GetPublisherContactInfo(job.OwnerId, job.RepositoryId);

            var ntriplesDownloadLink =
                new Uri($"https://github.com/{job.OwnerId}/{job.RepositoryId}/releases/download/{releaseTag}/{releaseTag}.nt.gz");
            var csvDownloadLink =
                new Uri(repositoryUri + $"csv/{job.DatasetId}/{job.CsvFileName}");

            using (var tmpReader = File.OpenText(csvPath))
            {
                var header = tmpReader.ReadLine();
                Log.Information("CSV header: {CsvHeader}", header);
            }

            var    metadataBaseUri = new Uri(datasetUri + "/csv/" + job.CsvFileName + "-metadata.json");
            IGraph datasetGraph    = await GenerateDatasetGraphAsync(csvPath, metadataJson, metadataBaseUri);

            IGraph metadataGraph = GenerateMetadataGraph(datasetUri, publisherIri, metadataJson,
                                                         new[] { ntriplesDownloadLink, csvDownloadLink }, datasetGraph);

            IGraph definitionsGraph = GenerateDefinitionsGraph(metadataJson);



            var dataDataDockRepository = _dataDataDockRepositoryFactory.GetRepositoryForJob(job, ProgressLog);

            dataDataDockRepository.UpdateDataset(
                datasetGraph, datasetUri, job.OverwriteExistingData,
                metadataGraph, datasetMetadataGraphIri,
                definitionsGraph, definitionsGraphIri,
                publisherIri, publisher,
                "", "",
                rootMetadataGraphIri);

            await UpdateHtmlPagesAsync(dataDataDockRepository,
                                       new[] { datasetUri, datasetMetadataGraphIri, rootMetadataGraphIri });

            // Add and Commit all changes
            if (await _git.CommitChanges(targetDirectory,
                                         $"Added {job.CsvFileName} to dataset {job.DatasetIri}", userAccount))
            {
                await _git.PushChanges(job.GitRepositoryUrl, targetDirectory, AuthenticationToken);

                await _git.MakeRelease(datasetGraph, releaseTag, job.OwnerId, job.RepositoryId, job.DatasetId, targetDirectory, AuthenticationToken);
            }

            // Update the dataset repository
            try
            {
                var voidMetadataJson = ExtractVoidMetadata(metadataGraph);
                var datasetInfo      = new DatasetInfo
                {
                    OwnerId        = job.OwnerId,
                    RepositoryId   = job.RepositoryId,
                    DatasetId      = job.DatasetId,
                    LastModified   = DateTime.UtcNow,
                    CsvwMetadata   = metadataJson,
                    VoidMetadata   = voidMetadataJson,
                    ShowOnHomePage = job.IsPublic,
                    Tags           = metadataJson["dcat:keyword"]?.ToObject <List <string> >()
                };
                await _datasetStore.CreateOrUpdateDatasetRecordAsync(datasetInfo);

                ProgressLog.DatasetUpdated(datasetInfo);
            }
            catch (Exception ex)
            {
                Log.Error(ex, "Failed to update dataset record");
                throw new WorkerException(ex,
                                          "Failed to update dataset record. Your repository is updated, but may not show in the portal.");
            }
        }