Ejemplo n.º 1
0
        private async Task AddCsvFilesToRepository(string repositoryDirectory, string datasetId, string csvFileName, string csvFileId, string csvmFileId)
        {
            try
            {
                ProgressLog.Info("Copying source CSV and metadata files to repository directory csv/{0}", datasetId);
                var datasetCsvDirPath = Path.Combine(repositoryDirectory, "csv", datasetId);
                if (!Directory.Exists(datasetCsvDirPath))
                {
                    Directory.CreateDirectory(datasetCsvDirPath);
                }
                var csvFilePath   = Path.Combine(datasetCsvDirPath, csvFileName);
                var csvFileStream = await _jobFileStore.GetFileAsync(csvFileId);

                await using (var csvOutStream = File.Open(csvFilePath, FileMode.Create, FileAccess.Write))
                {
                    csvFileStream.CopyTo(csvOutStream);
                }
                if (csvmFileId != null)
                {
                    var csvmFilePath   = csvFilePath + "-metadata.json";
                    var csvmFileStream = await _jobFileStore.GetFileAsync(csvmFileId);

                    await using var csvmOutStream = File.Open(csvmFilePath, FileMode.Create, FileAccess.Write);
                    csvmFileStream.CopyTo(csvmOutStream);
                }
            }
            catch (Exception ex)
            {
                Log.Error(ex, "Failed to copy CSV/CSVM files");
                throw new WorkerException(ex, "Failed to copy CSV/CSVM files from upload to Github repository.");
            }
        }
Ejemplo n.º 2
0
        private Graph GenerateDefinitionsGraph(JObject metadataJson)
        {
            var definitionsGraph  = new Graph();
            var metadataExtractor = new MetdataExtractor();

            ProgressLog.Info("Extracting column property definitions");
            metadataExtractor.GenerateColumnDefinitions(metadataJson, definitionsGraph);
            return(definitionsGraph);
        }
Ejemplo n.º 3
0
        private Graph GenerateMetadataGraph(Uri datasetUri, Uri publisherIri, JObject metadataJson, IEnumerable <Uri> downloadUris, IGraph dataGraph)
        {
            var metadataGraph     = new Graph();
            var metadataExtractor = new MetdataExtractor();

            ProgressLog.Info("Extracting dataset metadata");
            metadataExtractor.Run(metadataJson, metadataGraph, publisherIri, dataGraph.Triples.Count, DateTime.UtcNow);
            var dsNode          = metadataGraph.CreateUriNode(datasetUri);
            var ddNode          = metadataGraph.CreateUriNode(new Uri("http://rdfs.org/ns/void#dataDump"));
            var exampleResource = metadataGraph.CreateUriNode(new Uri("http://rdfs.org/ns/void#exampleResource"));

            foreach (var downloadUri in downloadUris)
            {
                metadataGraph.Assert(dsNode, ddNode, metadataGraph.CreateUriNode(downloadUri));
            }
            foreach (var distinctSubject in dataGraph.Triples.Select(t => t.Subject).OfType <IUriNode>().Distinct().Take(10))
            {
                metadataGraph.Assert(dsNode, exampleResource, distinctSubject);
            }
            return(metadataGraph);
        }
Ejemplo n.º 4
0
        private async Task <Graph> GenerateDatasetGraphAsync(string csvPath, JObject metadataJson, Uri metadataUri)
        {
            var parser     = new JsonMetadataParser(null, metadataUri);
            var tableGroup = new TableGroup();

            try
            {
                var tableMeta = parser.ParseTable(tableGroup, metadataJson);
                if (tableMeta == null)
                {
                    throw new WorkerException("CSV Conversion failed. Unable to read CSV table metadata.");
                }
            }
            catch (MetadataParseException ex)
            {
                Log.Error(ex, "Invalid CSV table metadata: " + ex.Message);
                throw new WorkerException(ex, "CSV conversion failed. Invalid CSV table metadata: " + ex.Message);
            }

            var graph = new Graph();

            ProgressLog.Info("Running CSV to RDF conversion");
            var graphHandler  = new GraphHandler(graph);
            var tableResolver = new LocalTableResolver(tableGroup.Tables[0].Url, csvPath);
            var converter     = new Converter(graphHandler, tableResolver, ConverterMode.Minimal, (msg) => ProgressLog.Error(msg), this, reportInterval: CsvConversionReportInterval);
            await converter.ConvertAsync(tableGroup);

            if (converter.Errors.Any())
            {
                foreach (var e in converter.Errors)
                {
                    ProgressLog.Error(e);
                }
                throw new WorkerException("One or more errors where encountered during the CSV to RDF conversion.");
            }
            return(graph);
        }
Ejemplo n.º 5
0
        private async Task <ContactInfo> GetPublisherContactInfo(string ownerId, string repoId)
        {
            try
            {
                ProgressLog.Info("Attempting to retrieve publisher contact information from repository settings");
                // get repoSettings
                var repoSettings = await RepoSettingsStore.GetRepoSettingsAsync(ownerId, repoId);

                if (repoSettings?.DefaultPublisher != null)
                {
                    ProgressLog.Info("Returning publisher from repository settings");
                    return(repoSettings.DefaultPublisher);
                }
                // no repo settings publisher, try at owner level
                ProgressLog.Info("No publisher info found in repository settings");
                if (ownerId != null)
                {
                    ProgressLog.Info("Attempting to retrieve publisher contact information from repository owner's settings");
                    var ownerSettings = await OwnerSettingsStore.GetOwnerSettingsAsync(ownerId);

                    if (ownerSettings?.DefaultPublisher != null)
                    {
                        ProgressLog.Info("Returning publisher from repository owner's settings");
                        return(ownerSettings.DefaultPublisher);
                    }
                }
                // no settings / publisher found for that repo
                ProgressLog.Info("No publisher info found in repository owner's settings");
                return(null);
            }
            catch (Exception)
            {
                ProgressLog.Error("Error when attempting to retrieve publisher contact information from repository/owner settings");
                return(null);
            }
        }
Ejemplo n.º 6
0
        protected override async Task RunJob(JobInfo job, UserAccount userAccount)
        {
            ProgressLog.Info("Starting import job processing for " + userAccount.UserId);

            var targetDirectory = Path.Combine(_configuration.RepoBaseDir, job.JobId);

            Log.Information("Using local directory {localDirPath}", targetDirectory);

            // Clone the repository
            await _git.CloneRepository(job.GitRepositoryUrl, targetDirectory, AuthenticationToken, userAccount);

            // Retrieve CSV and CSVM files to src directory in the repository
            await AddCsvFilesToRepository(targetDirectory,
                                          job.DatasetId,
                                          job.CsvFileName,
                                          job.CsvFileId,
                                          job.CsvmFileId);

            var csvPath  = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName);
            var metaPath = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName + "-metadata.json");

            // Parse the JSON metadata
            JObject metadataJson;

            using (var metadataReader = File.OpenText(metaPath))
            {
                var metadataString = metadataReader.ReadToEnd();
                metadataJson = JObject.Parse(metadataString);
            }

            // Run the CSV to RDF conversion
            var repositoryUri           = new Uri(_dataDockUriService.GetRepositoryUri(job.OwnerId, job.RepositoryId));
            var publisherIri            = new Uri(_dataDockUriService.GetRepositoryPublisherIdentifier(job.OwnerId, job.RepositoryId));
            var datasetUri              = new Uri(job.DatasetIri);
            var datasetMetadataGraphIri = new Uri(datasetUri + "/metadata");
            var rootMetadataGraphIri    = new Uri(_dataDockUriService.GetMetadataGraphIdentifier(job.OwnerId, job.RepositoryId));
            var definitionsGraphIri     = new Uri(_dataDockUriService.GetDefinitionsGraphIdentifier(job.OwnerId, job.RepositoryId));
            var dateTag    = DateTime.UtcNow.ToString("yyyyMMdd_HHmmss");
            var releaseTag = MakeSafeTag(job.DatasetId + "_" + dateTag);
            var publisher  = await GetPublisherContactInfo(job.OwnerId, job.RepositoryId);

            var ntriplesDownloadLink =
                new Uri($"https://github.com/{job.OwnerId}/{job.RepositoryId}/releases/download/{releaseTag}/{releaseTag}.nt.gz");
            var csvDownloadLink =
                new Uri(repositoryUri + $"csv/{job.DatasetId}/{job.CsvFileName}");

            using (var tmpReader = File.OpenText(csvPath))
            {
                var header = tmpReader.ReadLine();
                Log.Information("CSV header: {CsvHeader}", header);
            }

            var    metadataBaseUri = new Uri(datasetUri + "/csv/" + job.CsvFileName + "-metadata.json");
            IGraph datasetGraph    = await GenerateDatasetGraphAsync(csvPath, metadataJson, metadataBaseUri);

            IGraph metadataGraph = GenerateMetadataGraph(datasetUri, publisherIri, metadataJson,
                                                         new[] { ntriplesDownloadLink, csvDownloadLink }, datasetGraph);

            IGraph definitionsGraph = GenerateDefinitionsGraph(metadataJson);



            var dataDataDockRepository = _dataDataDockRepositoryFactory.GetRepositoryForJob(job, ProgressLog);

            dataDataDockRepository.UpdateDataset(
                datasetGraph, datasetUri, job.OverwriteExistingData,
                metadataGraph, datasetMetadataGraphIri,
                definitionsGraph, definitionsGraphIri,
                publisherIri, publisher,
                "", "",
                rootMetadataGraphIri);

            await UpdateHtmlPagesAsync(dataDataDockRepository,
                                       new[] { datasetUri, datasetMetadataGraphIri, rootMetadataGraphIri });

            // Add and Commit all changes
            if (await _git.CommitChanges(targetDirectory,
                                         $"Added {job.CsvFileName} to dataset {job.DatasetIri}", userAccount))
            {
                await _git.PushChanges(job.GitRepositoryUrl, targetDirectory, AuthenticationToken);

                await _git.MakeRelease(datasetGraph, releaseTag, job.OwnerId, job.RepositoryId, job.DatasetId, targetDirectory, AuthenticationToken);
            }

            // Update the dataset repository
            try
            {
                var voidMetadataJson = ExtractVoidMetadata(metadataGraph);
                var datasetInfo      = new DatasetInfo
                {
                    OwnerId        = job.OwnerId,
                    RepositoryId   = job.RepositoryId,
                    DatasetId      = job.DatasetId,
                    LastModified   = DateTime.UtcNow,
                    CsvwMetadata   = metadataJson,
                    VoidMetadata   = voidMetadataJson,
                    ShowOnHomePage = job.IsPublic,
                    Tags           = metadataJson["dcat:keyword"]?.ToObject <List <string> >()
                };
                await _datasetStore.CreateOrUpdateDatasetRecordAsync(datasetInfo);

                ProgressLog.DatasetUpdated(datasetInfo);
            }
            catch (Exception ex)
            {
                Log.Error(ex, "Failed to update dataset record");
                throw new WorkerException(ex,
                                          "Failed to update dataset record. Your repository is updated, but may not show in the portal.");
            }
        }
Ejemplo n.º 7
0
 public void Report(int value)
 {
     ProgressLog.Info("CSV conversion processed {0} rows", value);
 }
Ejemplo n.º 8
0
        private async Task <PortalInfoDrop> GetPortalSettingsInfo(string ownerId, string repoId, string authenticationToken)
        {
            try
            {
                ProgressLog.Info("Attempting to retrieve portal settings information from owner settings");
                if (ownerId != null)
                {
                    var portalInfo = new PortalInfoDrop
                    {
                        OwnerId        = ownerId,
                        RepositoryName = repoId
                    };

                    ProgressLog.Info("Attempting to retrieve publisher contact information from repository owner's settings");
                    var ownerSettings = await OwnerSettingsStore.GetOwnerSettingsAsync(ownerId);

                    if (ownerSettings != null)
                    {
                        portalInfo.IsOrg             = ownerSettings.IsOrg;
                        portalInfo.ShowDashboardLink = ownerSettings.DisplayDataDockLink;
                        if (!string.IsNullOrEmpty(ownerSettings.TwitterHandle))
                        {
                            portalInfo.Twitter = ownerSettings.TwitterHandle;
                        }

                        var client = GitHubClientFactory.CreateClient(authenticationToken);
                        if (ownerSettings.IsOrg)
                        {
                            var org = await client.Organization.Get(ownerId);

                            if (org == null)
                            {
                                return(portalInfo);
                            }

                            portalInfo.OwnerDisplayName = org.Name ?? ownerId;
                            if (ownerSettings.DisplayGitHubBlogUrl)
                            {
                                portalInfo.Website = org.Blog;
                            }
                            if (ownerSettings.DisplayGitHubAvatar)
                            {
                                portalInfo.LogoUrl = org.AvatarUrl;
                            }
                            if (ownerSettings.DisplayGitHubDescription)
                            {
                                portalInfo.Description = org.Bio;
                            }
                            if (ownerSettings.DisplayGitHubBlogUrl)
                            {
                                portalInfo.Website = org.Blog;
                            }
                            if (ownerSettings.DisplayGitHubLocation)
                            {
                                portalInfo.Location = org.Location;
                            }
                            if (ownerSettings.DisplayGitHubIssuesLink)
                            {
                                portalInfo.GitHubHtmlUrl = org.HtmlUrl;
                            }
                        }
                        else
                        {
                            var user = await client.User.Get(ownerId);

                            if (user == null)
                            {
                                return(portalInfo);
                            }

                            portalInfo.OwnerDisplayName = user.Name ?? ownerId;
                            if (ownerSettings.DisplayGitHubBlogUrl)
                            {
                                portalInfo.Website = user.Blog;
                            }
                            if (ownerSettings.DisplayGitHubAvatar)
                            {
                                portalInfo.LogoUrl = user.AvatarUrl;
                            }
                            if (ownerSettings.DisplayGitHubDescription)
                            {
                                portalInfo.Description = user.Bio;
                            }
                            if (ownerSettings.DisplayGitHubBlogUrl)
                            {
                                portalInfo.Website = user.Blog;
                            }
                            if (ownerSettings.DisplayGitHubLocation)
                            {
                                portalInfo.Location = user.Location;
                            }
                            if (ownerSettings.DisplayGitHubIssuesLink)
                            {
                                portalInfo.GitHubHtmlUrl = user.HtmlUrl;
                            }
                        }
                    }
                    ProgressLog.Info("Looking up repository portal search buttons from settings for {0} repository.", repoId);

                    var repoSettings = await RepoSettingsStore.GetRepoSettingsAsync(ownerId, repoId);

                    var repoSearchButtons = repoSettings?.SearchButtons;
                    if (!string.IsNullOrEmpty(repoSearchButtons))
                    {
                        portalInfo.RepoSearchButtons = GetSearchButtons(repoSearchButtons);
                    }
                    return(portalInfo);
                }
                // no settings
                ProgressLog.Info("No owner settings found");
                return(null);
            }
            catch (Exception)
            {
                ProgressLog.Error("Error when attempting to retrieve portal information from owner settings");
                return(null);
            }
        }