private async Task AddCsvFilesToRepository(string repositoryDirectory, string datasetId, string csvFileName, string csvFileId, string csvmFileId) { try { ProgressLog.Info("Copying source CSV and metadata files to repository directory csv/{0}", datasetId); var datasetCsvDirPath = Path.Combine(repositoryDirectory, "csv", datasetId); if (!Directory.Exists(datasetCsvDirPath)) { Directory.CreateDirectory(datasetCsvDirPath); } var csvFilePath = Path.Combine(datasetCsvDirPath, csvFileName); var csvFileStream = await _jobFileStore.GetFileAsync(csvFileId); await using (var csvOutStream = File.Open(csvFilePath, FileMode.Create, FileAccess.Write)) { csvFileStream.CopyTo(csvOutStream); } if (csvmFileId != null) { var csvmFilePath = csvFilePath + "-metadata.json"; var csvmFileStream = await _jobFileStore.GetFileAsync(csvmFileId); await using var csvmOutStream = File.Open(csvmFilePath, FileMode.Create, FileAccess.Write); csvmFileStream.CopyTo(csvmOutStream); } } catch (Exception ex) { Log.Error(ex, "Failed to copy CSV/CSVM files"); throw new WorkerException(ex, "Failed to copy CSV/CSVM files from upload to Github repository."); } }
private Graph GenerateDefinitionsGraph(JObject metadataJson) { var definitionsGraph = new Graph(); var metadataExtractor = new MetdataExtractor(); ProgressLog.Info("Extracting column property definitions"); metadataExtractor.GenerateColumnDefinitions(metadataJson, definitionsGraph); return(definitionsGraph); }
private Graph GenerateMetadataGraph(Uri datasetUri, Uri publisherIri, JObject metadataJson, IEnumerable <Uri> downloadUris, IGraph dataGraph) { var metadataGraph = new Graph(); var metadataExtractor = new MetdataExtractor(); ProgressLog.Info("Extracting dataset metadata"); metadataExtractor.Run(metadataJson, metadataGraph, publisherIri, dataGraph.Triples.Count, DateTime.UtcNow); var dsNode = metadataGraph.CreateUriNode(datasetUri); var ddNode = metadataGraph.CreateUriNode(new Uri("http://rdfs.org/ns/void#dataDump")); var exampleResource = metadataGraph.CreateUriNode(new Uri("http://rdfs.org/ns/void#exampleResource")); foreach (var downloadUri in downloadUris) { metadataGraph.Assert(dsNode, ddNode, metadataGraph.CreateUriNode(downloadUri)); } foreach (var distinctSubject in dataGraph.Triples.Select(t => t.Subject).OfType <IUriNode>().Distinct().Take(10)) { metadataGraph.Assert(dsNode, exampleResource, distinctSubject); } return(metadataGraph); }
private async Task <Graph> GenerateDatasetGraphAsync(string csvPath, JObject metadataJson, Uri metadataUri) { var parser = new JsonMetadataParser(null, metadataUri); var tableGroup = new TableGroup(); try { var tableMeta = parser.ParseTable(tableGroup, metadataJson); if (tableMeta == null) { throw new WorkerException("CSV Conversion failed. Unable to read CSV table metadata."); } } catch (MetadataParseException ex) { Log.Error(ex, "Invalid CSV table metadata: " + ex.Message); throw new WorkerException(ex, "CSV conversion failed. Invalid CSV table metadata: " + ex.Message); } var graph = new Graph(); ProgressLog.Info("Running CSV to RDF conversion"); var graphHandler = new GraphHandler(graph); var tableResolver = new LocalTableResolver(tableGroup.Tables[0].Url, csvPath); var converter = new Converter(graphHandler, tableResolver, ConverterMode.Minimal, (msg) => ProgressLog.Error(msg), this, reportInterval: CsvConversionReportInterval); await converter.ConvertAsync(tableGroup); if (converter.Errors.Any()) { foreach (var e in converter.Errors) { ProgressLog.Error(e); } throw new WorkerException("One or more errors where encountered during the CSV to RDF conversion."); } return(graph); }
private async Task <ContactInfo> GetPublisherContactInfo(string ownerId, string repoId) { try { ProgressLog.Info("Attempting to retrieve publisher contact information from repository settings"); // get repoSettings var repoSettings = await RepoSettingsStore.GetRepoSettingsAsync(ownerId, repoId); if (repoSettings?.DefaultPublisher != null) { ProgressLog.Info("Returning publisher from repository settings"); return(repoSettings.DefaultPublisher); } // no repo settings publisher, try at owner level ProgressLog.Info("No publisher info found in repository settings"); if (ownerId != null) { ProgressLog.Info("Attempting to retrieve publisher contact information from repository owner's settings"); var ownerSettings = await OwnerSettingsStore.GetOwnerSettingsAsync(ownerId); if (ownerSettings?.DefaultPublisher != null) { ProgressLog.Info("Returning publisher from repository owner's settings"); return(ownerSettings.DefaultPublisher); } } // no settings / publisher found for that repo ProgressLog.Info("No publisher info found in repository owner's settings"); return(null); } catch (Exception) { ProgressLog.Error("Error when attempting to retrieve publisher contact information from repository/owner settings"); return(null); } }
protected override async Task RunJob(JobInfo job, UserAccount userAccount) { ProgressLog.Info("Starting import job processing for " + userAccount.UserId); var targetDirectory = Path.Combine(_configuration.RepoBaseDir, job.JobId); Log.Information("Using local directory {localDirPath}", targetDirectory); // Clone the repository await _git.CloneRepository(job.GitRepositoryUrl, targetDirectory, AuthenticationToken, userAccount); // Retrieve CSV and CSVM files to src directory in the repository await AddCsvFilesToRepository(targetDirectory, job.DatasetId, job.CsvFileName, job.CsvFileId, job.CsvmFileId); var csvPath = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName); var metaPath = Path.Combine(targetDirectory, "csv", job.DatasetId, job.CsvFileName + "-metadata.json"); // Parse the JSON metadata JObject metadataJson; using (var metadataReader = File.OpenText(metaPath)) { var metadataString = metadataReader.ReadToEnd(); metadataJson = JObject.Parse(metadataString); } // Run the CSV to RDF conversion var repositoryUri = new Uri(_dataDockUriService.GetRepositoryUri(job.OwnerId, job.RepositoryId)); var publisherIri = new Uri(_dataDockUriService.GetRepositoryPublisherIdentifier(job.OwnerId, job.RepositoryId)); var datasetUri = new Uri(job.DatasetIri); var datasetMetadataGraphIri = new Uri(datasetUri + "/metadata"); var rootMetadataGraphIri = new Uri(_dataDockUriService.GetMetadataGraphIdentifier(job.OwnerId, job.RepositoryId)); var definitionsGraphIri = new Uri(_dataDockUriService.GetDefinitionsGraphIdentifier(job.OwnerId, job.RepositoryId)); var dateTag = DateTime.UtcNow.ToString("yyyyMMdd_HHmmss"); var releaseTag = MakeSafeTag(job.DatasetId + "_" + dateTag); var publisher = await GetPublisherContactInfo(job.OwnerId, job.RepositoryId); var ntriplesDownloadLink = new Uri($"https://github.com/{job.OwnerId}/{job.RepositoryId}/releases/download/{releaseTag}/{releaseTag}.nt.gz"); var csvDownloadLink = new Uri(repositoryUri + $"csv/{job.DatasetId}/{job.CsvFileName}"); using (var tmpReader = File.OpenText(csvPath)) { var header = tmpReader.ReadLine(); Log.Information("CSV header: {CsvHeader}", header); } var metadataBaseUri = new Uri(datasetUri + "/csv/" + job.CsvFileName + "-metadata.json"); IGraph datasetGraph = await GenerateDatasetGraphAsync(csvPath, metadataJson, metadataBaseUri); IGraph metadataGraph = GenerateMetadataGraph(datasetUri, publisherIri, metadataJson, new[] { ntriplesDownloadLink, csvDownloadLink }, datasetGraph); IGraph definitionsGraph = GenerateDefinitionsGraph(metadataJson); var dataDataDockRepository = _dataDataDockRepositoryFactory.GetRepositoryForJob(job, ProgressLog); dataDataDockRepository.UpdateDataset( datasetGraph, datasetUri, job.OverwriteExistingData, metadataGraph, datasetMetadataGraphIri, definitionsGraph, definitionsGraphIri, publisherIri, publisher, "", "", rootMetadataGraphIri); await UpdateHtmlPagesAsync(dataDataDockRepository, new[] { datasetUri, datasetMetadataGraphIri, rootMetadataGraphIri }); // Add and Commit all changes if (await _git.CommitChanges(targetDirectory, $"Added {job.CsvFileName} to dataset {job.DatasetIri}", userAccount)) { await _git.PushChanges(job.GitRepositoryUrl, targetDirectory, AuthenticationToken); await _git.MakeRelease(datasetGraph, releaseTag, job.OwnerId, job.RepositoryId, job.DatasetId, targetDirectory, AuthenticationToken); } // Update the dataset repository try { var voidMetadataJson = ExtractVoidMetadata(metadataGraph); var datasetInfo = new DatasetInfo { OwnerId = job.OwnerId, RepositoryId = job.RepositoryId, DatasetId = job.DatasetId, LastModified = DateTime.UtcNow, CsvwMetadata = metadataJson, VoidMetadata = voidMetadataJson, ShowOnHomePage = job.IsPublic, Tags = metadataJson["dcat:keyword"]?.ToObject <List <string> >() }; await _datasetStore.CreateOrUpdateDatasetRecordAsync(datasetInfo); ProgressLog.DatasetUpdated(datasetInfo); } catch (Exception ex) { Log.Error(ex, "Failed to update dataset record"); throw new WorkerException(ex, "Failed to update dataset record. Your repository is updated, but may not show in the portal."); } }
public void Report(int value) { ProgressLog.Info("CSV conversion processed {0} rows", value); }
private async Task <PortalInfoDrop> GetPortalSettingsInfo(string ownerId, string repoId, string authenticationToken) { try { ProgressLog.Info("Attempting to retrieve portal settings information from owner settings"); if (ownerId != null) { var portalInfo = new PortalInfoDrop { OwnerId = ownerId, RepositoryName = repoId }; ProgressLog.Info("Attempting to retrieve publisher contact information from repository owner's settings"); var ownerSettings = await OwnerSettingsStore.GetOwnerSettingsAsync(ownerId); if (ownerSettings != null) { portalInfo.IsOrg = ownerSettings.IsOrg; portalInfo.ShowDashboardLink = ownerSettings.DisplayDataDockLink; if (!string.IsNullOrEmpty(ownerSettings.TwitterHandle)) { portalInfo.Twitter = ownerSettings.TwitterHandle; } var client = GitHubClientFactory.CreateClient(authenticationToken); if (ownerSettings.IsOrg) { var org = await client.Organization.Get(ownerId); if (org == null) { return(portalInfo); } portalInfo.OwnerDisplayName = org.Name ?? ownerId; if (ownerSettings.DisplayGitHubBlogUrl) { portalInfo.Website = org.Blog; } if (ownerSettings.DisplayGitHubAvatar) { portalInfo.LogoUrl = org.AvatarUrl; } if (ownerSettings.DisplayGitHubDescription) { portalInfo.Description = org.Bio; } if (ownerSettings.DisplayGitHubBlogUrl) { portalInfo.Website = org.Blog; } if (ownerSettings.DisplayGitHubLocation) { portalInfo.Location = org.Location; } if (ownerSettings.DisplayGitHubIssuesLink) { portalInfo.GitHubHtmlUrl = org.HtmlUrl; } } else { var user = await client.User.Get(ownerId); if (user == null) { return(portalInfo); } portalInfo.OwnerDisplayName = user.Name ?? ownerId; if (ownerSettings.DisplayGitHubBlogUrl) { portalInfo.Website = user.Blog; } if (ownerSettings.DisplayGitHubAvatar) { portalInfo.LogoUrl = user.AvatarUrl; } if (ownerSettings.DisplayGitHubDescription) { portalInfo.Description = user.Bio; } if (ownerSettings.DisplayGitHubBlogUrl) { portalInfo.Website = user.Blog; } if (ownerSettings.DisplayGitHubLocation) { portalInfo.Location = user.Location; } if (ownerSettings.DisplayGitHubIssuesLink) { portalInfo.GitHubHtmlUrl = user.HtmlUrl; } } } ProgressLog.Info("Looking up repository portal search buttons from settings for {0} repository.", repoId); var repoSettings = await RepoSettingsStore.GetRepoSettingsAsync(ownerId, repoId); var repoSearchButtons = repoSettings?.SearchButtons; if (!string.IsNullOrEmpty(repoSearchButtons)) { portalInfo.RepoSearchButtons = GetSearchButtons(repoSearchButtons); } return(portalInfo); } // no settings ProgressLog.Info("No owner settings found"); return(null); } catch (Exception) { ProgressLog.Error("Error when attempting to retrieve portal information from owner settings"); return(null); } }