public void Update_SourceTable_ExportDate(string schema_name, string table_name) { try { int rec_count = GetMaxId(schema_name, table_name); int rec_batch = 50000; string sql_string = @"UPDATE " + schema_name + "." + table_name + @" s SET exported_on = CURRENT_TIMESTAMP "; if (rec_count > rec_batch) { for (int r = 1; r <= rec_count; r += rec_batch) { string batch_sql_string = sql_string + " where s.id >= " + r.ToString() + " and s.id < " + (r + rec_batch).ToString(); ExecuteSQL(batch_sql_string); string feedback = "Updated " + schema_name + "." + table_name + " export date, " + r.ToString() + " to "; feedback += (r + rec_batch < rec_count) ? (r + rec_batch - 1).ToString() : rec_count.ToString(); logging_repo.LogLine(feedback); } } else { ExecuteSQL(sql_string); logging_repo.LogLine("Updated " + schema_name + "." + table_name + " export date, as a single batch"); } } catch (Exception e) { string res = e.Message; logging_repo.LogError("In update export date (" + schema_name + "." + table_name + ") to aggregate table: " + res); } }
public void GetStatisticsBySource() { // get the list of sources IEnumerable <Source> sources = logging_repo.RetrieveDataSources(); // Loop through and... // derive a connection string for each source, // then get the records contained in each ad table // and store it in the database. logging_repo.DeleteSameEventDBStats(agg_event_id); logging_repo.LogHeader("Statistics for each source database"); foreach (Source s in sources) { string conn_string = logging_repo.FetchConnString(s.database_name); SourceSummary sm = new SourceSummary(agg_event_id, s.database_name); sm.study_recs = logging_repo.GetRecNum("studies", conn_string); sm.study_identifiers_recs = logging_repo.GetRecNum("study_identifiers", conn_string); sm.study_titles_recs = logging_repo.GetRecNum("study_titles", conn_string); sm.study_contributors_recs = logging_repo.GetRecNum("study_contributors", conn_string); sm.study_topics_recs = logging_repo.GetRecNum("study_topics", conn_string); sm.study_features_recs = logging_repo.GetRecNum("study_features", conn_string); sm.study_references_recs = logging_repo.GetRecNum("study_references", conn_string); sm.study_relationships_recs = logging_repo.GetRecNum("study_relationships", conn_string); sm.data_object_recs = logging_repo.GetRecNum("data_objects", conn_string); sm.object_datasets_recs = logging_repo.GetRecNum("object_datasets", conn_string); sm.object_instances_recs = logging_repo.GetRecNum("object_instances", conn_string); sm.object_titles_recs = logging_repo.GetRecNum("object_titles", conn_string); sm.object_dates_recs = logging_repo.GetRecNum("object_dates", conn_string); sm.object_contributors_recs = logging_repo.GetRecNum("object_contributors", conn_string); sm.object_topics_recs = logging_repo.GetRecNum("object_topics", conn_string); sm.object_identifiers_recs = logging_repo.GetRecNum("object_identifiers", conn_string); sm.object_descriptions_recs = logging_repo.GetRecNum("object_descriptions", conn_string); sm.object_rights_recs = logging_repo.GetRecNum("object_rights", conn_string); sm.object_relationships_recs = logging_repo.GetRecNum("object_relationships", conn_string); logging_repo.StoreSourceSummary(sm); logging_repo.LogLine("Summary stats generated for " + s.database_name + " tables"); } }
static async Task <int> RunOptionsAndReturnExitCodeAsync(Options opts) { // N.B. The aggregation process re-aggregates all the data from scratch. LoggingDataLayer logging_repo = new LoggingDataLayer(); Aggregator ag = new Aggregator(logging_repo); logging_repo.OpenLogFile(opts); try { await ag.AggregateDataAsync(opts); return(0); } catch (Exception e) { logging_repo.LogError("Unhandled exception: " + e.Message); logging_repo.LogLine(e.StackTrace); logging_repo.LogLine(e.TargetSite.Name); logging_repo.CloseLog(); return(-1); } }
public void CheckStudyLinks() { using (var conn = new NpgsqlConnection(connString)) { // Does any study id correspond to a study already all_ids_studies // table, that is linked to it via study-study link table. // Such a study will match the left hand side of the study-study // link table (the one to be replacwed), and take on the study_id // used for the 'preferred' right hand side. This should already exist // because addition of studies is donme in the order 'more preferred first'. string sql_string = @"UPDATE nk.temp_study_ids t SET study_id = s.study_id, is_preferred = false FROM nk.study_study_links k INNER JOIN nk.all_ids_studies s ON k.preferred_sd_sid = s.sd_sid AND k.preferred_source_id = s.source_id WHERE t.sd_sid = k.sd_sid AND t.source_id = k.source_id;"; int res = db.ExecuteSQL(sql_string); logging_repo.LogLine(res.ToString() + " existing studies found"); // Also create a small table that has just the study_ids and sd_sids for the // already existing studies (used in the import of any additional data // from these studies sql_string = @"DROP TABLE IF EXISTS nk.existing_studies; CREATE TABLE nk.existing_studies as SELECT sd_sid, study_id FROM nk.temp_study_ids WHERE is_preferred = false"; db.ExecuteSQL(sql_string); } }
public void LoopThroughStudyRecords(JSONStudyDataLayer repo, int min_id, int max_id, bool also_do_files, int offset) { JSONStudyProcessor processor = new JSONStudyProcessor(repo); // Do 10,000 ids at a time int batch = 10000; //int batch = 100; // testing string folder_path = ""; int k = 0; for (int n = min_id; n <= max_id; n += batch) { if (also_do_files) { // Create folder for the next batch, obtaining the parent path from repo string folder_name = "studies " + n.ToString() + " to " + (n + batch - 1).ToString(); folder_path = Path.Combine(repo.StudyJsonFolder, folder_name); if (!Directory.Exists(folder_path)) { Directory.CreateDirectory(folder_path); } else { // first clear files from folder DirectoryInfo di = new DirectoryInfo(folder_path); foreach (FileInfo file in di.EnumerateFiles()) { file.Delete(); } } } IEnumerable <int> id_numbers = repo.FetchIds(n, batch); foreach (int id in id_numbers) { // Construct single study object, drawing data from various database tables // and serialise to a formatted json string, then store json in the database. JSONStudy st = processor.CreateStudyObject(id); if (st != null) { var linear_json = JsonConvert.SerializeObject(st); processor.StoreJSONStudyInDB(id, linear_json); if (also_do_files) { var formatted_json = JsonConvert.SerializeObject(st, Formatting.Indented); string file_name = "study " + id.ToString() + ".json"; string full_path = Path.Combine(folder_path, file_name); File.WriteAllText(full_path, formatted_json); } } k++; if (k % 1000 == 0) { logging_repo.LogLine(k.ToString() + " records processed"); } } } }
public async Task AggregateDataAsync(Options opts) { logging_repo.LogParameters(opts); DataLayer repo = new DataLayer("mdr"); // set up the context DB as two sets of foreign tables // as it is used in several places repo.SetUpTempContextFTWs(); if (opts.transfer_data) { // Establish the mdr and logging repo layers. logging_repo.LogHeader("Establish aggregate schemas"); // In the mdr database, establish new tables, // for the three schemas st, ob, nk (schemas should already exist) SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo); sb.DeleteStudyTables(); sb.DeleteObjectTables(); sb.DeleteLinkTables(); sb.BuildNewStudyTables(); sb.BuildNewObjectTables(); sb.BuildNewLinkTables(); logging_repo.LogLine("Tables created"); // construct the aggregation event record AggregationEvent agg_event = new AggregationEvent(agg_event_id); // Derive a new table of inter-study relationships - // First get a list of all the study sources and // ensure it is sorted correctly. IEnumerable <Source> sources = logging_repo.RetrieveDataSources() .OrderBy(s => s.preference_rating); logging_repo.LogLine("Sources obtained"); StudyLinkBuilder slb = new StudyLinkBuilder(repo, logging_repo); slb.CollectStudyStudyLinks(sources); slb.ProcessStudyStudyLinks(); logging_repo.LogLine("Study-study links identified"); // Start the data transfer process logging_repo.LogHeader("Data Transfer"); // Loop through the study sources (in preference order) // In each case establish and then drop the source tables // in a foreign table wrapper int num_studies_imported = 0; int num_objects_imported = 0; foreach (Source s in sources) { string schema_name = repo.SetUpTempFTW(s.database_name); string conn_string = logging_repo.FetchConnString(s.database_name); DataTransferBuilder tb = new DataTransferBuilder(s, schema_name, conn_string, logging_repo); if (s.has_study_tables) { tb.ProcessStudyIds(); num_studies_imported += tb.TransferStudyData(); tb.ProcessStudyObjectIds(); } else { tb.ProcessStandaloneObjectIds(); } num_objects_imported += tb.TransferObjectData(); repo.DropTempFTW(s.database_name); } // Also use the study groups to set up study_relationship records slb.CreateStudyGroupRecords(); // Update aggregation event record. agg_event.num_studies_imported = num_studies_imported; agg_event.num_objects_imported = num_objects_imported; string mdr_string = logging_repo.FetchConnString("mdr"); agg_event.num_total_studies = logging_repo.GetAggregateRecNum("studies", "st", mdr_string); agg_event.num_total_objects = logging_repo.GetAggregateRecNum("data_objects", "ob", mdr_string); agg_event.num_total_study_object_links = logging_repo.GetAggregateRecNum("all_ids_data_objects", "nk", mdr_string); logging_repo.StoreAggregationEvent(agg_event); repo.DropTempContextFTWs(); } if (opts.create_core) { // create core tables SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo); logging_repo.LogHeader("Set up"); sb.DeleteCoreTables(); sb.BuildNewCoreTables(); // transfer data to core tables DataTransferBuilder tb = new DataTransferBuilder(logging_repo); logging_repo.LogHeader("Transferring study data"); tb.TransferCoreStudyData(); logging_repo.LogHeader("Transferring object data"); tb.TransferCoreObjectData(); logging_repo.LogHeader("Transferring link data"); tb.TransferCoreLinkData(); // Include generation of data provenance strings // Need an additional temporary FTW link to mon logging_repo.LogHeader("Finish"); repo.SetUpTempFTW("mon"); tb.GenerateProvenanceData(); repo.DropTempFTW("mon"); } if (opts.do_statistics) { int last_agg_event_id = logging_repo.GetLastAggEventId(); StatisticsBuilder stb = new StatisticsBuilder(last_agg_event_id, logging_repo); stb.GetStatisticsBySource(); stb.GetSummaryStatistics(); } if (opts.create_json) { string conn_string = logging_repo.FetchConnString("mdr"); JSONHelper jh = new JSONHelper(conn_string, logging_repo); // Create json fields. // if tables are to be left as they are, add false as // an additional boolean (default = true) // if tables are to have further data appended add an integer // offset that represents the records to skip (default = 0) logging_repo.LogHeader("Creating JSON study data"); jh.CreateJSONStudyData(opts.also_do_files); //jh.UpdateJSONStudyData(opts.also_do_files); logging_repo.LogHeader("Creating JSON object data"); jh.CreateJSONObjectData(opts.also_do_files); //jh.UpdateJSONObjectData(opts.also_do_files); } repo.DropTempContextFTWs(); logging_repo.CloseLog(); }
public int LoadDataObjects(string schema_name) { string sql_string = @"INSERT INTO ob.data_objects(id, display_title, version, doi, doi_status_id, publication_year, object_class_id, object_type_id, managing_org_id, managing_org, lang_code, access_type_id, access_details, access_details_url, url_last_checked, eosc_category, add_study_contribs, add_study_topics) SELECT t.object_id, s.display_title, s.version, s.doi, s.doi_status_id, s.publication_year, s.object_class_id, s.object_type_id, s.managing_org_id, s.managing_org, s.lang_code, s.access_type_id, s.access_details, s.access_details_url, s.url_last_checked, s.eosc_category, s.add_study_contribs, s.add_study_topics FROM " + schema_name + @".data_objects s INNER JOIN nk.temp_objects_to_add t on s.sd_oid = t.sd_oid "; int res = db.ExecuteTransferSQL(sql_string, schema_name, "data_objects", ""); logging_repo.LogLine("Loaded records - " + res.ToString() + " data_objects"); db.Update_SourceTable_ExportDate(schema_name, "data_objects"); return(res); }
public void ProcessStudyIds() { // Get the new study data as a set of study records // using the ad database as the source. // set up a temporary table that holds the sd_sid, // for all studies, and then fill it. st_tr.SetUpTempStudyIdsTable(); IEnumerable <StudyId> study_ids = st_tr.FetchStudyIds(source.id, source_conn_string); logging_repo.LogLine("Study Ids obtained"); st_tr.StoreStudyIds(CopyHelpers.study_ids_helper, study_ids); logging_repo.LogLine("Study Ids stored"); // Do the check of the temp table ids against the study_study links. // Change the table to reflect the 'preferred' Ids. // Back load the correct study ids into the temporary table. st_tr.CheckStudyLinks(); logging_repo.LogLine("Study Ids checked"); st_tr.UpdateAllStudyIdsTable(source.id); logging_repo.LogLine("Study Ids processed"); }