public void GetStatisticsBySource() { // get the list of sources IEnumerable <Source> sources = logging_repo.RetrieveDataSources(); // Loop through and... // derive a connection string for each source, // then get the records contained in each ad table // and store it in the database. logging_repo.DeleteSameEventDBStats(agg_event_id); logging_repo.LogHeader("Statistics for each source database"); foreach (Source s in sources) { string conn_string = logging_repo.FetchConnString(s.database_name); SourceSummary sm = new SourceSummary(agg_event_id, s.database_name); sm.study_recs = logging_repo.GetRecNum("studies", conn_string); sm.study_identifiers_recs = logging_repo.GetRecNum("study_identifiers", conn_string); sm.study_titles_recs = logging_repo.GetRecNum("study_titles", conn_string); sm.study_contributors_recs = logging_repo.GetRecNum("study_contributors", conn_string); sm.study_topics_recs = logging_repo.GetRecNum("study_topics", conn_string); sm.study_features_recs = logging_repo.GetRecNum("study_features", conn_string); sm.study_references_recs = logging_repo.GetRecNum("study_references", conn_string); sm.study_relationships_recs = logging_repo.GetRecNum("study_relationships", conn_string); sm.data_object_recs = logging_repo.GetRecNum("data_objects", conn_string); sm.object_datasets_recs = logging_repo.GetRecNum("object_datasets", conn_string); sm.object_instances_recs = logging_repo.GetRecNum("object_instances", conn_string); sm.object_titles_recs = logging_repo.GetRecNum("object_titles", conn_string); sm.object_dates_recs = logging_repo.GetRecNum("object_dates", conn_string); sm.object_contributors_recs = logging_repo.GetRecNum("object_contributors", conn_string); sm.object_topics_recs = logging_repo.GetRecNum("object_topics", conn_string); sm.object_identifiers_recs = logging_repo.GetRecNum("object_identifiers", conn_string); sm.object_descriptions_recs = logging_repo.GetRecNum("object_descriptions", conn_string); sm.object_rights_recs = logging_repo.GetRecNum("object_rights", conn_string); sm.object_relationships_recs = logging_repo.GetRecNum("object_relationships", conn_string); logging_repo.StoreSourceSummary(sm); logging_repo.LogLine("Summary stats generated for " + s.database_name + " tables"); } }
public async Task AggregateDataAsync(Options opts) { logging_repo.LogParameters(opts); DataLayer repo = new DataLayer("mdr"); // set up the context DB as two sets of foreign tables // as it is used in several places repo.SetUpTempContextFTWs(); if (opts.transfer_data) { // Establish the mdr and logging repo layers. logging_repo.LogHeader("Establish aggregate schemas"); // In the mdr database, establish new tables, // for the three schemas st, ob, nk (schemas should already exist) SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo); sb.DeleteStudyTables(); sb.DeleteObjectTables(); sb.DeleteLinkTables(); sb.BuildNewStudyTables(); sb.BuildNewObjectTables(); sb.BuildNewLinkTables(); logging_repo.LogLine("Tables created"); // construct the aggregation event record AggregationEvent agg_event = new AggregationEvent(agg_event_id); // Derive a new table of inter-study relationships - // First get a list of all the study sources and // ensure it is sorted correctly. IEnumerable <Source> sources = logging_repo.RetrieveDataSources() .OrderBy(s => s.preference_rating); logging_repo.LogLine("Sources obtained"); StudyLinkBuilder slb = new StudyLinkBuilder(repo, logging_repo); slb.CollectStudyStudyLinks(sources); slb.ProcessStudyStudyLinks(); logging_repo.LogLine("Study-study links identified"); // Start the data transfer process logging_repo.LogHeader("Data Transfer"); // Loop through the study sources (in preference order) // In each case establish and then drop the source tables // in a foreign table wrapper int num_studies_imported = 0; int num_objects_imported = 0; foreach (Source s in sources) { string schema_name = repo.SetUpTempFTW(s.database_name); string conn_string = logging_repo.FetchConnString(s.database_name); DataTransferBuilder tb = new DataTransferBuilder(s, schema_name, conn_string, logging_repo); if (s.has_study_tables) { tb.ProcessStudyIds(); num_studies_imported += tb.TransferStudyData(); tb.ProcessStudyObjectIds(); } else { tb.ProcessStandaloneObjectIds(); } num_objects_imported += tb.TransferObjectData(); repo.DropTempFTW(s.database_name); } // Also use the study groups to set up study_relationship records slb.CreateStudyGroupRecords(); // Update aggregation event record. agg_event.num_studies_imported = num_studies_imported; agg_event.num_objects_imported = num_objects_imported; string mdr_string = logging_repo.FetchConnString("mdr"); agg_event.num_total_studies = logging_repo.GetAggregateRecNum("studies", "st", mdr_string); agg_event.num_total_objects = logging_repo.GetAggregateRecNum("data_objects", "ob", mdr_string); agg_event.num_total_study_object_links = logging_repo.GetAggregateRecNum("all_ids_data_objects", "nk", mdr_string); logging_repo.StoreAggregationEvent(agg_event); repo.DropTempContextFTWs(); } if (opts.create_core) { // create core tables SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo); logging_repo.LogHeader("Set up"); sb.DeleteCoreTables(); sb.BuildNewCoreTables(); // transfer data to core tables DataTransferBuilder tb = new DataTransferBuilder(logging_repo); logging_repo.LogHeader("Transferring study data"); tb.TransferCoreStudyData(); logging_repo.LogHeader("Transferring object data"); tb.TransferCoreObjectData(); logging_repo.LogHeader("Transferring link data"); tb.TransferCoreLinkData(); // Include generation of data provenance strings // Need an additional temporary FTW link to mon logging_repo.LogHeader("Finish"); repo.SetUpTempFTW("mon"); tb.GenerateProvenanceData(); repo.DropTempFTW("mon"); } if (opts.do_statistics) { int last_agg_event_id = logging_repo.GetLastAggEventId(); StatisticsBuilder stb = new StatisticsBuilder(last_agg_event_id, logging_repo); stb.GetStatisticsBySource(); stb.GetSummaryStatistics(); } if (opts.create_json) { string conn_string = logging_repo.FetchConnString("mdr"); JSONHelper jh = new JSONHelper(conn_string, logging_repo); // Create json fields. // if tables are to be left as they are, add false as // an additional boolean (default = true) // if tables are to have further data appended add an integer // offset that represents the records to skip (default = 0) logging_repo.LogHeader("Creating JSON study data"); jh.CreateJSONStudyData(opts.also_do_files); //jh.UpdateJSONStudyData(opts.also_do_files); logging_repo.LogHeader("Creating JSON object data"); jh.CreateJSONObjectData(opts.also_do_files); //jh.UpdateJSONObjectData(opts.also_do_files); } repo.DropTempContextFTWs(); logging_repo.CloseLog(); }
public void ProcessStandaloneObjectIds() { ob_tr.SetUpTempObjectIdsTables(); // process the data using available object-study links // (may be multiple study links per object) // exact process likely to differ with different standalone // object sources - at the moment only PubMed in this category if (source.id == 100135) { // Get the source -study- pmid link data // A table of PMID bank data was created during data download, but this // may have been date limited (probably was) so the total of records // in the ad tables needs to be used. // This needs to be combined with the references in those sources // that conbtain study_reference tables PubmedTransferHelper pm_tr = new PubmedTransferHelper(); pm_tr.SetupTempPMIDTable(); pm_tr.SetupDistinctPMIDTable(); IEnumerable <PMIDLink> bank_object_ids = pm_tr.FetchBankPMIDs(); pm_tr.StorePMIDLinks(CopyHelpers.pmid_links_helper, bank_object_ids); logging_repo.LogLine("PMID bank object Ids obtained"); // Loop threough the study databases that hold // study_reference tables, i.e. with pmid ids IEnumerable <Source> sources = logging_repo.RetrieveDataSources(); foreach (Source s in sources) { if (s.has_study_references) { IEnumerable <PMIDLink> source_references = pm_tr.FetchSourceReferences(s.id, s.database_name); pm_tr.StorePMIDLinks(CopyHelpers.pmid_links_helper, source_references); } } logging_repo.LogLine("PMID source object Ids obtained"); pm_tr.FillDistinctPMIDsTable(); pm_tr.DropTempPMIDTable(); // Try and tidy some of the worst data anomalies // before updating the data to the permanent tables. pm_tr.CleanPMIDsdsidData1(); pm_tr.CleanPMIDsdsidData2(); pm_tr.CleanPMIDsdsidData3(); pm_tr.CleanPMIDsdsidData4(); logging_repo.LogLine("PMID Ids cleaned"); // Transfer data to all_ids_data_objects table. pm_tr.TransferPMIDLinksToObjectIds(); ob_tr.UpdateObjectsWithStudyIds(source.id); logging_repo.LogLine("Object Ids matched to study ids"); // Use study-study link table to get preferred sd_sid // then drop any resulting duplicates from study-pmid table pm_tr.InputPreferredSDSIDS(); // add in study-pmid links to all_ids_objects ob_tr.UpdateAllObjectIdsTable(source.id); logging_repo.LogLine("PMID Ids added to table"); // use min of ids to set all object ids the same for the same pmid pm_tr.ResetIdsOfDuplicatedPMIDs(); logging_repo.LogLine("PMID Ids deduplicatedd"); // make new table of distinct pmids to add ob_tr.FillObjectsToAddTable(source.id); logging_repo.LogLine("PMID Ids processed"); } }