public void GetStatisticsBySource()
        {
            // get the list of sources
            IEnumerable <Source> sources = logging_repo.RetrieveDataSources();

            // Loop through and...
            // derive a connection string for each source,
            // then get the records contained in each ad table
            // and store it in the database.
            logging_repo.DeleteSameEventDBStats(agg_event_id);
            logging_repo.LogHeader("Statistics for each source database");

            foreach (Source s in sources)
            {
                string        conn_string = logging_repo.FetchConnString(s.database_name);
                SourceSummary sm          = new SourceSummary(agg_event_id, s.database_name);

                sm.study_recs               = logging_repo.GetRecNum("studies", conn_string);
                sm.study_identifiers_recs   = logging_repo.GetRecNum("study_identifiers", conn_string);
                sm.study_titles_recs        = logging_repo.GetRecNum("study_titles", conn_string);
                sm.study_contributors_recs  = logging_repo.GetRecNum("study_contributors", conn_string);
                sm.study_topics_recs        = logging_repo.GetRecNum("study_topics", conn_string);
                sm.study_features_recs      = logging_repo.GetRecNum("study_features", conn_string);
                sm.study_references_recs    = logging_repo.GetRecNum("study_references", conn_string);
                sm.study_relationships_recs = logging_repo.GetRecNum("study_relationships", conn_string);

                sm.data_object_recs          = logging_repo.GetRecNum("data_objects", conn_string);
                sm.object_datasets_recs      = logging_repo.GetRecNum("object_datasets", conn_string);
                sm.object_instances_recs     = logging_repo.GetRecNum("object_instances", conn_string);
                sm.object_titles_recs        = logging_repo.GetRecNum("object_titles", conn_string);
                sm.object_dates_recs         = logging_repo.GetRecNum("object_dates", conn_string);
                sm.object_contributors_recs  = logging_repo.GetRecNum("object_contributors", conn_string);
                sm.object_topics_recs        = logging_repo.GetRecNum("object_topics", conn_string);
                sm.object_identifiers_recs   = logging_repo.GetRecNum("object_identifiers", conn_string);
                sm.object_descriptions_recs  = logging_repo.GetRecNum("object_descriptions", conn_string);
                sm.object_rights_recs        = logging_repo.GetRecNum("object_rights", conn_string);
                sm.object_relationships_recs = logging_repo.GetRecNum("object_relationships", conn_string);

                logging_repo.StoreSourceSummary(sm);
                logging_repo.LogLine("Summary stats generated for " + s.database_name + " tables");
            }
        }
        public async Task AggregateDataAsync(Options opts)
        {
            logging_repo.LogParameters(opts);
            DataLayer repo = new DataLayer("mdr");

            // set up the context DB as two sets of foreign tables
            // as it is used in several places
            repo.SetUpTempContextFTWs();


            if (opts.transfer_data)
            {
                // Establish the mdr and logging repo layers.
                logging_repo.LogHeader("Establish aggregate schemas");

                // In the mdr database, establish new tables,
                // for the three schemas st, ob, nk (schemas should already exist)
                SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo);
                sb.DeleteStudyTables();
                sb.DeleteObjectTables();
                sb.DeleteLinkTables();

                sb.BuildNewStudyTables();
                sb.BuildNewObjectTables();
                sb.BuildNewLinkTables();
                logging_repo.LogLine("Tables created");

                // construct the aggregation event record
                AggregationEvent agg_event = new AggregationEvent(agg_event_id);

                // Derive a new table of inter-study relationships -
                // First get a list of all the study sources and
                // ensure it is sorted correctly.

                IEnumerable <Source> sources = logging_repo.RetrieveDataSources()
                                               .OrderBy(s => s.preference_rating);
                logging_repo.LogLine("Sources obtained");

                StudyLinkBuilder slb = new StudyLinkBuilder(repo, logging_repo);
                slb.CollectStudyStudyLinks(sources);
                slb.ProcessStudyStudyLinks();
                logging_repo.LogLine("Study-study links identified");

                // Start the data transfer process
                logging_repo.LogHeader("Data Transfer");

                // Loop through the study sources (in preference order)
                // In each case establish and then drop the source tables
                // in a foreign table wrapper
                int num_studies_imported = 0;
                int num_objects_imported = 0;

                foreach (Source s in sources)
                {
                    string schema_name     = repo.SetUpTempFTW(s.database_name);
                    string conn_string     = logging_repo.FetchConnString(s.database_name);
                    DataTransferBuilder tb = new DataTransferBuilder(s, schema_name, conn_string, logging_repo);
                    if (s.has_study_tables)
                    {
                        tb.ProcessStudyIds();
                        num_studies_imported += tb.TransferStudyData();
                        tb.ProcessStudyObjectIds();
                    }
                    else
                    {
                        tb.ProcessStandaloneObjectIds();
                    }
                    num_objects_imported += tb.TransferObjectData();
                    repo.DropTempFTW(s.database_name);
                }

                // Also use the study groups to set up study_relationship records
                slb.CreateStudyGroupRecords();

                // Update aggregation event record.

                agg_event.num_studies_imported = num_studies_imported;
                agg_event.num_objects_imported = num_objects_imported;

                string mdr_string = logging_repo.FetchConnString("mdr");
                agg_event.num_total_studies            = logging_repo.GetAggregateRecNum("studies", "st", mdr_string);
                agg_event.num_total_objects            = logging_repo.GetAggregateRecNum("data_objects", "ob", mdr_string);
                agg_event.num_total_study_object_links = logging_repo.GetAggregateRecNum("all_ids_data_objects", "nk", mdr_string);

                logging_repo.StoreAggregationEvent(agg_event);
                repo.DropTempContextFTWs();
            }


            if (opts.create_core)
            {
                // create core tables
                SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo);
                logging_repo.LogHeader("Set up");
                sb.DeleteCoreTables();
                sb.BuildNewCoreTables();

                // transfer data to core tables
                DataTransferBuilder tb = new DataTransferBuilder(logging_repo);
                logging_repo.LogHeader("Transferring study data");
                tb.TransferCoreStudyData();
                logging_repo.LogHeader("Transferring object data");
                tb.TransferCoreObjectData();
                logging_repo.LogHeader("Transferring link data");
                tb.TransferCoreLinkData();

                // Include generation of data provenance strings
                // Need an additional temporary FTW link to mon
                logging_repo.LogHeader("Finish");
                repo.SetUpTempFTW("mon");
                tb.GenerateProvenanceData();
                repo.DropTempFTW("mon");
            }


            if (opts.do_statistics)
            {
                int last_agg_event_id = logging_repo.GetLastAggEventId();
                StatisticsBuilder stb = new StatisticsBuilder(last_agg_event_id, logging_repo);
                stb.GetStatisticsBySource();
                stb.GetSummaryStatistics();
            }


            if (opts.create_json)
            {
                string     conn_string = logging_repo.FetchConnString("mdr");
                JSONHelper jh          = new JSONHelper(conn_string, logging_repo);

                // Create json fields.

                // if tables are to be left as they are, add false as
                // an additional boolean (default = true)
                // if tables are to have further data appended add an integer
                // offset that represents the records to skip (default = 0)

                logging_repo.LogHeader("Creating JSON study data");
                jh.CreateJSONStudyData(opts.also_do_files);
                //jh.UpdateJSONStudyData(opts.also_do_files);
                logging_repo.LogHeader("Creating JSON object data");
                jh.CreateJSONObjectData(opts.also_do_files);
                //jh.UpdateJSONObjectData(opts.also_do_files);
            }

            repo.DropTempContextFTWs();
            logging_repo.CloseLog();
        }
Beispiel #3
0
        public void ProcessStandaloneObjectIds()
        {
            ob_tr.SetUpTempObjectIdsTables();

            // process the data using available object-study links
            // (may be multiple study links per object)
            // exact process likely to differ with different standalone
            // object sources - at the moment only PubMed in this category

            if (source.id == 100135)
            {
                // Get the source -study- pmid link data
                // A table of PMID bank data was created during data download, but this
                // may have been date limited (probably was) so the total of records
                // in the ad tables needs to be used.
                // This needs to be combined with the references in those sources
                // that conbtain study_reference tables

                PubmedTransferHelper pm_tr = new PubmedTransferHelper();
                pm_tr.SetupTempPMIDTable();
                pm_tr.SetupDistinctPMIDTable();

                IEnumerable <PMIDLink> bank_object_ids = pm_tr.FetchBankPMIDs();
                pm_tr.StorePMIDLinks(CopyHelpers.pmid_links_helper, bank_object_ids);
                logging_repo.LogLine("PMID bank object Ids obtained");

                // Loop threough the study databases that hold
                // study_reference tables, i.e. with pmid ids
                IEnumerable <Source> sources = logging_repo.RetrieveDataSources();
                foreach (Source s in sources)
                {
                    if (s.has_study_references)
                    {
                        IEnumerable <PMIDLink> source_references = pm_tr.FetchSourceReferences(s.id, s.database_name);
                        pm_tr.StorePMIDLinks(CopyHelpers.pmid_links_helper, source_references);
                    }
                }
                logging_repo.LogLine("PMID source object Ids obtained");

                pm_tr.FillDistinctPMIDsTable();
                pm_tr.DropTempPMIDTable();

                // Try and tidy some of the worst data anomalies
                // before updating the data to the permanent tables.

                pm_tr.CleanPMIDsdsidData1();
                pm_tr.CleanPMIDsdsidData2();
                pm_tr.CleanPMIDsdsidData3();
                pm_tr.CleanPMIDsdsidData4();
                logging_repo.LogLine("PMID Ids cleaned");

                // Transfer data to all_ids_data_objects table.

                pm_tr.TransferPMIDLinksToObjectIds();
                ob_tr.UpdateObjectsWithStudyIds(source.id);
                logging_repo.LogLine("Object Ids matched to study ids");

                // Use study-study link table to get preferred sd_sid
                // then drop any resulting duplicates from study-pmid table
                pm_tr.InputPreferredSDSIDS();

                // add in study-pmid links to all_ids_objects
                ob_tr.UpdateAllObjectIdsTable(source.id);
                logging_repo.LogLine("PMID Ids added to table");

                // use min of ids to set all object ids the same for the same pmid
                pm_tr.ResetIdsOfDuplicatedPMIDs();
                logging_repo.LogLine("PMID Ids deduplicatedd");

                // make new table of distinct pmids to add
                ob_tr.FillObjectsToAddTable(source.id);
                logging_repo.LogLine("PMID Ids processed");
            }
        }