Beispiel #1
0
        public void Update_SourceTable_ExportDate(string schema_name, string table_name)
        {
            try
            {
                int    rec_count  = GetMaxId(schema_name, table_name);
                int    rec_batch  = 50000;
                string sql_string = @"UPDATE " + schema_name + "." + table_name + @" s
                                      SET exported_on = CURRENT_TIMESTAMP ";

                if (rec_count > rec_batch)
                {
                    for (int r = 1; r <= rec_count; r += rec_batch)
                    {
                        string batch_sql_string = sql_string + " where s.id >= " + r.ToString() + " and s.id < " + (r + rec_batch).ToString();
                        ExecuteSQL(batch_sql_string);
                        string feedback = "Updated " + schema_name + "." + table_name + " export date, " + r.ToString() + " to ";
                        feedback += (r + rec_batch < rec_count) ? (r + rec_batch - 1).ToString() : rec_count.ToString();
                        logging_repo.LogLine(feedback);
                    }
                }
                else
                {
                    ExecuteSQL(sql_string);
                    logging_repo.LogLine("Updated " + schema_name + "." + table_name + " export date, as a single batch");
                }
            }
            catch (Exception e)
            {
                string res = e.Message;
                logging_repo.LogError("In update export date (" + schema_name + "." + table_name + ") to aggregate table: " + res);
            }
        }
        public void GetStatisticsBySource()
        {
            // get the list of sources
            IEnumerable <Source> sources = logging_repo.RetrieveDataSources();

            // Loop through and...
            // derive a connection string for each source,
            // then get the records contained in each ad table
            // and store it in the database.
            logging_repo.DeleteSameEventDBStats(agg_event_id);
            logging_repo.LogHeader("Statistics for each source database");

            foreach (Source s in sources)
            {
                string        conn_string = logging_repo.FetchConnString(s.database_name);
                SourceSummary sm          = new SourceSummary(agg_event_id, s.database_name);

                sm.study_recs               = logging_repo.GetRecNum("studies", conn_string);
                sm.study_identifiers_recs   = logging_repo.GetRecNum("study_identifiers", conn_string);
                sm.study_titles_recs        = logging_repo.GetRecNum("study_titles", conn_string);
                sm.study_contributors_recs  = logging_repo.GetRecNum("study_contributors", conn_string);
                sm.study_topics_recs        = logging_repo.GetRecNum("study_topics", conn_string);
                sm.study_features_recs      = logging_repo.GetRecNum("study_features", conn_string);
                sm.study_references_recs    = logging_repo.GetRecNum("study_references", conn_string);
                sm.study_relationships_recs = logging_repo.GetRecNum("study_relationships", conn_string);

                sm.data_object_recs          = logging_repo.GetRecNum("data_objects", conn_string);
                sm.object_datasets_recs      = logging_repo.GetRecNum("object_datasets", conn_string);
                sm.object_instances_recs     = logging_repo.GetRecNum("object_instances", conn_string);
                sm.object_titles_recs        = logging_repo.GetRecNum("object_titles", conn_string);
                sm.object_dates_recs         = logging_repo.GetRecNum("object_dates", conn_string);
                sm.object_contributors_recs  = logging_repo.GetRecNum("object_contributors", conn_string);
                sm.object_topics_recs        = logging_repo.GetRecNum("object_topics", conn_string);
                sm.object_identifiers_recs   = logging_repo.GetRecNum("object_identifiers", conn_string);
                sm.object_descriptions_recs  = logging_repo.GetRecNum("object_descriptions", conn_string);
                sm.object_rights_recs        = logging_repo.GetRecNum("object_rights", conn_string);
                sm.object_relationships_recs = logging_repo.GetRecNum("object_relationships", conn_string);

                logging_repo.StoreSourceSummary(sm);
                logging_repo.LogLine("Summary stats generated for " + s.database_name + " tables");
            }
        }
Beispiel #3
0
        static async Task <int> RunOptionsAndReturnExitCodeAsync(Options opts)
        {
            // N.B. The aggregation process re-aggregates all the data from scratch.

            LoggingDataLayer logging_repo = new LoggingDataLayer();
            Aggregator       ag           = new Aggregator(logging_repo);

            logging_repo.OpenLogFile(opts);

            try
            {
                await ag.AggregateDataAsync(opts);

                return(0);
            }
            catch (Exception e)
            {
                logging_repo.LogError("Unhandled exception: " + e.Message);
                logging_repo.LogLine(e.StackTrace);
                logging_repo.LogLine(e.TargetSite.Name);
                logging_repo.CloseLog();
                return(-1);
            }
        }
Beispiel #4
0
        public void CheckStudyLinks()
        {
            using (var conn = new NpgsqlConnection(connString))
            {
                // Does any study id correspond to a study already all_ids_studies
                // table, that is linked to it via study-study link table.
                // Such a study will match the left hand side of the study-study
                // link table (the one to be replacwed), and take on the study_id
                // used for the 'preferred' right hand side. This should already exist
                // because addition of studies is donme in the order 'more preferred first'.

                string sql_string = @"UPDATE nk.temp_study_ids t
                           SET study_id = s.study_id, is_preferred = false
                           FROM nk.study_study_links k
                                INNER JOIN nk.all_ids_studies s
                                ON k.preferred_sd_sid = s.sd_sid
                                AND k.preferred_source_id = s.source_id
                           WHERE t.sd_sid = k.sd_sid
                           AND t.source_id =  k.source_id;";
                int    res        = db.ExecuteSQL(sql_string);
                logging_repo.LogLine(res.ToString() + " existing studies found");

                // Also create a small table that has just the study_ids and sd_sids for the
                // already existing studies (used in the import of any additional data
                // from these studies

                sql_string = @"DROP TABLE IF EXISTS nk.existing_studies;
                               CREATE TABLE nk.existing_studies as 
                                       SELECT sd_sid, study_id
                                       FROM nk.temp_study_ids
                                       WHERE is_preferred = false";
                db.ExecuteSQL(sql_string);
            }
        }
Beispiel #5
0
        public void LoopThroughStudyRecords(JSONStudyDataLayer repo, int min_id, int max_id, bool also_do_files, int offset)
        {
            JSONStudyProcessor processor = new JSONStudyProcessor(repo);

            // Do 10,000 ids at a time
            int batch = 10000;
            //int batch = 100;  // testing

            string folder_path = "";
            int    k           = 0;

            for (int n = min_id; n <= max_id; n += batch)
            {
                if (also_do_files)
                {
                    // Create folder for the next batch, obtaining the parent path from repo

                    string folder_name = "studies " + n.ToString() + " to " + (n + batch - 1).ToString();
                    folder_path = Path.Combine(repo.StudyJsonFolder, folder_name);
                    if (!Directory.Exists(folder_path))
                    {
                        Directory.CreateDirectory(folder_path);
                    }
                    else
                    {
                        // first clear files from folder
                        DirectoryInfo di = new DirectoryInfo(folder_path);
                        foreach (FileInfo file in di.EnumerateFiles())
                        {
                            file.Delete();
                        }
                    }
                }

                IEnumerable <int> id_numbers = repo.FetchIds(n, batch);
                foreach (int id in id_numbers)
                {
                    // Construct single study object, drawing data from various database tables
                    // and serialise to a formatted json string, then store json in the database.

                    JSONStudy st = processor.CreateStudyObject(id);
                    if (st != null)
                    {
                        var linear_json = JsonConvert.SerializeObject(st);
                        processor.StoreJSONStudyInDB(id, linear_json);
                        if (also_do_files)
                        {
                            var    formatted_json = JsonConvert.SerializeObject(st, Formatting.Indented);
                            string file_name      = "study " + id.ToString() + ".json";
                            string full_path      = Path.Combine(folder_path, file_name);
                            File.WriteAllText(full_path, formatted_json);
                        }
                    }

                    k++;
                    if (k % 1000 == 0)
                    {
                        logging_repo.LogLine(k.ToString() + " records processed");
                    }
                }
            }
        }
        public async Task AggregateDataAsync(Options opts)
        {
            logging_repo.LogParameters(opts);
            DataLayer repo = new DataLayer("mdr");

            // set up the context DB as two sets of foreign tables
            // as it is used in several places
            repo.SetUpTempContextFTWs();


            if (opts.transfer_data)
            {
                // Establish the mdr and logging repo layers.
                logging_repo.LogHeader("Establish aggregate schemas");

                // In the mdr database, establish new tables,
                // for the three schemas st, ob, nk (schemas should already exist)
                SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo);
                sb.DeleteStudyTables();
                sb.DeleteObjectTables();
                sb.DeleteLinkTables();

                sb.BuildNewStudyTables();
                sb.BuildNewObjectTables();
                sb.BuildNewLinkTables();
                logging_repo.LogLine("Tables created");

                // construct the aggregation event record
                AggregationEvent agg_event = new AggregationEvent(agg_event_id);

                // Derive a new table of inter-study relationships -
                // First get a list of all the study sources and
                // ensure it is sorted correctly.

                IEnumerable <Source> sources = logging_repo.RetrieveDataSources()
                                               .OrderBy(s => s.preference_rating);
                logging_repo.LogLine("Sources obtained");

                StudyLinkBuilder slb = new StudyLinkBuilder(repo, logging_repo);
                slb.CollectStudyStudyLinks(sources);
                slb.ProcessStudyStudyLinks();
                logging_repo.LogLine("Study-study links identified");

                // Start the data transfer process
                logging_repo.LogHeader("Data Transfer");

                // Loop through the study sources (in preference order)
                // In each case establish and then drop the source tables
                // in a foreign table wrapper
                int num_studies_imported = 0;
                int num_objects_imported = 0;

                foreach (Source s in sources)
                {
                    string schema_name     = repo.SetUpTempFTW(s.database_name);
                    string conn_string     = logging_repo.FetchConnString(s.database_name);
                    DataTransferBuilder tb = new DataTransferBuilder(s, schema_name, conn_string, logging_repo);
                    if (s.has_study_tables)
                    {
                        tb.ProcessStudyIds();
                        num_studies_imported += tb.TransferStudyData();
                        tb.ProcessStudyObjectIds();
                    }
                    else
                    {
                        tb.ProcessStandaloneObjectIds();
                    }
                    num_objects_imported += tb.TransferObjectData();
                    repo.DropTempFTW(s.database_name);
                }

                // Also use the study groups to set up study_relationship records
                slb.CreateStudyGroupRecords();

                // Update aggregation event record.

                agg_event.num_studies_imported = num_studies_imported;
                agg_event.num_objects_imported = num_objects_imported;

                string mdr_string = logging_repo.FetchConnString("mdr");
                agg_event.num_total_studies            = logging_repo.GetAggregateRecNum("studies", "st", mdr_string);
                agg_event.num_total_objects            = logging_repo.GetAggregateRecNum("data_objects", "ob", mdr_string);
                agg_event.num_total_study_object_links = logging_repo.GetAggregateRecNum("all_ids_data_objects", "nk", mdr_string);

                logging_repo.StoreAggregationEvent(agg_event);
                repo.DropTempContextFTWs();
            }


            if (opts.create_core)
            {
                // create core tables
                SchemaBuilder sb = new SchemaBuilder(repo.ConnString, logging_repo);
                logging_repo.LogHeader("Set up");
                sb.DeleteCoreTables();
                sb.BuildNewCoreTables();

                // transfer data to core tables
                DataTransferBuilder tb = new DataTransferBuilder(logging_repo);
                logging_repo.LogHeader("Transferring study data");
                tb.TransferCoreStudyData();
                logging_repo.LogHeader("Transferring object data");
                tb.TransferCoreObjectData();
                logging_repo.LogHeader("Transferring link data");
                tb.TransferCoreLinkData();

                // Include generation of data provenance strings
                // Need an additional temporary FTW link to mon
                logging_repo.LogHeader("Finish");
                repo.SetUpTempFTW("mon");
                tb.GenerateProvenanceData();
                repo.DropTempFTW("mon");
            }


            if (opts.do_statistics)
            {
                int last_agg_event_id = logging_repo.GetLastAggEventId();
                StatisticsBuilder stb = new StatisticsBuilder(last_agg_event_id, logging_repo);
                stb.GetStatisticsBySource();
                stb.GetSummaryStatistics();
            }


            if (opts.create_json)
            {
                string     conn_string = logging_repo.FetchConnString("mdr");
                JSONHelper jh          = new JSONHelper(conn_string, logging_repo);

                // Create json fields.

                // if tables are to be left as they are, add false as
                // an additional boolean (default = true)
                // if tables are to have further data appended add an integer
                // offset that represents the records to skip (default = 0)

                logging_repo.LogHeader("Creating JSON study data");
                jh.CreateJSONStudyData(opts.also_do_files);
                //jh.UpdateJSONStudyData(opts.also_do_files);
                logging_repo.LogHeader("Creating JSON object data");
                jh.CreateJSONObjectData(opts.also_do_files);
                //jh.UpdateJSONObjectData(opts.also_do_files);
            }

            repo.DropTempContextFTWs();
            logging_repo.CloseLog();
        }
Beispiel #7
0
        public int LoadDataObjects(string schema_name)
        {
            string sql_string = @"INSERT INTO ob.data_objects(id,
                    display_title, version, doi, doi_status_id, publication_year,
                    object_class_id, object_type_id, managing_org_id, managing_org,
                    lang_code, access_type_id, access_details, access_details_url,
                    url_last_checked, eosc_category, add_study_contribs, 
                    add_study_topics)
                    SELECT t.object_id,
                    s.display_title, s.version, s.doi, s.doi_status_id, s.publication_year,
                    s.object_class_id, s.object_type_id, s.managing_org_id, s.managing_org,
                    s.lang_code, s.access_type_id, s.access_details, s.access_details_url,
                    s.url_last_checked, s.eosc_category, s.add_study_contribs, 
                    s.add_study_topics
                    FROM " + schema_name + @".data_objects s
                    INNER JOIN nk.temp_objects_to_add t
                    on s.sd_oid = t.sd_oid ";

            int res = db.ExecuteTransferSQL(sql_string, schema_name, "data_objects", "");

            logging_repo.LogLine("Loaded records - " + res.ToString() + " data_objects");

            db.Update_SourceTable_ExportDate(schema_name, "data_objects");
            return(res);
        }
Beispiel #8
0
        public void ProcessStudyIds()
        {
            // Get the new study data as a set of study records
            // using the ad database as the source.
            // set up a temporary table that holds the sd_sid,
            // for all studies, and then fill it.

            st_tr.SetUpTempStudyIdsTable();
            IEnumerable <StudyId> study_ids = st_tr.FetchStudyIds(source.id, source_conn_string);

            logging_repo.LogLine("Study Ids obtained");
            st_tr.StoreStudyIds(CopyHelpers.study_ids_helper, study_ids);
            logging_repo.LogLine("Study Ids stored");

            // Do the check of the temp table ids against the study_study links.
            // Change the table to reflect the 'preferred' Ids.
            // Back load the correct study ids into the temporary table.

            st_tr.CheckStudyLinks();
            logging_repo.LogLine("Study Ids checked");
            st_tr.UpdateAllStudyIdsTable(source.id);
            logging_repo.LogLine("Study Ids processed");
        }