Exemple #1
0
        /// <summary>
        /// Harvest each of the publications in the people file
        /// </summary>
        /// <param name="PeopleFile">Filename of the people file</param>
        /// <param name="PublicationTypeFile">Filename of publication type file</param>
        /// <param name="ContinueFromInterruption">True if continuing from a previously interrupted harvest</param>
        public void Harvest(string PeopleFile, string PublicationTypeFile, bool ContinueFromInterruption)
        {
            // First verify that the files exist
            if (!File.Exists(PeopleFile))
            {
                MessageBox.Show("The People file '" + PeopleFile + "' does not exist", "People file not found", MessageBoxButtons.OK, MessageBoxIcon.Warning);
                return;
            }
            if (!File.Exists(PublicationTypeFile))
            {
                MessageBox.Show("The Publication Type file '" + PublicationTypeFile + "' does not exist", "Publication Type file not found", MessageBoxButtons.OK, MessageBoxIcon.Warning);
                return;
            }


            UpdateDatabaseStatus();
            if (ContinueFromInterruption)
            {
                AddLogEntry("Continuing interrupted harvest");
            }
            else
            {
                AddLogEntry("Beginning harvesting");
            }

            // Reset lastDSNSelected to make sure that the next check for interrupted data is NOT skipped
            lastDSNSelected = "";

            // Initialize the harvester
            Harvester harvester;
            Database  DB;

            // Initialize objects
            try
            {
                DB        = new Database(DSN.Text);
                harvester = new Harvester(DB);

                // Set the language restriction
                string[] Languages;
                if (LanguageList.Text != "")
                {
                    Languages           = LanguageList.Text.Split(',');
                    harvester.Languages = Languages;
                    foreach (string Language in Languages)
                    {
                        AddLogEntry("Adding language restriction: " + Language);
                    }
                }
                else
                {
                    AddLogEntry("No language restriction added");
                }
            }
            catch (Exception ex)
            {
                AddLogEntryWithErrorBox(ex.Message, "Unable to begin harvesting");
                return;
            }

            // Initializethe database
            try
            {
                if (!ContinueFromInterruption)
                {
                    AddLogEntry("Initializing the database");
                    harvester.CreateTables();
                    UpdateDatabaseStatus();
                }
            }
            catch (Exception ex)
            {
                AddLogEntryWithErrorBox(ex.Message, "Unable to initialize database");
                return;
            }


            PublicationTypes pubTypes;

            if (ContinueFromInterruption)
            {
                // If we're continuing, read the publication types from the databse
                try
                {
                    AddLogEntry("Reading publication types from the database");
                    pubTypes = new PublicationTypes(DB);
                }
                catch (Exception ex)
                {
                    AddLogEntryWithErrorBox(ex.Message, "Unable to read publication types");
                    return;
                }
                // Remove any data left over from the interruption
                if (ContinueFromInterruption)
                {
                    AddLogEntry("Removing any data left over from the previous interruption");
                    harvester.ClearDataAfterInterruption();
                }
                UpdateDatabaseStatus();
            }
            else
            {
                // Read the publication types from the file and write them to the database
                try
                {
                    AddLogEntry("Writing publication types to database");
                    pubTypes = new PublicationTypes(Path.GetDirectoryName(PublicationTypeFile), Path.GetFileName(PublicationTypeFile));
                    pubTypes.WriteToDB(DB);
                    UpdateDatabaseStatus();
                }
                catch (Exception ex)
                {
                    AddLogEntryWithErrorBox(ex.Message, "Unable to read publication types");
                    return;
                }

                // Read the people
                try
                {
                    AddLogEntry("Reading people from " + Path.GetFileName(PeopleFile) + " and writing them to the database");
                    harvester.ImportPeople(PeopleFile);
                    UpdateDatabaseStatus();
                }
                catch (Exception ex)
                {
                    AddLogEntryWithErrorBox(ex.Message, "Unable to read the people from " + Path.GetFileName(PeopleFile));
                    return;
                }
            }


            // Make an anonymous callback function that keeps track of the callback data
            Harvester.GetPublicationsStatus StatusCallback = delegate(int number, int total, int averageTime)
            {
                // No need to update the progress bar for this -- it leads to a messy-looking UI because it's also updated for the person total
                // toolStripProgressBar1.Minimum = 0;
                // toolStripProgressBar1.Maximum = total;
                // toolStripProgressBar1.Value = number;
                toolStripStatusLabel1.Text = "Reading publication " + number.ToString() + " of " + total.ToString() + " (" + averageTime.ToString() + " ms average)";
                UpdateDatabaseStatus();
                Application.DoEvents();
            };

            // Make an anonymous callback function that logs any messages passed back
            Harvester.GetPublicationsMessage MessageCallback = delegate(string Message, bool StatusBarOnly)
            {
                if (StatusBarOnly)
                {
                    toolStripStatusLabel1.Text = Message;
                    //this.Refresh();
                    //statusStrip1.Refresh();
                    Application.DoEvents();
                }
                else
                {
                    AddLogEntry(Message);
                }
            };

            // Make an anonymous callback function to return the value of Interrupt for CheckForInterrupt
            Harvester.CheckForInterrupt InterruptCallback = delegate()
            {
                return(InterruptClicked);
            };

            // Get each person's publications and write them to the database
            NCBI ncbi = new NCBI("medline");

            if (NCBI.ApiKeyExists)
            {
                AddLogEntry("Using API key: " + NCBI.ApiKeyPath);
            }
            else
            {
                AddLogEntry("Performance is limited to under 3 requests per second.");
                AddLogEntry("Consider pasting an API key into " + NCBI.ApiKeyPath);
                AddLogEntry("Or set the NCBI_API_KEY_FILE environemnt variable to the API key file path");
                AddLogEntry("For more information, see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/");
            }
            People people = new People(DB);
            int    totalPeopleInPersonList = people.PersonList.Count;
            int    numberOfPeopleProcessed = 0;

            toolStripProgressBar1.Minimum = 0;
            toolStripProgressBar1.Maximum = totalPeopleInPersonList;

            foreach (Person person in people.PersonList)
            {
                numberOfPeopleProcessed++;
                try
                {
                    // If continuing from interruption, only harvest unharvested people
                    if ((!ContinueFromInterruption) || (!person.Harvested))
                    {
                        AddLogEntry("Getting publications for " + person.Last + " (" + person.Setnb + "), number " + numberOfPeopleProcessed.ToString() + " of " + totalPeopleInPersonList.ToString());
                        toolStripProgressBar1.Value = numberOfPeopleProcessed;
                        double AverageMilliseconds;
                        int    NumPublications = harvester.GetPublications(ncbi, pubTypes, person, StatusCallback, MessageCallback, InterruptCallback, out AverageMilliseconds);
                        if (InterruptClicked)
                        {
                            AddLogEntry("Publication harvesting was interrupted");
                            UpdateDatabaseStatus();
                            return;
                        }
                        AddLogEntry("Wrote " + NumPublications.ToString() + " publications, average write time " + Convert.ToString(Math.Round(AverageMilliseconds, 1)) + " ms");
                        UpdateDatabaseStatus();
                    }
                    else
                    {
                        AddLogEntry("Already retrieved publications for " + person.Last + " (" + person.Setnb + ")");
                    }
                }
                catch (Exception ex)
                {
                    AddLogEntry("An error occurred while reading publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message);
                }
            }

            AddLogEntry("Finished reading publications");
            UpdateDatabaseStatus();
        }
Exemple #2
0
        public void TestClearingDataAfterInterruption()
        {
            // Set up the database
            TestHarvester.GetPublicationsFromInput1XLS_Using_MockNCBI(false, new string[] { "eng" }, 22);

            Database  DB        = new Database("Publication Harvester Unit Test");
            Harvester harvester = new Harvester(DB);

            // Add a grant for publication 12462241, since it doesn't have one
            ArrayList Parameters = new ArrayList();

            Parameters.Add(Database.Parameter(12462241));
            Parameters.Add(Database.Parameter("Fake grant ID"));
            DB.ExecuteNonQuery("INSERT INTO PublicationGrants (PMID, GrantID) VALUES ( ? , ? )", Parameters);


            // Verify that InterruptedDataExists and UnharvestedPeopleExist both return false
            Assert.IsFalse(harvester.InterruptedDataExists());
            Assert.IsFalse(harvester.UnharvestedPeopleExist());

            // Verify that there are publications for Tobian (setnb A5401532)
            // (there should be 5 publications)

            Assert.IsTrue(DB.GetIntValue(
                              @"SELECT Count(*) 
                    FROM PeoplePublications
                   WHERE Setnb = 'A5401532'") == 5);
            int TotalPublicationsBeforeClear
                = DB.GetIntValue("SELECT Count(*) FROM PeoplePublications");

            // Verify that there are authors and MeSH headings for publication 12462241
            // (which belongs to Guillemin, and should have 82 authors and 9 headings)

            Assert.IsTrue(DB.GetIntValue(
                              @"SELECT Count(*) 
                    FROM PublicationMeSHHeadings
                   WHERE PMID = 12462241") == 9);
            int TotalHeadingsBeforeClear
                = DB.GetIntValue("SELECT Count(*) FROM PublicationMeSHHeadings");

            Assert.IsTrue(DB.GetIntValue(
                              @"SELECT Count(*) 
                    FROM PublicationAuthors
                   WHERE PMID = 12462241") == 82);
            int TotalAuthorsBeforeClear
                = DB.GetIntValue("SELECT Count(*) FROM PublicationAuthors");

            Assert.IsTrue(DB.GetIntValue(
                              @"SELECT Count(*) 
                    FROM PublicationGrants
                   WHERE PMID = 12462241") == 1);
            int TotalGrantsBeforeClear
                = DB.GetIntValue("SELECT Count(*) FROM PublicationGrants");

            // Set Tobian's Harvested to 0
            DB.ExecuteNonQuery("UPDATE People SET Harvested = 0 WHERE Setnb = 'A5401532'");

            // Remove publication 12462241
            DB.ExecuteNonQuery("DELETE FROM Publications WHERE PMID = 12462241");


            // Verify that InterruptedDataExists and UnharvestedPeopleExist both return true
            Assert.IsTrue(harvester.InterruptedDataExists());
            Assert.IsTrue(harvester.UnharvestedPeopleExist());

            // Execute ClearDataAfterInterruption
            harvester.ClearDataAfterInterruption();

            // Verify that InterruptedDataExists returns false again, now that
            // the intterrupted data has been cleared, but that there are still
            // unharvested people
            Assert.IsFalse(harvester.InterruptedDataExists());
            Assert.IsTrue(harvester.UnharvestedPeopleExist());

            // Verify that only Tobian's publications have been removed
            Assert.IsTrue(
                DB.GetIntValue("SELECT Count(*) FROM PeoplePublications")
                == TotalPublicationsBeforeClear - 5);

            // Verify that only publication 12462241's grants, headings and authors have been removed
            Assert.IsTrue(
                DB.GetIntValue("SELECT Count(*) FROM PublicationMeSHHeadings")
                == TotalHeadingsBeforeClear - 9);

            Assert.IsTrue(
                DB.GetIntValue("SELECT Count(*) FROM PublicationAuthors")
                == TotalAuthorsBeforeClear - 82);

            Assert.IsTrue(
                DB.GetIntValue("SELECT Count(*) FROM PublicationGrants")
                == TotalGrantsBeforeClear - 1);
        }