/// <summary> /// Harvest each of the publications in the people file /// </summary> /// <param name="PeopleFile">Filename of the people file</param> /// <param name="PublicationTypeFile">Filename of publication type file</param> /// <param name="ContinueFromInterruption">True if continuing from a previously interrupted harvest</param> public void Harvest(string PeopleFile, string PublicationTypeFile, bool ContinueFromInterruption) { // First verify that the files exist if (!File.Exists(PeopleFile)) { MessageBox.Show("The People file '" + PeopleFile + "' does not exist", "People file not found", MessageBoxButtons.OK, MessageBoxIcon.Warning); return; } if (!File.Exists(PublicationTypeFile)) { MessageBox.Show("The Publication Type file '" + PublicationTypeFile + "' does not exist", "Publication Type file not found", MessageBoxButtons.OK, MessageBoxIcon.Warning); return; } UpdateDatabaseStatus(); if (ContinueFromInterruption) { AddLogEntry("Continuing interrupted harvest"); } else { AddLogEntry("Beginning harvesting"); } // Reset lastDSNSelected to make sure that the next check for interrupted data is NOT skipped lastDSNSelected = ""; // Initialize the harvester Harvester harvester; Database DB; // Initialize objects try { DB = new Database(DSN.Text); harvester = new Harvester(DB); // Set the language restriction string[] Languages; if (LanguageList.Text != "") { Languages = LanguageList.Text.Split(','); harvester.Languages = Languages; foreach (string Language in Languages) { AddLogEntry("Adding language restriction: " + Language); } } else { AddLogEntry("No language restriction added"); } } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to begin harvesting"); return; } // Initializethe database try { if (!ContinueFromInterruption) { AddLogEntry("Initializing the database"); harvester.CreateTables(); UpdateDatabaseStatus(); } } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to initialize database"); return; } PublicationTypes pubTypes; if (ContinueFromInterruption) { // If we're continuing, read the publication types from the databse try { AddLogEntry("Reading publication types from the database"); pubTypes = new PublicationTypes(DB); } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to read publication types"); return; } // Remove any data left over from the interruption if (ContinueFromInterruption) { AddLogEntry("Removing any data left over from the previous interruption"); harvester.ClearDataAfterInterruption(); } UpdateDatabaseStatus(); } else { // Read the publication types from the file and write them to the database try { AddLogEntry("Writing publication types to database"); pubTypes = new PublicationTypes(Path.GetDirectoryName(PublicationTypeFile), Path.GetFileName(PublicationTypeFile)); pubTypes.WriteToDB(DB); UpdateDatabaseStatus(); } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to read publication types"); return; } // Read the people try { AddLogEntry("Reading people from " + Path.GetFileName(PeopleFile) + " and writing them to the database"); harvester.ImportPeople(PeopleFile); UpdateDatabaseStatus(); } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to read the people from " + Path.GetFileName(PeopleFile)); return; } } // Make an anonymous callback function that keeps track of the callback data Harvester.GetPublicationsStatus StatusCallback = delegate(int number, int total, int averageTime) { // No need to update the progress bar for this -- it leads to a messy-looking UI because it's also updated for the person total // toolStripProgressBar1.Minimum = 0; // toolStripProgressBar1.Maximum = total; // toolStripProgressBar1.Value = number; toolStripStatusLabel1.Text = "Reading publication " + number.ToString() + " of " + total.ToString() + " (" + averageTime.ToString() + " ms average)"; UpdateDatabaseStatus(); Application.DoEvents(); }; // Make an anonymous callback function that logs any messages passed back Harvester.GetPublicationsMessage MessageCallback = delegate(string Message, bool StatusBarOnly) { if (StatusBarOnly) { toolStripStatusLabel1.Text = Message; //this.Refresh(); //statusStrip1.Refresh(); Application.DoEvents(); } else { AddLogEntry(Message); } }; // Make an anonymous callback function to return the value of Interrupt for CheckForInterrupt Harvester.CheckForInterrupt InterruptCallback = delegate() { return(InterruptClicked); }; // Get each person's publications and write them to the database NCBI ncbi = new NCBI("medline"); if (NCBI.ApiKeyExists) { AddLogEntry("Using API key: " + NCBI.ApiKeyPath); } else { AddLogEntry("Performance is limited to under 3 requests per second."); AddLogEntry("Consider pasting an API key into " + NCBI.ApiKeyPath); AddLogEntry("Or set the NCBI_API_KEY_FILE environemnt variable to the API key file path"); AddLogEntry("For more information, see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/"); } People people = new People(DB); int totalPeopleInPersonList = people.PersonList.Count; int numberOfPeopleProcessed = 0; toolStripProgressBar1.Minimum = 0; toolStripProgressBar1.Maximum = totalPeopleInPersonList; foreach (Person person in people.PersonList) { numberOfPeopleProcessed++; try { // If continuing from interruption, only harvest unharvested people if ((!ContinueFromInterruption) || (!person.Harvested)) { AddLogEntry("Getting publications for " + person.Last + " (" + person.Setnb + "), number " + numberOfPeopleProcessed.ToString() + " of " + totalPeopleInPersonList.ToString()); toolStripProgressBar1.Value = numberOfPeopleProcessed; double AverageMilliseconds; int NumPublications = harvester.GetPublications(ncbi, pubTypes, person, StatusCallback, MessageCallback, InterruptCallback, out AverageMilliseconds); if (InterruptClicked) { AddLogEntry("Publication harvesting was interrupted"); UpdateDatabaseStatus(); return; } AddLogEntry("Wrote " + NumPublications.ToString() + " publications, average write time " + Convert.ToString(Math.Round(AverageMilliseconds, 1)) + " ms"); UpdateDatabaseStatus(); } else { AddLogEntry("Already retrieved publications for " + person.Last + " (" + person.Setnb + ")"); } } catch (Exception ex) { AddLogEntry("An error occurred while reading publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message); } } AddLogEntry("Finished reading publications"); UpdateDatabaseStatus(); }
public void TestClearingDataAfterInterruption() { // Set up the database TestHarvester.GetPublicationsFromInput1XLS_Using_MockNCBI(false, new string[] { "eng" }, 22); Database DB = new Database("Publication Harvester Unit Test"); Harvester harvester = new Harvester(DB); // Add a grant for publication 12462241, since it doesn't have one ArrayList Parameters = new ArrayList(); Parameters.Add(Database.Parameter(12462241)); Parameters.Add(Database.Parameter("Fake grant ID")); DB.ExecuteNonQuery("INSERT INTO PublicationGrants (PMID, GrantID) VALUES ( ? , ? )", Parameters); // Verify that InterruptedDataExists and UnharvestedPeopleExist both return false Assert.IsFalse(harvester.InterruptedDataExists()); Assert.IsFalse(harvester.UnharvestedPeopleExist()); // Verify that there are publications for Tobian (setnb A5401532) // (there should be 5 publications) Assert.IsTrue(DB.GetIntValue( @"SELECT Count(*) FROM PeoplePublications WHERE Setnb = 'A5401532'") == 5); int TotalPublicationsBeforeClear = DB.GetIntValue("SELECT Count(*) FROM PeoplePublications"); // Verify that there are authors and MeSH headings for publication 12462241 // (which belongs to Guillemin, and should have 82 authors and 9 headings) Assert.IsTrue(DB.GetIntValue( @"SELECT Count(*) FROM PublicationMeSHHeadings WHERE PMID = 12462241") == 9); int TotalHeadingsBeforeClear = DB.GetIntValue("SELECT Count(*) FROM PublicationMeSHHeadings"); Assert.IsTrue(DB.GetIntValue( @"SELECT Count(*) FROM PublicationAuthors WHERE PMID = 12462241") == 82); int TotalAuthorsBeforeClear = DB.GetIntValue("SELECT Count(*) FROM PublicationAuthors"); Assert.IsTrue(DB.GetIntValue( @"SELECT Count(*) FROM PublicationGrants WHERE PMID = 12462241") == 1); int TotalGrantsBeforeClear = DB.GetIntValue("SELECT Count(*) FROM PublicationGrants"); // Set Tobian's Harvested to 0 DB.ExecuteNonQuery("UPDATE People SET Harvested = 0 WHERE Setnb = 'A5401532'"); // Remove publication 12462241 DB.ExecuteNonQuery("DELETE FROM Publications WHERE PMID = 12462241"); // Verify that InterruptedDataExists and UnharvestedPeopleExist both return true Assert.IsTrue(harvester.InterruptedDataExists()); Assert.IsTrue(harvester.UnharvestedPeopleExist()); // Execute ClearDataAfterInterruption harvester.ClearDataAfterInterruption(); // Verify that InterruptedDataExists returns false again, now that // the intterrupted data has been cleared, but that there are still // unharvested people Assert.IsFalse(harvester.InterruptedDataExists()); Assert.IsTrue(harvester.UnharvestedPeopleExist()); // Verify that only Tobian's publications have been removed Assert.IsTrue( DB.GetIntValue("SELECT Count(*) FROM PeoplePublications") == TotalPublicationsBeforeClear - 5); // Verify that only publication 12462241's grants, headings and authors have been removed Assert.IsTrue( DB.GetIntValue("SELECT Count(*) FROM PublicationMeSHHeadings") == TotalHeadingsBeforeClear - 9); Assert.IsTrue( DB.GetIntValue("SELECT Count(*) FROM PublicationAuthors") == TotalAuthorsBeforeClear - 82); Assert.IsTrue( DB.GetIntValue("SELECT Count(*) FROM PublicationGrants") == TotalGrantsBeforeClear - 1); }