/// <summary> /// Sets the NCBI API key from the field on the form /// </summary> /// <returns> /// True if the API key file in the field is found or empty, false if the API it's invalid /// </returns> private bool SetApiKeyFromFormField() { if (!string.IsNullOrWhiteSpace(apiKeyFileTextBox.Text.Trim())) { if (File.Exists(apiKeyFileTextBox.Text.Trim())) { NCBI.GetApiKey(apiKeyFileTextBox.Text.Trim()); } else { MessageBox.Show($"API key file not found: {apiKeyFileTextBox.Text.Trim()}"); return(false); } } if (NCBI.ApiKeyExists) { Trace.WriteLine("Using API key: " + NCBI.ApiKeyPath); } else { Trace.WriteLine("Performance is limited to under 3 requests per second."); Trace.WriteLine("Consider pasting an API key into " + NCBI.ApiKeyPath); Trace.WriteLine("Or set the NCBI_API_KEY_FILE environemnt variable to the API key file path"); Trace.WriteLine("For more information, see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/"); } return(true); }
/// <summary> /// Constructor /// </summary> /// <param name="roster">AAMC roster object</param> /// <param name="alternateTableName">Alternate PeoplePublications table name, null if not using it</param> public ColleagueFinder(Database DB, Roster roster, NCBI ncbi, string alternateTableName) { this.DB = DB; this.roster = roster; this.ncbi = ncbi; this.harvester = new Harvester(DB); this.pubTypes = new PublicationTypes(DB); AlternateTableName = alternateTableName; }
/// <summary> /// Sets the NCBI API key from the field on the form /// </summary> /// <returns> /// True if the API key file in the field is found or empty, false if the API it's invalid /// </returns> private bool SetApiKeyFromFormField() { if (!string.IsNullOrWhiteSpace(ApiKeyFile.Text.Trim())) { if (File.Exists(ApiKeyFile.Text.Trim())) { NCBI.GetApiKey(ApiKeyFile.Text.Trim()); } else { MessageBox.Show($"API key file not found: {ApiKeyFile.Text.Trim()}"); return(false); } } return(true); }
/// <summary> /// Use the NCBI Elink request to retrieve related IDs for one or more publication IDs /// </summary> /// <param name="ids">IDs to retrieve</param> /// <returns>A string with XML results from elink.fcgi</returns> private static string ExecuteRelatedSearch(IEnumerable <int> ids) { if (ids == null) { throw new ArgumentNullException("ids"); } StringBuilder query = new StringBuilder(); query.AppendFormat("dbfrom={0}&db={1}&id=", ELINK_DBFROM, ELINK_DB); bool first = true; foreach (int id in ids) { if (!first) { query.Append("&id="); } else { first = false; } query.Append(id); } // Add "&cmd=neighbor_score" to get the <Score> elements query.Append("&cmd=neighbor_score"); query.Append(NCBI.ApiKeyParam); WebRequest request = WebRequest.Create(ELINK_URL); request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; byte[] byteArray = UTF8Encoding.UTF8.GetBytes(query.ToString()); request.ContentLength = byteArray.Length; using (Stream dataStream = request.GetRequestStream()) { dataStream.Write(byteArray, 0, byteArray.Length); } string result = NCBI.ExecuteWebRequest(request); return(result); }
public void MalformedXML() { string malformed = "this is some malformed xml!"; try { NCBI.EsearchResults results = NCBI.ParseSearchResults(malformed); Assert.Fail(); } catch (Exception ex) { // Verify the error is thrown Assert.IsTrue(ex.Message == "Unable to process XML returned by the NCBI server. Offending XML has been written to pubharvester_error.log."); // Read the last 6 lines of pubharvester_error.log StreamReader reader = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "\\pubharvester_error.log"); string[] lines = reader.ReadToEnd().Split('\n'); Assert.IsTrue(lines[lines.Length - 5].Trim() == malformed.Trim()); } }
public void NormalResults() { string xml = @"<?xml version=""1.0""?> <!DOCTYPE eSearchResult PUBLIC ""-//NLM//DTD eSearchResult, 11 May 2002//EN"" ""http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSearch_020511.dtd""> <eSearchResult> <Count>99</Count> <RetMax>1</RetMax> <RetStart>0</RetStart> <QueryKey>1</QueryKey> <WebEnv>01jHC0pmRm0V5DX0SCaTpJ0OqIA1N2LSKc2-Uus4KHDqRMj7m9Lz@@u66F4IOFk0AAH4@OH4AAAAQ</WebEnv> <IdList> <Id>15904469</Id> </IdList> <TranslationSet> <Translation> <From>STELLMAN SD</From> <To>STELLMAN SD[Author]</To> </Translation> </TranslationSet> <TranslationStack> <TermSet> <Term>STELLMAN SD[Author]</Term> <Field>Author</Field> <Count>99</Count> <Explode>Y</Explode> </TermSet> <OP>GROUP</OP> </TranslationStack> </eSearchResult>"; NCBI.EsearchResults results = NCBI.ParseSearchResults(xml); Assert.IsTrue(results.WebEnv == "01jHC0pmRm0V5DX0SCaTpJ0OqIA1N2LSKc2-Uus4KHDqRMj7m9Lz@@u66F4IOFk0AAH4@OH4AAAAQ"); Assert.IsTrue(results.QueryKey == 1); Assert.IsTrue(results.Count == 99); Assert.IsTrue(results.Found == true); }
public void TestColleaguesSetUp() { // Create the AAMC roster object roster = new Roster(AppDomain.CurrentDomain.BaseDirectory + "\\Test Data\\TestRoster\\testroster.csv"); // Stuff for GetPublications() // Make an anonymous callback function that keeps track of the callback data Harvester.GetPublicationsStatus StatusCallback = delegate(int number, int total, int averageTime) { // }; // Make an anonymous callback function to do nothing for GetPublicationsMessage Harvester.GetPublicationsMessage MessageCallback = delegate(string Message, bool StatusBarOnly) { // }; // Make an anonymous callback function to return false for CheckForInterrupt Harvester.CheckForInterrupt InterruptCallback = delegate() { return(false); }; double AverageMilliseconds; // Read the people file People PeopleFromFile = new People( AppDomain.CurrentDomain.BaseDirectory + "\\Test Data\\TestColleagues", "PeopleFile.xls"); // Drop all tables from the test database DB = new Database("Colleague Generator Unit Test"); foreach (string Table in new string[] { "colleaguepublications", "colleagues", "meshheadings", "people", "peoplepublications", "publicationauthors", "publicationgrants", "publicationmeshheadings", "publications", "pubtypecategories", "starcolleagues" } ) { DB.ExecuteNonQuery("DROP TABLE IF EXISTS " + Table + ";"); } // Create the test database harvester = new Harvester(DB); harvester.CreateTables(); ColleagueFinder.CreateTables(DB); // Populate it using the Mock NCBI object ncbi = new MockNCBI("Medline"); PubTypes = new PublicationTypes( AppDomain.CurrentDomain.BaseDirectory + "\\Test Data\\TestColleagues", "PublicationTypes.csv" ); // Write each person and his publications to the database foreach (Person person in PeopleFromFile.PersonList) { person.WriteToDB(DB); harvester.GetPublications(ncbi, PubTypes, person, StatusCallback, MessageCallback, InterruptCallback, out AverageMilliseconds); } }
/// <summary> /// Use the tests from TestColleagues to set up the database, /// then find the colleagues, get their publications and /// remove false colleagues. /// /// This is a static void so that it can be called by other tests. /// </summary> public static void DoSetUp(out Database DB, out Harvester harvester, out PublicationTypes PubTypes, out NCBI ncbi, string[] Languages) { // First recreate the database DB = new Database("Colleague Generator Unit Test"); ColleagueFinder.CreateTables(DB); // Then use the test fixture setup in TestColleagues to populate it TestColleagues testColleagues = new TestColleagues(); testColleagues.TestColleaguesSetUp(); // Write the publication types to the database PubTypes = new PublicationTypes( AppDomain.CurrentDomain.BaseDirectory + "\\Test Data\\TestColleagues", "PublicationTypes.csv" ); PubTypes.WriteToDB(DB); // Create the other objects from the database harvester = new Harvester(DB); Roster roster = new Roster(AppDomain.CurrentDomain.BaseDirectory + "\\Test Data\\TestRoster\\testroster.csv"); ncbi = new MockNCBI("Medline"); // Find the colleagues and publications ColleagueFinder finder = new ColleagueFinder(DB, roster, ncbi, null); People people = new People(DB); foreach (Person person in people.PersonList) { Person[] found = finder.FindPotentialColleagues(person); if (found != null) { finder.GetColleaguePublications(found, new string[] { "eng" }, new List <int> { 1, 2, 3 }); } } // Remove false colleagues ColleagueFinder.RemoveFalseColleagues(DB, null, "PeoplePublications"); // Create the extra articles for Bunn and Tobian. // Verify that Bunn and Tobian have five articles in common, with years // ranging from 1993 to 2001. CreateExtraArticlesForTobianAndBunn(DB, PubTypes, Languages); DataTable Result = DB.ExecuteQuery( @"SELECT p.Year, p.PMID, pp.PositionType AS StarPositionType, cp.PositionType AS ColleaguePositionType, p.Journal FROM Publications p, ColleaguePublications cp, PeoplePublications pp WHERE pp.Setnb = 'A5401532' AND cp.Setnb = 'A4800524' AND p.PMID = pp.PMID AND p.PMID = cp.PMID ORDER BY p.Year ASC"); Assert.AreEqual(Result.Rows.Count, 5); Assert.AreEqual(Result.Rows[0]["Year"], 1993); Assert.AreEqual(Result.Rows[4]["Year"], 2001); }
/// <summary> /// Harvest each of the publications in the people file /// </summary> /// <param name="PeopleFile">Filename of the people file</param> /// <param name="PublicationTypeFile">Filename of publication type file</param> /// <param name="ContinueFromInterruption">True if continuing from a previously interrupted harvest</param> public void Harvest(string PeopleFile, string PublicationTypeFile, bool ContinueFromInterruption) { // First verify that the files exist if (!File.Exists(PeopleFile)) { MessageBox.Show("The People file '" + PeopleFile + "' does not exist", "People file not found", MessageBoxButtons.OK, MessageBoxIcon.Warning); return; } if (!File.Exists(PublicationTypeFile)) { MessageBox.Show("The Publication Type file '" + PublicationTypeFile + "' does not exist", "Publication Type file not found", MessageBoxButtons.OK, MessageBoxIcon.Warning); return; } UpdateDatabaseStatus(); if (ContinueFromInterruption) { AddLogEntry("Continuing interrupted harvest"); } else { AddLogEntry("Beginning harvesting"); } // Reset lastDSNSelected to make sure that the next check for interrupted data is NOT skipped lastDSNSelected = ""; // Initialize the harvester Harvester harvester; Database DB; // Initialize objects try { DB = new Database(DSN.Text); harvester = new Harvester(DB); // Set the language restriction string[] Languages; if (LanguageList.Text != "") { Languages = LanguageList.Text.Split(','); harvester.Languages = Languages; foreach (string Language in Languages) { AddLogEntry("Adding language restriction: " + Language); } } else { AddLogEntry("No language restriction added"); } } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to begin harvesting"); return; } // Initializethe database try { if (!ContinueFromInterruption) { AddLogEntry("Initializing the database"); harvester.CreateTables(); UpdateDatabaseStatus(); } } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to initialize database"); return; } PublicationTypes pubTypes; if (ContinueFromInterruption) { // If we're continuing, read the publication types from the databse try { AddLogEntry("Reading publication types from the database"); pubTypes = new PublicationTypes(DB); } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to read publication types"); return; } // Remove any data left over from the interruption if (ContinueFromInterruption) { AddLogEntry("Removing any data left over from the previous interruption"); harvester.ClearDataAfterInterruption(); } UpdateDatabaseStatus(); } else { // Read the publication types from the file and write them to the database try { AddLogEntry("Writing publication types to database"); pubTypes = new PublicationTypes(Path.GetDirectoryName(PublicationTypeFile), Path.GetFileName(PublicationTypeFile)); pubTypes.WriteToDB(DB); UpdateDatabaseStatus(); } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to read publication types"); return; } // Read the people try { AddLogEntry("Reading people from " + Path.GetFileName(PeopleFile) + " and writing them to the database"); harvester.ImportPeople(PeopleFile); UpdateDatabaseStatus(); } catch (Exception ex) { AddLogEntryWithErrorBox(ex.Message, "Unable to read the people from " + Path.GetFileName(PeopleFile)); return; } } // Make an anonymous callback function that keeps track of the callback data Harvester.GetPublicationsStatus StatusCallback = delegate(int number, int total, int averageTime) { // No need to update the progress bar for this -- it leads to a messy-looking UI because it's also updated for the person total // toolStripProgressBar1.Minimum = 0; // toolStripProgressBar1.Maximum = total; // toolStripProgressBar1.Value = number; toolStripStatusLabel1.Text = "Reading publication " + number.ToString() + " of " + total.ToString() + " (" + averageTime.ToString() + " ms average)"; UpdateDatabaseStatus(); Application.DoEvents(); }; // Make an anonymous callback function that logs any messages passed back Harvester.GetPublicationsMessage MessageCallback = delegate(string Message, bool StatusBarOnly) { if (StatusBarOnly) { toolStripStatusLabel1.Text = Message; //this.Refresh(); //statusStrip1.Refresh(); Application.DoEvents(); } else { AddLogEntry(Message); } }; // Make an anonymous callback function to return the value of Interrupt for CheckForInterrupt Harvester.CheckForInterrupt InterruptCallback = delegate() { return(InterruptClicked); }; // Get each person's publications and write them to the database NCBI ncbi = new NCBI("medline"); if (NCBI.ApiKeyExists) { AddLogEntry("Using API key: " + NCBI.ApiKeyPath); } else { AddLogEntry("Performance is limited to under 3 requests per second."); AddLogEntry("Consider pasting an API key into " + NCBI.ApiKeyPath); AddLogEntry("Or set the NCBI_API_KEY_FILE environemnt variable to the API key file path"); AddLogEntry("For more information, see https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/"); } People people = new People(DB); int totalPeopleInPersonList = people.PersonList.Count; int numberOfPeopleProcessed = 0; toolStripProgressBar1.Minimum = 0; toolStripProgressBar1.Maximum = totalPeopleInPersonList; foreach (Person person in people.PersonList) { numberOfPeopleProcessed++; try { // If continuing from interruption, only harvest unharvested people if ((!ContinueFromInterruption) || (!person.Harvested)) { AddLogEntry("Getting publications for " + person.Last + " (" + person.Setnb + "), number " + numberOfPeopleProcessed.ToString() + " of " + totalPeopleInPersonList.ToString()); toolStripProgressBar1.Value = numberOfPeopleProcessed; double AverageMilliseconds; int NumPublications = harvester.GetPublications(ncbi, pubTypes, person, StatusCallback, MessageCallback, InterruptCallback, out AverageMilliseconds); if (InterruptClicked) { AddLogEntry("Publication harvesting was interrupted"); UpdateDatabaseStatus(); return; } AddLogEntry("Wrote " + NumPublications.ToString() + " publications, average write time " + Convert.ToString(Math.Round(AverageMilliseconds, 1)) + " ms"); UpdateDatabaseStatus(); } else { AddLogEntry("Already retrieved publications for " + person.Last + " (" + person.Setnb + ")"); } } catch (Exception ex) { AddLogEntry("An error occurred while reading publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message); } } AddLogEntry("Finished reading publications"); UpdateDatabaseStatus(); }
/// <summary> /// Retrieve the publications for found colleagues /// </summary> private void RetrieveColleaguePublications_Click(object sender, EventArgs e) { if (!SetApiKeyFromFormField()) { return; } this.Enabled = false; this.Cursor = Cursors.WaitCursor; // Set the language restriction string[] Languages = null; if (LanguageList.Text != "") { Languages = LanguageList.Text.Split(','); foreach (string Language in Languages) { AddLogEntry("Adding language restriction: " + Language); } } else { AddLogEntry("No language restriction added"); } Database DB = new Database(DSN.Text); // Clear any publications left over after an interruption DB.ExecuteNonQuery( @"DELETE p.* FROM ColleaguePublications p, Colleagues c WHERE c.Setnb = p.Setnb AND c.Harvested = 0" ); // Clear any errors int NumberOfErrors = DB.GetIntValue("SELECT Count(*) FROM Colleagues WHERE Error IS NOT NULL"); AddLogEntry("Clearing " + NumberOfErrors.ToString() + " colleagues with errors"); DB.ExecuteNonQuery( @"DELETE p.* FROM ColleaguePublications p, Colleagues c WHERE c.Setnb = p.Setnb AND c.Error IS NOT NULL" ); DB.ExecuteNonQuery( @"UPDATE Colleagues SET Harvested = 0, Error = NULL, ErrorMessage = NULL WHERE Error IS NOT NULL" ); NumberOfErrors = DB.GetIntValue("SELECT Count(*) FROM Colleagues WHERE Error = 1"); if (NumberOfErrors != 0) { AddLogEntry("WARNING: " + NumberOfErrors + " errors were not cleared!"); } UpdateStatus(); // Retrieve the publications for each unharvested colleague NCBI ncbi = new NCBI("Medline"); AddApiKeyLogEntries(); ColleagueFinder finder = new ColleagueFinder(DB, roster, ncbi, GetPeoplePublicationTableName()); People Colleagues = new People(DB, "Colleagues"); int Total = Colleagues.PersonList.Count; int Count = 0; toolStripProgressBar1.Minimum = 0; toolStripProgressBar1.Maximum = Total; foreach (Person person in Colleagues.PersonList) { Count++; toolStripProgressBar1.Value = Count; statusStrip1.Refresh(); if (person.Harvested == false) { AddLogEntry("Harvesting publications for colleague " + person.Last + " (" + person.Setnb + "), " + Count.ToString() + " of " + Total.ToString()); UpdateStatus(); // Check for existing publications in ColleaguePublications ArrayList Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); int ExistingPublications = DB.GetIntValue( "SELECT Count(*) FROM ColleaguePublications WHERE Setnb = ?", Parameters); if (ExistingPublications > 0) { DB.ExecuteNonQuery("UPDATE Colleagues SET Harvested = 1 WHERE Setnb = ?", Parameters); AddLogEntry(ExistingPublications.ToString() + " publications already exist for colleague " + person.Last + " (" + person.Setnb + "), " + Count.ToString() + " of " + Total.ToString()); } else { Person[] personArray = new Person[1]; personArray[0] = person; try { List <int> allowedPubTypes = new List <int>(); foreach (string type in AllowedPubTypeCategories.Text.Split(',')) { allowedPubTypes.Add(int.Parse(type)); } finder.GetColleaguePublications(personArray, Languages, allowedPubTypes); GC.Collect(); // no need to wait for finalizers, because they don't do anything } catch (Exception ex) { AddLogEntry(ex.Message); // Set the error for the colleague Parameters = new ArrayList(); Parameters.Add(Database.Parameter(ex.Message)); Parameters.Add(Database.Parameter(person.Setnb)); DB.ExecuteNonQuery( @"UPDATE Colleagues SET Error = 1, ErrorMessage = ? WHERE Setnb = ?", Parameters); } } UpdateStatus(); } else { AddLogEntry("Already harvested publications for colleague " + person.Last + " (" + person.Setnb + "), " + Count.ToString() + " of " + Total.ToString()); } } this.Cursor = Cursors.Default; this.Enabled = true; }
/// <summary> /// Generate the colleagues using the Colleagues class /// </summary> private void FindPotentialColleagues_Click(object sender, EventArgs e) { if (!SetApiKeyFromFormField()) { return; } // If the tables aren't populated, don't look for colleagues. Database UpdateDB = new Database(DSN.Text); bool TablesCreated; int NumPeople; int NumHarvestedPeople; int NumPublications; int NumErrors; UpdateDB.GetStatus(out TablesCreated, out NumPeople, out NumHarvestedPeople, out NumPublications, out NumErrors); if ((!TablesCreated) || (NumPeople == 0) || (NumHarvestedPeople == 0)) { MessageBox.Show("Please select a database that the Publication Harvester has been run on.", "Specify a Valid Database", MessageBoxButtons.OK, MessageBoxIcon.Stop); return; } // Reset the database by default -- if the user says "Yes" when asked, // then continue from the previous search. bool ResetDatabase = true; if (DiadsFound.Text != "0") { if (MessageBox.Show("Colleagues have already been found. Are you sure you want to re-find them (or continue the previous search)?", "Re-find Colleagues?", MessageBoxButtons.YesNo, MessageBoxIcon.Hand) == DialogResult.No) { return; } if (MessageBox.Show("Should the colleague search continue where it was previously left off? (Click 'No' to reset the database and start finding new colleagues.)", "Continue from previous search?", MessageBoxButtons.YesNo, MessageBoxIcon.Hand) == DialogResult.Yes) { ResetDatabase = false; } } this.Enabled = false; this.Cursor = Cursors.WaitCursor; Database DB = new Database(DSN.Text); // Use the ResetDatabase variable to determine whether or not to resume // a past find. If resuming, get the Setnb's of stars from the StarColleagues // table so they can be skipped. if (ResetDatabase) { ColleagueFinder.CreateTables(DB); } DataTable StarSetnbsResult = DB.ExecuteQuery("SELECT StarSetnb FROM StarColleagues"); ArrayList StarSetnbs = new ArrayList(); foreach (DataRow Row in StarSetnbsResult.Rows) { if (!(StarSetnbs.Contains(Row[0].ToString()))) { StarSetnbs.Add(Row[0].ToString()); } } NCBI ncbi = new NCBI("Medline"); AddApiKeyLogEntries(); ColleagueFinder finder = new ColleagueFinder(DB, roster, ncbi, GetPeoplePublicationTableName()); People Stars = new People(DB); int NumStars = Stars.PersonList.Count; toolStripProgressBar1.Minimum = 0; toolStripProgressBar1.Maximum = NumStars; int Count = 0; foreach (Person Star in Stars.PersonList) { Count++; toolStripProgressBar1.Value = Count; statusStrip1.Refresh(); Application.DoEvents(); if (StarSetnbs.Contains(Star.Setnb)) { AddLogEntry("Already found colleagues for " + Star.Last + " (" + Star.Setnb + "), " + Count.ToString() + " of " + NumStars.ToString()); } else { AddLogEntry("Finding " + Star.Last + " (" + Star.Setnb + "), " + Count.ToString() + " of " + NumStars.ToString()); UpdateStatus(); Person[] Colleagues = finder.FindPotentialColleagues(Star); } } UpdateStatus(); this.Cursor = Cursors.Default; this.Enabled = true; }
public void TestNCBIWebQuerySetUp() { ncbi = new NCBI("medline"); }