/// <summary> /// Initialize the journal weights file /// </summary> /// <param name="JournalWeightsFilename">Filename of the journal weights file</param> public Reports(Database DB, string JournalWeightsFilename) { PeopleReportSections = DefaultPeopleReportSections(); string[] Columns = { "JOURNAL TITLE", "JIF" }; DataTable Results = NpoiHelper.ReadExcelFileToDataTable( Path.GetDirectoryName(JournalWeightsFilename), Path.GetFileName(JournalWeightsFilename)); // Populate the Weights hash table (which was declared to be case-insensitive) Weights = new Hashtable(StringComparer.CurrentCultureIgnoreCase); foreach (DataRow Row in Results.Rows) { if (!Weights.ContainsKey(Row["JOURNAL TITLE"].ToString())) { Weights.Add(Row["JOURNAL TITLE"].ToString(), Convert.ToSingle(Row["JIF"])); } } this.PubTypes = new PublicationTypes(DB); this.DB = DB; }
/// <summary> /// Read a string with data full of Medline citations and extract all of /// the publications /// </summary> /// <param name="medlineData">String with data full of Medline citations</param> public Publications(string medlineData, PublicationTypes pubTypes) { if (CheckForEmptyResult(medlineData)) { PublicationList = null; return; } // Get each publication from MedlineData and add it to PublicationList[] Publication publication; List <Publication> tempList = new List <Publication>(); if (PublicationList != null) { tempList.AddRange(PublicationList); } while (GetNextPublication(ref medlineData, out publication, pubTypes)) { // Add the publication to the end of PublicationList[] tempList.Add(publication); } PublicationList = tempList.Count > 0 ? tempList.ToArray() : null; }
/// <summary> /// Add the current publication to the database /// </summary> /// <param name="publication">Publication to write</param> /// <param name="DB">Database to add to</param> /// <param name="PubTypes">PublicationTypes object</param> /// <param name="Languages">The publication must match one of these languages or it will be rejected</param> /// <returns>True if the publication was written or is in the database already, false otherwise</returns> public static bool WriteToDB(Publication publication, Database DB, PublicationTypes PubTypes, string[] Languages) { ArrayList Parameters; // If the object already exists in the database, do nothing Parameters = new ArrayList(); Parameters.Add(Database.Parameter(publication.PMID)); int Count = DB.GetIntValue( @"SELECT Count(*) FROM Publications p WHERE p.PMID = ?", Parameters ); if (Count > 0) { return(true); // Return true because the publication is in the database } // Only write a publication if the language matches one of the languages // passed in the Languages parameter -- if Languages is null, accept // all values if (Languages != null) { bool Found = false; foreach (string Language in Languages) { if (publication.Language == Language) { Found = true; } } if (!Found) { return(false); // Return false because the publication wasn't written } } // Add the authors // First delete any authors that are there, in case the add was // interrupted partway through for (int Position = 0; (publication.Authors != null) && (Position <= publication.Authors.GetUpperBound(0)); Position++) { int First = (Position == 0) ? 1 : 0; int Last = (Position == publication.Authors.GetUpperBound(0)) ? 1 : 0; Parameters = new ArrayList(); Parameters.Add(Database.Parameter(publication.PMID)); Parameters.Add(Database.Parameter(Position + 1)); // The first author position is 1, not 0 Parameters.Add(Database.Parameter(Database.Left(publication.Authors[Position], 70))); Parameters.Add(Database.Parameter(First)); Parameters.Add(Database.Parameter(Last)); DB.ExecuteNonQuery( @"INSERT INTO PublicationAuthors (PMID, Position, Author, First, Last) VALUES (? , ? , ? , ? , ?)" , Parameters); } // Add the MeSH headings for (int Heading = 0; (publication.MeSHHeadings != null) && (Heading < publication.MeSHHeadings.Count); Heading++) { // If the MeSH heading already exists in the MeSHHeadings table, reuse it, // otherwise add it int ID; string MeSHHeading = Database.Left((string)publication.MeSHHeadings[Heading], 255); Parameters = new ArrayList(); Parameters.Add(Database.Parameter(MeSHHeading)); DataTable Results = DB.ExecuteQuery("SELECT ID FROM MeSHHeadings WHERE Heading = ?", Parameters); if (Results.Rows.Count > 0) { ID = Convert.ToInt32(Results.Rows[0][0]); } else { Parameters = new ArrayList(); Parameters.Add(Database.Parameter(MeSHHeading)); DB.ExecuteNonQuery("INSERT INTO MeSHHeadings (Heading) VALUES (?)", Parameters); ID = DB.GetIntValue("SELECT LAST_INSERT_ID()"); } Parameters = new ArrayList(); Parameters.Add(Database.Parameter(publication.PMID)); Parameters.Add(Database.Parameter(ID)); DB.ExecuteNonQuery( @"INSERT INTO PublicationMeSHHeadings (PMID, MeSHHeadingID) VALUES ( ? , ? )", Parameters); } // Add the grants for (int Grant = 0; (publication.Grants != null) && (Grant < publication.Grants.Count); Grant++) { // Some publications may have duplicate grants, so only add non-duplicates to avoid // primary key problems string GrantID = Database.Left((string)publication.Grants[Grant], 50); Parameters = new ArrayList(); Parameters.Add(Database.Parameter(publication.PMID)); Parameters.Add(Database.Parameter(GrantID)); if (DB.GetIntValue( @"SELECT Count(*) FROM PublicationGrants WHERE PMID = ? AND GrantID = ?", Parameters) == 0) { Parameters = new ArrayList(); Parameters.Add(Database.Parameter(publication.PMID)); Parameters.Add(Database.Parameter(GrantID)); DB.ExecuteNonQuery( @"INSERT INTO PublicationGrants (PMID, GrantID) VALUES ( ? , ? )", Parameters); } } // Strip the single and double quotes from the title before writing it string title; if (String.IsNullOrEmpty(publication.Title)) { title = ""; } else { title = publication.Title.Replace("\"", "").Replace("'", ""); } Parameters = new ArrayList(); Parameters.Add(Database.Parameter(publication.PMID)); Parameters.Add(Database.Parameter(Database.Left(publication.Journal, 128))); Parameters.Add(Database.Parameter(publication.Year)); Parameters.Add(Database.Parameter(publication.Authors == null ? 0 : publication.Authors.Length)); Parameters.Add(Database.Parameter(Database.Left(publication.Month, 32))); Parameters.Add(Database.Parameter(Database.Left(publication.Day, 32))); Parameters.Add(Database.Parameter(Database.Left(title, 244))); Parameters.Add(Database.Parameter(Database.Left(publication.Volume, 32))); Parameters.Add(Database.Parameter(Database.Left(publication.Issue, 32))); Parameters.Add(Database.Parameter(Database.Left(publication.Pages, 50))); // Finally, add the publication. This is part of the publication fault // tolerance system -- the headings and authors are not "final" until // the publication is written, and can be cleared from the database using // Harvester.ClearDataAfterInterruption(). // Publication type processing -- read the publication type file, // create the publication type table, add the types, file types into bins Parameters.Add(Database.Parameter(publication.PubType)); Parameters.Add(Database.Parameter(PubTypes.GetCategoryNumber(publication.PubType))); DB.ExecuteNonQuery( @"INSERT INTO Publications (PMID, Journal, Year, Authors, Month, Day, Title, Volume, Issue, Pages, PubType, PubTypeCategoryID) VALUES (? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? )", Parameters); return(true); }
/// <summary> /// Read a string containing a Medline tag and its data and populate the /// appropriate property /// </summary> /// <param name="MedlineTag">A Medline tag and its data (with newlines stripped out)</param> public static void ProcessMedlineTag(ref Publication publication, string MedlineTag, PublicationTypes pubTypes) { // Verify that the string is really a tag -- it must start with four characters // followed by a dash and a space. If not, don't process it. if ((MedlineTag.Length < 6) || (MedlineTag.Substring(4, 2) != "- ")) { return; } string data = MedlineTag.Substring(5).Trim(); // Process the tags switch (MedlineTag.Substring(0, 4).Trim()) { case "PMID": // "PMID- 16319490" if (IsNumeric(data)) { publication.PMID = Convert.ToInt32(data); } else { publication.PMID = int.MinValue; } break; case "DP": // "DP - 2005 Nov 24" if (IsNumeric(data.Substring(0, 4))) { // Grab the year first, then the rest is month and day. publication.Year = Convert.ToInt32(data.Substring(0, 4)); data = data.Substring(4).Trim(); if (data.Contains(" ")) { publication.Month = data.Substring(0, data.IndexOf(" ")); publication.Day = data.Substring(data.IndexOf(" ") + 1); } else { if (data.Length != 0) { publication.Month = data; } } } break; case "TI": // "TI - Title..." publication.Title = data; break; case "TA": // "TA - Hum Hered" publication.Journal = data; break; case "GR": // "GR - GM 28356/GM/NIGMS" // GrantID should contain a comma-delimit list of grant IDs if (publication.Grants == null) { publication.Grants = new ArrayList(); } publication.Grants.Add(data); break; case "PT": // "PT - Clinical Trial" // Only copy PubType if it's the first PT tag encountered in this // publication or if the publication type is flagged as an // "override first category" publication type if (publication.PubType == null) { publication.PubType = data; } else if (pubTypes.OverrideFirstCategory.ContainsKey(data)) { publication.PubType = data; } break; case "IP": // "IP - 3" publication.Issue = data; break; case "PG": // "PG - 134-142" publication.Pages = data; break; case "LA": // "LA - eng" publication.Language = data.ToLower(); break; case "VI": // "VI - 60" publication.Volume = data; break; case "AU": // "AU - Wang T" // If this is the first author, create the Authors array if (publication.Authors == null) { publication.Authors = new string[1]; publication.Authors[0] = data; } else { // Otherwise, copy the author to the end of the Authors array string[] temp = new string[publication.Authors.Length + 1]; publication.Authors.CopyTo(temp, 0); publication.Authors = temp; publication.Authors[publication.Authors.GetUpperBound(0)] = data; } break; case "MH": // "AU - Wang T" // If this is the first MeSH heading, create the new ArrayList if (publication.MeSHHeadings == null) { publication.MeSHHeadings = new ArrayList(); } publication.MeSHHeadings.Add(data.ToString()); break; } }
/// <summary> /// Strip the next publication off of the top of the string. /// </summary> /// <param name="MedlineData">A string containing the Medline data, passed /// by reference so that the first publication can be stripped off</param> /// <param name="publication">A publication that will contain the next publication in the Medline stream</param> /// <returns>True if a publication was read, false otherwise</param> private static bool GetNextPublication(ref string MedlineData, out Publication publication, PublicationTypes pubTypes) { Publication PublicationToWrite = new Publication(); string line; StringReader reader = new StringReader(MedlineData); // Skip past any blank lines at the top of the publication // Return null if there are no more publications line = reader.ReadLine(); if (line == null) { // There are no more publications publication = new Publication(); return(false); } else if (line.Trim().Length == 0) { // There are blank lines to skip. Read each line, and if it's blank // advance MedlineData past it. while (line.Trim().Length == 0) { MedlineData = reader.ReadToEnd(); reader = new StringReader(MedlineData); line = reader.ReadLine(); if (line == null) { publication = new Publication(); return(false); } } // MedlineData is now set to the first line after the blanks } // Never mind, set reader back to the beginning of MedlineData reader = new StringReader(MedlineData); // Read the next line, and keep reading until it hits a blank line // or the end of the file while (((line = reader.ReadLine()) != null) && (line.Trim().Length != 0)) { // Take each following line that starts with a space and add them // to the end of the current line while (reader.Peek() == ' ') { line = line + " " + reader.ReadLine().Trim(); } Publications.ProcessMedlineTag(ref PublicationToWrite, line, pubTypes); } MedlineData = reader.ReadToEnd(); publication = PublicationToWrite; return(true); }
/// <summary> /// Retrieve counts from a publication list /// </summary> /// <param name="PublicationList">Publication list to retrieve counts from, /// sorted by year, publication type and author position</param> /// <param name="Index">Offset in the publication list of the first publication /// matching the year and publication type</param> /// <param name="Year">Year to match for</param> /// <param name="PublicationType">Publication type to match for</param> public Counts(Publication[] PublicationList, ref int Index, int Year, int PublicationType, PublicationTypes PubTypes, Database DB, Person person, Hashtable Weights, string PeoplePublicationsTable) { // Return zero counts if the publication list is empty if (PublicationList.Length == 0) { return; } // Return zero counts if the index is out of bounds if ((Index < 0) || (Index >= PublicationList.Length)) { return; } // Return zero counts if the index doesn't point to a match -- that means // there are no matches Publication pub = PublicationList[Index]; int PubType = PubTypes.GetCategoryNumber(pub.PubType); if ((pub.Year != Year) || (PubType != PublicationType)) { return; } // If we get this far, we have a match. Move forward through the publication // list, adding to the counts, until we find a non-matching publication or // the list runs out. do { // Get the weight for the journal float Weight = 0; if (pub.Journal != null && Weights.ContainsKey(pub.Journal)) { Weight += (float)Weights[pub.Journal]; } // Get the position type, and increment the correct counter Harvester.AuthorPositions PositionType; Publications.GetAuthorPosition(DB, pub.PMID, person, out PositionType, PeoplePublicationsTable); switch (PositionType) { case Harvester.AuthorPositions.First: First++; FirstWeighted += Weight; break; case Harvester.AuthorPositions.Last: Last++; LastWeighted += Weight; break; case Harvester.AuthorPositions.Second: Second++; SecondWeighted += Weight; break; case Harvester.AuthorPositions.NextToLast: NextToLast++; NextToLastWeighted += Weight; break; case Harvester.AuthorPositions.Middle: case Harvester.AuthorPositions.None: Middle++; MiddleWeighted += Weight; break; } Index++; if (Index < PublicationList.Length) { pub = PublicationList[Index]; PubType = PubTypes.GetCategoryNumber(pub.PubType); } } while ((Index < PublicationList.Length) && (PublicationList[Index].Year == Year) && (PubType == PublicationType)); }
/// <summary> /// Retrieve the publications for a person and write them to the database /// </summary> /// <param name="ncbi">NCBI web query object</param> /// <param name="pubTypes">PublicationTypes object</param> /// <param name="person">Person to retrieve publications for</param> /// <param name="StatusCallback">Callback function to return status</param> /// <param name="MessageCallback">Callback function to send messages</param> /// <param name="AverageMilliseconds">Average time (in milliseconds) of each publication write</param> /// <returns>Number of publications written</returns> public int GetPublications(NCBI ncbi, PublicationTypes pubTypes, Person person, GetPublicationsStatus StatusCallback, GetPublicationsMessage MessageCallback, CheckForInterrupt InterruptCallback, out double AverageMilliseconds) { ArrayList Parameters; DateTime StartTime; DateTime EndTime; double TotalMilliseconds = 0; AverageMilliseconds = 0; int numberFound = 0; int numberWritten = 0; // Double-check that the person is really unharvested. If we try to // write publications for a person who already has publications, it will // cause an error -- and that could happen if this person was already // written from a duplicate person earlier. Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); int HarvestedCount = DB.GetIntValue("SELECT Count(*) FROM People WHERE Setnb = ? AND Harvested = 1", Parameters); if (HarvestedCount > 0) { MessageCallback("Already harvested publications for " + person.Last + " (" + person.Setnb + ")", false); Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); return(DB.GetIntValue("SELECT Count(*) FROM PeoplePublications WHERE Setnb = ?", Parameters)); } MessageCallback("Retrieving data from NCBI", true); // Find any other people with the same names and search criteria. // Any publications found for this person should also be found // for them, so when we write the rows to PeoplePublications later // we'll also write them for the other people as well. // Look in the database for any other people with the same // values for name1, name2, name3, name4, name5, name6, and MedlineSearch. // Write their PeoplePublications as well. string NamesClause = ""; Parameters = new ArrayList(); for (int i = 0; i < 6; i++) { if (i < person.Names.Length) { Parameters.Add(Database.Parameter(person.Names[i])); NamesClause += " Name" + ((int)(i + 1)).ToString() + " = ? AND "; } else { NamesClause += " Name" + ((int)(i + 1)).ToString() + " IS NULL AND "; } } Parameters.Add(Database.Parameter(person.MedlineSearch)); Parameters.Add(Database.Parameter(person.Setnb)); DataTable Results = DB.ExecuteQuery("SELECT " + Database.PEOPLE_COLUMNS + @"FROM People WHERE Harvested = 0 AND " + NamesClause + @" MedlineSearch = ? AND Setnb <> ?", Parameters ); ArrayList DuplicatePeople = new ArrayList(); foreach (DataRow Row in Results.Rows) { Person dupe = new Person(Row, Results.Columns); DuplicatePeople.Add(dupe); MessageCallback("Also writing publications for " + dupe.Last + " (" + dupe.Setnb + ") with same names and search criteria", false); } // Search NCBI -- if an error is thrown, write that error to the database string results; try { results = ncbi.Search(person.MedlineSearch); if (results.Substring(0, 100).Contains("Error occurred")) { // NCBI returns an HTML error page in the results // // <html> // <body> // <br/><h2>Error occurred: Unable to obtain query #1</h2><br/> // ... // // If NCBI returns an empty result set with no publications, it will give the error: // Error occurred: Empty result - nothing todo // // That error should generate a warning and mark the person as harvested in the database. // Any other error should be written to the database as an error. string Error = results.Substring(results.IndexOf("Error occurred")); if (results.Contains("<")) { Error = Error.Substring(0, Error.IndexOf("<")); } string Message; if (Error.ToLower().Contains("empty result")) { Message = "Warning for " + person.Last + " (" + person.Setnb + "): no publications found (NCBI returned empty results)"; person.Harvested = true; person.WriteToDB(DB); } else { Message = "Error reading publications for " + person.Last + " (" + person.Setnb + "): NCBI returned '" + Error + "'"; person.WriteErrorToDB(DB, Message); } MessageCallback(Message, false); return(0); } } catch (Exception ex) { string Message = "Error reading publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message; person.WriteErrorToDB(DB, Message); MessageCallback(Message, false); return(0); } Publications mpr = new Publications(results, pubTypes); if (mpr.PublicationList != null) { foreach (Publication publication in mpr.PublicationList) { numberFound++; // Exit immediately if the user interrupted the harvest if (InterruptCallback()) { return(numberWritten); } try { // Calculate the average time, to return in the callback status function StartTime = DateTime.Now; // Add the publication to PeoplePublications // First find the author position and calculate the position type int AuthorPosition = 0; for (int i = 1; (publication.Authors != null) && (AuthorPosition == 0) && (i <= publication.Authors.Length); i++) { foreach (string name in person.Names) { if (StringComparer.CurrentCultureIgnoreCase.Equals( publication.Authors[i - 1], name //.ToUpper() )) { AuthorPosition = i; } else if (name == "*") { AuthorPosition = -1; } } } // If the PMID is 0, we don't have a way to process the publication // and it was probably a Medline search result error. if (publication.PMID == int.MinValue) { string errorMessage = "Found an invalid publication"; if (!string.IsNullOrEmpty(publication.Title)) { errorMessage += " (Title = '" + publication.Title + "')"; } person.WriteErrorToDB(DB, errorMessage); MessageCallback(errorMessage, false); } else if (publication.PMID == 0) { string errorMessage = "WARNING: Found a publication with PMID = 0, not marking this as an error"; if (!string.IsNullOrEmpty(publication.Title)) { errorMessage += " (Title = '" + publication.Title + "')"; } MessageCallback(errorMessage, false); } // If for some reason the author doesn't exist in the publication, send a message back else if (AuthorPosition == 0) { MessageCallback("Publication " + publication.PMID + " does not contain author " + person.Setnb, false); } else { // Write the publication to the database if (Publications.WriteToDB(publication, DB, pubTypes, Languages)) { // Exit immediately if the user interrupted the harvest if (InterruptCallback()) { return(numberWritten); } // Only increment the publication count if the publication // is actually written or already in the database numberWritten++; // Only add the row to PeoplePublications if the publication // was written, or was already in the database. (For example, // if the publication is not in English, it won't be written.) Publications.WritePeoplePublicationsToDB(DB, person, publication); // Write the publication for each of the other people foreach (Person dupe in DuplicatePeople) { Publications.WritePeoplePublicationsToDB(DB, dupe, publication); } // Calculate the average time per publication in milliseconds EndTime = DateTime.Now; TimeSpan Difference = EndTime - StartTime; TotalMilliseconds += Difference.TotalMilliseconds; AverageMilliseconds = TotalMilliseconds / numberWritten; } } } catch (Exception ex) { person.WriteErrorToDB(DB, ex.Message); MessageCallback("Error writing publication " + publication.PMID.ToString() + ": " + ex.Message, false); } StatusCallback(numberFound, mpr.PublicationList.Length, (int)AverageMilliseconds); } } // Make sure each of the people with the same names and search query // are marked as harvested and have their errors cleared foreach (Person dupe in DuplicatePeople) { Parameters = new ArrayList(); Parameters.Add(Database.Parameter(dupe.Setnb)); DB.ExecuteNonQuery( @"UPDATE People SET Harvested = 1, Error = NULL, ErrorMessage = NULL WHERE Setnb = ?", Parameters); } // Once the publications are all read, updated People.Harvested, as part of // the fault-tolerance scheme -- PeoplePublications rows are only "final" when // this value is updated for the person. Any others can be cleared using // ClearDataAfterInterruption(). Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); DB.ExecuteNonQuery(@"UPDATE People SET Harvested = 1 WHERE Setnb = ?", Parameters); return(numberWritten); }