/// <summary> /// Write the publications report /// </summary> /// <param name="writer">Writer to write the report to</param> public void PubsReport(ArrayList SetnbsToSkip, StreamWriter writer, ReportStatus StatusCallback, ReportMessage MessageCallback) { // Write the header row string[] columns = { "setnb", "pmid", "journal_name", "year", "Month", "day", "title", "Volume", "issue", "position","nbauthors","Bin", "Pages", "Publication_type" }; writer.WriteLine(String.Join(",", columns)); // Write the row for each person People people = new People(DB, PeopleTable); int Total = people.PersonList.Count; int Number = 0; foreach (Person person in people.PersonList) { Number++; StatusCallback(Number, Total, person, false); // Skip the person if the Setnb is in SetnbsToSkip if ((SetnbsToSkip == null) || (!SetnbsToSkip.Contains(person.Setnb))) { // Get the person's publications -- this will throw an exception // if there are no publications so catch it and use the message // callback Publications pubs = null; try { pubs = new Publications(DB, person, PeoplePublicationsTable, false); } catch (Exception ex) { MessageCallback("Unable to retrive publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message); } if (pubs != null && pubs.PublicationList != null) { foreach (Publication pub in pubs.PublicationList) { // Write each row writer.WriteLine(PubsReportRow(person, pub)); } } } else { MessageCallback("Skipping " + person.Last + " (" + person.Setnb + ")"); } } }
/// <summary> /// Write the MeSH Heading report /// </summary> /// <param name="writer">Writer to send the report to</param> public void MeSHHeadingReport(StreamWriter writer, ReportStatus StatusCallback, ReportMessage MessageCallback) { // Write the header writer.WriteLine("setnb,year,heading,count"); // The MeSH Heading report has one row per person per year per heading People people = new People(DB, PeopleTable); int Total = people.PersonList.Count; int Count = 0; foreach (Person person in people.PersonList) { // Report status Count++; StatusCallback(Count, Total, person, false); // Catch any errors, report them, and continue try { // Find the minimum and maximum year for the person int MinYear = 0; int MaxYear = 0; Publications pubs = new Publications(DB, person, PeoplePublicationsTable, false); Hashtable years = new Hashtable(); if (pubs.PublicationList != null) { foreach (Publication pub in pubs.PublicationList) { if (MinYear == 0 || MinYear > pub.Year) { MinYear = pub.Year; } if (MaxYear == 0 || MaxYear < pub.Year) { MaxYear = pub.Year; } // Go through each of the MeSH headings and count how many // occurrences of each heading are in each year. Store each // count in a hashtable keyed by heading, which in turn is // stored in a hashtable keyed by year. if (!years.ContainsKey(pub.Year)) { years[pub.Year] = new Hashtable(); } Hashtable yearHeadings = (Hashtable)years[pub.Year]; if (pub.MeSHHeadings != null) { foreach (string Heading in pub.MeSHHeadings) { if (!yearHeadings.ContainsKey(Heading)) { yearHeadings[Heading] = 0; } yearHeadings[Heading] = ((int)yearHeadings[Heading]) + 1; } } } } // Write the heading rows for each year for (int Year = MinYear; Year <= MaxYear; Year++) { // Write the rows for that person's year to the writer if (years.ContainsKey(Year)) { Hashtable yearHeadings = (Hashtable)years[Year]; if (yearHeadings != null) { foreach (string Heading in yearHeadings.Keys) { StringWriter swriter = new StringWriter(); swriter.Write(person.Setnb); // setnb Reports.WriteCSV(Year.ToString(), swriter); // year Reports.WriteCSV(Heading, swriter); // heading Reports.WriteCSV(yearHeadings[Heading].ToString(), swriter); // count writer.WriteLine(swriter.ToString()); } } } } } catch (Exception ex) { MessageCallback(ex.Message); } } }
/// <summary> /// Add rows to the People report /// </summary> /// <param name="writer">Writer to write the CSV rows to</param> public void PeopleReport(ArrayList SetnbsToSkip, StreamWriter writer, ReportStatus StatusCallback, ReportMessage MessageCallback) { // Write the header row -- this must be generated dynamically // based on the values in PeopleReportSections // write the keys writer.Write("setnb,year"); // write a set of column names for each element in PeopleReportSections for (int i = 0; i < PeopleReportSections.Length; i++) { string values = PeopleReportSections[i].ToLower().Trim(); string[] BaseColumnNames = { "pubcount", "wghtd_pubcount", "pubcount_pos1", "wghtd_pubcount_pos1", "pubcount_posN", "wghtd_pubcount_posN", "pubcount_posM", "wghtd_pubcount_posM", "pubcount_posNTL", "wghtd_pubcount_posNTL", "pubcount_pos2", "wghtd_pubcount_pos2" }; if (values == "all") { // all bins -- use the base column names as-is writer.Write("," + String.Join(",", BaseColumnNames)); } else { // string any +'s from the value type, so "1+2+3" turns into "123" values = values.Replace("+", ""); // replace pubcount_posM with 123pubcount_posM // replace wghtd_pubcount_pos1 with wghtd_123pubcount_pos1 for (int j = 0; j < BaseColumnNames.Length; j++) { string Column; if (BaseColumnNames[j].Contains("wghtd_")) { Column = BaseColumnNames[j].Replace("wghtd_", "wghtd_" + values); } else { Column = values + BaseColumnNames[j]; } writer.Write("," + Column); } } } writer.WriteLine(); // Write the row for each person People people = new People(DB, PeopleTable); int Total = people.PersonList.Count; int Number = 0; foreach (Person person in people.PersonList) { Number++; StatusCallback(Number, Total, person, false); // Skip the person if the Setnb is in SetnbsToSkip if ((SetnbsToSkip == null) || (!SetnbsToSkip.Contains(person.Setnb))) { // Get the person's publications. If there are no publications for // the person, this will throw an error. Publications pubs; try { pubs = new Publications(DB, person, PeoplePublicationsTable, false); } catch (Exception ex) { MessageCallback(ex.Message); pubs = null; } // Sort the list of publications if (pubs != null) { PublicationComparer Comparer = new PublicationComparer(); Comparer.DB = DB; Comparer.person = person; Comparer.publicationTypes = PubTypes; Array.Sort(pubs.PublicationList, Comparer); // Find the minimum and maximum years int YearMinimum = pubs.PublicationList[0].Year; int YearMaximum = pubs.PublicationList[0].Year; if (pubs.PublicationList != null) { foreach (Publication pub in pubs.PublicationList) { if (pub.Year < YearMinimum) { YearMinimum = pub.Year; } if (pub.Year > YearMaximum) { YearMaximum = pub.Year; } } } // Write each row for (int Year = YearMinimum; Year <= YearMaximum; Year++) { StatusCallback(Year - YearMinimum, YearMaximum - YearMinimum, person, true); writer.WriteLine(ReportRow(person, pubs, Year)); } } } else { MessageCallback("Skipping " + person.Last + " (" + person.Setnb + ")"); } } }
/// <summary> /// Create a row in the People report /// </summary> /// <param name="person">Person to write</param> /// <param name="Pubs">Publications to use as input</param> /// <param name="Year">Year to write</param> /// <returns>The row in CSV format</returns> public string ReportRow(Person person, Publications Pubs, int Year) { // This function has been optimized so that the software only loops through the list of publications // once. To do this, the list is first sorted in order of year, publication type "bin", author // position type and PMID. (PMID is only there so that the ordering of the list is easily predictable.) // // The function builds one row in the report by constructing an array of values, and then joining // that array using commas. (There are no strings with commas, so this will be a valid CSV row.) // The row is divided into sections: a set of columns for each bin, one column per author position. StringBuilder sb = new StringBuilder(); StringWriter writer = new StringWriter(sb); // Write the keys // setnb -- Person identifier // year -- Year of transition writer.Write(person.Setnb + ","); writer.Write(Year.ToString()); // The array has been sorted, so we an search for the year. Note that the // binary search may not return the first matching index, so we need to rewind. PublicationYearFinder YearFinder = new PublicationYearFinder(); int Index = Array.BinarySearch(Pubs.PublicationList, Year, YearFinder); while ((Index > 0) && (Pubs.PublicationList[Index - 1].Year == Year)) { Index--; } // Get the counts for each publication type "bin" // The bins are defined in the PeopleReportSections array, which // contains either "all", "i+j+k+..+y+z" or "n" // Query the PublicationTypes table to find all of the pub types, // and use them to build a Hashtable, indexed by publication type // category, that contains a Counts() object for that type Hashtable CategoryCounts = new Hashtable(); DataTable CategoryTable = DB.ExecuteQuery( @"SELECT DISTINCT PubTypeCategoryID FROM PubTypeCategories ORDER BY PubTypeCategoryID;"); // Order by Category ID so it doesn't break the optimization int NumCategories = CategoryTable.Rows.Count; int[] Categories = new int[NumCategories]; for (int RowNum = 0; RowNum < CategoryTable.Rows.Count; RowNum++) { int Category = Convert.ToInt32(CategoryTable.Rows[RowNum]["PubTypeCategoryID"]); Categories[RowNum] = Category; CategoryCounts[Category] = new Counts( Pubs.PublicationList, ref Index, Year, Category, PubTypes, DB, person, Weights, PeoplePublicationsTable); } // For each section in PeopleReportSections, write the appropriate section, // using the Counts() object that was just calculated and stuck into // the CategoryCounts hashtable for (int SectionNum = 0; SectionNum < PeopleReportSections.Length; SectionNum++) { string Section = PeopleReportSections[SectionNum]; if (Section == "all") { // The section is "all" -- generate a count of all values Counts[] AllCountObjects = new Counts[NumCategories]; for (int i = 0; i < NumCategories; i++) { AllCountObjects[i] = (Counts)CategoryCounts[Categories[i]]; } Counts AllCounts = new Counts(AllCountObjects); AllCounts.WriteCounts(writer); } else if (Section.Contains("+")) { // The section contains a list of categories separated with +'s // This is a sum of categories (like "1+2+3") string[] SectionSplit = Section.Split('+'); Counts[] SumCountObjects = new Counts[SectionSplit.Length]; for (int i = 0; i < SectionSplit.Length; i++) { string OneSection = SectionSplit[i]; if (!Publications.IsNumeric(OneSection)) { throw new Exception("ReportSections contains invalid section '" + Section + "'"); } int SectionValue = Convert.ToInt32(OneSection); if (CategoryCounts.ContainsKey(SectionValue)) { Counts OneBinCounts = (Counts)CategoryCounts[SectionValue]; SumCountObjects[i] = OneBinCounts; } else { throw new Exception("ReportSections contains invalid section '" + Section + "'"); } } Counts SumCounts = new Counts(SumCountObjects); SumCounts.WriteCounts(writer); } else { // The section contains a single bin -- generate a Counts object // and write it out. (Make sure it's a real category!) if (!Publications.IsNumeric(Section)) { throw new Exception("ReportSections contains invalid section '" + Section + "'"); } int SectionValue = Convert.ToInt32(Section); if (CategoryCounts.ContainsKey(SectionValue)) { Counts SingleBinCounts = (Counts)CategoryCounts[Categories[SectionValue]]; SingleBinCounts.WriteCounts(writer); } else { throw new Exception("ReportSections contains invalid section '" + Section + "'"); } } } return(sb.ToString()); }
/// <summary> /// Retrieve counts from a publication list /// </summary> /// <param name="PublicationList">Publication list to retrieve counts from, /// sorted by year, publication type and author position</param> /// <param name="Index">Offset in the publication list of the first publication /// matching the year and publication type</param> /// <param name="Year">Year to match for</param> /// <param name="PublicationType">Publication type to match for</param> public Counts(Publication[] PublicationList, ref int Index, int Year, int PublicationType, PublicationTypes PubTypes, Database DB, Person person, Hashtable Weights, string PeoplePublicationsTable) { // Return zero counts if the publication list is empty if (PublicationList.Length == 0) { return; } // Return zero counts if the index is out of bounds if ((Index < 0) || (Index >= PublicationList.Length)) { return; } // Return zero counts if the index doesn't point to a match -- that means // there are no matches Publication pub = PublicationList[Index]; int PubType = PubTypes.GetCategoryNumber(pub.PubType); if ((pub.Year != Year) || (PubType != PublicationType)) { return; } // If we get this far, we have a match. Move forward through the publication // list, adding to the counts, until we find a non-matching publication or // the list runs out. do { // Get the weight for the journal float Weight = 0; if (pub.Journal != null && Weights.ContainsKey(pub.Journal)) { Weight += (float)Weights[pub.Journal]; } // Get the position type, and increment the correct counter Harvester.AuthorPositions PositionType; Publications.GetAuthorPosition(DB, pub.PMID, person, out PositionType, PeoplePublicationsTable); switch (PositionType) { case Harvester.AuthorPositions.First: First++; FirstWeighted += Weight; break; case Harvester.AuthorPositions.Last: Last++; LastWeighted += Weight; break; case Harvester.AuthorPositions.Second: Second++; SecondWeighted += Weight; break; case Harvester.AuthorPositions.NextToLast: NextToLast++; NextToLastWeighted += Weight; break; case Harvester.AuthorPositions.Middle: case Harvester.AuthorPositions.None: Middle++; MiddleWeighted += Weight; break; } Index++; if (Index < PublicationList.Length) { pub = PublicationList[Index]; PubType = PubTypes.GetCategoryNumber(pub.PubType); } } while ((Index < PublicationList.Length) && (PublicationList[Index].Year == Year) && (PubType == PublicationType)); }
/// <summary> /// Retrieve the publications for a person and write them to the database /// </summary> /// <param name="ncbi">NCBI web query object</param> /// <param name="pubTypes">PublicationTypes object</param> /// <param name="person">Person to retrieve publications for</param> /// <param name="StatusCallback">Callback function to return status</param> /// <param name="MessageCallback">Callback function to send messages</param> /// <param name="AverageMilliseconds">Average time (in milliseconds) of each publication write</param> /// <returns>Number of publications written</returns> public int GetPublications(NCBI ncbi, PublicationTypes pubTypes, Person person, GetPublicationsStatus StatusCallback, GetPublicationsMessage MessageCallback, CheckForInterrupt InterruptCallback, out double AverageMilliseconds) { ArrayList Parameters; DateTime StartTime; DateTime EndTime; double TotalMilliseconds = 0; AverageMilliseconds = 0; int numberFound = 0; int numberWritten = 0; // Double-check that the person is really unharvested. If we try to // write publications for a person who already has publications, it will // cause an error -- and that could happen if this person was already // written from a duplicate person earlier. Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); int HarvestedCount = DB.GetIntValue("SELECT Count(*) FROM People WHERE Setnb = ? AND Harvested = 1", Parameters); if (HarvestedCount > 0) { MessageCallback("Already harvested publications for " + person.Last + " (" + person.Setnb + ")", false); Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); return(DB.GetIntValue("SELECT Count(*) FROM PeoplePublications WHERE Setnb = ?", Parameters)); } MessageCallback("Retrieving data from NCBI", true); // Find any other people with the same names and search criteria. // Any publications found for this person should also be found // for them, so when we write the rows to PeoplePublications later // we'll also write them for the other people as well. // Look in the database for any other people with the same // values for name1, name2, name3, name4, name5, name6, and MedlineSearch. // Write their PeoplePublications as well. string NamesClause = ""; Parameters = new ArrayList(); for (int i = 0; i < 6; i++) { if (i < person.Names.Length) { Parameters.Add(Database.Parameter(person.Names[i])); NamesClause += " Name" + ((int)(i + 1)).ToString() + " = ? AND "; } else { NamesClause += " Name" + ((int)(i + 1)).ToString() + " IS NULL AND "; } } Parameters.Add(Database.Parameter(person.MedlineSearch)); Parameters.Add(Database.Parameter(person.Setnb)); DataTable Results = DB.ExecuteQuery("SELECT " + Database.PEOPLE_COLUMNS + @"FROM People WHERE Harvested = 0 AND " + NamesClause + @" MedlineSearch = ? AND Setnb <> ?", Parameters ); ArrayList DuplicatePeople = new ArrayList(); foreach (DataRow Row in Results.Rows) { Person dupe = new Person(Row, Results.Columns); DuplicatePeople.Add(dupe); MessageCallback("Also writing publications for " + dupe.Last + " (" + dupe.Setnb + ") with same names and search criteria", false); } // Search NCBI -- if an error is thrown, write that error to the database string results; try { results = ncbi.Search(person.MedlineSearch); if (results.Substring(0, 100).Contains("Error occurred")) { // NCBI returns an HTML error page in the results // // <html> // <body> // <br/><h2>Error occurred: Unable to obtain query #1</h2><br/> // ... // // If NCBI returns an empty result set with no publications, it will give the error: // Error occurred: Empty result - nothing todo // // That error should generate a warning and mark the person as harvested in the database. // Any other error should be written to the database as an error. string Error = results.Substring(results.IndexOf("Error occurred")); if (results.Contains("<")) { Error = Error.Substring(0, Error.IndexOf("<")); } string Message; if (Error.ToLower().Contains("empty result")) { Message = "Warning for " + person.Last + " (" + person.Setnb + "): no publications found (NCBI returned empty results)"; person.Harvested = true; person.WriteToDB(DB); } else { Message = "Error reading publications for " + person.Last + " (" + person.Setnb + "): NCBI returned '" + Error + "'"; person.WriteErrorToDB(DB, Message); } MessageCallback(Message, false); return(0); } } catch (Exception ex) { string Message = "Error reading publications for " + person.Last + " (" + person.Setnb + "): " + ex.Message; person.WriteErrorToDB(DB, Message); MessageCallback(Message, false); return(0); } Publications mpr = new Publications(results, pubTypes); if (mpr.PublicationList != null) { foreach (Publication publication in mpr.PublicationList) { numberFound++; // Exit immediately if the user interrupted the harvest if (InterruptCallback()) { return(numberWritten); } try { // Calculate the average time, to return in the callback status function StartTime = DateTime.Now; // Add the publication to PeoplePublications // First find the author position and calculate the position type int AuthorPosition = 0; for (int i = 1; (publication.Authors != null) && (AuthorPosition == 0) && (i <= publication.Authors.Length); i++) { foreach (string name in person.Names) { if (StringComparer.CurrentCultureIgnoreCase.Equals( publication.Authors[i - 1], name //.ToUpper() )) { AuthorPosition = i; } else if (name == "*") { AuthorPosition = -1; } } } // If the PMID is 0, we don't have a way to process the publication // and it was probably a Medline search result error. if (publication.PMID == int.MinValue) { string errorMessage = "Found an invalid publication"; if (!string.IsNullOrEmpty(publication.Title)) { errorMessage += " (Title = '" + publication.Title + "')"; } person.WriteErrorToDB(DB, errorMessage); MessageCallback(errorMessage, false); } else if (publication.PMID == 0) { string errorMessage = "WARNING: Found a publication with PMID = 0, not marking this as an error"; if (!string.IsNullOrEmpty(publication.Title)) { errorMessage += " (Title = '" + publication.Title + "')"; } MessageCallback(errorMessage, false); } // If for some reason the author doesn't exist in the publication, send a message back else if (AuthorPosition == 0) { MessageCallback("Publication " + publication.PMID + " does not contain author " + person.Setnb, false); } else { // Write the publication to the database if (Publications.WriteToDB(publication, DB, pubTypes, Languages)) { // Exit immediately if the user interrupted the harvest if (InterruptCallback()) { return(numberWritten); } // Only increment the publication count if the publication // is actually written or already in the database numberWritten++; // Only add the row to PeoplePublications if the publication // was written, or was already in the database. (For example, // if the publication is not in English, it won't be written.) Publications.WritePeoplePublicationsToDB(DB, person, publication); // Write the publication for each of the other people foreach (Person dupe in DuplicatePeople) { Publications.WritePeoplePublicationsToDB(DB, dupe, publication); } // Calculate the average time per publication in milliseconds EndTime = DateTime.Now; TimeSpan Difference = EndTime - StartTime; TotalMilliseconds += Difference.TotalMilliseconds; AverageMilliseconds = TotalMilliseconds / numberWritten; } } } catch (Exception ex) { person.WriteErrorToDB(DB, ex.Message); MessageCallback("Error writing publication " + publication.PMID.ToString() + ": " + ex.Message, false); } StatusCallback(numberFound, mpr.PublicationList.Length, (int)AverageMilliseconds); } } // Make sure each of the people with the same names and search query // are marked as harvested and have their errors cleared foreach (Person dupe in DuplicatePeople) { Parameters = new ArrayList(); Parameters.Add(Database.Parameter(dupe.Setnb)); DB.ExecuteNonQuery( @"UPDATE People SET Harvested = 1, Error = NULL, ErrorMessage = NULL WHERE Setnb = ?", Parameters); } // Once the publications are all read, updated People.Harvested, as part of // the fault-tolerance scheme -- PeoplePublications rows are only "final" when // this value is updated for the person. Any others can be cleared using // ClearDataAfterInterruption(). Parameters = new ArrayList(); Parameters.Add(Database.Parameter(person.Setnb)); DB.ExecuteNonQuery(@"UPDATE People SET Harvested = 1 WHERE Setnb = ?", Parameters); return(numberWritten); }