/// <summary> /// Access the web to resolve protein metadata not directly found in fasta file. /// The fasta text importer will have left search hints in ProteinMetadata. /// </summary> /// <param name="progressMonitor"></param> /// <param name="status"></param> /// <param name="fastaImporter">object that accesses the web, or pretends to if in a test</param> /// <param name="parseOnly">if true, attempt to parse protein metadata from descriptions but do not proceed to web access</param> /// <param name="done">will return true if there is nothung more to look up</param> /// <returns>true on success</returns> public bool LookupProteinMetadata(IProgressMonitor progressMonitor, ref IProgressStatus status, WebEnabledFastaImporter fastaImporter, bool parseOnly, out bool done) { var unsearchedProteins = new List <ProteinSearchInfo>(); done = false; // If we're here, it's because the background loader is done digesting and has moved on to protein metadata, // or because the PeptideSettingsUI thread needs to have protein metadata resolved for uniqueness purposes before // it can proceed. Either way, we should be working on a temp copy and be the only one needing write access, so get a lock now using (ISession session = OpenWriteSession()) // We may update the protdb file with web search results { if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, Resources.ProteomeDb_LookupProteinMetadata_looking_for_unresolved_protein_details, 0)) { return(false); } // get a list of proteins with unresolved metadata websearches var proteinNames = session.CreateCriteria(typeof(DbProteinName)).List <DbProteinName>().Where(x => x.WebSearchInfo.NeedsSearch()).ToList(); var proteinsToSearch = proteinNames.Where(proteinName => (proteinName.GetProteinMetadata().GetPendingSearchTerm().Length > 0)) .ToList(); if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, Resources.ProteomeDb_LookupProteinMetadata_looking_for_unresolved_protein_details, 0)) { return(false); } // and a list of proteins which have never been considered for metadata search var untaggedProteins = proteinNames.Where(proteinName => proteinName.WebSearchInfo.IsEmpty()).ToList(); foreach (var untaggedProtein in untaggedProteins) { untaggedProtein.SetWebSearchCompleted(); // by default take this out of consideration for next time var metadata = untaggedProtein.GetProteinMetadata(); if (metadata.HasMissingMetadata()) { var search = fastaImporter.ParseProteinMetaData(metadata); if (search != null) { metadata = untaggedProtein.ChangeProteinMetadata(metadata.Merge(search)); // don't stomp name by accident metadata = untaggedProtein.ChangeProteinMetadata(metadata.ChangeWebSearchInfo(search.WebSearchInfo)); } } if (metadata.NeedsSearch()) { proteinsToSearch.Add(untaggedProtein); // add to the list of things to commit back to the db } } // Get the lengths of the sequences without getting the sequences themselves, for best speed var proteinIds = proteinsToSearch.Select(name => name.Protein.Id.Value).Distinct().ToArray(); var proteinLengths = new Dictionary <long, int>(); using (var cmd = session.Connection.CreateCommand()) { string sql = @"SELECT Id, LENGTH(Sequence) AS SequenceLength FROM ProteomeDbProtein P"; if (proteinIds.Length < 1000) { sql += @" WHERE P.Id IN (" + string.Join(@",", proteinIds) + @")"; } cmd.CommandText = sql; using (var reader = cmd.ExecuteReader()) { while (reader.Read()) { var id = reader.GetValue(0); var len = reader.GetValue(1); proteinLengths.Add(Convert.ToInt64(id), Convert.ToInt32(len)); if (proteinLengths.Count % 100 == 0) // Periodic cancellation check { if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, Resources.ProteomeDb_LookupProteinMetadata_looking_for_unresolved_protein_details, 0)) { return(false); } } } } } foreach (var p in proteinsToSearch) { int length; proteinLengths.TryGetValue(p.Protein.Id.GetValueOrDefault(), out length); unsearchedProteins.Add(new ProteinSearchInfo(p, length)); } if (untaggedProteins.Any(untagged => !untagged.GetProteinMetadata().NeedsSearch())) // did any get set as unsearchable? { // Write back the ones that were formerly without search terms, but which now indicate no search is possible using (var transaction = session.BeginTransaction()) { foreach (var untagged in untaggedProteins.Where(untagged => !untagged.GetProteinMetadata().NeedsSearch())) { session.SaveOrUpdate(untagged); // update the metadata } transaction.Commit(); } } if (unsearchedProteins.Any() && !parseOnly) { int resultsCount = 0; int unsearchedCount = unsearchedProteins.Count; for (bool success = true; success;) { success = false; // Until we see at least one succeed this round var results = new List <DbProteinName>(); if (progressMonitor.IsCanceled) { return(false); } // The "true" arg means "do just one batch then return" foreach (var result in fastaImporter.DoWebserviceLookup(unsearchedProteins, progressMonitor, true)) { if (result != null) { string message = string.Format(Resources.ProteomeDb_LookupProteinMetadata_Retrieving_details_for__0__proteins, unsearchedProteins.Count); // Make it clearer when web access is faked during testing if (fastaImporter.IsAccessFaked) { message = @"FAKED: " + message; } if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, message, 100 * resultsCount++ / unsearchedCount)) { return(false); } success = true; results.Add(result.ProteinDbInfo); } } if (results.Any()) // save this batch { using (var transaction = session.BeginTransaction()) { foreach (var result in results) { session.SaveOrUpdate(result); } transaction.Commit(); } } // Edit this list rather than rederive with database access var hits = unsearchedProteins.Where(p => !p.GetProteinMetadata().NeedsSearch()).ToList(); foreach (var hit in hits) { unsearchedProteins.Remove(hit); } } } done = !unsearchedProteins.Any(); } // End writesession return(true); }
/// <summary> /// Access the web to resolve protein metadata not directly found in fasta file. /// The fasta text importer will have left search hints in ProteinMetadata. /// </summary> /// <param name="progressMonitor"></param> /// <param name="fastaImporter">object that accesses the web, or pretends to if in a test</param> /// <param name="polite">if true, don't try to resolve everything in one go, assume we can come back later</param> /// <returns>true on success</returns> public bool LookupProteinMetadata(ProgressMonitor progressMonitor, WebEnabledFastaImporter fastaImporter, bool polite = false) { var unsearchedProteins = new List <ProteinSearchInfo>(); List <DbProteinName> untaggedProteins; using (ISession session = OpenSession()) { if (!progressMonitor.Invoke(Resources.ProteomeDb_LookupProteinMetadata_looking_for_unresolved_protein_details, 0)) { return(false); } // get a list of proteins with unresolved metadata websearches var proteinNames = session.CreateCriteria(typeof(DbProteinName)).List <DbProteinName>(); var proteinsToSearch = proteinNames.Where(proteinName => (proteinName.GetProteinMetadata().GetPendingSearchTerm().Length > 0)) .ToList(); // and a list of proteins which have never been considered for metadata search untaggedProteins = proteinNames.Where(proteinName => proteinName.WebSearchInfo.IsEmpty()).ToList(); foreach (var untaggedProtein in untaggedProteins) { untaggedProtein.SetWebSearchCompleted(); // by default take this out of consideration for next time var metadata = untaggedProtein.GetProteinMetadata(); if (metadata.HasMissingMetadata()) { var search = fastaImporter.ParseProteinMetaData(metadata); if (search != null) { metadata = untaggedProtein.ChangeProteinMetadata(metadata.Merge(search)); // don't stomp name by accident metadata = untaggedProtein.ChangeProteinMetadata(metadata.ChangeWebSearchInfo(search.WebSearchInfo)); } } if (metadata.NeedsSearch()) { proteinsToSearch.Add(untaggedProtein); // add to the list of things to commit back to the db } } // Get the lengths of the sequences without getting the sequences themselves, for best speed var proteinIds = proteinsToSearch.Select(name => name.Protein.Id.Value).Distinct().ToArray(); var proteinLengths = new Dictionary <long, int>(); using (var cmd = session.Connection.CreateCommand()) { string sql = "SELECT Id, LENGTH(Sequence) AS SequenceLength FROM ProteomeDbProtein P"; // Not L10N if (proteinIds.Length < 1000) { sql += " WHERE P.Id IN (" + // Not L10N string.Join(",", proteinIds) + ")"; // Not L10N } cmd.CommandText = sql; using (var reader = cmd.ExecuteReader()) { while (reader.Read()) { var id = reader.GetValue(0); var len = reader.GetValue(1); proteinLengths.Add(Convert.ToInt64(id), Convert.ToInt32(len)); } } } foreach (var p in proteinsToSearch) { int length; proteinLengths.TryGetValue(p.Protein.Id.GetValueOrDefault(), out length); unsearchedProteins.Add(new ProteinSearchInfo(p, length)); } } if (untaggedProteins.Any(untagged => !untagged.GetProteinMetadata().NeedsSearch())) // did any get set as unsearchable? { // Write back the ones that were formerly without search terms, but which now indicate no search is possible using (ISession session = OpenWriteSession()) { using (var transaction = session.BeginTransaction()) { foreach (var untagged in untaggedProteins.Where(untagged => !untagged.GetProteinMetadata().NeedsSearch())) { session.SaveOrUpdate(untagged); // update the metadata } transaction.Commit(); } } } if (unsearchedProteins.Any()) { int resultsCount = 0; int unsearchedCount = unsearchedProteins.Count; for (bool success = true; success;) { success = false; // Until we see at least one succeed this round var results = new List <DbProteinName>(); // The "true" arg means "do just one batch then return" foreach (var result in fastaImporter.DoWebserviceLookup(unsearchedProteins, null, true)) { if (result != null) { if ( !progressMonitor.Invoke( string.Format( Resources.ProteomeDb_LookupProteinMetadata_Retrieving_details_for__0__proteins, unsearchedProteins.Count), 100 * resultsCount++ / unsearchedCount)) { return(false); } success = true; results.Add(result.ProteinDbInfo); } } if (results.Any()) // save this batch { using (var session = OpenWriteSession()) { using (var transaction = session.BeginTransaction()) { foreach (var result in results) { session.SaveOrUpdate(result); } transaction.Commit(); session.Close(); } } } // Edit this list rather than rederive with database access var hits = unsearchedProteins.Where(p => !p.GetProteinMetadata().NeedsSearch()).ToList(); foreach (var hit in hits) { unsearchedProteins.Remove(hit); } } } return(true); }