public void DoTestOlderProteomeDb(TestContext testContext, bool doActualWebAccess) { using (var testFilesDir = new TestFilesDir(testContext, ZIP_FILE)) { string fastaPath = testFilesDir.GetTestPath("tiny.fasta"); string protDbPath = testFilesDir.GetTestPath("celegans_mini.protdb"); // a version 0 protdb file string blibPath = testFilesDir.GetTestPath("random.blib"); // a bibliospec file // What happens when you try to open a random file as a protdb file? AssertEx.ThrowsException<DbException>(() => ProteomeDb.OpenProteomeDb(fastaPath)); // What happens when you try to open a non-protdb database file as a protdb file? AssertEx.ThrowsException<FileLoadException>(() => ProteomeDb.OpenProteomeDb(blibPath)); using (ProteomeDb proteomeDb = ProteomeDb.OpenProteomeDb(protDbPath)) { Assert.IsTrue(proteomeDb.GetSchemaVersionMajor() == 0); // the initial db from our zipfile should be ancient Assert.IsTrue(proteomeDb.GetSchemaVersionMinor() == 0); // the initial db from our zipfile should be ancient Assert.AreEqual(9, proteomeDb.GetProteinCount()); var protein = proteomeDb.GetProteinByName("Y18D10A.20"); Assert.IsNotNull(protein); Assert.IsTrue(String.IsNullOrEmpty(protein.Accession)); // old db won't have this populated WebEnabledFastaImporter searcher = new WebEnabledFastaImporter(doActualWebAccess ? null :new WebEnabledFastaImporter.FakeWebSearchProvider()); Assert.IsTrue(proteomeDb.LookupProteinMetadata(Progress, searcher)); // add any missing protein metadata protein = proteomeDb.GetProteinByName("Y18D10A.20"); Assert.IsNotNull(protein); if (doActualWebAccess) // We can actually go to the web for metadata Assert.AreEqual( "Q9XW16", protein.Accession); using (var reader = new StreamReader(fastaPath)) { proteomeDb.AddFastaFile(reader, (msg, progress) => true); } // the act of writing should update to the current version Assert.AreEqual(ProteomeDb.SCHEMA_VERSION_MAJOR_CURRENT, proteomeDb.GetSchemaVersionMajor()); Assert.AreEqual(ProteomeDb.SCHEMA_VERSION_MINOR_CURRENT, proteomeDb.GetSchemaVersionMinor()); Assert.AreEqual(19, proteomeDb.GetProteinCount()); // check for propery processed protein metadata Assert.IsTrue(proteomeDb.LookupProteinMetadata(Progress,searcher)); protein = proteomeDb.GetProteinByName("IPI00000044"); Assert.IsNotNull(protein); Assert.AreEqual("P01127", protein.Accession); // We get this offline with our ipi->uniprot mapper if (doActualWebAccess) Assert.AreEqual("PDGFB_HUMAN", protein.PreferredName); // But this we get only with web access // TODO: bspratt fix this - GetDigestion has no notion of a Db that has been added to, doesn't digest the new proteins and returns immediately (issue #304) // Enzyme trypsin = EnzymeList.GetDefault(); //proteomeDb.Digest(new ProteaseImpl(trypsin), (msg, progress) => true); //Digestion digestion = proteomeDb.GetDigestion(trypsin.Name); //var digestedProteins0 = digestion.GetProteinsWithSequencePrefix("EDGWVK", 100); //Assert.IsTrue(digestedProteins0.Count >= 1); } } }
public void AddFastaFile(StreamReader reader, ProgressMonitor progressMonitor) { Dictionary<string, ProtIdNames> proteinIds = new Dictionary<string, ProtIdNames>(); using (ISession session = OpenWriteSession()) { foreach (DbProtein protein in session.CreateCriteria(typeof(DbProtein)).List()) { if (protein.Id.HasValue) proteinIds.Add(protein.Sequence, new ProtIdNames(protein.Id.Value, protein.Names)); } int proteinCount = 0; using (var transaction = session.BeginTransaction()) using (IDbCommand insertProtein = session.Connection.CreateCommand()) using (IDbCommand insertName = session.Connection.CreateCommand()) { WebEnabledFastaImporter fastaImporter = new WebEnabledFastaImporter(new WebEnabledFastaImporter.DelayedWebSearchProvider()); // just parse, no search for now insertProtein.CommandText = "INSERT INTO ProteomeDbProtein (Version, Sequence) Values (1,?);select last_insert_rowid();"; // Not L10N insertProtein.Parameters.Add(new SQLiteParameter()); insertName.CommandText = "INSERT INTO ProteomeDbProteinName (Version, Protein, IsPrimary, Name, Description, PreferredName, Accession, Gene, Species, WebSearchStatus) Values(1,?,?,?,?,?,?,?,?,?)"; // Not L10N insertName.Parameters.Add(new SQLiteParameter()); // Id insertName.Parameters.Add(new SQLiteParameter()); // IsPrimary insertName.Parameters.Add(new SQLiteParameter()); // Name insertName.Parameters.Add(new SQLiteParameter()); // Description insertName.Parameters.Add(new SQLiteParameter()); // PreferredName insertName.Parameters.Add(new SQLiteParameter()); // Accession insertName.Parameters.Add(new SQLiteParameter()); // Gene insertName.Parameters.Add(new SQLiteParameter()); // Species insertName.Parameters.Add(new SQLiteParameter()); // WebSearchInfo foreach (DbProtein protein in fastaImporter.Import(reader)) { int iProgress = (int)(reader.BaseStream.Position * 100 / (reader.BaseStream.Length + 1)); if (!progressMonitor.Invoke(string.Format(Resources.ProteomeDb_AddFastaFile_Added__0__proteins,proteinCount), iProgress)) { return; } bool existingProtein = false; ProtIdNames proteinIdNames; if (proteinIds.TryGetValue(protein.Sequence, out proteinIdNames)) { existingProtein = true; } else { ((SQLiteParameter)insertProtein.Parameters[0]).Value = protein.Sequence; proteinIdNames = new ProtIdNames(Convert.ToInt64(insertProtein.ExecuteScalar()), new DbProteinName[0]); proteinIds.Add(protein.Sequence, proteinIdNames); proteinCount++; } foreach (var proteinName in protein.Names) { // Skip any names that already exist if (proteinIdNames.Names.Any(dbProteinName => Equals(dbProteinName.Name, proteinName.Name))) continue; try { ((SQLiteParameter)insertName.Parameters[0]).Value = proteinIdNames.Id; ((SQLiteParameter)insertName.Parameters[1]).Value = proteinName.IsPrimary && !existingProtein; ((SQLiteParameter)insertName.Parameters[2]).Value = proteinName.Name; ((SQLiteParameter)insertName.Parameters[3]).Value = proteinName.Description; ((SQLiteParameter)insertName.Parameters[4]).Value = proteinName.PreferredName; ((SQLiteParameter)insertName.Parameters[5]).Value = proteinName.Accession; ((SQLiteParameter)insertName.Parameters[6]).Value = proteinName.Gene; ((SQLiteParameter)insertName.Parameters[7]).Value = proteinName.Species; ((SQLiteParameter)insertName.Parameters[8]).Value = proteinName.WebSearchStatus; // represent as a string for ease of serialization insertName.ExecuteNonQuery(); } catch (Exception exception) { Console.Out.WriteLine(exception); } } } if (!progressMonitor.Invoke(Resources.ProteomeDb_AddFastaFile_Saving_changes, 99)) { return; } transaction.Commit(); } AnalyzeDb(session); progressMonitor.Invoke( string.Format(Resources.ProteomeDb_AddFastaFile_Finished_importing__0__proteins, proteinCount), 100); } }
/// <summary> /// Access the web to resolve protein metadata not directly found in fasta file. /// The fasta text importer will have left search hints in ProteinMetadata. /// </summary> /// <param name="progressMonitor"></param> /// <param name="fastaImporter">object that accesses the web, or pretends to if in a test</param> /// <param name="polite">if true, don't try to resolve everything in one go, assume we can come back later</param> /// <returns>true on success</returns> public bool LookupProteinMetadata(ProgressMonitor progressMonitor, WebEnabledFastaImporter fastaImporter, bool polite = false) { var unsearchedProteins = new List<ProteinSearchInfo>(); List<DbProteinName> untaggedProteins; using (ISession session = OpenSession()) { if (!progressMonitor.Invoke(Resources.ProteomeDb_LookupProteinMetadata_looking_for_unresolved_protein_details, 0)) { return false; } // get a list of proteins with unresolved metadata websearches var proteinNames = session.CreateCriteria(typeof (DbProteinName)).List<DbProteinName>(); var proteinsToSearch = proteinNames.Where(proteinName => (proteinName.GetProteinMetadata().GetPendingSearchTerm().Length > 0)) .ToList(); // and a list of proteins which have never been considered for metadata search untaggedProteins = proteinNames.Where(proteinName => proteinName.WebSearchInfo.IsEmpty()).ToList(); foreach (var untaggedProtein in untaggedProteins) { untaggedProtein.SetWebSearchCompleted(); // by default take this out of consideration for next time var metadata = untaggedProtein.GetProteinMetadata(); if (metadata.HasMissingMetadata()) { var search = fastaImporter.ParseProteinMetaData(metadata); if (search!=null) { metadata = untaggedProtein.ChangeProteinMetadata(metadata.Merge(search)); // don't stomp name by accident metadata = untaggedProtein.ChangeProteinMetadata(metadata.ChangeWebSearchInfo(search.WebSearchInfo)); } } if (metadata.NeedsSearch()) proteinsToSearch.Add(untaggedProtein); // add to the list of things to commit back to the db } // Get the lengths of the sequences without getting the sequences themselves, for best speed var proteinIds = proteinsToSearch.Select(name => name.Protein.Id.Value).Distinct().ToArray(); var proteinLengths = new Dictionary<long, int>(); using (var cmd = session.Connection.CreateCommand()) { string sql = "SELECT Id, LENGTH(Sequence) AS SequenceLength FROM ProteomeDbProtein P"; // Not L10N if (proteinIds.Length < 1000) { sql += " WHERE P.Id IN (" + // Not L10N string.Join(",", proteinIds) + ")"; // Not L10N } cmd.CommandText = sql; using (var reader = cmd.ExecuteReader()) { while (reader.Read()) { var id = reader.GetValue(0); var len = reader.GetValue(1); proteinLengths.Add(Convert.ToInt64(id), Convert.ToInt32(len)); } } } foreach (var p in proteinsToSearch) { int length; proteinLengths.TryGetValue(p.Protein.Id.GetValueOrDefault(), out length); unsearchedProteins.Add(new ProteinSearchInfo(p, length)); } } if (untaggedProteins.Any(untagged => !untagged.GetProteinMetadata().NeedsSearch())) // did any get set as unsearchable? { // Write back the ones that were formerly without search terms, but which now indicate no search is possible using (ISession session = OpenWriteSession()) { using (var transaction = session.BeginTransaction()) { foreach (var untagged in untaggedProteins.Where(untagged => !untagged.GetProteinMetadata().NeedsSearch())) session.SaveOrUpdate(untagged); // update the metadata transaction.Commit(); } } } if (unsearchedProteins.Any()) { int resultsCount = 0; int unsearchedCount = unsearchedProteins.Count; for (bool success = true; success;) { success = false; // Until we see at least one succeed this round var results = new List<DbProteinName>(); // The "true" arg means "do just one batch then return" foreach (var result in fastaImporter.DoWebserviceLookup(unsearchedProteins, null, true)) { if (result != null) { if ( !progressMonitor.Invoke( string.Format( Resources.ProteomeDb_LookupProteinMetadata_Retrieving_details_for__0__proteins, unsearchedProteins.Count), 100 * resultsCount++ / unsearchedCount)) { return false; } success = true; results.Add(result.ProteinDbInfo); } } if (results.Any()) // save this batch { using (var session = OpenWriteSession()) { using (var transaction = session.BeginTransaction()) { foreach (var result in results) session.SaveOrUpdate(result); transaction.Commit(); session.Close(); } } } // Edit this list rather than rederive with database access var hits = unsearchedProteins.Where(p => !p.GetProteinMetadata().NeedsSearch()).ToList(); foreach (var hit in hits) { unsearchedProteins.Remove(hit); } } } return true; }
private void DoWork() { try { // just do the basic name+description parsing, no regex or web access - we don't use extended metadata here var fastaImporter = new WebEnabledFastaImporter(new WebEnabledFastaImporter.FakeWebSearchProvider()); var proteins = new Dictionary<string, ProteinData>(); foreach (var path in FastaFilePaths) { var statusText = "Reading FASTA file " + Path.GetFileName(path); var fileInfo = new FileInfo(path); var reader = File.OpenText(path); foreach (var protein in fastaImporter.Import(File.OpenText(path))) { if (!UpdateProgress(statusText, (int) (reader.BaseStream.Position * 100 / fileInfo.Length))) { return; } ProteinData proteinData; if (!proteins.TryGetValue(protein.Sequence, out proteinData)) { proteinData = new ProteinData(protein.Sequence); proteins.Add(protein.Sequence, proteinData); } foreach (var name in protein.Names) { proteinData.AddName(name.Name, name.Description); } } } UpdatePeptides(proteins); if (!IsDisposed) { BeginInvoke(new Action(Close)); } } finally { lock(this) { _running = false; Monitor.PulseAll(this); } } }