public static void ExampleProteinGrouping(IProtease protease, double percentIdentified = 0.01, int maxMissed = 3, int minLength = 5, int maxLength = 50) { Stopwatch watch = new Stopwatch(); watch.Start(); List<Peptide> peps = new List<Peptide>(1000000); List<Protein> proteins = new List<Protein>(7000); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { peps.AddRange(protein.Digest(protease, maxMissed, minLength, maxLength)); proteins.Add(protein); } } Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds); watch.Restart(); Random random = new Random(480912341); // Take the first x % to act as our identified peptides List<Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int) (peps.Count*percentIdentified)).ToList(); List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count); Console.WriteLine(); Console.WriteLine("Time elapsed: {0} ms", watch.ElapsedMilliseconds); }
public static void Start(IProtease protease, double percentIdentified = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Protein Grouping**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<Peptide> peps = new List<Peptide>(); List<Protein> proteins = new List<Protein>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); } proteins.Add(protein); } } Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds); // Fixed seed to make it reproducible Random random = new Random(480912341); // Take the first x % to act as our identified peptides List<Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int) (peps.Count*percentIdentified)).ToList(); List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet/(1024*1024)); Console.WriteLine("**END Protein Grouping**"); }
public static IEnumerable <ProteinGroup> GroupProteins(string fastaFile, IProtease protease, IEnumerable <IAminoAcidSequence> observeredSequences, IEqualityComparer <IAminoAcidSequence> peptideComparer, int MaxMissedCleavages = 3) { using (FastaReader fasta = new FastaReader(fastaFile)) { return(GroupProteins(fasta.ReadNextProtein(), new[] { protease }, observeredSequences, peptideComparer, MaxMissedCleavages)); } }
public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Morpheus Search**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<int> hashCodes = new List<int>(); // Generate peptide candidates HashSet<Peptide> peptides = new HashSet<Peptide>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peptides.Add(peptide); } } } MSSearchEngine engine = new MorpheusSearchEngine(); engine.PrecursorMassTolerance = Tolerance.FromPPM(100); engine.ProductMassTolerance = Tolerance.FromPPM(10); engine.LoadPeptides(peptides); watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", Environment.WorkingSet/(1024*1024)); Console.WriteLine("**End Morpheus Search**"); }
public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Morpheus Search**"); Stopwatch watch = new Stopwatch(); watch.Start(); List <int> hashCodes = new List <int>(); // Generate peptide candidates HashSet <Peptide> peptides = new HashSet <Peptide>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peptides.Add(peptide); } } } MSSearchEngine engine = new MorpheusSearchEngine(); engine.PrecursorMassTolerance = Tolerance.FromPPM(100); engine.ProductMassTolerance = Tolerance.FromPPM(10); engine.LoadPeptides(peptides); watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**End Morpheus Search**"); }
public static void Start(IProtease protease, int maxMissed = 1, int minLength = 0, int maxLength = int.MaxValue, bool storeSequenceString = true) { Console.WriteLine("**Start Digestion**"); Stopwatch watch = new Stopwatch(); watch.Start(); List <Peptide> peps = new List <Peptide>(); List <Protein> prots = new List <Protein>(); List <double> allMzs = new List <double>(); AminoAcidPolymer.StoreSequenceString = storeSequenceString; using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); allMzs.Add(peptide.ToMz(2)); // forces the calculation of the mass and thus chemical formula } prots.Add(protein); } } watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} peptides using {2:N0} missed cleavages", prots.Count, peps.Count, maxMissed); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**End Digestion**"); }
public static void StartRamp(IProtease protease, double percentIdentifiedSteps = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35) { List <Peptide> peps = new List <Peptide>(); List <Protein> proteins = new List <Protein>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); } proteins.Add(protein); } } // Fixed seed to make it reproducible Random random = new Random(480912341); peps = peps.OrderBy(x => random.Next()).ToList(); for (double percentIdentified = 0; percentIdentified <= 1; percentIdentified += percentIdentifiedSteps) { // Take the first x % to act as our identified peptides List <Peptide> identifiedPeptides = peps.Take((int)(peps.Count * percentIdentified)).ToList(); List <ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); Console.WriteLine("{0} peptides {1} protein groups", identifiedPeptides.Count, proteinGroups.Count); } }
public static void Start(IProtease protease, int maxMissed = 1, int minLength = 0, int maxLength = int.MaxValue, bool storeSequenceString = true) { Console.WriteLine("**Start Digestion**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<Peptide> peps = new List<Peptide>(); List<Protein> prots = new List<Protein>(); List<double> allMzs = new List<double>(); AminoAcidPolymer.StoreSequenceString = storeSequenceString; using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); allMzs.Add(peptide.ToMz(2)); // forces the calculation of the mass and thus chemical formula } prots.Add(protein); } } watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} peptides using {2:N0} missed cleavages", prots.Count, peps.Count, maxMissed); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**End Digestion**"); }
public static void ExampleDigestion() { const string fastaFilePath = "Resources/yeast_uniprot_120226.fasta"; IProtease trypsin = Protease.GetProtease("Trypsin"); const int maxMissedCleavages = 3; const int minPeptideLength = 5; const int maxPeptideLength = 50; List <double> masses = new List <double>(); Stopwatch watch = new Stopwatch(); watch.Start(); using (FastaReader reader = new FastaReader(fastaFilePath)) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(trypsin, maxMissedCleavages, minPeptideLength, maxPeptideLength)) { masses.Add(peptide.MonoisotopicMass); } } } //Console.WriteLine("Average Peptide Mass = {0:F4}", masses.Average()); watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); }
public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35) { //Console.WriteLine("**Start Protein Grouping**"); //Stopwatch watch = new Stopwatch(); //watch.Start(); //List<Peptide> peps = new List<Peptide>(); //List<Protein> prots = new List<Protein>(); //using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) //{ // foreach (Protein protein in reader.ReadNextProtein()) // { // foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) // { // peps.Add(peptide); // } // prots.Add(protein); // } //} //List<ProteinGroup> groups = new List<ProteinGroup>(); //ProteinGroup.SetProteins(prots, protease, maxMissed, minLength, maxLength); //foreach (Peptide pep in peps) //{ // ProteinGroup pg = ProteinGroup.Group(pep); // groups.Add(pg); //} //watch.Stop(); //Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups using {2:N0} missed clevages", prots.Count, groups.Count, maxMissed); //Console.WriteLine("Time elapsed: {0}", watch.Elapsed); //Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024)); //Console.WriteLine("**End Digestion**"); }
public static void ExampleProteinGrouping(IProtease protease, double percentIdentified = 0.01, int maxMissed = 3, int minLength = 5, int maxLength = 50) { Stopwatch watch = new Stopwatch(); watch.Start(); List <Peptide> peps = new List <Peptide>(1000000); List <Protein> proteins = new List <Protein>(7000); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { peps.AddRange(protein.Digest(protease, maxMissed, minLength, maxLength)); proteins.Add(protein); } } Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds); watch.Restart(); Random random = new Random(480912341); // Take the first x % to act as our identified peptides List <Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int)(peps.Count * percentIdentified)).ToList(); List <ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count); Console.WriteLine(); Console.WriteLine("Time elapsed: {0} ms", watch.ElapsedMilliseconds); }
private void btnDigest_Click(object sender, EventArgs e) { String enzymeName = lbxDigestion.SelectedItem.ToString(); IProtease protease = Enzymes.AllEnzymes()[enzymeName]; String organism = lbxOrganisms.SelectedItem.ToString(); RunBackground(() => proteomeDb.GetOrganism(organism).Digest(protease, enzymeName, null, UpdateProgress)); }
public static void AssignNumProteaseTermini(this IProtease protease, IIdentifiedSpectrum s) { var counts = (from p in s.Peptides let beforeChar = p.Sequence[0] let afterChar = p.Sequence[p.Sequence.Length - 1] let c = protease.GetProteaseTerminiCount(beforeChar, p.PureSequence, afterChar, '-') select c).Distinct().ToList(); s.NumProteaseTermini = counts.Max(); }
public ProteinMatchSettings(ProteomeDbPath proteomeDbPath, IProtease protease, ProteinMatchType proteinMatchTypes, String searchText) { ProteomeDbPath = proteomeDbPath; Protease = protease; if (protease != null) { using (var proteomeDb = proteomeDbPath.OpenProteomeDb()) { Digestion = proteomeDb.GetDigestion(protease.Name); } } MatchTypes = proteinMatchTypes; SearchText = searchText; }
public static int GetNumProteaseTermini(this IProtease protease, char beforeChar, string pureSeq, char afterChar, char terminalChar, int positionInProtein) { int result = 0; if (beforeChar == 'M' && positionInProtein == 2) { result++; } else if (protease.IsCleavageSite(beforeChar, pureSeq[0], terminalChar)) { result++; } if (protease.IsCleavageSite(pureSeq[pureSeq.Length - 1], afterChar, terminalChar)) { result++; } return(result); }
public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Morpheus Search**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<int> hashCodes = new List<int>(); // Generate peptide candidates HashSet<Peptide> peptides = new HashSet<Peptide>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peptides.Add(peptide); } } } MSSearchEngine engine = new MorpheusSearchEngine(); engine.PrecursorMassTolerance = Tolerance.FromPPM(100); engine.ProductMassTolerance = Tolerance.FromPPM(10); engine.LoadPeptides(peptides); using (MSDataFile msDataFile = new ThermoRawFile("Resources/ThermoRawFileMS1MS2.raw")) { //SortedMaxSizedContainer<PeptideSpectralMatch> psms = engine.Search(msDataFile.Where(scan => scan.MsnOrder > 1)); //foreach (MSDataScan scan in msDataFile.Where(scan => scan.MsnOrder > 1)) //{ // List<PeptideSpectralMatch> psms = engine.Search(scan); // Console.WriteLine("{0} {1}", scan.SpectrumNumber, psms.Count); //} } watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**End Morpheus Search**"); }
public static void Start(IProtease protease, double percentIdentified = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Protein Grouping**"); Stopwatch watch = new Stopwatch(); watch.Start(); List <Peptide> peps = new List <Peptide>(); List <Protein> proteins = new List <Protein>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); } proteins.Add(protein); } } Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds); // Fixed seed to make it reproducible Random random = new Random(480912341); // Take the first x % to act as our identified peptides List <Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int)(peps.Count * percentIdentified)).ToList(); List <ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**END Protein Grouping**"); }
public static IEnumerable <ProteinGroup> GroupProteins(IEnumerable <Protein> proteins, IProtease protease, IEnumerable <IAminoAcidSequence> observeredSequences, IEqualityComparer <IAminoAcidSequence> peptideComparer, int MaxMissedCleavages = 3, int minPepPerProtein = 1) { return(GroupProteins(proteins, new[] { protease }, observeredSequences, peptideComparer, MaxMissedCleavages)); }
/// <summary> /// Digests this protein into peptides. /// </summary> /// <param name="protease">The protease to digest with</param> /// <param name="maxMissedCleavages">The max number of missed cleavages generated, 0 means no missed cleavages</param> /// <param name="minLength">The minimum length (in amino acids) of the peptide</param> /// <param name="maxLength">The maximum length (in amino acids) of the peptide</param> /// <param name="initiatorMethonine"></param> /// <param name="includeModifications"></param> /// <param name="semiDigestion"></param> /// <returns>A list of digested peptides</returns> public virtual IEnumerable <Peptide> Digest(IProtease protease, int maxMissedCleavages = 3, int minLength = 1, int maxLength = int.MaxValue, bool initiatorMethonine = true, bool includeModifications = false, bool semiDigestion = false) { return(Digest(new[] { protease }, maxMissedCleavages, minLength, maxLength, initiatorMethonine, includeModifications, semiDigestion)); }
public static IEnumerable <string> Digest(AminoAcidPolymer polymer, IProtease protease, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion) { return(Digest(polymer.BaseSequence, new[] { protease }, maxMissedCleavages, minLength, maxLength, methionineInitiator, semiDigestion)); }
public static IEnumerable <string> Digest(AminoAcidPolymer sequence, IProtease protease) { return(Digest(sequence, protease, 3, 1, int.MaxValue, true, false)); }
public Digestion Digest(IProtease protease, String name, String description, ProgressMonitor progressMonitor) { DbOrganism organism; DbDigestion digestion; List <DbProtein> proteins; using (ISession session = ProteomeDb.OpenWriteSession()) { organism = GetEntity(session); session.BeginTransaction(); digestion = new DbDigestion { Name = name, Description = description, Organism = organism, MaxMissedCleavages = protease.MaxMissedCleavages }; session.Save(digestion); if (!progressMonitor.Invoke("Listing proteins", 0)) { return(null); } proteins = new List <DbProtein>(organism.Proteins); Dictionary <String, long> digestedPeptideIds = new Dictionary <string, long>(); const String sqlPeptide = "INSERT INTO ProteomeDbDigestedPeptide (Digestion, MissedCleavages, Sequence, Version) VALUES(@Digestion,@MissedCleavages,@Sequence,1);select last_insert_rowid();"; var commandPeptide = session.Connection.CreateCommand(); commandPeptide.CommandText = sqlPeptide; commandPeptide.Parameters.Add(new SQLiteParameter("@Digestion")); commandPeptide.Parameters.Add(new SQLiteParameter("@MissedCleavages")); commandPeptide.Parameters.Add(new SQLiteParameter("@Sequence")); const String sqlPeptideProtein = "INSERT INTO ProteomeDbDigestedPeptideProtein (StartIndex, Peptide, Protein, Version) VALUES(?,?,?,1);"; var commandProtein = session.Connection.CreateCommand(); commandProtein.CommandText = sqlPeptideProtein; commandProtein.Parameters.Add(new SQLiteParameter("@StartIndex")); commandProtein.Parameters.Add(new SQLiteParameter("@Peptide")); commandProtein.Parameters.Add(new SQLiteParameter("@Protein")); for (int i = 0; i < proteins.Count; i++) { if (!progressMonitor.Invoke("Digesting " + proteins.Count + " proteins", 100 * i / proteins.Count)) { return(null); } Protein protein = new Protein(this, proteins[i]); foreach (DigestedPeptide digestedPeptide in protease.Digest(protein)) { if (digestedPeptide.Sequence.Length > MAX_PEPTIDE_LENGTH) { continue; } long digestedPeptideId; if (!digestedPeptideIds.TryGetValue(digestedPeptide.Sequence, out digestedPeptideId)) { ((SQLiteParameter)commandPeptide.Parameters[0]).Value = digestion.Id; ((SQLiteParameter)commandPeptide.Parameters[1]).Value = digestedPeptide.MissedCleavages; ((SQLiteParameter)commandPeptide.Parameters[2]).Value = digestedPeptide.Sequence; digestedPeptideId = Convert.ToInt64(commandPeptide.ExecuteScalar()); digestedPeptideIds.Add(digestedPeptide.Sequence, digestedPeptideId); } ((SQLiteParameter)commandProtein.Parameters[0]).Value = digestedPeptide.Index; ((SQLiteParameter)commandProtein.Parameters[1]).Value = digestedPeptideId; ((SQLiteParameter)commandProtein.Parameters[2]).Value = proteins[i].Id; commandProtein.ExecuteNonQuery(); } } if (!progressMonitor.Invoke("Committing transaction", 99)) { return(null); } session.Transaction.Commit(); progressMonitor.Invoke( "Digested " + proteins.Count + " proteins into " + digestedPeptideIds.Count + " unique peptides", 100); return(new Digestion(this, digestion)); } }
/// <summary> /// Digests this protein into peptides. /// </summary> /// <param name="protease">The protease to digest with</param> /// <param name="maxMissedCleavages">The max number of missed cleavages generated, 0 means no missed cleavages</param> /// <param name="minLength">The minimum length (in amino acids) of the peptide</param> /// <param name="maxLength">The maximum length (in amino acids) of the peptide</param> /// <param name="initiatorMethonine"></param> /// <param name="includeModifications"></param> /// <param name="semiDigestion"></param> /// <returns>A list of digested peptides</returns> public virtual IEnumerable<Peptide> Digest(IProtease protease, int maxMissedCleavages = 3, int minLength = 1, int maxLength = int.MaxValue, bool initiatorMethonine = true, bool includeModifications = false, bool semiDigestion = false) { return Digest(new[] {protease}, maxMissedCleavages, minLength, maxLength, initiatorMethonine, includeModifications, semiDigestion); }
public Digestion Digest(IProtease protease, ProgressMonitor progressMonitor) { using (ISession session = OpenWriteSession()) { DbDigestion dbDigestion = GetDbDigestion(protease.Name); HashSet <string> existingSequences = new HashSet <string>(); using (var transaction = session.BeginTransaction()) { if (dbDigestion != null) { if (dbDigestion.MaxSequenceLength >= MAX_SEQUENCE_LENGTH) { return(new Digestion(this, dbDigestion)); } if (!progressMonitor.Invoke(Resources.ProteomeDb_Digest_Listing_existing_peptides, 0)) { return(null); } IQuery query = session.CreateQuery("SELECT P.Sequence FROM " // Not L10N + typeof(DbDigestedPeptide) + " P WHERE P.Digestion = :Digestion") // Not L10N .SetParameter("Digestion", dbDigestion); // Not L10N List <String> listSequences = new List <string>(); query.List(listSequences); existingSequences.UnionWith(listSequences); dbDigestion.MaxSequenceLength = MAX_SEQUENCE_LENGTH; session.Update(dbDigestion); } else { dbDigestion = new DbDigestion { Name = protease.Name, MinSequenceLength = MIN_SEQUENCE_LENGTH, MaxSequenceLength = MAX_SEQUENCE_LENGTH, }; session.Save(dbDigestion); } if (!progressMonitor.Invoke(Resources.ProteomeDb_Digest_Listing_proteins, 0)) { return(null); } List <DbProtein> proteins = new List <DbProtein>(); session.CreateCriteria(typeof(DbProtein)).List(proteins); Dictionary <String, long> digestedPeptideIds = new Dictionary <string, long>(); const String sqlPeptide = "INSERT INTO ProteomeDbDigestedPeptide (Digestion, Sequence) VALUES(?,?);select last_insert_rowid();"; // Not L10N using (var commandPeptide = session.Connection.CreateCommand()) using (var commandProtein = session.Connection.CreateCommand()) { commandPeptide.CommandText = sqlPeptide; commandPeptide.Parameters.Add(new SQLiteParameter()); commandPeptide.Parameters.Add(new SQLiteParameter()); const String sqlPeptideProtein = "INSERT INTO ProteomeDbDigestedPeptideProtein (Peptide, Protein) VALUES(?,?);"; // Not L10N commandProtein.CommandText = sqlPeptideProtein; commandProtein.Parameters.Add(new SQLiteParameter()); commandProtein.Parameters.Add(new SQLiteParameter()); commandProtein.Parameters.Add(new SQLiteParameter()); for (int i = 0; i < proteins.Count; i++) { var proteinSequences = new HashSet <string>(); if (!progressMonitor.Invoke(string.Format(Resources.ProteomeDb_Digest_Digesting__0__proteins, proteins.Count), 100 * i / proteins.Count)) { return(null); } Protein protein = new Protein(ProteomeDbPath, proteins[i]); foreach (DigestedPeptide digestedPeptide in protease.Digest(protein)) { if (digestedPeptide.Sequence.Length < dbDigestion.MinSequenceLength) { continue; } String truncatedSequence = digestedPeptide.Sequence.Substring( 0, Math.Min(digestedPeptide.Sequence.Length, dbDigestion.MaxSequenceLength)); if (existingSequences.Contains(truncatedSequence)) { continue; } if (proteinSequences.Contains(truncatedSequence)) { continue; } proteinSequences.Add(truncatedSequence); long digestedPeptideId; if (!digestedPeptideIds.TryGetValue(truncatedSequence, out digestedPeptideId)) { ((SQLiteParameter)commandPeptide.Parameters[0]).Value = dbDigestion.Id; ((SQLiteParameter)commandPeptide.Parameters[1]).Value = truncatedSequence; digestedPeptideId = Convert.ToInt64(commandPeptide.ExecuteScalar()); digestedPeptideIds.Add(truncatedSequence, digestedPeptideId); } ((SQLiteParameter)commandProtein.Parameters[0]).Value = digestedPeptideId; ((SQLiteParameter)commandProtein.Parameters[1]).Value = protein.Id; commandProtein.ExecuteNonQuery(); } } } if (!progressMonitor.Invoke(Resources.ProteomeDb_AddFastaFile_Saving_changes, 99)) { return(null); } transaction.Commit(); AnalyzeDb(session); progressMonitor.Invoke( string.Format(Resources.ProteomeDb_Digest_Digested__0__proteins_into__1__unique_peptides, proteins.Count, digestedPeptideIds.Count), 100); } return(new Digestion(this, dbDigestion)); } }
public Digestion Digest(IProtease protease, ProgressMonitor progressMonitor) { using (ISession session = OpenWriteSession()) { DbDigestion dbDigestion = GetDbDigestion(protease.Name); HashSet<string> existingSequences = new HashSet<string>(); using (var transaction = session.BeginTransaction()) { if (dbDigestion != null) { if (dbDigestion.MaxSequenceLength >= MAX_SEQUENCE_LENGTH) { return new Digestion(this, dbDigestion); } if (!progressMonitor.Invoke(Resources.ProteomeDb_Digest_Listing_existing_peptides, 0)) { return null; } IQuery query = session.CreateQuery("SELECT P.Sequence FROM " // Not L10N + typeof(DbDigestedPeptide) + " P WHERE P.Digestion = :Digestion") // Not L10N .SetParameter("Digestion", dbDigestion); // Not L10N List<String> listSequences = new List<string>(); query.List(listSequences); existingSequences.UnionWith(listSequences); dbDigestion.MaxSequenceLength = MAX_SEQUENCE_LENGTH; session.Update(dbDigestion); } else { dbDigestion = new DbDigestion { Name = protease.Name, MinSequenceLength = MIN_SEQUENCE_LENGTH, MaxSequenceLength = MAX_SEQUENCE_LENGTH, }; session.Save(dbDigestion); } if (!progressMonitor.Invoke(Resources.ProteomeDb_Digest_Listing_proteins, 0)) { return null; } List<DbProtein> proteins = new List<DbProtein>(); session.CreateCriteria(typeof(DbProtein)).List(proteins); Dictionary<String, long> digestedPeptideIds = new Dictionary<string, long>(); const String sqlPeptide = "INSERT INTO ProteomeDbDigestedPeptide (Digestion, Sequence) VALUES(?,?);select last_insert_rowid();"; // Not L10N using (var commandPeptide = session.Connection.CreateCommand()) using (var commandProtein = session.Connection.CreateCommand()) { commandPeptide.CommandText = sqlPeptide; commandPeptide.Parameters.Add(new SQLiteParameter()); commandPeptide.Parameters.Add(new SQLiteParameter()); const String sqlPeptideProtein = "INSERT INTO ProteomeDbDigestedPeptideProtein (Peptide, Protein) VALUES(?,?);"; // Not L10N commandProtein.CommandText = sqlPeptideProtein; commandProtein.Parameters.Add(new SQLiteParameter()); commandProtein.Parameters.Add(new SQLiteParameter()); commandProtein.Parameters.Add(new SQLiteParameter()); for (int i = 0; i < proteins.Count; i++) { var proteinSequences = new HashSet<string>(); if (!progressMonitor.Invoke(string.Format(Resources.ProteomeDb_Digest_Digesting__0__proteins,proteins.Count), 100 * i / proteins.Count)) { return null; } Protein protein = new Protein(ProteomeDbPath, proteins[i]); foreach (DigestedPeptide digestedPeptide in protease.Digest(protein)) { if (digestedPeptide.Sequence.Length < dbDigestion.MinSequenceLength) { continue; } String truncatedSequence = digestedPeptide.Sequence.Substring( 0, Math.Min(digestedPeptide.Sequence.Length, dbDigestion.MaxSequenceLength)); if (existingSequences.Contains(truncatedSequence)) { continue; } if (proteinSequences.Contains(truncatedSequence)) { continue; } proteinSequences.Add(truncatedSequence); long digestedPeptideId; if (!digestedPeptideIds.TryGetValue(truncatedSequence, out digestedPeptideId)) { ((SQLiteParameter)commandPeptide.Parameters[0]).Value = dbDigestion.Id; ((SQLiteParameter)commandPeptide.Parameters[1]).Value = truncatedSequence; digestedPeptideId = Convert.ToInt64(commandPeptide.ExecuteScalar()); digestedPeptideIds.Add(truncatedSequence, digestedPeptideId); } ((SQLiteParameter)commandProtein.Parameters[0]).Value = digestedPeptideId; ((SQLiteParameter)commandProtein.Parameters[1]).Value = protein.Id; commandProtein.ExecuteNonQuery(); } } } if (!progressMonitor.Invoke(Resources.ProteomeDb_AddFastaFile_Saving_changes, 99)) { return null; } transaction.Commit(); AnalyzeDb(session); progressMonitor.Invoke( string.Format(Resources.ProteomeDb_Digest_Digested__0__proteins_into__1__unique_peptides, proteins.Count, digestedPeptideIds.Count), 100); } return new Digestion(this, dbDigestion); } }
public Digestion Digest(IProtease protease, int maxMissedCleavages, IProgressMonitor progressMonitor, ref IProgressStatus status, bool delayDbIndexing = false) { using (ISession session = OpenWriteSession()) { DbDigestion dbDigestion = GetDbDigestion(protease.Name, session); HashSet <string> existingSequences; // TODO(bspratt) - the logic around this seems fishy, investigate. Probably never actually been used. Part of fix for issue #304, probably if (dbDigestion != null) { if (dbDigestion.MaxSequenceLength >= MAX_SEQUENCE_LENGTH) { return(new Digestion(this, dbDigestion)); } if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, Resources.ProteomeDb_Digest_Listing_existing_peptides, 0)) { return(null); } IQuery query = session.CreateQuery("SELECT P.Sequence FROM " // Not L10N + typeof(DbDigestedPeptide) + " P WHERE P.Digestion = :Digestion") // Not L10N .SetParameter("Digestion", dbDigestion); // Not L10N List <String> listSequences = new List <string>(); query.List(listSequences); existingSequences = new HashSet <string>(listSequences); dbDigestion.MaxSequenceLength = MAX_SEQUENCE_LENGTH; } else { dbDigestion = new DbDigestion { Name = protease.Name, MinSequenceLength = MIN_SEQUENCE_LENGTH, MaxSequenceLength = MAX_SEQUENCE_LENGTH, }; existingSequences = new HashSet <string>(); } if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, Resources.ProteomeDb_Digest_Listing_proteins, 0)) { return(null); } var dbProteins = new List <DbProtein>(); session.CreateCriteria(typeof(DbProtein)).List(dbProteins); // Digest the proteins var proteinCount = dbProteins.Count; if (proteinCount == 0) { return(null); } var proteinsList = new Protein[proteinCount]; var truncatedSequences = new HashSet <string> [proteinCount]; // One hashset of sequences for each protein of interest const int N_DIGEST_THREADS = 16; // Arbitrary value - do a progress/canel check every nth protein string message = string.Format(Resources.ProteomeDb_Digest_Digesting__0__proteins, proteinCount); for (var i = 0; i < proteinCount; i += N_DIGEST_THREADS) { var endRange = Math.Min(proteinCount, i + N_DIGEST_THREADS); if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, message, 50 * endRange / proteinCount)) { return(null); } for (int ii = i; ii < endRange; ii++) { var protein = new Protein(ProteomeDbPath, dbProteins[ii]); proteinsList[ii] = protein; } Parallel.For(i, endRange, ii => { var proteinSequences = new HashSet <string>(); // We only save the first dbDigestion.MaxSequenceLength characters of each peptide so collisions are likely truncatedSequences[ii] = proteinSequences; // One hashset of sequences for each protein of interest foreach (var digestedPeptide in protease.DigestSequence(proteinsList[ii].Sequence, maxMissedCleavages, null)) { if (digestedPeptide.Sequence.Length < dbDigestion.MinSequenceLength) { continue; } var truncatedSequence = digestedPeptide.Sequence.Substring( 0, Math.Min(digestedPeptide.Sequence.Length, dbDigestion.MaxSequenceLength)); if (!existingSequences.Contains(truncatedSequence)) { proteinSequences.Add(truncatedSequence); } } }); } // Now write to db if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, Resources.ProteomeDb_AddFastaFile_Saving_changes, 50)) { return(null); } bool committed = true; int digestedPeptideIdsCount; try { using (var transaction = session.BeginTransaction()) { session.SaveOrUpdate(dbDigestion); Dictionary <String, long> digestedPeptideIds = new Dictionary <string, long>(); const String sqlPeptide = "INSERT INTO ProteomeDbDigestedPeptide (Digestion, Sequence) VALUES(?,?);select last_insert_rowid();"; // Not L10N using (var commandPeptide = session.Connection.CreateCommand()) using (var commandProtein = session.Connection.CreateCommand()) { commandPeptide.CommandText = sqlPeptide; commandPeptide.Parameters.Add(new SQLiteParameter()); commandPeptide.Parameters.Add(new SQLiteParameter()); const String sqlPeptideProtein = "INSERT INTO ProteomeDbDigestedPeptideProtein (Peptide, Protein) VALUES(?,?);"; // Not L10N commandProtein.CommandText = sqlPeptideProtein; commandProtein.Parameters.Add(new SQLiteParameter()); commandProtein.Parameters.Add(new SQLiteParameter()); commandProtein.Parameters.Add(new SQLiteParameter()); for (int i = 0; i < proteinCount; i++) { var protein = proteinsList[i]; if (!UpdateProgressAndCheckForCancellation(progressMonitor, ref status, message, 50 * (proteinCount + i) / proteinCount)) { return(null); } foreach (var truncatedSequence in truncatedSequences[i]) { long digestedPeptideId; if (!digestedPeptideIds.TryGetValue(truncatedSequence, out digestedPeptideId)) { ((SQLiteParameter)commandPeptide.Parameters[0]).Value = dbDigestion.Id; ((SQLiteParameter)commandPeptide.Parameters[1]).Value = truncatedSequence; digestedPeptideId = Convert.ToInt64(commandPeptide.ExecuteScalar()); digestedPeptideIds.Add(truncatedSequence, digestedPeptideId); } ((SQLiteParameter)commandProtein.Parameters[0]).Value = digestedPeptideId; ((SQLiteParameter)commandProtein.Parameters[1]).Value = protein.Id; commandProtein.ExecuteNonQuery(); } } } try { transaction.Commit(); } catch (Exception) { committed = false; } digestedPeptideIdsCount = digestedPeptideIds.Count; } } catch (Exception) { if (!committed) { return(null); // Interrupted } else { throw; } } if (committed && !delayDbIndexing) { AnalyzeDb(session); // This runs asynchronously, and interferes with writes } if (committed) { progressMonitor.UpdateProgress(new ProgressStatus(string.Format(Resources.ProteomeDb_Digest_Digested__0__proteins_into__1__unique_peptides, proteinCount, digestedPeptideIdsCount)).ChangePercentComplete(100)); } return(committed ? new Digestion(this, dbDigestion) : null); } }
public static void StartRamp(IProtease protease, double percentIdentifiedSteps = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35) { List<Peptide> peps = new List<Peptide>(); List<Protein> proteins = new List<Protein>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); } proteins.Add(protein); } } // Fixed seed to make it reproducible Random random = new Random(480912341); peps = peps.OrderBy(x => random.Next()).ToList(); for (double percentIdentified = 0; percentIdentified <= 1; percentIdentified += percentIdentifiedSteps) { // Take the first x % to act as our identified peptides List<Peptide> identifiedPeptides = peps.Take((int) (peps.Count*percentIdentified)).ToList(); List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); Console.WriteLine("{0} peptides {1} protein groups", identifiedPeptides.Count, proteinGroups.Count); } }