public void DuplicatePeptidesReturn() { Protein prot = new Protein("DEREKDEREK"); var peptides = prot.Digest(Protease.GetProtease("LysC"), 0).ToList(); Assert.AreEqual(peptides.Count, 2); }
public static void WritePepXml() { string filePath = Path.Combine(Examples.BASE_DIRECTORY, "example.pepXML"); Console.WriteLine("Writting to " + filePath); using (PepXmlWriter writer = new PepXmlWriter(filePath)) { writer.WriteSampleProtease(Protease.Trypsin); writer.StartSearchSummary("OMSSA", true, true); writer.WriteProteinDatabase("Resources/yeast_uniprot_120226.fasta"); writer.WriteSearchProtease(Protease.Trypsin, 3); writer.WriteModification(ModificationDictionary.GetModification("Acetyl"), ModificationSites.K | ModificationSites.NPep); writer.WriteModification(ModificationDictionary.GetModification("CAM"), ModificationSites.C); writer.WriteModification(ModificationDictionary.GetModification("Phospho"), ModificationSites.S | ModificationSites.T | ModificationSites.Y, false); writer.SetCurrentStage(PepXmlWriter.Stage.Spectra, true); writer.StartSpectrum(15, 1.234, 523.4324, 3); PeptideSpectralMatch psm = new PeptideSpectralMatch(PeptideSpectralMatchScoreType.OmssaEvalue); psm.Score = 1.5e-5; Protein protein = new Protein("", "Test Protein"); psm.Peptide = new Peptide("DEREK",protein); psm.Charge = 3; writer.WritePSM(psm); writer.EndSpectrum(); } }
public void DuplicatePeptidesAreEqualivant() { Protein prot = new Protein("DEREKDEREK"); var peptides = prot.Digest(Protease.GetProtease("LysC"), 0).ToList(); Assert.AreEqual(peptides[0], peptides[1]); }
public void Write(Protein protein) { Write(protein.Sequence, protein.Description); }
/// <summary> /// Performs an in silico digestion of all the proteins found within the fasta file. /// </summary> /// <param name="fastaFile">The fasta filename to perfrom the digestion on</param> /// <param name="uniquePeptides">The unique peptides that were read in from the csv files</param> /// <param name="proteases"></param> /// <param name="semiDigestion">Perform a Semi Digestion</param> /// <returns>True if all the unique peptides get isMapped to at least one protein, false otherwise</returns> private List<Protein> GetMappedProteinsFromFasta(string fastaFile, Dictionary<string, Peptide> uniquePeptides, IList<Protease> proteases, bool semiDigestion = false) { string fastaFileniceName = Path.GetFileName(fastaFile); StringBuilder sb = new StringBuilder(); foreach (Protease protease in proteases) { sb.Append(protease.Name); sb.Append(','); } if (sb.Length > 0) { sb.Remove(sb.Length - 1, 1); } Log("Performing {0}{1} digestion on {2}...", semiDigestion ? "semi " : "", sb, fastaFileniceName); //Peptide.MappedCount = 0; int forwardProteins = 0, decoyProteins = 0, forwardProteinsMapped = 0, decoyProteinsMapped = 0, fastaCounter = 0, pepsMapped = 0; long totalBytes = new FileInfo(fastaFile).Length; // A hashset of all proteins that have a peptide that was in the input files Dictionary<Protein, Protein> proteins = new Dictionary<Protein, Protein>(1 << 13); // Min and Max length of peptides int minLength = semiDigestion ? 1 : _smallestPeptide - 1; int maxLength = semiDigestion ? int.MaxValue : _largestPeptide + 1; // Open the reader for the protein database in the .fasta format using (FastaReader reader = new FastaReader(fastaFile)) { // Read in each protein one-by-one foreach (Fasta fasta in reader.ReadNextFasta()) { // The number of fasta (proteins) read in (for progress bar feedback) fastaCounter++; // Create a new protein from the fasta Protein prot = new Protein(fasta.Description, fasta.Sequence); // Check if the protein is a decoy protein or not if (prot.IsDecoy) { decoyProteins++; } else { forwardProteins++; } // Loop over each protease foreach (Protease protease in proteases) { // Digest the protein's leucine sequences (all I's are now L's) with the given proteases, max missed cleavages, limiting it to the smallest and largest peptide observed (speed improvement) // *Note each peptide sequence (pep_seq) will be leucine sequences as well foreach (string pepSeq in AminoAcidPolymer.Digest(prot.Sequence, protease, MaxMissedCleavages, minLength, maxLength, semiDigestion: semiDigestion)) { // Is this one of the unique peptide sequences in the csv files? If not, we don't care about it Peptide pep; if (!uniquePeptides.TryGetValue(pepSeq.Replace('I', 'L'), out pep)) continue; // Check to see if this protein has already been added to the list of proteins hit if(!proteins.ContainsKey(prot)) // returns true if the protein is new to the hashset of proteins { proteins.Add(prot, prot); if (prot.IsDecoy) { decoyProteinsMapped++; } else { forwardProteinsMapped++; } } // Add the peptide to the protein (internally hashed, so don't worry about duplicates) prot.AddPeptide(pep); // Mark that this peptide was successfully mapped, this is for error checking purposes if (!pep.IsMapped) { pepsMapped++; pep.IsMapped = true; } //pep.MarkAsMapped(); } } // Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work //if (fastaCounter > 100) //{ // fastaCounter = 0; // ProgressUpdate((double)reader.Position / totalBytes); //} } } // Check to see if every peptide is matched, if not try using a brute force search method instead if (uniquePeptides.Count > pepsMapped) { // Get all the peptides that weren't mapped List<Peptide> unMapedPeptides = uniquePeptides.Values.Where(p => !p.IsMapped).ToList(); Log("[WARNING] Couldn't find every peptide using digestion method (wrong enzyme perhaps?), trying brute force search instead on the remaining {0} peptides...", unMapedPeptides.Count); ProgressUpdate(0.1); using (FastaReader reader = new FastaReader(fastaFile)) { fastaCounter = 0; // Read in each protein one-by-one foreach (Fasta fasta in reader.ReadNextFasta()) { string seq = fasta.Sequence.Replace('I', 'L'); foreach (Peptide pep2 in unMapedPeptides) { if (!seq.Contains(pep2.LeucineSequence)) continue; Protein prot = new Protein(fasta.Description, fasta.Sequence); Protein realProt; if (proteins.TryGetValue(prot, out realProt)) { // Add the peptide to the protein realProt.AddPeptide(pep2); } else { proteins.Add(prot, prot); if (prot.IsDecoy) { decoyProteinsMapped++; } else { forwardProteinsMapped++; } // Add the peptide to the protein prot.AddPeptide(pep2); } // Mark that this peptide was successfully isMapped, this is for error checking purposes pep2.IsMapped = true; //pep2.MarkAsMapped(); } fastaCounter++; //// Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work //if (fastaCounter > 100) //{ // fastaCounter = 0; // ProgressUpdate((double)reader.BaseStream.Position / totalBytes); //} } } // Still missing peptides? if (unMapedPeptides.Any(p => !p.IsMapped)) { int count = 0; foreach (Peptide pep2 in unMapedPeptides) { if (pep2.IsMapped) continue; count++; Log("[ERROR]\tPeptide {0} was not isMapped", pep2); } throw new ArgumentException( string.Format( "[ERROR] Unable to map every peptide ({0}) to {1}. You might be using either the wrong database, enzyme, or max missed cleavages!", count, fastaFileniceName)); } } Log("Every unique peptide was successfully mapped to at least one protein"); Log("{0:N0} of {1:N0} ({2:F2}%) target proteins were mapped at least once", forwardProteinsMapped, forwardProteins, 100.0 * (double)forwardProteinsMapped / (double)forwardProteins); Log("{0:N0} of {1:N0} ({2:F2}%) decoy proteins were mapped at least once", decoyProteinsMapped, decoyProteins, 100.0 * (double)decoyProteinsMapped / (double)decoyProteins); // force the progress bar to go into marquee mode ProgressUpdate(0.0); // Return a list of all the proteins that were isMapped at least once return proteins.Values.ToList(); }
private void WriteProteinsPerMinute(List<Peptide> allPeptides, List<Protein> proteins, string outputDirectory) { string fileName = Path.Combine(outputDirectory, "proteins_per_minute.csv"); Log("Writing file " + fileName); double maxPeptides = allPeptides.Count; List<ProteinGroup> groups = null; using (StreamWriter writer = new StreamWriter(fileName)) { writer.WriteLine("Time (min),Unique Peptides,Protein Groups"); double i = 0; while(i < 1000) { HashSet<Peptide> currentPeptides = new HashSet<Peptide>(allPeptides.Where(pep => pep.PSMs.Any(psm => psm.RetentionTime <= i))); List<Protein> currentProteins = new List<Protein>(); foreach (Peptide peptide in allPeptides) { peptide.ProteinGroups.Clear(); } if (currentPeptides.Count > 0) { foreach (Protein protein in proteins) { Protein protein2 = null; bool first = true; foreach (Peptide peptide in protein.Peptides) { if (currentPeptides.Contains(peptide)) { if (first) { protein2 = new Protein(protein.Description, protein.Sequence); currentProteins.Add(protein2); first = false; } protein2.AddPeptide(peptide); } } } } groups = GroupProteins(currentProteins, false); int fdrGroups = groups.Count(g => g.PassesFDR); writer.WriteLine(i + "," + currentPeptides.Count + "," + fdrGroups); ProgressUpdate(currentPeptides.Count / maxPeptides); if (currentPeptides.Count >= maxPeptides) break; i++; } } return; }
public void Setup() { _proteinA = new Protein("MMRGFKQRLIKKTTGSSSSSSSKKKDKEKEKEKSSTTSSTSKKPASASSSSHGTTHSSASSTGSKSTTEKGKQSGSVPSQ" + "GKHHSSSTSKTKTATTPSSSSSSSRSSSVSRSGSSSTKKTSSRKGQEQSKQSQQPSQSQKQGSSSSSAAIMNPTPVLTVT" + "KDDKSTSGEDHAHPTLLGAVSAVPSSPISNASGTAVSSDVENGNSNNNNMNINTSNTQDANHASSQSIDIPRSSHSFERL" + "PTPTKLNPDTDLELIKTPQRHSSSRFEPSRYTPLTKLPNFNEVSPEERIPLFIAKVDQCNTMFDFNDPSFDIQGKEIKRS" + "TLDELIEFLVTNRFTYTNEMYAHVVNMFKINLFRPIPPPVNPVGDIYDPDEDEPVNELAWPHMQAVYEFFLRFVESPDFN" + "HQIAKQYIDQDFILKLLELFDSEDIRERDCLKTTLHRIYGKFLSLRSFIRRSMNNIFLQFIYETEKFNGVAELLEILGSI" + "INGFALPLKEEHKVFLVRILIPLHKVRCLSLYHPQLAYCIVQFLEKDPLLTEEVVMGLLRYWPKINSTKEIMFLNEIEDI" + "FEVIEPLEFIKVEVPLFVQLAKCISSPHFQVAEKVLSYWNNEYFLNLCIENAEVILPIIFPALYELTSQLELDTANGEDS" + "ISDPYMLVEQAINSGSWNRAIHAMAFKALKIFLETNPVLYENCNALYLSSVKETQQRKVQREENWSKLEEYVKNLRINND" + "KDQYTIKNPELRNSFNTASENNTLNEENENDCDSEIQ"); }
public void SemiTrypiticDigestion() { Protein prot = new Protein("MMRGFKQRLIKKTTGSSSSSSSKKKDKEKEKEKSSTTSSTSKKPASASSSSHGTTHSSASSTGSKSTTEKGKQSGSVPSQ"); var peptides = prot.Digest(Protease.GetProtease("Trypsin"), 0, 5, 10, semiDigestion: true).ToList(); Assert.AreEqual(17, peptides.Count); }