private static Dictionary <string, Protease> LoadProteaseDictionary(string proteasesLocation) { Dictionary <string, Protease> dict = new Dictionary <string, Protease>(); using (StreamReader proteases = new StreamReader(proteasesLocation)) { proteases.ReadLine(); while (proteases.Peek() != -1) { string line = proteases.ReadLine(); string[] fields = line.Split('\t'); string name = fields[0]; string[] sequences_inducing_cleavage = fields[1].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); string[] sequences_preventing_cleavage = fields[2].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); var cleavage_terminus = (TerminusType)Enum.Parse(typeof(TerminusType), fields[3], true); var cleavage_specificity = (CleavageSpecificity)Enum.Parse(typeof(CleavageSpecificity), fields[4], true); string psi_ms_accession_number = fields[5]; string psi_ms_name = fields[6]; string site_regexp = fields[7]; var protease = new Protease(name, sequences_inducing_cleavage, sequences_preventing_cleavage, cleavage_terminus, cleavage_specificity, psi_ms_accession_number, psi_ms_name, site_regexp); dict.Add(protease.Name, protease); } } return(dict); }
/// <summary> /// Gets peptides for semispecific digestion of a protein /// </summary> /// <param name="protein"></param> /// <returns></returns> public IEnumerable <PeptideWithSetModifications> SemiSpecificDigestion(Protein protein) { List <Peptide> intervals = new List <Peptide>(); List <int> oneBasedIndicesToCleaveAfter = Protease.GetDigestionSiteIndices(protein.BaseSequence); for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - MaximumMissedCleavages - 1; i++) { if (Protease.Retain(i, InitiatorMethionineBehavior, protein[0]) && Protease.OkayLength(oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], MinPeptidesLength, MaxPeptidesLength)) { intervals.Add(new Peptide(protein, oneBasedIndicesToCleaveAfter[i] + 1, oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1], oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], "semi")); } if (Protease.Cleave(i, InitiatorMethionineBehavior, protein[0]) && Protease.OkayLength(oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - 1, MinPeptidesLength, MaxPeptidesLength)) { intervals.Add(new Peptide(protein, 2, oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1], oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - 1, "semi:M cleaved")); } } int lastIndex = oneBasedIndicesToCleaveAfter.Count - 1; int maxIndex = MaximumMissedCleavages < lastIndex ? MaximumMissedCleavages : lastIndex; for (int i = 1; i <= maxIndex; i++) { if (DigestionParams.TerminusTypeSemiProtease == TerminusType.N) //tricky, it's N because we want the extra peptide at the C terminus |_ { if (Protease.OkayLength(oneBasedIndicesToCleaveAfter[lastIndex] - oneBasedIndicesToCleaveAfter[lastIndex - i], MinPeptidesLength, MaxPeptidesLength)) { intervals.Add(new Peptide(protein, oneBasedIndicesToCleaveAfter[lastIndex - i] + 1, oneBasedIndicesToCleaveAfter[lastIndex], oneBasedIndicesToCleaveAfter[lastIndex] - oneBasedIndicesToCleaveAfter[lastIndex - i], "semiN")); } } else //TerminusType.C { if (Protease.OkayLength(oneBasedIndicesToCleaveAfter[i] - oneBasedIndicesToCleaveAfter[0], MinPeptidesLength, MaxPeptidesLength)) { intervals.Add(new Peptide(protein, oneBasedIndicesToCleaveAfter[0] + 1, oneBasedIndicesToCleaveAfter[i], oneBasedIndicesToCleaveAfter[i] - oneBasedIndicesToCleaveAfter[0], "semiC")); } } } // Also digest using the proteolysis product start/end indices intervals.AddRange( protein.ProteolysisProducts .Where(proteolysisProduct => proteolysisProduct.OneBasedBeginPosition != 1 || proteolysisProduct.OneBasedEndPosition != protein.Length) .Select(proteolysisProduct => new Peptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, proteolysisProduct.Type + " start"))); return(intervals.SelectMany(peptide => peptide.GetModifiedPeptides(AllKnownFixedModifications, DigestionParams, VariableModifications))); }
/// <summary> /// Gets peptides for specific protease digestion of a protein /// </summary> /// <param name="protein"></param> /// <returns></returns> public IEnumerable <PeptideWithSetModifications> Digestion(Protein protein) { var intervals = Protease.GetDigestionIntervals(protein, MaximumMissedCleavages, InitiatorMethionineBehavior, MinPeptidesLength, MaxPeptidesLength); return(intervals.SelectMany(peptide => peptide.GetModifiedPeptides(AllKnownFixedModifications, DigestionParams, VariableModifications))); }
private List <ProteinGroup> ApplyProteinParsimony() { //if dictionary is empty return an empty list of protein groups if (!CompactPeptideToProteinPeptideMatching.Values.Any()) { return(new List <ProteinGroup>()); } // digesting an XML database results in a non-mod-agnostic digestion; need to fix this if mod-agnostic parsimony enabled if (!TreatModPeptidesAsDifferentPeptides)//user want modified and unmodified peptides treated the same { Dictionary <string, HashSet <PeptideWithSetModifications> > baseSeqToProteinMatch = new Dictionary <string, HashSet <PeptideWithSetModifications> >(); // dictionary where string key is the base sequence and the HashSet is all PeptidesWithSetModificatiosn with the same sequence // can access which protein these matching peptides came from through the PeptideWithSetModifications object foreach (var peptide in CompactPeptideToProteinPeptideMatching.SelectMany(b => b.Value)) { if (baseSeqToProteinMatch.TryGetValue(peptide.BaseSequence, out HashSet <PeptideWithSetModifications> value)) { value.Add(peptide); } else { baseSeqToProteinMatch[peptide.BaseSequence] = new HashSet <PeptideWithSetModifications> { peptide }; } } var blah = new Dictionary <PeptideWithSetModifications, List <CompactPeptideBase> >(); // where to store results foreach (var pep in CompactPeptideToProteinPeptideMatching) { foreach (var pepWithSetMods in pep.Value) { if (blah.TryGetValue(pepWithSetMods, out List <CompactPeptideBase> list)) { list.Add(pep.Key); } else { blah.Add(pepWithSetMods, new List <CompactPeptideBase> { pep.Key }); } } } foreach (var baseSequence in baseSeqToProteinMatch) { if (baseSequence.Value.Count > 1 && baseSequence.Value.Any(p => p.NumMods > 0)) { // list of proteins along with start/end residue in protein and the # missed cleavages var peptideInProteinInfo = new List <Tuple <Protein, DigestionParams, int, int, int> >(); foreach (var peptide in baseSequence.Value) { peptideInProteinInfo.Add(new Tuple <Protein, DigestionParams, int, int, int>(peptide.Protein, peptide.DigestionParams, peptide.OneBasedStartResidueInProtein, peptide.OneBasedEndResidueInProtein, (int)peptide.MissedCleavages)); } foreach (var peptide in baseSequence.Value) { foreach (var proteinInfo in peptideInProteinInfo) { var pep = new PeptideWithSetModifications(proteinInfo.Item1, proteinInfo.Item2, proteinInfo.Item3, proteinInfo.Item4, peptide.PeptideDescription, proteinInfo.Item5, peptide.AllModsOneIsNterminus, peptide.NumFixedMods); foreach (var compactPeptide in blah[peptide]) { CompactPeptideToProteinPeptideMatching[compactPeptide].Add(pep); } } } } } } var proteinToPeptidesMatching = new Dictionary <Protein, HashSet <CompactPeptideBase> >(); var parsimonyProteinList = new Dictionary <Protein, HashSet <CompactPeptideBase> >(); var proteinsWithUniquePeptides = new Dictionary <Protein, HashSet <PeptideWithSetModifications> >(); // peptide matched to fullseq (used depending on user preference) var compactPeptideToFullSeqMatch = CompactPeptideToProteinPeptideMatching.ToDictionary(x => x.Key, x => x.Value.First().Sequence); foreach (var kvp in CompactPeptideToProteinPeptideMatching) { HashSet <Protein> proteinsAssociatedWithThisPeptide = new HashSet <Protein>(kvp.Value.Select(p => p.Protein)); if (proteinsAssociatedWithThisPeptide.Count == 1) { if (!proteinsWithUniquePeptides.TryGetValue(kvp.Value.First().Protein, out HashSet <PeptideWithSetModifications> peptides)) { proteinsWithUniquePeptides.Add(kvp.Value.First().Protein, new HashSet <PeptideWithSetModifications>(kvp.Value)); } else { peptides.UnionWith(kvp.Value); } } // multiprotease parsimony is "weird" because a peptide sequence can be shared between // two proteins but technically be a "unique" peptide because it is unique in that protease digestion // this code marks these types of peptides as unique else { foreach (var peptide in kvp.Value) { Protease protease = peptide.DigestionParams.Protease; int sameProteaseCount = kvp.Value.Count(v => v.DigestionParams.Protease == protease); if (sameProteaseCount == 1) { if (!proteinsWithUniquePeptides.TryGetValue(peptide.Protein, out HashSet <PeptideWithSetModifications> peps)) { proteinsWithUniquePeptides.Add(peptide.Protein, new HashSet <PeptideWithSetModifications> { peptide }); } else { peps.UnionWith(kvp.Value); } } } } // if a peptide is associated with a decoy protein, remove all target protein associations with the peptide if (kvp.Value.Any(p => p.Protein.IsDecoy)) { kvp.Value.RemoveWhere(p => !p.Protein.IsDecoy); } // if a peptide is associated with a contaminant protein, remove all target protein associations with the peptide if (kvp.Value.Any(p => p.Protein.IsContaminant)) { kvp.Value.RemoveWhere(p => !p.Protein.IsContaminant); } } // makes dictionary with proteins as keys and list of associated peptides as the value (makes parsimony algo easier) foreach (var kvp in CompactPeptideToProteinPeptideMatching) { foreach (var peptide in kvp.Value) { if (!proteinToPeptidesMatching.TryGetValue(peptide.Protein, out HashSet <CompactPeptideBase> peptides)) { proteinToPeptidesMatching.Add(peptide.Protein, new HashSet <CompactPeptideBase>() { kvp.Key }); } else { peptides.Add(kvp.Key); } } } // build protein list for each peptide before parsimony has been applied var peptideSeqProteinListMatch = new Dictionary <string, HashSet <Protein> >(); foreach (var kvp in proteinToPeptidesMatching) { foreach (var peptide in kvp.Value) { string pepSequence; if (!TreatModPeptidesAsDifferentPeptides) { string nTerminalMasses = peptide.NTerminalMasses == null ? "" : string.Join("", peptide.NTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture))); string cTerminalMasses = peptide.CTerminalMasses == null ? "" : string.Join("", peptide.CTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture))); pepSequence = nTerminalMasses + cTerminalMasses + peptide.MonoisotopicMassIncludingFixedMods.ToString(CultureInfo.InvariantCulture); } else { pepSequence = compactPeptideToFullSeqMatch[peptide]; } if (!peptideSeqProteinListMatch.TryGetValue(pepSequence, out HashSet <Protein> proteinListHere)) { peptideSeqProteinListMatch.Add(pepSequence, new HashSet <Protein>() { kvp.Key }); } else { proteinListHere.Add(kvp.Key); } } } // dictionary associates proteins w/ unused base seqs (list will shrink over time) var algDictionary = new Dictionary <Protein, HashSet <string> >(); foreach (var kvp in peptideSeqProteinListMatch) { foreach (var protein in kvp.Value) { if (algDictionary.TryGetValue(protein, out HashSet <string> newPeptideBaseSeqs)) { newPeptideBaseSeqs.Add(kvp.Key); } else { algDictionary.Add(protein, new HashSet <string> { kvp.Key }); } } } // dictionary associates proteins w/ unused base seqs (list will NOT shrink over time) var proteinToPepSeqMatch = algDictionary.ToDictionary(x => x.Key, x => x.Value); // *** main parsimony loop bool uniquePeptidesLeft = proteinsWithUniquePeptides.Any(); int numNewSeqs = algDictionary.Max(p => p.Value.Count); while (numNewSeqs != 0) { var possibleBestProteinList = new List <KeyValuePair <Protein, HashSet <string> > >(); if (uniquePeptidesLeft) { var proteinsWithUniquePeptidesLeft = algDictionary.Where(p => proteinsWithUniquePeptides.ContainsKey(p.Key)); if (proteinsWithUniquePeptidesLeft.Any()) { possibleBestProteinList.Add(proteinsWithUniquePeptidesLeft.First()); } else { uniquePeptidesLeft = false; } } // gets list of proteins with the most unaccounted-for peptide base sequences if (!uniquePeptidesLeft) { possibleBestProteinList = algDictionary.Where(p => p.Value.Count == numNewSeqs).ToList(); } Protein bestProtein = possibleBestProteinList.First().Key; HashSet <string> newSeqs = new HashSet <string>(algDictionary[bestProtein]); // may need to select different protein if (possibleBestProteinList.Count > 1) { var proteinsWithTheseBaseSeqs = new HashSet <Protein>(); foreach (var kvp in possibleBestProteinList) { if (newSeqs.IsSubsetOf(kvp.Value)) { proteinsWithTheseBaseSeqs.Add(kvp.Key); } } if (proteinsWithTheseBaseSeqs.Count > 1) { var proteinsOrderedByTotalPeptideCount = new Dictionary <Protein, HashSet <string> >(); foreach (var protein in proteinsWithTheseBaseSeqs) { proteinsOrderedByTotalPeptideCount.Add(protein, proteinToPepSeqMatch[protein]); } bestProtein = proteinsOrderedByTotalPeptideCount.OrderByDescending(kvp => kvp.Value.Count).First().Key; } } parsimonyProteinList.Add(bestProtein, proteinToPeptidesMatching[bestProtein]); // remove used peptides from their proteins foreach (var newBaseSeq in newSeqs) { HashSet <Protein> proteinsWithThisPeptide = peptideSeqProteinListMatch[newBaseSeq]; foreach (var protein in proteinsWithThisPeptide) { algDictionary[protein].Remove(newBaseSeq); } } algDictionary.Remove(bestProtein); numNewSeqs = algDictionary.Any() ? algDictionary.Max(p => p.Value.Count) : 0; } // *** done with parsimony // add indistinguishable proteins var proteinsGroupedByNumPeptides = proteinToPeptidesMatching.GroupBy(p => p.Value.Count); var parsimonyProteinsGroupedByNumPeptides = parsimonyProteinList.GroupBy(p => p.Value.Count); var indistinguishableProteins = new ConcurrentDictionary <Protein, HashSet <CompactPeptideBase> >(); foreach (var group in proteinsGroupedByNumPeptides) { var parsimonyProteinsWithSameNumPeptides = parsimonyProteinsGroupedByNumPeptides.FirstOrDefault(p => p.Key == group.Key); var list = group.ToList(); if (parsimonyProteinsWithSameNumPeptides != null) { Parallel.ForEach(Partitioner.Create(0, list.Count), new ParallelOptions { MaxDegreeOfParallelism = commonParameters.MaxThreadsToUsePerFile }, (range, loopState) => { for (int i = range.Item1; i < range.Item2; i++) { foreach (var parsimonyProteinWithThisNumPeptides in parsimonyProteinsWithSameNumPeptides) { if (parsimonyProteinWithThisNumPeptides.Key != list[i].Key && proteinToPeptidesMatching[parsimonyProteinWithThisNumPeptides.Key].SetEquals(proteinToPeptidesMatching[list[i].Key])) { indistinguishableProteins.GetOrAdd(list[i].Key, proteinToPeptidesMatching[list[i].Key]); } } } } ); } } foreach (var protein in indistinguishableProteins) { if (!parsimonyProteinList.ContainsKey(protein.Key)) { parsimonyProteinList.Add(protein.Key, protein.Value); } } // multiprotease parsimony: // this code is a workaround to add back proteins to the parsimonious list that were removed // because unique peptides were mistaken for shared peptides. see line 139 for more info if (ListOfDigestionParams.Select(v => v.Protease).Distinct().Count() > 1) { HashSet <Protein> parsimonyProteinSet = new HashSet <Protein>(parsimonyProteinList.Keys); // add back in proteins that contain unique peptides foreach (var prot in proteinsWithUniquePeptides) { if (!parsimonyProteinSet.Contains(prot.Key)) { parsimonyProteinList.Add(prot.Key, proteinToPeptidesMatching[prot.Key]); } } } foreach (var kvp in CompactPeptideToProteinPeptideMatching) { kvp.Value.RemoveWhere(p => !parsimonyProteinList.ContainsKey(p.Protein)); } return(ConstructProteinGroups(new HashSet <PeptideWithSetModifications>(proteinsWithUniquePeptides.Values.SelectMany(p => p)), new HashSet <PeptideWithSetModifications>(CompactPeptideToProteinPeptideMatching.Values.SelectMany(p => p)))); }