protected void AnnotateModifications(string fullSequence, int yLoc) { var peptide = new PeptideWithSetModifications(fullSequence, GlobalVariables.AllModsKnownDictionary); // read glycans if applicable List <Tuple <int, string, double> > localGlycans = null; if (SpectrumMatch.GlycanLocalizationLevel != null) { localGlycans = PsmFromTsv.ReadLocalizedGlycan(SpectrumMatch.LocalizedGlycan); } // annotate mods foreach (var mod in peptide.AllModsOneIsNterminus) { double xLocation = (mod.Key - 1) * MetaDrawSettings.AnnotatedSequenceTextSpacing - 12; double yLocation = yLoc + 2; if (mod.Value.ModificationType == "O-Glycosylation") { if (localGlycans.Where(p => p.Item1 + 1 == mod.Key).Count() > 0) { DrawCircle(SequenceDrawingCanvas, new Point(xLocation, yLocation), MetaDrawSettings.ModificationAnnotationColor); } else { DrawCircle(SequenceDrawingCanvas, new Point(xLocation, yLocation), Brushes.Gray); } } else { DrawCircle(SequenceDrawingCanvas, new Point(xLocation, yLocation), MetaDrawSettings.ModificationAnnotationColor); } } }
/// <summary> /// Checks for an intersection between a peptide and applied variant that shows a sequence change. /// </summary> /// <param name="pep"></param> /// <param name="appliedVariation"></param> /// <returns></returns> private static bool IntersectsWithVariation(PeptideWithSetModifications pep, SequenceVariation appliedVariation, bool checkUnique) { // does it intersect? int intersectOneBasedStart = Math.Max(pep.OneBasedStartResidueInProtein, appliedVariation.OneBasedBeginPosition); int intersectOneBasedEnd = Math.Min(pep.OneBasedEndResidueInProtein, appliedVariation.OneBasedEndPosition); if (intersectOneBasedEnd < intersectOneBasedStart) { return(false); } else if (!checkUnique) { return(true); } else { // if the original sequence is too short or long, the intersect of the peptide and variant is unique int intersectSize = intersectOneBasedEnd - intersectOneBasedStart + 1; int variantZeroBasedStart = intersectOneBasedStart - appliedVariation.OneBasedBeginPosition; bool origSeqIsShort = appliedVariation.OriginalSequence.Length - variantZeroBasedStart < intersectSize; bool origSeqIsLong = appliedVariation.OriginalSequence.Length > intersectSize && pep.OneBasedEndResidueInProtein > intersectOneBasedEnd; if (origSeqIsShort || origSeqIsLong) { return(true); } // is the variant sequence intersecting the peptide different than the original sequence? string originalAtIntersect = appliedVariation.OriginalSequence.Substring(intersectOneBasedStart - appliedVariation.OneBasedBeginPosition, intersectSize); string variantAtIntersect = appliedVariation.VariantSequence.Substring(intersectOneBasedStart - appliedVariation.OneBasedBeginPosition, intersectSize); return(originalAtIntersect != variantAtIntersect); } }
public PeptideWithSetModifications(PeptideWithSetModifications modsFromThisOne, PeptideWithSetModifications everythingElseFromThisOne) : base(everythingElseFromThisOne.Protein, everythingElseFromThisOne.OneBasedStartResidueInProtein, everythingElseFromThisOne.OneBasedEndResidueInProtein, everythingElseFromThisOne.MissedCleavages, everythingElseFromThisOne.PeptideDescription) { this.allModsOneIsNterminus = modsFromThisOne.allModsOneIsNterminus; this.numFixedMods = modsFromThisOne.numFixedMods; }
/// <summary> /// Makes the string representing a detected sequence variation, including any modifications on a variant amino acid /// </summary> /// <param name="p"></param> /// <param name="d"></param> /// <returns></returns> private static string SequenceVariantString(PeptideWithSetModifications p, SequenceVariation applied) { var modsOnVariantOneIsNTerm = p.AllModsOneIsNterminus .Where(kv => kv.Key == 1 && applied.OneBasedBeginPosition == 1 || applied.OneBasedBeginPosition <= kv.Key - 2 + p.OneBasedStartResidueInProtein && kv.Key - 2 + p.OneBasedStartResidueInProtein <= applied.OneBasedEndPosition) .ToDictionary(kv => kv.Key - applied.OneBasedBeginPosition + 1, kv => kv.Value); PeptideWithSetModifications variantWithAnyMods = new PeptideWithSetModifications(p.Protein, p.DigestionParams, applied.OneBasedBeginPosition, applied.OneBasedEndPosition, p.CleavageSpecificityForFdrCategory, p.PeptideDescription, p.MissedCleavages, modsOnVariantOneIsNTerm, p.NumFixedMods); return($"{applied.OriginalSequence}{applied.OneBasedBeginPosition}{variantWithAnyMods.FullSequence}"); }
public static PeptideSpectralMatch GetSilacPsm(PeptideSpectralMatch psm, SilacLabel silacLabel) { List <(int Notch, PeptideWithSetModifications Peptide)> updatedBestMatchingPeptides = new List <(int Notch, PeptideWithSetModifications Peptide)>(); foreach ((int Notch, PeptideWithSetModifications Peptide)notchAndPwsm in psm.BestMatchingPeptides) { PeptideWithSetModifications modifiedPwsm = CreateSilacPwsm(silacLabel, notchAndPwsm.Peptide); updatedBestMatchingPeptides.Add((notchAndPwsm.Notch, modifiedPwsm)); } return(psm.Clone(updatedBestMatchingPeptides)); }
public CompactPeptide(PeptideWithSetModifications peptideWithSetModifications, TerminusType terminusType) { NTerminalMasses = null; CTerminalMasses = null; if (terminusType == TerminusType.None || terminusType == TerminusType.N) { NTerminalMasses = ComputeFollowingFragmentMasses(peptideWithSetModifications, 0, 0, 1).ToArray(); } if (terminusType == TerminusType.None || terminusType == TerminusType.C) { CTerminalMasses = ComputeFollowingFragmentMasses(peptideWithSetModifications, 0, peptideWithSetModifications.Length + 1, -1).ToArray(); } MonoisotopicMassIncludingFixedMods = peptideWithSetModifications.MonoisotopicMass; }
public PeptideWithSetModifications Localize(int j, double massToLocalize) { var vvv = new Dictionary <int, ModificationWithMass>(allModsOneIsNterminus); double massOfExistingMod = 0; if (vvv.TryGetValue(j + 2, out ModificationWithMass modToReplace)) { massOfExistingMod = modToReplace.monoisotopicMass; vvv.Remove(j + 2); } vvv.Add(j + 2, new ModificationWithMass(null, null, null, TerminusLocalization.Any, massToLocalize + massOfExistingMod)); var hm = new PeptideWithSetModifications(numFixedMods, Protein, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, vvv, MissedCleavages); return(hm); }
protected static IEnumerable <double> ComputeFollowingFragmentMasses(PeptideWithSetModifications yyy, double prevMass, int oneBasedIndexToLookAt, int direction) { ModificationWithMass currentModification = null; do { if (oneBasedIndexToLookAt != 0 && oneBasedIndexToLookAt != yyy.Length + 1) { prevMass += Residue.ResidueMonoisotopicMass[yyy[oneBasedIndexToLookAt - 1]]; } // If modification exists if (yyy.allModsOneIsNterminus.TryGetValue(oneBasedIndexToLookAt + 1, out currentModification)) { if (currentModification.neutralLosses.Count == 1 && oneBasedIndexToLookAt != 0 && oneBasedIndexToLookAt != yyy.Length + 1) { prevMass += currentModification.monoisotopicMass - currentModification.neutralLosses.First(); yield return(Math.Round(prevMass, digitsForRoundingMasses)); } else { foreach (double nl in currentModification.neutralLosses) { var theMass = prevMass + currentModification.monoisotopicMass - nl; if (oneBasedIndexToLookAt != 0 && oneBasedIndexToLookAt != yyy.Length + 1) { yield return(Math.Round(theMass, digitsForRoundingMasses)); } if ((direction == 1 && oneBasedIndexToLookAt + direction < yyy.Length) || (direction == -1 && oneBasedIndexToLookAt + direction > 1)) { foreach (var nextMass in ComputeFollowingFragmentMasses(yyy, theMass, oneBasedIndexToLookAt + direction, direction)) { yield return(Math.Round(nextMass, digitsForRoundingMasses)); } } } break; } } else if (oneBasedIndexToLookAt != 0 && oneBasedIndexToLookAt != yyy.Length + 1) // No modification exists { yield return(Math.Round(prevMass, digitsForRoundingMasses)); } oneBasedIndexToLookAt += direction; } while ((oneBasedIndexToLookAt > 1 && direction == -1) || (oneBasedIndexToLookAt < yyy.Length && direction == 1)); }
private static bool PeptideIsVariant(PeptideWithSetModifications pwsm) { bool identifiedVariant = false; if (pwsm.Protein.AppliedSequenceVariations.Count() > 0) { foreach (var variant in pwsm.Protein.AppliedSequenceVariations) { if (pwsm.IntersectsAndIdentifiesVariation(variant).identifies) { identifiedVariant = true; break; } } } return(identifiedVariant); }
//modify the proteins to appear only light (we want a protein sequence to look like PROTEINK instead of PROTEINa) public static List <PeptideSpectralMatch> UpdateProteinSequencesToLight(List <PeptideSpectralMatch> originalPsms, List <SilacLabel> labels) { List <PeptideSpectralMatch> psmsToReturn = new List <PeptideSpectralMatch>(); foreach (PeptideSpectralMatch psm in originalPsms) { List <(int Notch, PeptideWithSetModifications Peptide)> originalPeptides = psm.BestMatchingPeptides.ToList(); List <(int Notch, PeptideWithSetModifications Peptide)> updatedPeptides = new List <(int Notch, PeptideWithSetModifications Peptide)>(); foreach ((int Notch, PeptideWithSetModifications Peptide)notchPwsm in originalPeptides) { PeptideWithSetModifications pwsm = notchPwsm.Peptide; SilacLabel label = GetRelevantLabelFromBaseSequence(pwsm.BaseSequence, labels); Protein updatedProtein = pwsm.Protein; if (label != null) { string proteinLightSequence = updatedProtein.BaseSequence; proteinLightSequence = proteinLightSequence.Replace(label.AminoAcidLabel, label.OriginalAminoAcid); if (label.AdditionalLabels != null) { foreach (SilacLabel additionalLabel in label.AdditionalLabels) { proteinLightSequence = proteinLightSequence.Replace(additionalLabel.AminoAcidLabel, additionalLabel.OriginalAminoAcid); } } updatedProtein = new Protein(pwsm.Protein, proteinLightSequence); } PeptideWithSetModifications updatedPwsm = new PeptideWithSetModifications( updatedProtein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, pwsm.BaseSequence); updatedPeptides.Add((notchPwsm.Notch, updatedPwsm)); } psmsToReturn.Add(psm.Clone(updatedPeptides)); } return(psmsToReturn); }
public static PeptideSpectralMatch GetLabeledPsm(PeptideSpectralMatch psm, int notch, PeptideWithSetModifications pwsm, string labeledBaseSequence) { PeptideWithSetModifications labeledPwsm = new PeptideWithSetModifications( pwsm.Protein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, labeledBaseSequence); return(psm.Clone(new List <(int Notch, PeptideWithSetModifications Peptide)> { (notch, labeledPwsm) })); }
//Needed for parsimony, where there are ambiguous psms //Quantification ignores ambiguity public static PeptideSpectralMatch GetSilacPsmFromAmbiguousPsm(PeptideSpectralMatch psm, List <SilacLabel> silacLabels) { List <(int Notch, PeptideWithSetModifications Peptide)> updatedBestMatchingPeptides = new List <(int Notch, PeptideWithSetModifications Peptide)>(); foreach ((int Notch, PeptideWithSetModifications Peptide)notchAndPwsm in psm.BestMatchingPeptides) { PeptideWithSetModifications pwsm = notchAndPwsm.Peptide; SilacLabel silacLabel = GetRelevantLabelFromBaseSequence(pwsm.Protein.BaseSequence, silacLabels); if (silacLabel == null) { updatedBestMatchingPeptides.Add(notchAndPwsm); } else { PeptideWithSetModifications modifiedPwsm = CreateSilacPwsm(true, silacLabel, pwsm); //create light pwsm updatedBestMatchingPeptides.Add((notchAndPwsm.Notch, modifiedPwsm)); } } return(psm.Clone(updatedBestMatchingPeptides)); }
public static PeptideWithSetModifications CreateSilacPwsm(SilacLabel silacLabel, PeptideWithSetModifications pwsm) { string baseSequence = pwsm.BaseSequence; baseSequence = baseSequence.Replace(silacLabel.AminoAcidLabel, silacLabel.OriginalAminoAcid); //create light sequence if (silacLabel.AdditionalLabels != null) { foreach (SilacLabel additionalLabel in silacLabel.AdditionalLabels) { baseSequence = baseSequence.Replace(additionalLabel.AminoAcidLabel, additionalLabel.OriginalAminoAcid); //create light sequence } } return(new PeptideWithSetModifications( pwsm.Protein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, baseSequence)); //this is the only thing changing }
public PeptideWithSetModifications(PeptideWithSetModifications modsFromThisOne, int proteinOneBasedStart, int proteinOneBasedEnd) : base(modsFromThisOne.Protein, proteinOneBasedStart, proteinOneBasedEnd, proteinOneBasedEnd - proteinOneBasedStart, modsFromThisOne.PeptideDescription) { this.allModsOneIsNterminus = modsFromThisOne.allModsOneIsNterminus.Where(b => b.Key > (1 + proteinOneBasedStart - modsFromThisOne.OneBasedStartResidueInProtein) && b.Key <= (2 + proteinOneBasedEnd - modsFromThisOne.OneBasedStartResidueInProtein)).ToDictionary(b => (b.Key + modsFromThisOne.OneBasedStartResidueInProtein - proteinOneBasedStart), b => b.Value); }
/// <summary> /// TODO: Summarize parsimony; /// Parsimony algorithm based on: https://www.ncbi.nlm.nih.gov/pubmed/14632076 Anal Chem. 2003 Sep 1;75(17):4646-58. /// TODO: Note describing that peptide objects with the same sequence are associated with different proteins /// </summary> private List <ProteinGroup> RunProteinParsimonyEngine() { // parsimonious list of proteins built by this protein parsimony engine HashSet <Protein> parsimoniousProteinList = new HashSet <Protein>(); // list of peptides that can only be digestion products of one protein in the proteome (considering different protease digestion rules) HashSet <PeptideWithSetModifications> uniquePeptides = new HashSet <PeptideWithSetModifications>(); // if there are no peptides observed, there are no proteins; return an empty list of protein groups if (_fdrFilteredPeptides.Count == 0) { return(new List <ProteinGroup>()); } // Parsimony stage 0: create peptide-protein associations if needed because the user wants a modification-agnostic parsimony if (!_treatModPeptidesAsDifferentPeptides) { foreach (var protease in _fdrFilteredPsms.GroupBy(p => p.DigestionParams.Protease)) { Dictionary <string, List <PeptideSpectralMatch> > sequenceWithPsms = new Dictionary <string, List <PeptideSpectralMatch> >(); // for each protease, match the base sequence of each peptide to its PSMs foreach (PeptideSpectralMatch psm in protease) { if (sequenceWithPsms.TryGetValue(psm.BaseSequence, out List <PeptideSpectralMatch> peptidesForThisBaseSequence)) { peptidesForThisBaseSequence.Add(psm); } else { sequenceWithPsms[psm.BaseSequence] = new List <PeptideSpectralMatch> { psm }; } } // create new peptide-protein associations foreach (var baseSequence in sequenceWithPsms) { var peptidesWithNotchInfo = baseSequence.Value.SelectMany(p => p.BestMatchingPeptides).Distinct().ToList(); // if the base seq has >1 PeptideWithSetMods object and has >0 mods, it might need to be matched to new proteins if (peptidesWithNotchInfo.Count > 1 && peptidesWithNotchInfo.Any(p => p.Peptide.NumMods > 0)) { // list of proteins along with start/end residue in protein and the # missed cleavages // this is needed to create new PeptideWithSetModification objects var peptideInProteinInfo = new List <Tuple <Protein, DigestionParams, int, int, int, int> >(); foreach (var peptide in peptidesWithNotchInfo) { peptideInProteinInfo.Add(new Tuple <Protein, DigestionParams, int, int, int, int>(peptide.Peptide.Protein, peptide.Peptide.DigestionParams, peptide.Peptide.OneBasedStartResidueInProtein, peptide.Peptide.OneBasedEndResidueInProtein, peptide.Peptide.MissedCleavages, peptide.Notch)); } // add the protein associations to the PSM foreach (PeptideSpectralMatch psm in baseSequence.Value) { foreach (var proteinInfo in peptideInProteinInfo) { var originalPep = psm.BestMatchingPeptides.First().Peptide; var pep = new PeptideWithSetModifications(proteinInfo.Item1, proteinInfo.Item2, proteinInfo.Item3, proteinInfo.Item4, originalPep.CleavageSpecificityForFdrCategory, originalPep.PeptideDescription, proteinInfo.Item5, originalPep.AllModsOneIsNterminus, originalPep.NumFixedMods); _fdrFilteredPeptides.Add(pep); psm.AddProteinMatch((proteinInfo.Item6, pep)); } } } } } } // Parsimony stage 1: add proteins with unique peptides (for each protease) var peptidesGroupedByProtease = _fdrFilteredPeptides.GroupBy(p => p.DigestionParams.Protease); foreach (var peptidesForThisProtease in peptidesGroupedByProtease) { Dictionary <string, List <Protein> > peptideSequenceToProteinsForThisProtease = new Dictionary <string, List <Protein> >(); Dictionary <string, List <PeptideWithSetModifications> > sequenceToPwsm = new Dictionary <string, List <PeptideWithSetModifications> >(); foreach (PeptideWithSetModifications peptide in peptidesForThisProtease) { string sequence = peptide.BaseSequence; if (_treatModPeptidesAsDifferentPeptides) { //these and next set to full sequence but might be base sequence. treat modified as unique makes sense to use full sequence = peptide.FullSequence; } if (peptideSequenceToProteinsForThisProtease.TryGetValue(sequence, out List <Protein> proteinsForThisPeptideSequence)) { proteinsForThisPeptideSequence.Add(peptide.Protein); } else { peptideSequenceToProteinsForThisProtease.Add(sequence, new List <Protein> { peptide.Protein }); } if (sequenceToPwsm.TryGetValue(sequence, out List <PeptideWithSetModifications> peptidesForThisSequence)) { peptidesForThisSequence.Add(peptide); } else { sequenceToPwsm.Add(sequence, new List <PeptideWithSetModifications> { peptide }); } } foreach (var uniquePeptide in peptideSequenceToProteinsForThisProtease.Where(p => p.Value.Count == 1)) { // add the protein with the unique peptide to the parsimonious protein list Protein proteinWithUniquePeptideSequence = uniquePeptide.Value.First(); parsimoniousProteinList.Add(proteinWithUniquePeptideSequence); // add the unique peptide to the list of unique peptides PeptideWithSetModifications uniquePwsm = sequenceToPwsm[uniquePeptide.Key].First(); uniquePeptides.Add(uniquePwsm); } } // Parsimony stage 2: build the peptide-protein matching structure for the parsimony greedy algorithm // and remove all peptides observed by proteins with unique peptides Dictionary <ParsimonySequence, List <Protein> > peptideSequenceToProteins = new Dictionary <ParsimonySequence, List <Protein> >(); // this dictionary associates proteins w/ all peptide sequences (list will NOT shrink over time) // this is used in case of greedy algorithm ties to figure out which protein has more total peptides observed Dictionary <Protein, HashSet <ParsimonySequence> > proteinToPepSeqMatch = new Dictionary <Protein, HashSet <ParsimonySequence> >(); foreach (var peptide in _fdrFilteredPeptides) { ParsimonySequence sequence = new ParsimonySequence(peptide, _treatModPeptidesAsDifferentPeptides); if (peptideSequenceToProteins.TryGetValue(sequence, out List <Protein> proteinsForThisPeptideSequence)) { proteinsForThisPeptideSequence.Add(peptide.Protein); } else { peptideSequenceToProteins.Add(sequence, new List <Protein> { peptide.Protein }); } if (proteinToPepSeqMatch.TryGetValue(peptide.Protein, out var peptideSequences)) { peptideSequences.Add(sequence); } else { proteinToPepSeqMatch.Add(peptide.Protein, new HashSet <ParsimonySequence> { sequence }); } } // remove the peptides observed by proteins with unique peptides HashSet <ParsimonySequence> toRemove = new HashSet <ParsimonySequence>(); foreach (var seq in peptideSequenceToProteins) { bool observedAlready = seq.Value.Any(p => parsimoniousProteinList.Contains(p)); if (observedAlready) { toRemove.Add(seq.Key); } } foreach (var sequence in toRemove) { peptideSequenceToProteins.Remove(sequence); } if (peptideSequenceToProteins.Any()) { // Parsimony stage 3: greedy algorithm // dictionary with proteins as keys and list of associated peptide sequences as the values. // this data structure makes parsimony easier because the algorithm can look up a protein's peptides // to remove them from the list of available peptides. this list will shrink as the algorithm progresses var algDictionary = new Dictionary <Protein, HashSet <string> >(); var algDictionaryProtease = new Dictionary <Protein, HashSet <ParsimonySequence> >(); foreach (var kvp in peptideSequenceToProteins) { foreach (var protein in kvp.Value) { if (algDictionaryProtease.TryGetValue(protein, out HashSet <ParsimonySequence> peptideSequencesWithProtease)) { peptideSequencesWithProtease.Add(kvp.Key); } else { algDictionaryProtease.Add(protein, new HashSet <ParsimonySequence> { kvp.Key }); } if (algDictionary.TryGetValue(protein, out HashSet <string> peptideSequences)) { peptideSequences.Add(kvp.Key.Sequence); } else { algDictionary.Add(protein, new HashSet <string> { kvp.Key.Sequence }); } } } // *** greedy algorithm loop int numNewSeqs = algDictionary.Max(p => p.Value.Count); while (numNewSeqs != 0) { // gets list of proteins with the most unaccounted-for peptide sequences var possibleBestProteinList = algDictionary.Where(p => p.Value.Count == numNewSeqs).ToList(); Protein bestProtein = possibleBestProteinList.First().Key; // may need to select different protein in case of a greedy algorithm tie // the protein with the most total peptide sequences wins in this case (doesn't matter if parsimony has grabbed them or not) if (possibleBestProteinList.Count > 1) { int highestNumTotalPep = proteinToPepSeqMatch[bestProtein].Count; foreach (var kvp in possibleBestProteinList) { if (proteinToPepSeqMatch[kvp.Key].Count > highestNumTotalPep) { highestNumTotalPep = proteinToPepSeqMatch[kvp.Key].Count; bestProtein = kvp.Key; } } } parsimoniousProteinList.Add(bestProtein); // remove observed peptide seqs List <ParsimonySequence> temp = algDictionaryProtease[bestProtein].ToList(); foreach (ParsimonySequence peptideSequence in temp) { List <Protein> proteinsWithThisPeptide = peptideSequenceToProteins[peptideSequence]; foreach (var protein in proteinsWithThisPeptide) { algDictionary[protein].Remove(peptideSequence.Sequence); algDictionaryProtease[protein].Remove(peptideSequence); } } algDictionary.Remove(bestProtein); algDictionaryProtease.Remove(bestProtein); numNewSeqs = algDictionary.Any() ? algDictionary.Max(p => p.Value.Count) : 0; } // *** done with greedy algorithm // Parsimony stage 4: add back indistinguishable proteins (proteins that have identical peptide sets as parsimonious proteins) var allProteinsGroupedByNumPeptides = proteinToPepSeqMatch.GroupBy(p => p.Value.Count); var parsimonyProteinsGroupedByNumPeptides = parsimoniousProteinList.GroupBy(p => proteinToPepSeqMatch[p].Count); var indistinguishableProteins = new ConcurrentBag <Protein>(); foreach (var group in allProteinsGroupedByNumPeptides) { var parsimonyProteinsWithSameNumPeptides = parsimonyProteinsGroupedByNumPeptides.FirstOrDefault(p => p.Key == group.Key); var list = group.ToList(); if (parsimonyProteinsWithSameNumPeptides != null) { Parallel.ForEach(Partitioner.Create(0, list.Count), new ParallelOptions { MaxDegreeOfParallelism = CommonParameters.MaxThreadsToUsePerFile }, (range, loopState) => { for (int i = range.Item1; i < range.Item2; i++) { Protein otherProtein = list[i].Key; foreach (var parsimonyProtein in parsimonyProteinsWithSameNumPeptides) { // if the two proteins have the same set of peptide sequences, they're indistinguishable if (parsimonyProtein != otherProtein && proteinToPepSeqMatch[parsimonyProtein].SetEquals(proteinToPepSeqMatch[otherProtein])) { indistinguishableProteins.Add(otherProtein); } } } } ); } } foreach (Protein protein in indistinguishableProteins) { parsimoniousProteinList.Add(protein); } } // Parsimony stage 5: remove peptide objects that do not have proteins in the parsimonious list foreach (PeptideSpectralMatch psm in _allPsms) { // if this PSM has a protein in the parsimonious list, it removes the proteins NOT in the parsimonious list // otherwise, no proteins are removed (i.e., for PSMs that cannot be explained by a parsimonious protein, // no protein associations are removed) if (psm.BestMatchingPeptides.Any(p => parsimoniousProteinList.Contains(p.Peptide.Protein))) { psm.TrimProteinMatches(parsimoniousProteinList); } } // construct protein groups List <ProteinGroup> proteinGroups = ConstructProteinGroups(uniquePeptides); // finished with parsimony return(proteinGroups); }
private List <ProteinGroup> ApplyProteinParsimony() { if (!compactPeptideToProteinPeptideMatching.Values.Any()) { return(new List <ProteinGroup>()); } // digesting an XML database results in a non-mod-agnostic digestion; need to fix this if mod-agnostic parsimony enabled if (!treatModPeptidesAsDifferentPeptides) { Dictionary <string, HashSet <PeptideWithSetModifications> > baseSeqToProteinMatch = new Dictionary <string, HashSet <PeptideWithSetModifications> >(); foreach (var peptide in compactPeptideToProteinPeptideMatching.SelectMany(b => b.Value)) { if (baseSeqToProteinMatch.TryGetValue(peptide.BaseSequence, out HashSet <PeptideWithSetModifications> value)) { value.Add(peptide); } else { baseSeqToProteinMatch[peptide.BaseSequence] = new HashSet <PeptideWithSetModifications> { peptide } }; } var blah = new Dictionary <PeptideWithSetModifications, List <CompactPeptideBase> >(); // where to store results foreach (var pep in compactPeptideToProteinPeptideMatching) { foreach (var pepWithSetMods in pep.Value) { if (blah.TryGetValue(pepWithSetMods, out List <CompactPeptideBase> list)) { list.Add(pep.Key); } else { blah.Add(pepWithSetMods, new List <CompactPeptideBase> { pep.Key }); } } } foreach (var baseSequence in baseSeqToProteinMatch) { if (baseSequence.Value.Count > 1 && baseSequence.Value.Any(p => p.NumMods > 0)) { // list of proteins along with start/end residue in protein and the # missed cleavages var peptideInProteinInfo = new List <Tuple <Protein, int, int, int> >(); foreach (var peptide in baseSequence.Value) { peptideInProteinInfo.Add(new Tuple <Protein, int, int, int>(peptide.Protein, peptide.OneBasedStartResidueInProtein, peptide.OneBasedEndResidueInProtein, (int)peptide.MissedCleavages)); } foreach (var peptide in baseSequence.Value) { foreach (var proteinInfo in peptideInProteinInfo) { var pep = new PeptideWithSetModifications(proteinInfo.Item1, proteinInfo.Item2, proteinInfo.Item3, peptide.PeptideDescription, proteinInfo.Item4, peptide.allModsOneIsNterminus, peptide.numFixedMods); foreach (var compactPeptide in blah[peptide]) { compactPeptideToProteinPeptideMatching[compactPeptide].Add(pep); } } } } } } var proteinToPeptidesMatching = new Dictionary <Protein, HashSet <CompactPeptideBase> >(); var parsimonyProteinList = new Dictionary <Protein, HashSet <CompactPeptideBase> >(); var proteinsWithUniquePeptides = new Dictionary <Protein, HashSet <PeptideWithSetModifications> >(); // peptide matched to fullseq (used depending on user preference) var compactPeptideToFullSeqMatch = compactPeptideToProteinPeptideMatching.ToDictionary(x => x.Key, x => x.Value.First().Sequence); foreach (var kvp in compactPeptideToProteinPeptideMatching) { // finds unique peptides (peptides that can belong to only one protein) HashSet <Protein> proteinsAssociatedWithThisPeptide = new HashSet <Protein>(kvp.Value.Select(p => p.Protein)); if (proteinsAssociatedWithThisPeptide.Count == 1) { if (!proteinsWithUniquePeptides.TryGetValue(kvp.Value.First().Protein, out HashSet <PeptideWithSetModifications> peptides)) { proteinsWithUniquePeptides.Add(kvp.Value.First().Protein, new HashSet <PeptideWithSetModifications>(kvp.Value)); } else { peptides.UnionWith(kvp.Value); } } // if a peptide is associated with a decoy protein, remove all target protein associations with the peptide if (kvp.Value.Any(p => p.Protein.IsDecoy)) { kvp.Value.RemoveWhere(p => !p.Protein.IsDecoy); } // if a peptide is associated with a contaminant protein, remove all target protein associations with the peptide if (kvp.Value.Any(p => p.Protein.IsContaminant)) { kvp.Value.RemoveWhere(p => !p.Protein.IsContaminant); } } // makes dictionary with proteins as keys and list of associated peptides as the value (makes parsimony algo easier) foreach (var kvp in compactPeptideToProteinPeptideMatching) { foreach (var peptide in kvp.Value) { if (!proteinToPeptidesMatching.TryGetValue(peptide.Protein, out HashSet <CompactPeptideBase> peptides)) { proteinToPeptidesMatching.Add(peptide.Protein, new HashSet <CompactPeptideBase>() { kvp.Key }); } else { peptides.Add(kvp.Key); } } } // build protein list for each peptide before parsimony has been applied var peptideSeqProteinListMatch = new Dictionary <string, HashSet <Protein> >(); foreach (var kvp in proteinToPeptidesMatching) { foreach (var peptide in kvp.Value) { string pepSequence; if (!treatModPeptidesAsDifferentPeptides) { pepSequence = string.Join("", peptide.NTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture))) + string.Join("", peptide.CTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture))) + peptide.MonoisotopicMassIncludingFixedMods.ToString(CultureInfo.InvariantCulture); } else { pepSequence = compactPeptideToFullSeqMatch[peptide]; } if (!peptideSeqProteinListMatch.TryGetValue(pepSequence, out HashSet <Protein> proteinListHere)) { peptideSeqProteinListMatch.Add(pepSequence, new HashSet <Protein>() { kvp.Key }); } else { proteinListHere.Add(kvp.Key); } } } // dictionary associates proteins w/ unused base seqs (list will shrink over time) var algDictionary = new Dictionary <Protein, HashSet <string> >(); foreach (var kvp in peptideSeqProteinListMatch) { foreach (var protein in kvp.Value) { if (algDictionary.TryGetValue(protein, out HashSet <string> newPeptideBaseSeqs)) { newPeptideBaseSeqs.Add(kvp.Key); } else { algDictionary.Add(protein, new HashSet <string> { kvp.Key }); } } } // dictionary associates proteins w/ unused base seqs (list will NOT shrink over time) var proteinToPepSeqMatch = algDictionary.ToDictionary(x => x.Key, x => x.Value); // *** main parsimony loop bool uniquePeptidesLeft = false; if (proteinsWithUniquePeptides.Any()) { uniquePeptidesLeft = true; } int numNewSeqs = algDictionary.Max(p => p.Value.Count); while (numNewSeqs != 0) { var possibleBestProteinList = new List <KeyValuePair <Protein, HashSet <string> > >(); if (uniquePeptidesLeft) { var proteinsWithUniquePeptidesLeft = algDictionary.Where(p => proteinsWithUniquePeptides.ContainsKey(p.Key)); if (proteinsWithUniquePeptidesLeft.Any()) { possibleBestProteinList.Add(proteinsWithUniquePeptidesLeft.First()); } else { uniquePeptidesLeft = false; } } // gets list of proteins with the most unaccounted-for peptide base sequences if (!uniquePeptidesLeft) { possibleBestProteinList = algDictionary.Where(p => p.Value.Count == numNewSeqs).ToList(); } Protein bestProtein = possibleBestProteinList.First().Key; HashSet <string> newSeqs = new HashSet <string>(algDictionary[bestProtein]); // may need to select different protein if (possibleBestProteinList.Count > 1) { var proteinsWithTheseBaseSeqs = new HashSet <Protein>(); foreach (var kvp in possibleBestProteinList) { if (newSeqs.IsSubsetOf(kvp.Value)) { proteinsWithTheseBaseSeqs.Add(kvp.Key); } } if (proteinsWithTheseBaseSeqs.Count > 1) { var proteinsOrderedByTotalPeptideCount = new Dictionary <Protein, HashSet <string> >(); foreach (var protein in proteinsWithTheseBaseSeqs) { proteinsOrderedByTotalPeptideCount.Add(protein, proteinToPepSeqMatch[protein]); } bestProtein = proteinsOrderedByTotalPeptideCount.OrderByDescending(kvp => kvp.Value.Count).First().Key; } } parsimonyProteinList.Add(bestProtein, proteinToPeptidesMatching[bestProtein]); // remove used peptides from their proteins foreach (var newBaseSeq in newSeqs) { HashSet <Protein> proteinsWithThisPeptide = peptideSeqProteinListMatch[newBaseSeq]; foreach (var protein in proteinsWithThisPeptide) { algDictionary[protein].Remove(newBaseSeq); } } algDictionary.Remove(bestProtein); if (algDictionary.Any()) { numNewSeqs = algDictionary.Max(p => p.Value.Count); } else { numNewSeqs = 0; } } // *** done with parsimony // add indistinguishable proteins var proteinsGroupedByNumPeptides = proteinToPeptidesMatching.GroupBy(p => p.Value.Count); var parsimonyProteinsGroupedByNumPeptides = parsimonyProteinList.GroupBy(p => p.Value.Count); var indistinguishableProteins = new ConcurrentDictionary <Protein, HashSet <CompactPeptideBase> >(); foreach (var group in proteinsGroupedByNumPeptides) { var parsimonyProteinsWithSameNumPeptides = parsimonyProteinsGroupedByNumPeptides.FirstOrDefault(p => p.Key == group.Key); var list = group.ToList(); if (parsimonyProteinsWithSameNumPeptides != null) { Parallel.ForEach(Partitioner.Create(0, list.Count), new ParallelOptions { MaxDegreeOfParallelism = -1 }, (range, loopState) => { for (int i = range.Item1; i < range.Item2; i++) { foreach (var parsimonyProteinWithThisNumPeptides in parsimonyProteinsWithSameNumPeptides) { if (parsimonyProteinWithThisNumPeptides.Key != list[i].Key) { if (proteinToPeptidesMatching[parsimonyProteinWithThisNumPeptides.Key].SetEquals(proteinToPeptidesMatching[list[i].Key])) { indistinguishableProteins.GetOrAdd(list[i].Key, proteinToPeptidesMatching[list[i].Key]); } } } } } ); } } foreach (var protein in indistinguishableProteins) { parsimonyProteinList.Add(protein.Key, protein.Value); } foreach (var kvp in compactPeptideToProteinPeptideMatching) { kvp.Value.RemoveWhere(p => !parsimonyProteinList.ContainsKey(p.Protein)); } Status("Finished Parsimony"); return(ConstructProteinGroups(new HashSet <PeptideWithSetModifications>(proteinsWithUniquePeptides.Values.SelectMany(p => p)), new HashSet <PeptideWithSetModifications>(compactPeptideToProteinPeptideMatching.Values.SelectMany(p => p)))); }
public static PeptideWithSetModifications CreateSilacPwsm(bool heavyToLight, SilacLabel silacLabel, PeptideWithSetModifications pwsm) { Protein modifiedProtein = CreateSilacProtein(heavyToLight, silacLabel, pwsm.Protein); return(new PeptideWithSetModifications( modifiedProtein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods)); }
public static PsmData CreateOnePsmDataEntry(PeptideSpectralMatch psm, Dictionary <string, int> sequenceToPsmCount, Dictionary <string, Dictionary <int, Tuple <double, double> > > timeDependantHydrophobicityAverageAndDeviation_unmodified, Dictionary <string, Dictionary <int, Tuple <double, double> > > timeDependantHydrophobicityAverageAndDeviation_modified, int chargeStateMode, PeptideWithSetModifications selectedPeptide, string[] trainingVariables, int notchToUse, bool label) { float totalMatchingFragmentCount = 0; if (trainingVariables.Contains("TotalMatchingFragmentCount")) { totalMatchingFragmentCount = (float)Math.Floor(psm.Score); } float ambiguity = 0; if (trainingVariables.Contains("Ambiguity")) { ambiguity = Math.Min((float)(psm.PeptidesToMatchingFragments.Keys.Count - 1), 10); } float intensity = 0; if (trainingVariables.Contains("Intensity")) { intensity = (float)(psm.Score - (int)psm.Score); } float chargeDifference = 0; if (trainingVariables.Contains("PrecursorChargeDiffToMode")) { chargeDifference = -Math.Abs(chargeStateMode - psm.ScanPrecursorCharge); } float deltaScore = 0; if (trainingVariables.Contains("DeltaScore")) { deltaScore = (float)psm.DeltaScore; } float psmCount = 1; if (trainingVariables.Contains("PsmCount")) { psmCount = sequenceToPsmCount[String.Join("|", psm.BestMatchingPeptides.Select(p => p.Peptide.FullSequence).ToList())]; //grouping psm counts as follows is done for stability. you get very nice numbers at low psms to get good statistics. But you get a few peptides with high psm counts that could be either targets or decoys and the values swing between extremes. So grouping psms in bundles really adds stability. List <int> psmCountList = new List <int> { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100, 200, 300, 400, 500 }; int closest = psmCountList.OrderBy(item => Math.Abs(psmCount - item)).First(); psmCount = closest; } int notch = 0; if (trainingVariables.Contains("Notch")) { notch = notchToUse; } float modCount = 0; if (trainingVariables.Contains("ModsCount")) { modCount = Math.Min((float)selectedPeptide.AllModsOneIsNterminus.Keys.Count(), 10); } float missedCleavages = 0; if (trainingVariables.Contains("MissedCleavagesCount")) { missedCleavages = selectedPeptide.MissedCleavages; } float longestSeq = 0; if (trainingVariables.Contains("LongestFragmentIonSeries")) { longestSeq = psm.GetLongestIonSeriesBidirectional(selectedPeptide); } float hydrophobicityZscore = float.NaN; if (selectedPeptide.BaseSequence.Equals(selectedPeptide.FullSequence) && trainingVariables.Contains("HydrophobicityZScore")) { hydrophobicityZscore = GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_unmodified); } else if (trainingVariables.Contains("HydrophobicityZScore")) { hydrophobicityZscore = GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_modified); } bool isVariantPeptide = PeptideIsVariant(selectedPeptide); if (psm.IsDecoy) { label = false; } else { label = true; } psm.PsmData_forPEPandPercolator = new PsmData { TotalMatchingFragmentCount = totalMatchingFragmentCount, Intensity = intensity, PrecursorChargeDiffToMode = chargeDifference, DeltaScore = deltaScore, Notch = notch, PsmCount = psmCount, ModsCount = modCount, MissedCleavagesCount = missedCleavages, Ambiguity = ambiguity, LongestFragmentIonSeries = longestSeq, HydrophobicityZScore = hydrophobicityZscore, IsVariantPeptide = Convert.ToSingle(isVariantPeptide), Label = label }; return(psm.PsmData_forPEPandPercolator); }
private static float GetSSRCalcHydrophobicityZScore(PeptideSpectralMatch psm, PeptideWithSetModifications Peptide, Dictionary <string, Dictionary <int, Tuple <double, double> > > d) { //Using SSRCalc3 but probably any number of different calculators could be used instead. One could also use the CE mobility. SSRCalc3 calc = new SSRCalc3("SSRCalc 3.0 (300A)", SSRCalc3.Column.A300); double hydrophobicityZscore = double.NaN; if (d.ContainsKey(psm.FullFilePath)) { int time = (int)(2 * Math.Round(psm.ScanRetentionTime / 2d, 0)); if (d[psm.FullFilePath].Keys.Contains(time)) { double predictedHydrophobicity = calc.ScoreSequence(Peptide); hydrophobicityZscore = Math.Abs(d[psm.FullFilePath][time].Item1 - predictedHydrophobicity) / d[psm.FullFilePath][time].Item2; } } double maxHydrophobicityZscore = 10; // each "Z" is one standard deviation. so, maxHydrophobicityZscore 10 is quite large if (double.IsNaN(hydrophobicityZscore) || double.IsInfinity(hydrophobicityZscore) || hydrophobicityZscore > maxHydrophobicityZscore) { hydrophobicityZscore = maxHydrophobicityZscore; } return((float)hydrophobicityZscore); }
private List <ProteinGroup> ApplyProteinParsimony() { //if dictionary is empty return an empty list of protein groups if (!CompactPeptideToProteinPeptideMatching.Values.Any()) { return(new List <ProteinGroup>()); } // digesting an XML database results in a non-mod-agnostic digestion; need to fix this if mod-agnostic parsimony enabled if (!TreatModPeptidesAsDifferentPeptides)//user want modified and unmodified peptides treated the same { Dictionary <string, HashSet <PeptideWithSetModifications> > baseSeqToProteinMatch = new Dictionary <string, HashSet <PeptideWithSetModifications> >(); // dictionary where string key is the base sequence and the HashSet is all PeptidesWithSetModificatiosn with the same sequence // can access which protein these matching peptides came from through the PeptideWithSetModifications object foreach (var peptide in CompactPeptideToProteinPeptideMatching.SelectMany(b => b.Value)) { if (baseSeqToProteinMatch.TryGetValue(peptide.BaseSequence, out HashSet <PeptideWithSetModifications> value)) { value.Add(peptide); } else { baseSeqToProteinMatch[peptide.BaseSequence] = new HashSet <PeptideWithSetModifications> { peptide }; } } var blah = new Dictionary <PeptideWithSetModifications, List <CompactPeptideBase> >(); // where to store results foreach (var pep in CompactPeptideToProteinPeptideMatching) { foreach (var pepWithSetMods in pep.Value) { if (blah.TryGetValue(pepWithSetMods, out List <CompactPeptideBase> list)) { list.Add(pep.Key); } else { blah.Add(pepWithSetMods, new List <CompactPeptideBase> { pep.Key }); } } } foreach (var baseSequence in baseSeqToProteinMatch) { if (baseSequence.Value.Count > 1 && baseSequence.Value.Any(p => p.NumMods > 0)) { // list of proteins along with start/end residue in protein and the # missed cleavages var peptideInProteinInfo = new List <Tuple <Protein, DigestionParams, int, int, int> >(); foreach (var peptide in baseSequence.Value) { peptideInProteinInfo.Add(new Tuple <Protein, DigestionParams, int, int, int>(peptide.Protein, peptide.DigestionParams, peptide.OneBasedStartResidueInProtein, peptide.OneBasedEndResidueInProtein, (int)peptide.MissedCleavages)); } foreach (var peptide in baseSequence.Value) { foreach (var proteinInfo in peptideInProteinInfo) { var pep = new PeptideWithSetModifications(proteinInfo.Item1, proteinInfo.Item2, proteinInfo.Item3, proteinInfo.Item4, peptide.PeptideDescription, proteinInfo.Item5, peptide.AllModsOneIsNterminus, peptide.NumFixedMods); foreach (var compactPeptide in blah[peptide]) { CompactPeptideToProteinPeptideMatching[compactPeptide].Add(pep); } } } } } } var proteinToPeptidesMatching = new Dictionary <Protein, HashSet <CompactPeptideBase> >(); var parsimonyProteinList = new Dictionary <Protein, HashSet <CompactPeptideBase> >(); var proteinsWithUniquePeptides = new Dictionary <Protein, HashSet <PeptideWithSetModifications> >(); // peptide matched to fullseq (used depending on user preference) var compactPeptideToFullSeqMatch = CompactPeptideToProteinPeptideMatching.ToDictionary(x => x.Key, x => x.Value.First().Sequence); foreach (var kvp in CompactPeptideToProteinPeptideMatching) { HashSet <Protein> proteinsAssociatedWithThisPeptide = new HashSet <Protein>(kvp.Value.Select(p => p.Protein)); if (proteinsAssociatedWithThisPeptide.Count == 1) { if (!proteinsWithUniquePeptides.TryGetValue(kvp.Value.First().Protein, out HashSet <PeptideWithSetModifications> peptides)) { proteinsWithUniquePeptides.Add(kvp.Value.First().Protein, new HashSet <PeptideWithSetModifications>(kvp.Value)); } else { peptides.UnionWith(kvp.Value); } } // multiprotease parsimony is "weird" because a peptide sequence can be shared between // two proteins but technically be a "unique" peptide because it is unique in that protease digestion // this code marks these types of peptides as unique else { foreach (var peptide in kvp.Value) { Protease protease = peptide.DigestionParams.Protease; int sameProteaseCount = kvp.Value.Count(v => v.DigestionParams.Protease == protease); if (sameProteaseCount == 1) { if (!proteinsWithUniquePeptides.TryGetValue(peptide.Protein, out HashSet <PeptideWithSetModifications> peps)) { proteinsWithUniquePeptides.Add(peptide.Protein, new HashSet <PeptideWithSetModifications> { peptide }); } else { peps.UnionWith(kvp.Value); } } } } // if a peptide is associated with a decoy protein, remove all target protein associations with the peptide if (kvp.Value.Any(p => p.Protein.IsDecoy)) { kvp.Value.RemoveWhere(p => !p.Protein.IsDecoy); } // if a peptide is associated with a contaminant protein, remove all target protein associations with the peptide if (kvp.Value.Any(p => p.Protein.IsContaminant)) { kvp.Value.RemoveWhere(p => !p.Protein.IsContaminant); } } // makes dictionary with proteins as keys and list of associated peptides as the value (makes parsimony algo easier) foreach (var kvp in CompactPeptideToProteinPeptideMatching) { foreach (var peptide in kvp.Value) { if (!proteinToPeptidesMatching.TryGetValue(peptide.Protein, out HashSet <CompactPeptideBase> peptides)) { proteinToPeptidesMatching.Add(peptide.Protein, new HashSet <CompactPeptideBase>() { kvp.Key }); } else { peptides.Add(kvp.Key); } } } // build protein list for each peptide before parsimony has been applied var peptideSeqProteinListMatch = new Dictionary <string, HashSet <Protein> >(); foreach (var kvp in proteinToPeptidesMatching) { foreach (var peptide in kvp.Value) { string pepSequence; if (!TreatModPeptidesAsDifferentPeptides) { string nTerminalMasses = peptide.NTerminalMasses == null ? "" : string.Join("", peptide.NTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture))); string cTerminalMasses = peptide.CTerminalMasses == null ? "" : string.Join("", peptide.CTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture))); pepSequence = nTerminalMasses + cTerminalMasses + peptide.MonoisotopicMassIncludingFixedMods.ToString(CultureInfo.InvariantCulture); } else { pepSequence = compactPeptideToFullSeqMatch[peptide]; } if (!peptideSeqProteinListMatch.TryGetValue(pepSequence, out HashSet <Protein> proteinListHere)) { peptideSeqProteinListMatch.Add(pepSequence, new HashSet <Protein>() { kvp.Key }); } else { proteinListHere.Add(kvp.Key); } } } // dictionary associates proteins w/ unused base seqs (list will shrink over time) var algDictionary = new Dictionary <Protein, HashSet <string> >(); foreach (var kvp in peptideSeqProteinListMatch) { foreach (var protein in kvp.Value) { if (algDictionary.TryGetValue(protein, out HashSet <string> newPeptideBaseSeqs)) { newPeptideBaseSeqs.Add(kvp.Key); } else { algDictionary.Add(protein, new HashSet <string> { kvp.Key }); } } } // dictionary associates proteins w/ unused base seqs (list will NOT shrink over time) var proteinToPepSeqMatch = algDictionary.ToDictionary(x => x.Key, x => x.Value); // *** main parsimony loop bool uniquePeptidesLeft = proteinsWithUniquePeptides.Any(); int numNewSeqs = algDictionary.Max(p => p.Value.Count); while (numNewSeqs != 0) { var possibleBestProteinList = new List <KeyValuePair <Protein, HashSet <string> > >(); if (uniquePeptidesLeft) { var proteinsWithUniquePeptidesLeft = algDictionary.Where(p => proteinsWithUniquePeptides.ContainsKey(p.Key)); if (proteinsWithUniquePeptidesLeft.Any()) { possibleBestProteinList.Add(proteinsWithUniquePeptidesLeft.First()); } else { uniquePeptidesLeft = false; } } // gets list of proteins with the most unaccounted-for peptide base sequences if (!uniquePeptidesLeft) { possibleBestProteinList = algDictionary.Where(p => p.Value.Count == numNewSeqs).ToList(); } Protein bestProtein = possibleBestProteinList.First().Key; HashSet <string> newSeqs = new HashSet <string>(algDictionary[bestProtein]); // may need to select different protein if (possibleBestProteinList.Count > 1) { var proteinsWithTheseBaseSeqs = new HashSet <Protein>(); foreach (var kvp in possibleBestProteinList) { if (newSeqs.IsSubsetOf(kvp.Value)) { proteinsWithTheseBaseSeqs.Add(kvp.Key); } } if (proteinsWithTheseBaseSeqs.Count > 1) { var proteinsOrderedByTotalPeptideCount = new Dictionary <Protein, HashSet <string> >(); foreach (var protein in proteinsWithTheseBaseSeqs) { proteinsOrderedByTotalPeptideCount.Add(protein, proteinToPepSeqMatch[protein]); } bestProtein = proteinsOrderedByTotalPeptideCount.OrderByDescending(kvp => kvp.Value.Count).First().Key; } } parsimonyProteinList.Add(bestProtein, proteinToPeptidesMatching[bestProtein]); // remove used peptides from their proteins foreach (var newBaseSeq in newSeqs) { HashSet <Protein> proteinsWithThisPeptide = peptideSeqProteinListMatch[newBaseSeq]; foreach (var protein in proteinsWithThisPeptide) { algDictionary[protein].Remove(newBaseSeq); } } algDictionary.Remove(bestProtein); numNewSeqs = algDictionary.Any() ? algDictionary.Max(p => p.Value.Count) : 0; } // *** done with parsimony // add indistinguishable proteins var proteinsGroupedByNumPeptides = proteinToPeptidesMatching.GroupBy(p => p.Value.Count); var parsimonyProteinsGroupedByNumPeptides = parsimonyProteinList.GroupBy(p => p.Value.Count); var indistinguishableProteins = new ConcurrentDictionary <Protein, HashSet <CompactPeptideBase> >(); foreach (var group in proteinsGroupedByNumPeptides) { var parsimonyProteinsWithSameNumPeptides = parsimonyProteinsGroupedByNumPeptides.FirstOrDefault(p => p.Key == group.Key); var list = group.ToList(); if (parsimonyProteinsWithSameNumPeptides != null) { Parallel.ForEach(Partitioner.Create(0, list.Count), new ParallelOptions { MaxDegreeOfParallelism = commonParameters.MaxThreadsToUsePerFile }, (range, loopState) => { for (int i = range.Item1; i < range.Item2; i++) { foreach (var parsimonyProteinWithThisNumPeptides in parsimonyProteinsWithSameNumPeptides) { if (parsimonyProteinWithThisNumPeptides.Key != list[i].Key && proteinToPeptidesMatching[parsimonyProteinWithThisNumPeptides.Key].SetEquals(proteinToPeptidesMatching[list[i].Key])) { indistinguishableProteins.GetOrAdd(list[i].Key, proteinToPeptidesMatching[list[i].Key]); } } } } ); } } foreach (var protein in indistinguishableProteins) { if (!parsimonyProteinList.ContainsKey(protein.Key)) { parsimonyProteinList.Add(protein.Key, protein.Value); } } // multiprotease parsimony: // this code is a workaround to add back proteins to the parsimonious list that were removed // because unique peptides were mistaken for shared peptides. see line 139 for more info if (ListOfDigestionParams.Select(v => v.Protease).Distinct().Count() > 1) { HashSet <Protein> parsimonyProteinSet = new HashSet <Protein>(parsimonyProteinList.Keys); // add back in proteins that contain unique peptides foreach (var prot in proteinsWithUniquePeptides) { if (!parsimonyProteinSet.Contains(prot.Key)) { parsimonyProteinList.Add(prot.Key, proteinToPeptidesMatching[prot.Key]); } } } foreach (var kvp in CompactPeptideToProteinPeptideMatching) { kvp.Value.RemoveWhere(p => !parsimonyProteinList.ContainsKey(p.Protein)); } return(ConstructProteinGroups(new HashSet <PeptideWithSetModifications>(proteinsWithUniquePeptides.Values.SelectMany(p => p)), new HashSet <PeptideWithSetModifications>(CompactPeptideToProteinPeptideMatching.Values.SelectMany(p => p)))); }
/// <summary> /// /// </summary> /// <param name="psm"></param> /// <param name="sequenceToPsmCount"></param> /// <param name="selectedPeptide"></param> /// <param name="notchToUse"></param> /// <param name="trueOrFalse"></param> /// <returns></returns> public static PsmData CreateOnePsmDataEntry(PeptideSpectralMatch psm, Dictionary <string, int> sequenceToPsmCount, Dictionary <string, Dictionary <int, Tuple <double, double> > > timeDependantHydrophobicityAverageAndDeviation_unmodified, Dictionary <string, Dictionary <int, Tuple <double, double> > > timeDependantHydrophobicityAverageAndDeviation_modified, PeptideWithSetModifications selectedPeptide, string searchType, int?notchToUse, bool?trueOrFalse = null) { float ambiguity = (float)psm.PeptidesToMatchingFragments.Keys.Count; float intensity = (float)(psm.Score - (int)psm.Score); float charge = psm.ScanPrecursorCharge; float deltaScore = (float)psm.DeltaScore; float psmCount = sequenceToPsmCount[String.Join("|", psm.BestMatchingPeptides.Select(p => p.Peptide.FullSequence).ToList())]; int notch = 0; if (notchToUse.HasValue) { notch = notchToUse.Value; } else if (psm.Notch.HasValue) { notch = psm.Notch.Value; } if (selectedPeptide == null) { selectedPeptide = psm.BestMatchingPeptides.Select(p => p.Peptide).First(); } float modCount = selectedPeptide.AllModsOneIsNterminus.Keys.Count(); float missedCleavages = selectedPeptide.MissedCleavages; float longestSeq = psm.GetLongestIonSeriesBidirectional(selectedPeptide); float hydrophobicityZscore = float.NaN; if (selectedPeptide.BaseSequence.Equals(selectedPeptide.FullSequence) && searchType == "standard") { hydrophobicityZscore = GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_unmodified); } else if (searchType == "standard") { hydrophobicityZscore = GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_modified); } bool label; if (trueOrFalse != null) { label = trueOrFalse.Value; } else if (psm.IsDecoy) { label = false; } else { label = true; } return(new PsmData() { Intensity = intensity, ScanPrecursorCharge = charge, DeltaScore = deltaScore, Notch = notch, PsmCount = psmCount, ModsCount = modCount, MissedCleavagesCount = missedCleavages, Ambiguity = ambiguity, LongestFragmentIonSeries = longestSeq, HydrophobicityZScore = hydrophobicityZscore, Label = label }); }
/// <summary> /// TODO: Summarize parsimony; /// Parsimony algorithm based on: https://www.ncbi.nlm.nih.gov/pubmed/14632076 Anal Chem. 2003 Sep 1;75(17):4646-58. /// TODO: Note describing that peptide objects with the same sequence are associated with different proteins /// </summary> private List <ProteinGroup> RunProteinParsimonyEngine() { // parsimonious list of proteins built by this protein parsimony engine HashSet <Protein> parsimoniousProteinList = new HashSet <Protein>(); // list of peptides that can only be digestion products of one protein in the proteome (considering different protease digestion rules) HashSet <PeptideWithSetModifications> uniquePeptides = new HashSet <PeptideWithSetModifications>(); // if there are no peptides observed, there are no proteins; return an empty list of protein groups if (_fdrFilteredPeptides.Count == 0) { return(new List <ProteinGroup>()); } // Parsimony stage 0: create peptide-protein associations if needed because the user wants a modification-agnostic parsimony // this is needed for edge cases digesting a protein .xml from UniProt that has peptide sequences shared between proteins // that have unevenly-shared modifications if (!_treatModPeptidesAsDifferentPeptides) { foreach (var protease in _fdrFilteredPsms.GroupBy(p => p.DigestionParams.Protease)) { Dictionary <string, List <PeptideSpectralMatch> > sequenceWithPsms = new Dictionary <string, List <PeptideSpectralMatch> >(); // for each protease, match the base sequence of each peptide to its PSMs foreach (PeptideSpectralMatch psm in protease) { if (sequenceWithPsms.TryGetValue(psm.BaseSequence, out List <PeptideSpectralMatch> peptidesForThisBaseSequence)) { peptidesForThisBaseSequence.Add(psm); } else { sequenceWithPsms[psm.BaseSequence] = new List <PeptideSpectralMatch> { psm }; } } var sequenceWithPsmsList = sequenceWithPsms.ToList(); // create new peptide-protein associations as needed Parallel.ForEach(Partitioner.Create(0, sequenceWithPsmsList.Count), new ParallelOptions { MaxDegreeOfParallelism = CommonParameters.MaxThreadsToUsePerFile }, (range, loopState) => { for (int i = range.Item1; i < range.Item2; i++) { var baseSequence = sequenceWithPsmsList[i]; var peptidesWithNotchInfo = baseSequence.Value.SelectMany(p => p.BestMatchingPeptides).Distinct().ToList(); // if the base seq has >1 PeptideWithSetMods object and has >0 mods, it might need to be matched to new proteins if (peptidesWithNotchInfo.Count > 1 && peptidesWithNotchInfo.Any(p => p.Peptide.NumMods > 0)) { bool needToAddPeptideToProteinAssociations = false; // numProteinsForThisBaseSequence is the total number of proteins that this base sequence is a digestion product of int numProteinsForThisBaseSequence = peptidesWithNotchInfo.Select(p => p.Peptide.Protein).Distinct().Count(); if (numProteinsForThisBaseSequence == 1) { continue; } foreach (var psm in baseSequence.Value) { // numProteinsForThisPsm is the number of proteins that this PSM's peptides are associated with int numProteinsForThisPsm = psm.BestMatchingPeptides.Select(p => p.Peptide.Protein).Distinct().Count(); if (numProteinsForThisPsm != numProteinsForThisBaseSequence) { // this PSM is not matched to all the proteins that it should be matched to // at this point we know that we need to make some new peptide-protein associations needToAddPeptideToProteinAssociations = true; } } if (!needToAddPeptideToProteinAssociations) { continue; } // this gets the digestion info for all of the peptide-protein associations that should exist var proteinToPeptideInfo = new Dictionary <Protein, (DigestionParams DigestParams, int OneBasedStart, int OneBasedEnd, int MissedCleavages, int Notch, CleavageSpecificity CleavageSpecificity)>(); foreach (PeptideSpectralMatch psm in baseSequence.Value) { foreach (var peptideWithNotch in psm.BestMatchingPeptides) { PeptideWithSetModifications peptide = peptideWithNotch.Peptide; Protein protein = peptide.Protein; if (!proteinToPeptideInfo.ContainsKey(protein)) { proteinToPeptideInfo.Add(protein, (peptideWithNotch.Peptide.DigestionParams, peptideWithNotch.Peptide.OneBasedStartResidueInProtein, peptideWithNotch.Peptide.OneBasedEndResidueInProtein, peptideWithNotch.Peptide.MissedCleavages, peptideWithNotch.Notch, peptideWithNotch.Peptide.CleavageSpecificityForFdrCategory)); } } } // create any new associations that need to be made foreach (PeptideSpectralMatch psm in baseSequence.Value) { PeptideWithSetModifications originalPeptide = psm.BestMatchingPeptides.First().Peptide; HashSet <Protein> psmProteins = new HashSet <Protein>(psm.BestMatchingPeptides.Select(p => p.Peptide.Protein)); foreach (var proteinWithDigestInfo in proteinToPeptideInfo) { if (!psmProteins.Contains(proteinWithDigestInfo.Key)) { var pep = new PeptideWithSetModifications( proteinWithDigestInfo.Key, proteinWithDigestInfo.Value.DigestParams, proteinWithDigestInfo.Value.OneBasedStart, proteinWithDigestInfo.Value.OneBasedEnd, proteinWithDigestInfo.Value.CleavageSpecificity, originalPeptide.PeptideDescription, proteinWithDigestInfo.Value.MissedCleavages, originalPeptide.AllModsOneIsNterminus, originalPeptide.NumFixedMods); lock (_fdrFilteredPeptides) { _fdrFilteredPeptides.Add(pep); } psm.AddProteinMatch((proteinWithDigestInfo.Value.Notch, pep)); } } } } } }
/// <summary> /// Determines whether a peptide includes a splice site /// </summary> /// <param name="pep"></param> /// <param name="site"></param> /// <returns></returns> private static bool Includes(PeptideWithSetModifications pep, SpliceSite site) { return(pep.OneBasedStartResidueInProtein <= site.OneBasedBeginPosition && pep.OneBasedEndResidueInProtein >= site.OneBasedEndPosition); }
public static PsmData CreateOnePsmDataFromPsm2(PeptideSpectralMatch psm, int notch, PeptideWithSetModifications firstPeptide, Dictionary <string, int> accessionCounts, Dictionary <string, int> sequenceToPsmCount, bool?trueOrFalse = null) { //dont' think ambiguity is helping so not using currently float ambiguity = (float)psm.PeptidesToMatchingFragments.Keys.Count; float intensity = (float)(psm.Score - (int)psm.Score); float charge = psm.ScanPrecursorCharge; float deltaScore = (float)psm.DeltaScore; float psmCount = sequenceToPsmCount[String.Join("|", psm.BestMatchingPeptides.Select(p => p.Peptide.FullSequence).ToList())]; float modCount = firstPeptide.AllModsOneIsNterminus.Keys.Count(); //todo: for non-specific cleavage, ignore missed cleavages float missedCleavages = firstPeptide.MissedCleavages; float longestSeq = psm.GetLongestIonSeriesBidirectional(firstPeptide); string accession = firstPeptide.Protein.Accession; float appearances; if (accessionCounts.Keys.Count != 0 && accessionCounts.ContainsKey(accession)) { appearances = (float)accessionCounts[accession]; } else { appearances = 1; } float score = (float)psm.Score; bool label; if (trueOrFalse != null) { label = trueOrFalse.Value; } else if (psm.IsDecoy) { label = false; } else { label = true; } return(new PsmData() { Intensity = intensity, ScanPrecursorCharge = charge, DeltaScore = deltaScore, Notch = notch, PsmCount = psmCount, ModsCount = modCount, MissedCleavagesCount = missedCleavages, Ambiguity = ambiguity, LongestFragmentIonSeries = longestSeq, AccessionAppearances = appearances, Label = label }); }