public static int DescendingSummedMorpheusScoreProteinGroupComparison(ProteinGroup left, ProteinGroup right) { int comparison = -(left.SummedMorpheusScore.CompareTo(right.SummedMorpheusScore)); if (comparison != 0) { return(comparison); } else { return(left.Target.CompareTo(right.Target)); } }
public static List <ProteinGroup> ApplyProteinParsimony(IEnumerable <PeptideSpectrumMatch> peptideSpectrumMatches, double morpheusScoreThreshold, FileStream proteinFastaDatabase, bool onTheFlyDecoys, IDictionary <string, Modification> knownVariableModifications, Protease protease, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, int maximumThreads) { // make a list of the all the distinct base leucine peptide sequences Dictionary <string, List <Protein> > peptide_proteins = new Dictionary <string, List <Protein> >(); foreach (PeptideSpectrumMatch psm in peptideSpectrumMatches) { if (psm.MorpheusScore >= morpheusScoreThreshold) { if (!peptide_proteins.ContainsKey(psm.Peptide.BaseLeucineSequence)) { peptide_proteins.Add(psm.Peptide.BaseLeucineSequence, new List <Protein>()); } } } // record all proteins that could have been the source of each peptide ParallelOptions parallel_options = new ParallelOptions(); parallel_options.MaxDegreeOfParallelism = maximumThreads; Parallel.ForEach(ProteomeDatabaseReader.ReadProteins(proteinFastaDatabase, onTheFlyDecoys, REQUIRE_MATCHING_KNOWN_MODIFICATIONS_IN_PROTEIN_PARSIMONY ? knownVariableModifications : null), parallel_options, protein => { foreach (Peptide peptide in protein.Digest(protease, maximumMissedCleavages, initiatorMethionineBehavior, null, null)) { lock (peptide_proteins) { List <Protein> proteins; if (peptide_proteins.TryGetValue(peptide.BaseLeucineSequence, out proteins)) { List <Peptide> peptides; if (!protein.IdentifiedPeptides.TryGetValue(peptide.BaseLeucineSequence, out peptides)) { peptides = new List <Peptide>(); peptides.Add(peptide); protein.IdentifiedPeptides.Add(peptide.BaseLeucineSequence, peptides); } else { peptides.Add(peptide); } proteins.Add(protein); } } } } ); // create protein groups (initially with just one protein each) and assign PSMs to them Dictionary <string, ProteinGroup> proteins_by_description = new Dictionary <string, ProteinGroup>(); foreach (PeptideSpectrumMatch psm in peptideSpectrumMatches) { if (psm.MorpheusScore >= morpheusScoreThreshold) { foreach (Protein protein in peptide_proteins[psm.Peptide.BaseLeucineSequence]) { if (REQUIRE_MATCHING_KNOWN_MODIFICATIONS_IN_PROTEIN_PARSIMONY) { // check to make sure this protein's known modifications match the PSM's bool known_modification_match = true; if (psm.Peptide.VariableModifications != null && psm.Peptide.VariableModifications.Count > 0) { foreach (KeyValuePair <int, Modification> kvp in psm.Peptide.VariableModifications) { if (kvp.Value.Known) { List <Modification> protein_modifications = null; if (protein.KnownModifications == null || !protein.KnownModifications.TryGetValue(psm.Peptide.StartResidueNumber - 1 + kvp.Key, out protein_modifications) || !protein_modifications.Contains(kvp.Value)) { known_modification_match = false; break; } } } if (!known_modification_match) { continue; } } } ProteinGroup protein_group; if (!proteins_by_description.TryGetValue(protein.Description, out protein_group)) { protein_group = new ProteinGroup(); protein_group.Add(protein); protein_group.PeptideSpectrumMatches.Add(psm); proteins_by_description.Add(protein.Description, protein_group); } else { protein_group.PeptideSpectrumMatches.Add(psm); } } } } List <ProteinGroup> protein_groups = new List <ProteinGroup>(proteins_by_description.Values); protein_groups.Sort(ProteinGroup.DescendingSummedMorpheusScoreProteinGroupComparison); // todo: remove shared peptides from lower-scoring protein group? // merge indistinguishable proteins (technically protein groups but they only contain a single protein thus far) for (int i = 0; i < protein_groups.Count - 1; i++) { ProteinGroup protein_group = protein_groups[i]; int j = i + 1; while (j < protein_groups.Count) { ProteinGroup lower_protein_group = protein_groups[j]; if (lower_protein_group.SummedMorpheusScore < protein_group.SummedMorpheusScore) { break; } if (lower_protein_group.BaseLeucinePeptideSequences.SetEquals(protein_group.BaseLeucinePeptideSequences)) { protein_group.UnionWith(lower_protein_group); // should only ever be one protein in the group to add protein_groups.RemoveAt(j); } else { j++; } } } // remove subset and subsumable protein groups int k = protein_groups.Count - 1; while (k >= 1) { ProteinGroup protein_group = protein_groups[k]; HashSet <string> protein_group_peptides = new HashSet <string>(protein_group.BaseLeucinePeptideSequences); for (int l = 0; l < k; l++) { ProteinGroup higher_protein_group = protein_groups[l]; protein_group_peptides.ExceptWith(higher_protein_group.BaseLeucinePeptideSequences); if (protein_group_peptides.Count == 0) { break; } } if (protein_group_peptides.Count == 0) { protein_groups.RemoveAt(k); } k--; } return(protein_groups); }