Exemple #1
0
        public static int DescendingSummedMorpheusScoreProteinGroupComparison(ProteinGroup left, ProteinGroup right)
        {
            int comparison = -(left.SummedMorpheusScore.CompareTo(right.SummedMorpheusScore));

            if (comparison != 0)
            {
                return(comparison);
            }
            else
            {
                return(left.Target.CompareTo(right.Target));
            }
        }
Exemple #2
0
        public static List <ProteinGroup> ApplyProteinParsimony(IEnumerable <PeptideSpectrumMatch> peptideSpectrumMatches, double morpheusScoreThreshold, FileStream proteinFastaDatabase, bool onTheFlyDecoys, IDictionary <string, Modification> knownVariableModifications, Protease protease, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, int maximumThreads)
        {
            // make a list of the all the distinct base leucine peptide sequences
            Dictionary <string, List <Protein> > peptide_proteins = new Dictionary <string, List <Protein> >();

            foreach (PeptideSpectrumMatch psm in peptideSpectrumMatches)
            {
                if (psm.MorpheusScore >= morpheusScoreThreshold)
                {
                    if (!peptide_proteins.ContainsKey(psm.Peptide.BaseLeucineSequence))
                    {
                        peptide_proteins.Add(psm.Peptide.BaseLeucineSequence, new List <Protein>());
                    }
                }
            }

            // record all proteins that could have been the source of each peptide
            ParallelOptions parallel_options = new ParallelOptions();

            parallel_options.MaxDegreeOfParallelism = maximumThreads;
            Parallel.ForEach(ProteomeDatabaseReader.ReadProteins(proteinFastaDatabase, onTheFlyDecoys, REQUIRE_MATCHING_KNOWN_MODIFICATIONS_IN_PROTEIN_PARSIMONY ? knownVariableModifications : null), parallel_options, protein =>
            {
                foreach (Peptide peptide in protein.Digest(protease, maximumMissedCleavages, initiatorMethionineBehavior, null, null))
                {
                    lock (peptide_proteins)
                    {
                        List <Protein> proteins;
                        if (peptide_proteins.TryGetValue(peptide.BaseLeucineSequence, out proteins))
                        {
                            List <Peptide> peptides;
                            if (!protein.IdentifiedPeptides.TryGetValue(peptide.BaseLeucineSequence, out peptides))
                            {
                                peptides = new List <Peptide>();
                                peptides.Add(peptide);
                                protein.IdentifiedPeptides.Add(peptide.BaseLeucineSequence, peptides);
                            }
                            else
                            {
                                peptides.Add(peptide);
                            }
                            proteins.Add(protein);
                        }
                    }
                }
            }
                             );

            // create protein groups (initially with just one protein each) and assign PSMs to them
            Dictionary <string, ProteinGroup> proteins_by_description = new Dictionary <string, ProteinGroup>();

            foreach (PeptideSpectrumMatch psm in peptideSpectrumMatches)
            {
                if (psm.MorpheusScore >= morpheusScoreThreshold)
                {
                    foreach (Protein protein in peptide_proteins[psm.Peptide.BaseLeucineSequence])
                    {
                        if (REQUIRE_MATCHING_KNOWN_MODIFICATIONS_IN_PROTEIN_PARSIMONY)
                        {
                            // check to make sure this protein's known modifications match the PSM's
                            bool known_modification_match = true;
                            if (psm.Peptide.VariableModifications != null && psm.Peptide.VariableModifications.Count > 0)
                            {
                                foreach (KeyValuePair <int, Modification> kvp in psm.Peptide.VariableModifications)
                                {
                                    if (kvp.Value.Known)
                                    {
                                        List <Modification> protein_modifications = null;
                                        if (protein.KnownModifications == null ||
                                            !protein.KnownModifications.TryGetValue(psm.Peptide.StartResidueNumber - 1 + kvp.Key, out protein_modifications) ||
                                            !protein_modifications.Contains(kvp.Value))
                                        {
                                            known_modification_match = false;
                                            break;
                                        }
                                    }
                                }
                                if (!known_modification_match)
                                {
                                    continue;
                                }
                            }
                        }

                        ProteinGroup protein_group;
                        if (!proteins_by_description.TryGetValue(protein.Description, out protein_group))
                        {
                            protein_group = new ProteinGroup();
                            protein_group.Add(protein);
                            protein_group.PeptideSpectrumMatches.Add(psm);
                            proteins_by_description.Add(protein.Description, protein_group);
                        }
                        else
                        {
                            protein_group.PeptideSpectrumMatches.Add(psm);
                        }
                    }
                }
            }

            List <ProteinGroup> protein_groups = new List <ProteinGroup>(proteins_by_description.Values);

            protein_groups.Sort(ProteinGroup.DescendingSummedMorpheusScoreProteinGroupComparison);

            // todo: remove shared peptides from lower-scoring protein group?

            // merge indistinguishable proteins (technically protein groups but they only contain a single protein thus far)
            for (int i = 0; i < protein_groups.Count - 1; i++)
            {
                ProteinGroup protein_group = protein_groups[i];

                int j = i + 1;
                while (j < protein_groups.Count)
                {
                    ProteinGroup lower_protein_group = protein_groups[j];

                    if (lower_protein_group.SummedMorpheusScore < protein_group.SummedMorpheusScore)
                    {
                        break;
                    }

                    if (lower_protein_group.BaseLeucinePeptideSequences.SetEquals(protein_group.BaseLeucinePeptideSequences))
                    {
                        protein_group.UnionWith(lower_protein_group);  // should only ever be one protein in the group to add
                        protein_groups.RemoveAt(j);
                    }
                    else
                    {
                        j++;
                    }
                }
            }

            // remove subset and subsumable protein groups
            int k = protein_groups.Count - 1;

            while (k >= 1)
            {
                ProteinGroup     protein_group          = protein_groups[k];
                HashSet <string> protein_group_peptides = new HashSet <string>(protein_group.BaseLeucinePeptideSequences);

                for (int l = 0; l < k; l++)
                {
                    ProteinGroup higher_protein_group = protein_groups[l];

                    protein_group_peptides.ExceptWith(higher_protein_group.BaseLeucinePeptideSequences);
                    if (protein_group_peptides.Count == 0)
                    {
                        break;
                    }
                }

                if (protein_group_peptides.Count == 0)
                {
                    protein_groups.RemoveAt(k);
                }
                k--;
            }

            return(protein_groups);
        }