Esempio n. 1
0
        private static Dictionary <string, Protease> LoadProteaseDictionary(string proteasesLocation)
        {
            Dictionary <string, Protease> dict = new Dictionary <string, Protease>();

            using (StreamReader proteases = new StreamReader(proteasesLocation))
            {
                proteases.ReadLine();

                while (proteases.Peek() != -1)
                {
                    string   line   = proteases.ReadLine();
                    string[] fields = line.Split('\t');

                    string   name = fields[0];
                    string[] sequences_inducing_cleavage   = fields[1].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                    string[] sequences_preventing_cleavage = fields[2].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                    var      cleavage_terminus             = (TerminusType)Enum.Parse(typeof(TerminusType), fields[3], true);
                    var      cleavage_specificity          = (CleavageSpecificity)Enum.Parse(typeof(CleavageSpecificity), fields[4], true);
                    string   psi_ms_accession_number       = fields[5];
                    string   psi_ms_name = fields[6];
                    string   site_regexp = fields[7];
                    var      protease    = new Protease(name, sequences_inducing_cleavage, sequences_preventing_cleavage, cleavage_terminus, cleavage_specificity, psi_ms_accession_number, psi_ms_name, site_regexp);
                    dict.Add(protease.Name, protease);
                }
            }
            return(dict);
        }
Esempio n. 2
0
        /// <summary>
        /// Gets peptides for semispecific digestion of a protein
        /// </summary>
        /// <param name="protein"></param>
        /// <returns></returns>
        public IEnumerable <PeptideWithSetModifications> SemiSpecificDigestion(Protein protein)
        {
            List <Peptide> intervals = new List <Peptide>();
            List <int>     oneBasedIndicesToCleaveAfter = Protease.GetDigestionSiteIndices(protein.BaseSequence);

            for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - MaximumMissedCleavages - 1; i++)
            {
                if (Protease.Retain(i, InitiatorMethionineBehavior, protein[0]) &&
                    Protease.OkayLength(oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], MinPeptidesLength, MaxPeptidesLength))
                {
                    intervals.Add(new Peptide(protein, oneBasedIndicesToCleaveAfter[i] + 1, oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1],
                                              oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], "semi"));
                }

                if (Protease.Cleave(i, InitiatorMethionineBehavior, protein[0]) &&
                    Protease.OkayLength(oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - 1, MinPeptidesLength, MaxPeptidesLength))
                {
                    intervals.Add(new Peptide(protein, 2, oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1],
                                              oneBasedIndicesToCleaveAfter[i + MaximumMissedCleavages + 1] - 1, "semi:M cleaved"));
                }
            }

            int lastIndex = oneBasedIndicesToCleaveAfter.Count - 1;
            int maxIndex  = MaximumMissedCleavages < lastIndex ? MaximumMissedCleavages : lastIndex;

            for (int i = 1; i <= maxIndex; i++)
            {
                if (DigestionParams.TerminusTypeSemiProtease == TerminusType.N) //tricky, it's N because we want the extra peptide at the C terminus |_
                {
                    if (Protease.OkayLength(oneBasedIndicesToCleaveAfter[lastIndex] - oneBasedIndicesToCleaveAfter[lastIndex - i], MinPeptidesLength, MaxPeptidesLength))
                    {
                        intervals.Add(new Peptide(protein, oneBasedIndicesToCleaveAfter[lastIndex - i] + 1, oneBasedIndicesToCleaveAfter[lastIndex],
                                                  oneBasedIndicesToCleaveAfter[lastIndex] - oneBasedIndicesToCleaveAfter[lastIndex - i], "semiN"));
                    }
                }
                else //TerminusType.C
                {
                    if (Protease.OkayLength(oneBasedIndicesToCleaveAfter[i] - oneBasedIndicesToCleaveAfter[0], MinPeptidesLength, MaxPeptidesLength))
                    {
                        intervals.Add(new Peptide(protein, oneBasedIndicesToCleaveAfter[0] + 1, oneBasedIndicesToCleaveAfter[i],
                                                  oneBasedIndicesToCleaveAfter[i] - oneBasedIndicesToCleaveAfter[0], "semiC"));
                    }
                }
            }

            // Also digest using the proteolysis product start/end indices
            intervals.AddRange(
                protein.ProteolysisProducts
                .Where(proteolysisProduct => proteolysisProduct.OneBasedBeginPosition != 1 || proteolysisProduct.OneBasedEndPosition != protein.Length)
                .Select(proteolysisProduct => new Peptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value,
                                                          0, proteolysisProduct.Type + " start")));

            return(intervals.SelectMany(peptide => peptide.GetModifiedPeptides(AllKnownFixedModifications, DigestionParams, VariableModifications)));
        }
Esempio n. 3
0
        /// <summary>
        /// Gets peptides for specific protease digestion of a protein
        /// </summary>
        /// <param name="protein"></param>
        /// <returns></returns>
        public IEnumerable <PeptideWithSetModifications> Digestion(Protein protein)
        {
            var intervals = Protease.GetDigestionIntervals(protein, MaximumMissedCleavages, InitiatorMethionineBehavior, MinPeptidesLength, MaxPeptidesLength);

            return(intervals.SelectMany(peptide => peptide.GetModifiedPeptides(AllKnownFixedModifications, DigestionParams, VariableModifications)));
        }
        private List <ProteinGroup> ApplyProteinParsimony()
        {
            //if dictionary is empty return an empty list of protein groups
            if (!CompactPeptideToProteinPeptideMatching.Values.Any())
            {
                return(new List <ProteinGroup>());
            }
            // digesting an XML database results in a non-mod-agnostic digestion; need to fix this if mod-agnostic parsimony enabled
            if (!TreatModPeptidesAsDifferentPeptides)//user want modified and unmodified peptides treated the same
            {
                Dictionary <string, HashSet <PeptideWithSetModifications> > baseSeqToProteinMatch = new Dictionary <string, HashSet <PeptideWithSetModifications> >();
                // dictionary where string key is the base sequence and the HashSet is all PeptidesWithSetModificatiosn with the same sequence
                // can access which protein these matching peptides came from through the PeptideWithSetModifications object
                foreach (var peptide in CompactPeptideToProteinPeptideMatching.SelectMany(b => b.Value))
                {
                    if (baseSeqToProteinMatch.TryGetValue(peptide.BaseSequence, out HashSet <PeptideWithSetModifications> value))
                    {
                        value.Add(peptide);
                    }
                    else
                    {
                        baseSeqToProteinMatch[peptide.BaseSequence] = new HashSet <PeptideWithSetModifications> {
                            peptide
                        };
                    }
                }

                var blah = new Dictionary <PeptideWithSetModifications, List <CompactPeptideBase> >();
                // where to store results
                foreach (var pep in CompactPeptideToProteinPeptideMatching)
                {
                    foreach (var pepWithSetMods in pep.Value)
                    {
                        if (blah.TryGetValue(pepWithSetMods, out List <CompactPeptideBase> list))
                        {
                            list.Add(pep.Key);
                        }
                        else
                        {
                            blah.Add(pepWithSetMods, new List <CompactPeptideBase> {
                                pep.Key
                            });
                        }
                    }
                }

                foreach (var baseSequence in baseSeqToProteinMatch)
                {
                    if (baseSequence.Value.Count > 1 && baseSequence.Value.Any(p => p.NumMods > 0))
                    {
                        // list of proteins along with start/end residue in protein and the # missed cleavages
                        var peptideInProteinInfo = new List <Tuple <Protein, DigestionParams, int, int, int> >();
                        foreach (var peptide in baseSequence.Value)
                        {
                            peptideInProteinInfo.Add(new Tuple <Protein, DigestionParams, int, int, int>(peptide.Protein, peptide.DigestionParams, peptide.OneBasedStartResidueInProtein, peptide.OneBasedEndResidueInProtein, (int)peptide.MissedCleavages));
                        }

                        foreach (var peptide in baseSequence.Value)
                        {
                            foreach (var proteinInfo in peptideInProteinInfo)
                            {
                                var pep = new PeptideWithSetModifications(proteinInfo.Item1, proteinInfo.Item2, proteinInfo.Item3, proteinInfo.Item4, peptide.PeptideDescription, proteinInfo.Item5, peptide.AllModsOneIsNterminus, peptide.NumFixedMods);
                                foreach (var compactPeptide in blah[peptide])
                                {
                                    CompactPeptideToProteinPeptideMatching[compactPeptide].Add(pep);
                                }
                            }
                        }
                    }
                }
            }

            var proteinToPeptidesMatching  = new Dictionary <Protein, HashSet <CompactPeptideBase> >();
            var parsimonyProteinList       = new Dictionary <Protein, HashSet <CompactPeptideBase> >();
            var proteinsWithUniquePeptides = new Dictionary <Protein, HashSet <PeptideWithSetModifications> >();

            // peptide matched to fullseq (used depending on user preference)
            var compactPeptideToFullSeqMatch = CompactPeptideToProteinPeptideMatching.ToDictionary(x => x.Key, x => x.Value.First().Sequence);

            foreach (var kvp in CompactPeptideToProteinPeptideMatching)
            {
                HashSet <Protein> proteinsAssociatedWithThisPeptide = new HashSet <Protein>(kvp.Value.Select(p => p.Protein));
                if (proteinsAssociatedWithThisPeptide.Count == 1)
                {
                    if (!proteinsWithUniquePeptides.TryGetValue(kvp.Value.First().Protein, out HashSet <PeptideWithSetModifications> peptides))
                    {
                        proteinsWithUniquePeptides.Add(kvp.Value.First().Protein, new HashSet <PeptideWithSetModifications>(kvp.Value));
                    }
                    else
                    {
                        peptides.UnionWith(kvp.Value);
                    }
                }
                // multiprotease parsimony is "weird" because a peptide sequence can be shared between
                // two proteins but technically be a "unique" peptide because it is unique in that protease digestion
                // this code marks these types of peptides as unique
                else
                {
                    foreach (var peptide in kvp.Value)
                    {
                        Protease protease          = peptide.DigestionParams.Protease;
                        int      sameProteaseCount = kvp.Value.Count(v => v.DigestionParams.Protease == protease);

                        if (sameProteaseCount == 1)
                        {
                            if (!proteinsWithUniquePeptides.TryGetValue(peptide.Protein, out HashSet <PeptideWithSetModifications> peps))
                            {
                                proteinsWithUniquePeptides.Add(peptide.Protein, new HashSet <PeptideWithSetModifications> {
                                    peptide
                                });
                            }
                            else
                            {
                                peps.UnionWith(kvp.Value);
                            }
                        }
                    }
                }

                // if a peptide is associated with a decoy protein, remove all target protein associations with the peptide
                if (kvp.Value.Any(p => p.Protein.IsDecoy))
                {
                    kvp.Value.RemoveWhere(p => !p.Protein.IsDecoy);
                }

                // if a peptide is associated with a contaminant protein, remove all target protein associations with the peptide
                if (kvp.Value.Any(p => p.Protein.IsContaminant))
                {
                    kvp.Value.RemoveWhere(p => !p.Protein.IsContaminant);
                }
            }
            // makes dictionary with proteins as keys and list of associated peptides as the value (makes parsimony algo easier)
            foreach (var kvp in CompactPeptideToProteinPeptideMatching)
            {
                foreach (var peptide in kvp.Value)
                {
                    if (!proteinToPeptidesMatching.TryGetValue(peptide.Protein, out HashSet <CompactPeptideBase> peptides))
                    {
                        proteinToPeptidesMatching.Add(peptide.Protein, new HashSet <CompactPeptideBase>()
                        {
                            kvp.Key
                        });
                    }
                    else
                    {
                        peptides.Add(kvp.Key);
                    }
                }
            }

            // build protein list for each peptide before parsimony has been applied
            var peptideSeqProteinListMatch = new Dictionary <string, HashSet <Protein> >();

            foreach (var kvp in proteinToPeptidesMatching)
            {
                foreach (var peptide in kvp.Value)
                {
                    string pepSequence;
                    if (!TreatModPeptidesAsDifferentPeptides)
                    {
                        string nTerminalMasses = peptide.NTerminalMasses == null ? "" : string.Join("", peptide.NTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture)));
                        string cTerminalMasses = peptide.CTerminalMasses == null ? "" : string.Join("", peptide.CTerminalMasses.Select(b => b.ToString(CultureInfo.InvariantCulture)));
                        pepSequence = nTerminalMasses + cTerminalMasses + peptide.MonoisotopicMassIncludingFixedMods.ToString(CultureInfo.InvariantCulture);
                    }
                    else
                    {
                        pepSequence = compactPeptideToFullSeqMatch[peptide];
                    }
                    if (!peptideSeqProteinListMatch.TryGetValue(pepSequence, out HashSet <Protein> proteinListHere))
                    {
                        peptideSeqProteinListMatch.Add(pepSequence, new HashSet <Protein>()
                        {
                            kvp.Key
                        });
                    }
                    else
                    {
                        proteinListHere.Add(kvp.Key);
                    }
                }
            }

            // dictionary associates proteins w/ unused base seqs (list will shrink over time)
            var algDictionary = new Dictionary <Protein, HashSet <string> >();

            foreach (var kvp in peptideSeqProteinListMatch)
            {
                foreach (var protein in kvp.Value)
                {
                    if (algDictionary.TryGetValue(protein, out HashSet <string> newPeptideBaseSeqs))
                    {
                        newPeptideBaseSeqs.Add(kvp.Key);
                    }
                    else
                    {
                        algDictionary.Add(protein, new HashSet <string> {
                            kvp.Key
                        });
                    }
                }
            }

            // dictionary associates proteins w/ unused base seqs (list will NOT shrink over time)
            var proteinToPepSeqMatch = algDictionary.ToDictionary(x => x.Key, x => x.Value);

            // *** main parsimony loop
            bool uniquePeptidesLeft = proteinsWithUniquePeptides.Any();

            int numNewSeqs = algDictionary.Max(p => p.Value.Count);

            while (numNewSeqs != 0)
            {
                var possibleBestProteinList = new List <KeyValuePair <Protein, HashSet <string> > >();

                if (uniquePeptidesLeft)
                {
                    var proteinsWithUniquePeptidesLeft = algDictionary.Where(p => proteinsWithUniquePeptides.ContainsKey(p.Key));
                    if (proteinsWithUniquePeptidesLeft.Any())
                    {
                        possibleBestProteinList.Add(proteinsWithUniquePeptidesLeft.First());
                    }
                    else
                    {
                        uniquePeptidesLeft = false;
                    }
                }

                // gets list of proteins with the most unaccounted-for peptide base sequences
                if (!uniquePeptidesLeft)
                {
                    possibleBestProteinList = algDictionary.Where(p => p.Value.Count == numNewSeqs).ToList();
                }

                Protein          bestProtein = possibleBestProteinList.First().Key;
                HashSet <string> newSeqs     = new HashSet <string>(algDictionary[bestProtein]);

                // may need to select different protein
                if (possibleBestProteinList.Count > 1)
                {
                    var proteinsWithTheseBaseSeqs = new HashSet <Protein>();

                    foreach (var kvp in possibleBestProteinList)
                    {
                        if (newSeqs.IsSubsetOf(kvp.Value))
                        {
                            proteinsWithTheseBaseSeqs.Add(kvp.Key);
                        }
                    }

                    if (proteinsWithTheseBaseSeqs.Count > 1)
                    {
                        var proteinsOrderedByTotalPeptideCount = new Dictionary <Protein, HashSet <string> >();
                        foreach (var protein in proteinsWithTheseBaseSeqs)
                        {
                            proteinsOrderedByTotalPeptideCount.Add(protein, proteinToPepSeqMatch[protein]);
                        }
                        bestProtein = proteinsOrderedByTotalPeptideCount.OrderByDescending(kvp => kvp.Value.Count).First().Key;
                    }
                }

                parsimonyProteinList.Add(bestProtein, proteinToPeptidesMatching[bestProtein]);

                // remove used peptides from their proteins
                foreach (var newBaseSeq in newSeqs)
                {
                    HashSet <Protein> proteinsWithThisPeptide = peptideSeqProteinListMatch[newBaseSeq];

                    foreach (var protein in proteinsWithThisPeptide)
                    {
                        algDictionary[protein].Remove(newBaseSeq);
                    }
                }
                algDictionary.Remove(bestProtein);
                numNewSeqs = algDictionary.Any() ? algDictionary.Max(p => p.Value.Count) : 0;
            }

            // *** done with parsimony

            // add indistinguishable proteins
            var proteinsGroupedByNumPeptides          = proteinToPeptidesMatching.GroupBy(p => p.Value.Count);
            var parsimonyProteinsGroupedByNumPeptides = parsimonyProteinList.GroupBy(p => p.Value.Count);
            var indistinguishableProteins             = new ConcurrentDictionary <Protein, HashSet <CompactPeptideBase> >();

            foreach (var group in proteinsGroupedByNumPeptides)
            {
                var parsimonyProteinsWithSameNumPeptides = parsimonyProteinsGroupedByNumPeptides.FirstOrDefault(p => p.Key == group.Key);
                var list = group.ToList();
                if (parsimonyProteinsWithSameNumPeptides != null)
                {
                    Parallel.ForEach(Partitioner.Create(0, list.Count),
                                     new ParallelOptions {
                        MaxDegreeOfParallelism = commonParameters.MaxThreadsToUsePerFile
                    },
                                     (range, loopState) =>
                    {
                        for (int i = range.Item1; i < range.Item2; i++)
                        {
                            foreach (var parsimonyProteinWithThisNumPeptides in parsimonyProteinsWithSameNumPeptides)
                            {
                                if (parsimonyProteinWithThisNumPeptides.Key != list[i].Key &&
                                    proteinToPeptidesMatching[parsimonyProteinWithThisNumPeptides.Key].SetEquals(proteinToPeptidesMatching[list[i].Key]))
                                {
                                    indistinguishableProteins.GetOrAdd(list[i].Key, proteinToPeptidesMatching[list[i].Key]);
                                }
                            }
                        }
                    }
                                     );
                }
            }
            foreach (var protein in indistinguishableProteins)
            {
                if (!parsimonyProteinList.ContainsKey(protein.Key))
                {
                    parsimonyProteinList.Add(protein.Key, protein.Value);
                }
            }

            // multiprotease parsimony:
            // this code is a workaround to add back proteins to the parsimonious list that were removed
            // because unique peptides were mistaken for shared peptides. see line 139 for more info
            if (ListOfDigestionParams.Select(v => v.Protease).Distinct().Count() > 1)
            {
                HashSet <Protein> parsimonyProteinSet = new HashSet <Protein>(parsimonyProteinList.Keys);

                // add back in proteins that contain unique peptides
                foreach (var prot in proteinsWithUniquePeptides)
                {
                    if (!parsimonyProteinSet.Contains(prot.Key))
                    {
                        parsimonyProteinList.Add(prot.Key, proteinToPeptidesMatching[prot.Key]);
                    }
                }
            }
            foreach (var kvp in CompactPeptideToProteinPeptideMatching)
            {
                kvp.Value.RemoveWhere(p => !parsimonyProteinList.ContainsKey(p.Protein));
            }

            return(ConstructProteinGroups(new HashSet <PeptideWithSetModifications>(proteinsWithUniquePeptides.Values.SelectMany(p => p)), new HashSet <PeptideWithSetModifications>(CompactPeptideToProteinPeptideMatching.Values.SelectMany(p => p))));
        }