Esempio n. 1
0
 public void add_topdown_theoreticals(ProteinWithGoTerms prot, string seq, string accession, double unmodified_mass, int decoy_number, int lysine_count, List <TheoreticalProteoform> new_theoreticals, int ptm_set_counter, Dictionary <double, int> mod_ranks, int added_ptm_penalty)
 {
     foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.Where(p => prot.AccessionList.Select(a => a.Split('_')[0]).Contains(p.accession.Split('_')[0].Split('-')[0]) &&
                                                                                    p.sequence == seq).OrderBy(t => t.accession).ThenByDescending(t => t.sequence.Length)) //order by gene name then descending sequence length --> order matters for creating theoreticals.
     {
         if (!new_theoreticals.Any(t => t.ptm_set.same_ptmset(topdown.topdown_ptm_set, true)))
         {
             //match each td proteoform group to the closest theoretical w/ best explanation.... otherwise make new theoretical proteoform
             PtmSet ptm_set          = new PtmSet(topdown.topdown_ptm_set.ptm_combination, mod_ranks, added_ptm_penalty);
             TheoreticalProteoform t =
                 new TheoreticalProteoform(
                     accession + "_P" + ptm_set_counter.ToString(),
                     prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()),
                     seq,
                     (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }),
                     unmodified_mass,
                     lysine_count,
                     ptm_set,
                     decoy_number < 0,
                     false,
                     theoretical_proteins);
             t.topdown_theoretical = true;
             new_theoreticals.Add(t);
             ptm_set_counter++;
         }
     }
 }
Esempio n. 2
0
        //if protein sequence doesn't exist, need to add...
        public void add_topdown_sequences()
        {
            List <ProteinWithGoTerms> new_proteins = new List <ProteinWithGoTerms>();

            foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.OrderBy(t => t.modified_mass))
            {
                List <ProteinWithGoTerms> candidate_theoreticals = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0])).ToList();
                if (candidate_theoreticals.Count > 0)
                {
                    topdown.gene_name = new GeneName(candidate_theoreticals.SelectMany(t => t.GeneNames));
                    topdown.geneID    = string.Join("; ", candidate_theoreticals.SelectMany(p => p.DatabaseReferences.Where(r => r.Type == "GeneID").Select(r => r.Id)).Distinct());
                    if (!candidate_theoreticals.Any(p => p.BaseSequence == topdown.sequence) && !new_proteins.Any(p => p.AccessionList.Select(a => a.Split('_')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0]) && p.BaseSequence == topdown.sequence))
                    {
                        int old_proteins_with_same_begin_end_diff_sequence = candidate_theoreticals.Count(t => t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence);
                        int new_proteins_with_same_being_end_diff_sequence = new_proteins.Count(t => t.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0]) && t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence);
                        int count            = old_proteins_with_same_begin_end_diff_sequence + new_proteins_with_same_being_end_diff_sequence;
                        ProteinWithGoTerms p = new ProteinWithGoTerms(topdown.sequence, topdown.accession.Split('_')[0].Split('-')[0] + "_" + topdown.topdown_begin + "frag" + topdown.topdown_end + (count > 0 ? "_" + count : ""), candidate_theoreticals.First().GeneNames.ToList(), candidate_theoreticals.First().OneBasedPossibleLocalizedModifications, new List <ProteolysisProduct>()
                        {
                            new ProteolysisProduct(topdown.topdown_begin, topdown.topdown_end, "full")
                        }, candidate_theoreticals.First().Name, candidate_theoreticals.First().FullName, false, false, candidate_theoreticals.First().DatabaseReferences, candidate_theoreticals.First().GoTerms);
                        p.topdown_protein = true;
                        new_proteins.Add(p);
                    }
                }
                else
                {
                    topdown.accepted = false;
                }
            }
            expanded_proteins = expanded_proteins.Concat(new_proteins).ToArray();
        }
        //if protein sequence doesn't exist, need to add...
        public void add_topdown_sequences()
        {
            List <ProteinWithGoTerms> new_proteins = new List <ProteinWithGoTerms>();

            foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.OrderBy(t => t.modified_mass))
            {
                List <ProteinWithGoTerms> candidate_theoreticals = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0])).ToList();
                bool accessions_in_database = true;
                foreach (var hit in topdown.ambiguous_topdown_hits)
                {
                    var ambiguous_hit_theoretical = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(hit.accession.Split('_')[0].Split('-')[0])).ToList();
                    if (ambiguous_hit_theoretical.Count > 0)
                    {
                        hit.gene_name = new GeneName(ambiguous_hit_theoretical.SelectMany(t => t.GeneNames));
                    }
                    else
                    {
                        accessions_in_database = false;
                    }
                }

                if (candidate_theoreticals.Count > 0 && accessions_in_database)
                {
                    topdown.topdown_geneName = new GeneName(candidate_theoreticals.SelectMany(t => t.GeneNames));
                    if (!candidate_theoreticals.Any(p => p.BaseSequence == topdown.sequence) && !new_proteins.Any(p => p.AccessionList.Select(a => a.Split('_')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0]) && p.BaseSequence == topdown.sequence))
                    {
                        int old_proteins_with_same_begin_end_diff_sequence = candidate_theoreticals.Count(t => t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence);
                        int new_proteins_with_same_being_end_diff_sequence = new_proteins.Count(t => t.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0]) && t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence);
                        int count            = old_proteins_with_same_begin_end_diff_sequence + new_proteins_with_same_being_end_diff_sequence;
                        ProteinWithGoTerms p = new ProteinWithGoTerms(topdown.sequence, topdown.accession.Split('_')[0].Split('-')[0] + "_" + topdown.topdown_begin + "frag" + topdown.topdown_end + (count > 0 ? "_" + count : ""), candidate_theoreticals.First().GeneNames.ToList(), candidate_theoreticals.First().OneBasedPossibleLocalizedModifications, new List <ProteolysisProduct>()
                        {
                            new ProteolysisProduct(topdown.topdown_begin, topdown.topdown_end, "full")
                        }, candidate_theoreticals.First().Name, candidate_theoreticals.First().FullName, false, false, candidate_theoreticals.First().DatabaseReferences, candidate_theoreticals.First().GoTerms);
                        p.topdown_protein = true;
                        new_proteins.Add(p);
                    }
                }
                else
                {
                    Sweet.lollipop.topdown_proteoforms_no_theoretical.Add(topdown);
                }
            }
            Sweet.lollipop.topdown_proteoforms = Sweet.lollipop.topdown_proteoforms.Except(Sweet.lollipop.topdown_proteoforms_no_theoretical).ToList();
            expanded_proteins = expanded_proteins.Concat(new_proteins).ToArray();
        }
Esempio n. 4
0
        public void EnterTheoreticalProteformFamily(string seq, ProteinWithGoTerms prot, IDictionary <int, List <Modification> > modifications, string accession, List <TheoreticalProteoform> theoretical_proteoforms, int decoy_number, IEnumerable <Modification> variableModifications)
        {
            List <TheoreticalProteoform> new_theoreticals = new List <TheoreticalProteoform>();

            if (seq.Length > 3000 || seq.Any(s => !aaIsotopeMassList.ContainsKey(s)))
            {
                return;
            }

            //Calculate the properties of this sequence
            double unmodified_mass = TheoreticalProteoform.CalculateProteoformMass(seq, new List <Ptm>());


            int  lysine_count       = seq.Split('K').Length - 1;
            bool check_contaminants = theoretical_proteins.Any(item => item.Key.ContaminantDB);

            //Figure out the possible ptm sets
            Dictionary <int, List <Modification> > possibleLocalizedMods = modifications.ToDictionary(kv => kv.Key, kv => new List <Modification>(kv.Value));

            foreach (Modification m in variableModifications)
            {
                for (int i = 1; i <= prot.BaseSequence.Length; i++)
                {
                    if (prot.BaseSequence[i - 1].ToString() == m.Target.ToString())
                    {
                        if (!possibleLocalizedMods.TryGetValue(i, out List <Modification> a))
                        {
                            possibleLocalizedMods.Add(i, new List <Modification> {
                                m
                            });
                        }
                        else
                        {
                            a.Add(m);
                        }
                    }
                }
            }

            int ptm_set_counter = 1;

            //if top-down protein sequence, only add PTMs from that top-down proteoforms (will happen in add_topdown_theoreticals method)
            if (!prot.topdown_protein)
            {
                List <PtmSet> unique_ptm_groups = PtmCombos.get_combinations(possibleLocalizedMods, Sweet.lollipop.max_ptms, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2, limit_triples_and_greater);

                //Enumerate the ptm combinations with _P# to distinguish from the counts in ProteinSequenceGroups (_#G) and TheoreticalPfGps (_#T)
                foreach (PtmSet ptm_set in unique_ptm_groups)
                {
                    TheoreticalProteoform t =
                        new TheoreticalProteoform(
                            accession + "_P" + ptm_set_counter.ToString(),
                            prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()),
                            seq,
                            (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }),
                            unmodified_mass,
                            lysine_count,
                            ptm_set,
                            decoy_number < 0,
                            check_contaminants,
                            theoretical_proteins);
                    t.topdown_theoretical = prot.topdown_protein;
                    new_theoreticals.Add(t);
                    ptm_set_counter++;
                }
            }
            add_topdown_theoreticals(prot, seq, accession, unmodified_mass, decoy_number, lysine_count, new_theoreticals, ptm_set_counter, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2);
            lock (theoretical_proteoforms) theoretical_proteoforms.AddRange(new_theoreticals);
        }