public void add_topdown_theoreticals(ProteinWithGoTerms prot, string seq, string accession, double unmodified_mass, int decoy_number, int lysine_count, List <TheoreticalProteoform> new_theoreticals, int ptm_set_counter, Dictionary <double, int> mod_ranks, int added_ptm_penalty) { foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.Where(p => prot.AccessionList.Select(a => a.Split('_')[0]).Contains(p.accession.Split('_')[0].Split('-')[0]) && p.sequence == seq).OrderBy(t => t.accession).ThenByDescending(t => t.sequence.Length)) //order by gene name then descending sequence length --> order matters for creating theoreticals. { if (!new_theoreticals.Any(t => t.ptm_set.same_ptmset(topdown.topdown_ptm_set, true))) { //match each td proteoform group to the closest theoretical w/ best explanation.... otherwise make new theoretical proteoform PtmSet ptm_set = new PtmSet(topdown.topdown_ptm_set.ptm_combination, mod_ranks, added_ptm_penalty); TheoreticalProteoform t = new TheoreticalProteoform( accession + "_P" + ptm_set_counter.ToString(), prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()), seq, (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }), unmodified_mass, lysine_count, ptm_set, decoy_number < 0, false, theoretical_proteins); t.topdown_theoretical = true; new_theoreticals.Add(t); ptm_set_counter++; } } }
//if protein sequence doesn't exist, need to add... public void add_topdown_sequences() { List <ProteinWithGoTerms> new_proteins = new List <ProteinWithGoTerms>(); foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.OrderBy(t => t.modified_mass)) { List <ProteinWithGoTerms> candidate_theoreticals = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0])).ToList(); if (candidate_theoreticals.Count > 0) { topdown.gene_name = new GeneName(candidate_theoreticals.SelectMany(t => t.GeneNames)); topdown.geneID = string.Join("; ", candidate_theoreticals.SelectMany(p => p.DatabaseReferences.Where(r => r.Type == "GeneID").Select(r => r.Id)).Distinct()); if (!candidate_theoreticals.Any(p => p.BaseSequence == topdown.sequence) && !new_proteins.Any(p => p.AccessionList.Select(a => a.Split('_')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0]) && p.BaseSequence == topdown.sequence)) { int old_proteins_with_same_begin_end_diff_sequence = candidate_theoreticals.Count(t => t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence); int new_proteins_with_same_being_end_diff_sequence = new_proteins.Count(t => t.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0]) && t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence); int count = old_proteins_with_same_begin_end_diff_sequence + new_proteins_with_same_being_end_diff_sequence; ProteinWithGoTerms p = new ProteinWithGoTerms(topdown.sequence, topdown.accession.Split('_')[0].Split('-')[0] + "_" + topdown.topdown_begin + "frag" + topdown.topdown_end + (count > 0 ? "_" + count : ""), candidate_theoreticals.First().GeneNames.ToList(), candidate_theoreticals.First().OneBasedPossibleLocalizedModifications, new List <ProteolysisProduct>() { new ProteolysisProduct(topdown.topdown_begin, topdown.topdown_end, "full") }, candidate_theoreticals.First().Name, candidate_theoreticals.First().FullName, false, false, candidate_theoreticals.First().DatabaseReferences, candidate_theoreticals.First().GoTerms); p.topdown_protein = true; new_proteins.Add(p); } } else { topdown.accepted = false; } } expanded_proteins = expanded_proteins.Concat(new_proteins).ToArray(); }
//if protein sequence doesn't exist, need to add... public void add_topdown_sequences() { List <ProteinWithGoTerms> new_proteins = new List <ProteinWithGoTerms>(); foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.OrderBy(t => t.modified_mass)) { List <ProteinWithGoTerms> candidate_theoreticals = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0])).ToList(); bool accessions_in_database = true; foreach (var hit in topdown.ambiguous_topdown_hits) { var ambiguous_hit_theoretical = expanded_proteins.Where(p => p.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(hit.accession.Split('_')[0].Split('-')[0])).ToList(); if (ambiguous_hit_theoretical.Count > 0) { hit.gene_name = new GeneName(ambiguous_hit_theoretical.SelectMany(t => t.GeneNames)); } else { accessions_in_database = false; } } if (candidate_theoreticals.Count > 0 && accessions_in_database) { topdown.topdown_geneName = new GeneName(candidate_theoreticals.SelectMany(t => t.GeneNames)); if (!candidate_theoreticals.Any(p => p.BaseSequence == topdown.sequence) && !new_proteins.Any(p => p.AccessionList.Select(a => a.Split('_')[0]).Contains(topdown.accession.Split('_')[0].Split('-')[0]) && p.BaseSequence == topdown.sequence)) { int old_proteins_with_same_begin_end_diff_sequence = candidate_theoreticals.Count(t => t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence); int new_proteins_with_same_being_end_diff_sequence = new_proteins.Count(t => t.AccessionList.Select(a => a.Split('_')[0].Split('-')[0]).Contains(topdown.accession.Split('_')[0]) && t.ProteolysisProducts.First().OneBasedBeginPosition == topdown.topdown_begin && t.ProteolysisProducts.First().OneBasedEndPosition == topdown.topdown_end && t.BaseSequence != topdown.sequence); int count = old_proteins_with_same_begin_end_diff_sequence + new_proteins_with_same_being_end_diff_sequence; ProteinWithGoTerms p = new ProteinWithGoTerms(topdown.sequence, topdown.accession.Split('_')[0].Split('-')[0] + "_" + topdown.topdown_begin + "frag" + topdown.topdown_end + (count > 0 ? "_" + count : ""), candidate_theoreticals.First().GeneNames.ToList(), candidate_theoreticals.First().OneBasedPossibleLocalizedModifications, new List <ProteolysisProduct>() { new ProteolysisProduct(topdown.topdown_begin, topdown.topdown_end, "full") }, candidate_theoreticals.First().Name, candidate_theoreticals.First().FullName, false, false, candidate_theoreticals.First().DatabaseReferences, candidate_theoreticals.First().GoTerms); p.topdown_protein = true; new_proteins.Add(p); } } else { Sweet.lollipop.topdown_proteoforms_no_theoretical.Add(topdown); } } Sweet.lollipop.topdown_proteoforms = Sweet.lollipop.topdown_proteoforms.Except(Sweet.lollipop.topdown_proteoforms_no_theoretical).ToList(); expanded_proteins = expanded_proteins.Concat(new_proteins).ToArray(); }
public void EnterTheoreticalProteformFamily(string seq, ProteinWithGoTerms prot, IDictionary <int, List <Modification> > modifications, string accession, List <TheoreticalProteoform> theoretical_proteoforms, int decoy_number, IEnumerable <Modification> variableModifications) { List <TheoreticalProteoform> new_theoreticals = new List <TheoreticalProteoform>(); if (seq.Length > 3000 || seq.Any(s => !aaIsotopeMassList.ContainsKey(s))) { return; } //Calculate the properties of this sequence double unmodified_mass = TheoreticalProteoform.CalculateProteoformMass(seq, new List <Ptm>()); int lysine_count = seq.Split('K').Length - 1; bool check_contaminants = theoretical_proteins.Any(item => item.Key.ContaminantDB); //Figure out the possible ptm sets Dictionary <int, List <Modification> > possibleLocalizedMods = modifications.ToDictionary(kv => kv.Key, kv => new List <Modification>(kv.Value)); foreach (Modification m in variableModifications) { for (int i = 1; i <= prot.BaseSequence.Length; i++) { if (prot.BaseSequence[i - 1].ToString() == m.Target.ToString()) { if (!possibleLocalizedMods.TryGetValue(i, out List <Modification> a)) { possibleLocalizedMods.Add(i, new List <Modification> { m }); } else { a.Add(m); } } } } int ptm_set_counter = 1; //if top-down protein sequence, only add PTMs from that top-down proteoforms (will happen in add_topdown_theoreticals method) if (!prot.topdown_protein) { List <PtmSet> unique_ptm_groups = PtmCombos.get_combinations(possibleLocalizedMods, Sweet.lollipop.max_ptms, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2, limit_triples_and_greater); //Enumerate the ptm combinations with _P# to distinguish from the counts in ProteinSequenceGroups (_#G) and TheoreticalPfGps (_#T) foreach (PtmSet ptm_set in unique_ptm_groups) { TheoreticalProteoform t = new TheoreticalProteoform( accession + "_P" + ptm_set_counter.ToString(), prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()), seq, (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }), unmodified_mass, lysine_count, ptm_set, decoy_number < 0, check_contaminants, theoretical_proteins); t.topdown_theoretical = prot.topdown_protein; new_theoreticals.Add(t); ptm_set_counter++; } } add_topdown_theoreticals(prot, seq, accession, unmodified_mass, decoy_number, lysine_count, new_theoreticals, ptm_set_counter, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2); lock (theoretical_proteoforms) theoretical_proteoforms.AddRange(new_theoreticals); }