private void copy_aggregate(ExperimentalProteoform e) { root = e.root; //doesn't copy quant on purpose agg_intensity = e.agg_intensity; agg_mass = e.agg_mass; modified_mass = e.modified_mass; agg_rt = e.agg_rt; lysine_count = e.lysine_count; is_target = e.is_target; family = e.family; aggregated = new List <IAggregatable>(e.aggregated); lt_quant_components = new List <Component>(e.lt_quant_components); lt_verification_components = new List <Component>(e.lt_verification_components); hv_quant_components = new List <Component>(e.hv_quant_components); hv_verification_components = new List <Component>(e.hv_verification_components); biorepIntensityList = new List <BiorepIntensity>(e.biorepIntensityList); manual_validation_id = e.manual_validation_id; manual_validation_verification = e.manual_validation_verification; manual_validation_quant = e.manual_validation_quant; }
public List <ExperimentalProteoform> identify_connected_experimentals() { List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>(); //do relations first closest to candidate ptmset delta mass, then in order of relation delta mass (need to do in same order every round) foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).OrderBy(r => r.candidate_ptmset != null ? Math.Abs(r.candidate_ptmset.mass - r.DeltaMass) : r.DeltaMass * 1e6).Distinct().ToList()) { ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this); if (e == null) { continue; } // Looking at an ET pair, expecting an EE pair TheoreticalProteoform theoretical_base = this as TheoreticalProteoform != null ? this as TheoreticalProteoform : //Theoretical starting point (linked_proteoform_references.First() as TheoreticalProteoform != null ? linked_proteoform_references.First() as TheoreticalProteoform : //Experimental with theoretical reference null); //Experimental without theoretical reference double mass_tolerance = modified_mass / 1000000 * Sweet.lollipop.mass_tolerance; PtmSet with_mod_change = determine_mod_change(e, this, theoretical_base, r, this.ptm_set); if (with_mod_change == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance) { lock (r) lock (e) assign_pf_identity(e, ptm_set, r, theoretical_base); identified.Add(e); } if (with_mod_change == null) { continue; } lock (r) lock (e) assign_pf_identity(e, with_mod_change, r, theoretical_base); identified.Add(e); } return(identified); }
//Selecting numerator and denominator is not implemented public QuantitativeProteoformValues(ExperimentalProteoform eP) { eP.quant = this; proteoform = eP; }
// COPYING CONSTRUCTOR public ExperimentalProteoform(ExperimentalProteoform eP) : base(eP.accession, eP.modified_mass, eP.lysine_count, eP.is_target) { copy_aggregate(eP); quant = new QuantitativeProteoformValues(this); }
public List <ExperimentalProteoform> identify_connected_experimentals(List <PtmSet> all_possible_ptmsets, List <ModificationWithMass> all_mods_with_mass) { List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>(); foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).Distinct().ToList()) { ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this); if (e == null) { continue; // Looking at an ET pair, expecting an EE pair } double mass_tolerance = modified_mass / 1000000 * (double)SaveState.lollipop.mass_tolerance; int sign = Math.Sign(e.modified_mass - modified_mass); double deltaM = Math.Sign(r.peak.DeltaMass) < 0 ? r.peak.DeltaMass : sign * r.peak.DeltaMass; // give EE relations the correct sign, but don't switch negative ET relation deltaM's TheoreticalProteoform theoretical_base = this as TheoreticalProteoform != null ? this as TheoreticalProteoform : //Theoretical starting point (linked_proteoform_references.First() as TheoreticalProteoform != null ? linked_proteoform_references.First() as TheoreticalProteoform : //Experimental with theoretical reference null); //Experimental without theoretical reference string theoretical_base_sequence = theoretical_base != null ? theoretical_base.sequence : ""; PtmSet best_addition = generate_possible_added_ptmsets(r.peak.possiblePeakAssignments, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, theoretical_base_sequence, 1) .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); PtmSet best_loss = null; foreach (PtmSet set in all_possible_ptmsets) { bool within_loss_tolerance = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance; var these_mods = this.ptm_set.ptm_combination.Select(ptm => ptm.modification); var those_mods = set.ptm_combination.Select(ptm => ptm.modification); // all must be in the current set to remove them bool can_be_removed = those_mods.All(m => these_mods.Contains(m)); bool better_than_current_best_loss = best_loss == null || Math.Abs(deltaM - (-set.mass)) < Math.Abs(deltaM - (-best_loss.mass)); if (can_be_removed && within_loss_tolerance && better_than_current_best_loss) { best_loss = set; } } // If they're the same and someone hasn't labeled 0 difference with a "ModificationWithMass", then label it null if (best_addition == null && best_loss == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance) { lock (r) lock (e) assign_pf_identity(e, this, ptm_set, r, sign, null); identified.Add(e); } if (best_addition == null && best_loss == null) { continue; } // Make the new ptmset with ptms removed or added PtmSet with_mod_change = null; if (best_loss == null) { with_mod_change = new PtmSet(new List <Ptm>(this.ptm_set.ptm_combination.Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.monoisotopicMass != 0).ToList())); } else { List <Ptm> new_combo = new List <Ptm>(this.ptm_set.ptm_combination); foreach (Ptm ptm in best_loss.ptm_combination) { new_combo.Remove(new_combo.FirstOrDefault(asdf => asdf.modification == ptm.modification)); } with_mod_change = new PtmSet(new_combo); } lock (r) lock (e) assign_pf_identity(e, this, with_mod_change, r, sign, best_loss != null ? best_loss : best_addition); identified.Add(e); } return(identified); }
public static string get_cytoscape_nodes_tsv(List <ProteoformFamily> families, bool quantitative, string color_scheme, string node_label, string node_label_position, string node_position, int double_rounding, IEnumerable <TheoreticalProteoform> theoreticals, bool gene_centric_families, string preferred_gene_label) { DataTable node_table = new DataTable(); node_table.Columns.Add("accession", typeof(string)); node_table.Columns.Add(proteoform_type_header, typeof(string)); node_table.Columns.Add(size_header, typeof(double)); node_table.Columns.Add(tooltip_header, typeof(string)); node_table.Columns.Add(layout_header, typeof(int)); if (quantitative) { node_table.Columns.Add(SaveState.lollipop.numerator_condition, typeof(string)); node_table.Columns.Add(SaveState.lollipop.denominator_condition, typeof(string)); node_table.Columns.Add(significant_header, typeof(string)); node_table.Columns.Add(piechart_header, typeof(string)); } //Choose the layout order IEnumerable <Proteoform> layout_order; switch (Lollipop.node_positioning.ToList().IndexOf(node_position)) { case 0: //arbitrary circle case 2: //mass circle default: layout_order = families.SelectMany(f => f.experimental_proteoforms).OfType <Proteoform>().Concat(theoreticals).OrderBy(p => p.modified_mass); break; case 1: //mass-based spiral layout_order = theoreticals.OrderByDescending(p => p.modified_mass).OfType <Proteoform>().Concat(families.SelectMany(f => f.experimental_proteoforms).OrderBy(p => p.modified_mass)); break; } int layout_rank = 1; string node_rows = ""; foreach (Proteoform p in layout_order.ToList()) { if (p as TheoreticalProteoform != null) { string node_type = String.Equals(p.ptm_description, "unmodified", StringComparison.CurrentCultureIgnoreCase) ? unmodified_theoretical_label : modified_theoretical_label; node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, mock_intensity, "", layout_rank); } if (p as ExperimentalProteoform != null) { ExperimentalProteoform ep = p as ExperimentalProteoform; string node_type = quantitative && ep.quant.intensitySum == 0 ? experimental_notQuantified_label : experimental_label; string total_intensity = quantitative ? ep.quant.intensitySum == 0 ? mock_intensity : ((double)ep.quant.intensitySum).ToString() : ep.agg_intensity.ToString(); //Names and size node_rows += String.Join("\t", new List <string> { get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity }); //Set tooltip information string tooltip = String.Join("; ", new string[] { "Accession = " + p.accession.ToString(), "Aggregated Mass = " + ep.agg_mass.ToString(), "Aggregated Retention Time = " + ep.agg_rt.ToString(), "Total Intensity = " + total_intensity.ToString(), "Aggregated Component Count = " + ep.aggregated_components.Count.ToString(), SaveState.lollipop.neucode_labeled ? "; Lysine Count = " + p.lysine_count : "", "Abundant Component for Manual Validation of Identification: " + ep.manual_validation_id, "Abundant Component for Manual Validation of Identification Validation: " + ep.manual_validation_verification }); if (quantitative && ep.quant.intensitySum > 0) { tooltip += "\\n\\nQuantitation Results:" + String.Join("; ", new string[] { "Q-Value = " + ep.quant.FDR.ToString(), "Log2FC = " + ep.quant.logFoldChange.ToString(), "Variance = " + ep.quant.variance.ToString(), "Significant = " + ep.quant.significant.ToString(), SaveState.lollipop.numerator_condition + " Quantitative Component Count = " + ep.lt_quant_components.Count.ToString(), SaveState.lollipop.denominator_condition + " Quantitative Component Count = " + ep.hv_quant_components.Count.ToString(), "Abundant Component for Manual Validation of Quantification: " + ep.manual_validation_quant }); } if (quantitative && ep.quant.intensitySum != 0) { node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity, tooltip, layout_rank, ((double)ep.quant.lightIntensitySum).ToString(), ((double)ep.quant.heavyIntensitySum).ToString(), ep.quant.significant.ToString(), get_piechart_string(color_scheme)); } else if (quantitative) { node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity, tooltip, layout_rank, "", "", "", ""); } else { node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity, tooltip, layout_rank); } } layout_rank++; } if (gene_centric_families) { foreach (string gene_name in theoreticals.Select(t => t.gene_name.get_prefered_name(preferred_gene_label)).Distinct()) { if (gene_name != null && quantitative) { node_table.Rows.Add(gene_name, gene_name_label, mock_intensity, "Other Gene Names: ", 0, "", "", "", ""); } else if (gene_name != null) { node_table.Rows.Add(gene_name, gene_name_label, mock_intensity, "Other Gene Names: ", 0); } } } return(get_table_string(node_table)); }
private bool assign_pf_identity(ExperimentalProteoform e, PtmSet set, int begin, int end, ProteoformRelation r, TheoreticalProteoform theoretical_base, List <Proteoform> linked_proteoform_references, bool check_ambiguous_IDs) { bool identification_assigned = false; if (!Sweet.lollipop.id_use_ppm_tolerance || Math.Abs(e.calculate_mass_error(theoretical_base, set, begin, end) * 1e6 / e.modified_mass) < Sweet.lollipop.id_ppm_tolerance) { int new_begin = begin; int new_end = end; PtmSet new_set = new PtmSet(new List <Ptm>(set.ptm_combination)); List <Ptm> remove = new List <Ptm>(); //do retention of M first foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { new_begin--; remove.Add(mod); } foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (!new_set.ptm_combination.Any(m => m.modification.ModificationType == "AminoAcid") && begin >= theoretical_base.begin) { if (theoretical_base.sequence[begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString()) { new_begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } } if (!remove.Contains(mod) && theoretical_base.sequence[end - theoretical_base.begin].ToString() == mod.modification.Target.ToString()) { new_end--; remove.Add(mod); } } foreach (var ptm in remove) { new_set.ptm_combination.Remove(ptm); } new_set = new PtmSet(new_set.ptm_combination); if (e.linked_proteoform_references == null) { identification_assigned = true; if (linked_proteoform_references != null) { e.linked_proteoform_references = new List <Proteoform>(linked_proteoform_references); e.linked_proteoform_references.Add(this); } else { e.linked_proteoform_references = new List <Proteoform>() { theoretical_base }; } e.relation_to_id = r; e.ptm_set = new_set; e.begin = new_begin; e.end = new_end; if (e.gene_name == null) { e.gene_name = theoretical_base.gene_name; } else { e.gene_name.gene_names.Concat(this.gene_name.gene_names); } } else { if (linked_proteoform_references != null && !linked_proteoform_references.Contains(e)) { bool different_id = e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != theoretical_base.gene_name.get_prefered_name(Lollipop.preferred_gene_label) || ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) != ExperimentalProteoform.get_sequence(theoretical_base, new_begin, new_end) || !e.ptm_set.same_ptmset(new_set, true); List <Modification> this_known_mods = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList(); List <Modification> previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList(); if (!Sweet.lollipop.topdown_theoretical_reduce_ambiguity || (theoretical_base.topdown_theoretical && !(e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical)) { if (!Sweet.lollipop.annotated_PTMs_reduce_ambiguity || (new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) && !e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))) { if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity || Sweet.lollipop.annotated_PTMs_reduce_ambiguity) { if (Sweet.lollipop.remove_bad_connections && different_id) //&& e.relation_to_id != r) { e.relation_to_id.Identification = false; e.relation_to_id.represented_ptmset = null; } e.linked_proteoform_references = null; e.ptm_set = new PtmSet(new List <Ptm>()); e.begin = 0; e.end = 0; e.gene_name = null; e.ambiguous_identifications.Clear(); ProteoformRelation relation = null; e.relation_to_id = relation; //reassign the topdown - based ID return(this.assign_pf_identity(e, set, begin, end, r, theoretical_base, linked_proteoform_references, true)); } } } if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity && (e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical && !theoretical_base.topdown_theoretical) { } else if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity && !new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) && e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification)))) { } else { if (different_id) { var new_linked_proteoform_references = new List <Proteoform>(linked_proteoform_references); new_linked_proteoform_references.Add(this); AmbiguousIdentification new_id = new AmbiguousIdentification(new_begin, new_end, new_set, r, theoretical_base, new_linked_proteoform_references); lock (e.ambiguous_identifications) { if (!e.ambiguous_identifications.Any(p => p.theoretical_base.gene_name.primary == new_id.theoretical_base.gene_name.primary && ExperimentalProteoform.get_sequence(p.theoretical_base, p.begin, p.end) == ExperimentalProteoform.get_sequence(new_id.theoretical_base, new_id.begin, new_id.end) && p.ptm_set.same_ptmset(new_id.ptm_set, true))) { e.ambiguous_identifications.Add(new_id); identification_assigned = true; } } } } } } } if (check_ambiguous_IDs) { //remove bad relations if using td to reduce ambiguity if (identification_assigned) { List <AmbiguousIdentification> to_remove = new List <AmbiguousIdentification>(); List <Modification> previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList(); if (theoretical_base.topdown_theoretical && Sweet.lollipop.topdown_theoretical_reduce_ambiguity) { to_remove.AddRange(e.ambiguous_identifications.Where(id => !id.theoretical_base.topdown_theoretical)); } if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity && e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification)))) { foreach (var ambiguous_id in e.ambiguous_identifications) { List <Modification> ambiguous_id_known_mods = ambiguous_id.theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList(); if (ambiguous_id.ptm_set.ptm_combination.Any(mod1 => !modification_is_adduct(mod1.modification) && !ambiguous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification)))) { to_remove.Add(ambiguous_id); } } } foreach (var x in to_remove) { if (e.ambiguous_identifications.Contains(x)) { e.ambiguous_identifications.Remove(x); if (Sweet.lollipop.remove_bad_connections) { if (e.relation_to_id != x.relation) { x.relation.Identification = false; x.relation.represented_ptmset = null; } } } } foreach (var x in e.ambiguous_identifications) { x.relation.Identification = true; } } if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous_identifications.Count > 0) { lock ((this as ExperimentalProteoform).ambiguous_identifications) { int count = (this as ExperimentalProteoform).ambiguous_identifications.Count; PtmSet[] new_ptm_set = new PtmSet[count]; Parallel.For(0, count, i => { var id = (this as ExperimentalProteoform).ambiguous_identifications[i]; new_ptm_set[i] = determine_mod_change(e, this, id.theoretical_base, r, id.ptm_set, id.begin, id.end); }); for (int i = 0; i < count; i++) { if (new_ptm_set[i] != null) { var id = (this as ExperimentalProteoform).ambiguous_identifications[i]; if (assign_pf_identity(e, new_ptm_set[i], id.begin, id.end, r, id.theoretical_base, id.linked_proteoform_references, false)) { identification_assigned = true; } } } } } } return(identification_assigned); }
private static PtmSet determine_mod_change(ExperimentalProteoform e, Proteoform p, TheoreticalProteoform theoretical_base, ProteoformRelation r, PtmSet this_ptmset, int begin, int end) { double mass_tolerance = p.modified_mass / 1000000 * Sweet.lollipop.mass_tolerance; int sign = Math.Sign(e.modified_mass - p.modified_mass); double deltaM = Math.Sign(r.peak.DeltaMass) < 0 ? r.peak.DeltaMass : sign * r.peak .DeltaMass; // give EE relations the correct sign, but don't switch negative ET relation deltaM's List <PtmSet> possible_additions = r.peak.possiblePeakAssignments .Where(peak => Math.Abs(peak.mass - deltaM) <= 1) .ToList(); // EE relations have PtmSets around both positive and negative deltaM, so remove the ones around the opposite of the deltaM of interest PtmSet best_addition = generate_possible_added_ptmsets(possible_additions, Sweet.lollipop.theoretical_database.all_mods_with_mass, theoretical_base, begin, end, this_ptmset, 1, true) .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); List <PtmSet> best_losses = new List <PtmSet>(); foreach (PtmSet set in r.peak.possiblePeakAssignments) //Parallel.ForEach(Sweet.lollipop.theoretical_database.all_possible_ptmsets, set => { bool within_loss_tolerance = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance; if (within_loss_tolerance) { List <Modification> these_mods = this_ptmset.ptm_combination.Select(ptm => ptm.modification).ToList(); List <Modification> those_mods = set.ptm_combination.Select(ptm => ptm.modification).ToList(); // all must be in the current set to remove them bool can_be_removed = those_mods.All(m1 => these_mods.Count(m2 => UnlocalizedModification.LookUpId(m2) == UnlocalizedModification.LookUpId(m1)) >= those_mods.Count(m2 => UnlocalizedModification.LookUpId(m2) == UnlocalizedModification.LookUpId(m1))); lock (best_losses) { if (can_be_removed && within_loss_tolerance) { best_losses.Add(set); } } } } //); PtmSet best_loss = best_losses.OrderBy(s => Math.Abs(deltaM - (-s.mass))).FirstOrDefault(); if (best_addition == null && best_loss == null) { return(null); } // Make the new ptmset with ptms removed or added PtmSet with_mod_change = null; if (best_loss == null) { with_mod_change = new PtmSet(new List <Ptm>(this_ptmset.ptm_combination .Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.MonoisotopicMass != 0) .ToList())); } else { List <Ptm> new_combo = new List <Ptm>(this_ptmset.ptm_combination); foreach (Ptm ptm in best_loss.ptm_combination) { new_combo.Remove(new_combo.FirstOrDefault(asdf => UnlocalizedModification.LookUpId(asdf.modification) == UnlocalizedModification.LookUpId(ptm.modification))); } with_mod_change = new PtmSet(new_combo); } if (r.represented_ptmset == null) { r.represented_ptmset = best_loss == null ? best_addition : best_loss; //if (r.RelationType == ProteoformComparison.ExperimentalExperimental) //{ // r.DeltaMass *= sign; //} } return(with_mod_change); }
private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, ProteoformRelation r, TheoreticalProteoform theoretical_base) { if (e.linked_proteoform_references == null) { e.linked_proteoform_references = new List <Proteoform>(this.linked_proteoform_references); e.linked_proteoform_references.Add(this); e.ptm_set = set; e.begin = this.begin; e.end = this.end; List <Ptm> remove = new List <Ptm>(); //do retention of M first foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { e.begin--; remove.Add(mod); } foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString()) { e.begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } else if (theoretical_base.sequence[this.end - this.begin].ToString() == mod.modification.Target.ToString()) { e.end--; remove.Add(mod); } } foreach (var ptm in remove) { e.ptm_set.ptm_combination.Remove(ptm); } e.ptm_set = new PtmSet(e.ptm_set.ptm_combination); if (e.gene_name == null) { e.gene_name = this.gene_name; } else if (!e.topdown_id) { e.gene_name.gene_names.Concat(this.gene_name.gene_names); } } else { //check if assign int begin = this.begin; int end = this.end; PtmSet ptm_set = set; List <Ptm> remove = new List <Ptm>(); //do retention of M first foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { begin--; remove.Add(mod); } foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString()) { begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } else if (theoretical_base.sequence[this.end - this.begin].ToString() == mod.modification.Target.ToString()) { end--; remove.Add(mod); } } foreach (var ptm in remove) { ptm_set.ptm_combination.Remove(ptm); } ptm_set = new PtmSet(ptm_set.ptm_combination); if (e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label) || e.begin != begin || e.end != end || !e.ptm_set.same_ptmset(ptm_set, true)) { e.ambiguous = true; Proteoform linked_proteoform_reference = this.linked_proteoform_references == null || this.linked_proteoform_references.Count == 0 ? this : this.linked_proteoform_references.First(); Tuple <Proteoform, int, int, PtmSet> new_id = new Tuple <Proteoform, int, int, PtmSet>(linked_proteoform_reference, begin, end, ptm_set); lock (e.ambiguous_identifications) { if (!e.ambiguous_identifications.Any(p => p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) && p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 && p.Item4.same_ptmset(new_id.Item4, true))) { e.ambiguous_identifications.Add(new_id); } } } } if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous) { foreach (var id in this.ambiguous_identifications) { TheoreticalProteoform id_theoretical_base = id.Item1 as TheoreticalProteoform; int begin = id.Item2; int end = id.Item3; var remove = new List <Ptm>(); var ptm_set = determine_mod_change(e, this, id_theoretical_base, r, id.Item4); if (ptm_set == null) { continue; } //do retention of M first foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { begin--; remove.Add(mod); } foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (id_theoretical_base.sequence[id.Item2 - id.Item1.begin].ToString() == mod.modification.Target.ToString()) { begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } else if (id_theoretical_base.sequence[id.Item3 - id.Item2].ToString() == mod.modification.Target.ToString()) { end--; remove.Add(mod); } } foreach (var ptm in remove) { ptm_set.ptm_combination.Remove(ptm); } ptm_set = new PtmSet(ptm_set.ptm_combination); lock (e.ambiguous_identifications) { var new_id = new Tuple <Proteoform, int, int, PtmSet>(id.Item1, begin, end, ptm_set); if ((e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) || e.begin != new_id.Item2 || e.end != new_id.Item3 || !e.ptm_set.same_ptmset(new_id.Item4, true)) && !e.ambiguous_identifications.Any(p => p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) && p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 && p.Item4.same_ptmset(new_id.Item4, true))) { e.ambiguous_identifications.Add(new_id); e.ambiguous = true; } } } } e.uniprot_mods = ""; foreach (string mod in e.ptm_set.ptm_combination.Concat(e.ambiguous_identifications.SelectMany(i => i.Item4.ptm_combination)).Where(ptm => ptm.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m)) { // positions with mod List <int> theo_ptms = theoretical_base.ExpandedProteinList.First() .OneBasedPossibleLocalizedModifications .Where(p => p.Key >= e.begin && p.Key <= e.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { e.uniprot_mods += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (e.ptm_set.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count || e.ambiguous_identifications.Any(i => i.Item4.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count)) { e.novel_mods = true; } } //else if (!e.topdown_id && e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label) // && e.linked_proteoform_references.Count == this.linked_proteoform_references.Count + 1) //{ // e.ambiguous = true; //} }
public void identify_experimentals() { HashSet <ExperimentalProteoform> identified_experimentals = new HashSet <ExperimentalProteoform>(); if (Sweet.lollipop.identify_from_td_nodes) { foreach (TopDownProteoform topdown in experimental_proteoforms.Where(e => e.topdown_id)) { Sweet.lollipop.theoretical_database .theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number] .TryGetValue(topdown.accession.Split('_')[0].Split('-')[0], out var t); if (t != null && t.Count > 0) { TheoreticalProteoform theoretical = new TheoreticalProteoform(topdown.accession, topdown.name, topdown.sequence, t.First().ExpandedProteinList, topdown.modified_mass, topdown.lysine_count, topdown.topdown_ptm_set, true, false, null); theoretical.topdown_theoretical = true; theoretical.new_topdown_proteoform = true; theoretical.begin = topdown.topdown_begin; theoretical.end = topdown.topdown_end; foreach (ExperimentalProteoform e in topdown.identify_connected_experimentals(theoretical, topdown.topdown_begin, topdown.topdown_end, new PtmSet(topdown.topdown_ptm_set.ptm_combination), null)) { identified_experimentals.Add(e); } } } } foreach (TheoreticalProteoform t in theoretical_proteoforms.OrderBy(t => t.topdown_theoretical)) { lock (identified_experimentals) foreach (ExperimentalProteoform e in t.identify_connected_experimentals(t, t.begin, t.end, t.ptm_set, t.linked_proteoform_references)) { identified_experimentals.Add(e); } } //Continue looking for new experimental identifications until no more remain to be identified List <ExperimentalProteoform> newly_identified_experimentals = new List <ExperimentalProteoform>(identified_experimentals).OrderBy(p => p.relationships.Count(r => r.candidate_ptmset != null) > 0 ? p.relationships.Where(r => r.candidate_ptmset != null).Min(r => Math.Abs(r.DeltaMass - r.candidate_ptmset.mass)) : 1e6).ThenBy(p => p.modified_mass).ToList(); int last_identified_count = identified_experimentals.Count - 1; while (newly_identified_experimentals.Count > 0) //&& identified_experimentals.Count > last_identified_count) { last_identified_count = identified_experimentals.Count; HashSet <ExperimentalProteoform> tmp_new_experimentals = new HashSet <ExperimentalProteoform>(); foreach (ExperimentalProteoform id_experimental in newly_identified_experimentals) { { lock (identified_experimentals) lock (tmp_new_experimentals) foreach (ExperimentalProteoform new_e in id_experimental.identify_connected_experimentals(id_experimental.linked_proteoform_references.First() as TheoreticalProteoform, id_experimental.begin, id_experimental.end, id_experimental.ptm_set, id_experimental.linked_proteoform_references)) { identified_experimentals.Add(new_e); tmp_new_experimentals.Add(new_e); } } } newly_identified_experimentals = new List <ExperimentalProteoform>(tmp_new_experimentals); } List <string> topdown_ids = Sweet.lollipop.topdown_proteoforms .Select(p => p.accession.Split('_')[0].Split('-')[0] + "_" + p.sequence + "_" + string.Join(", ", p.topdown_ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))).ToList(); //determine identified experimentals that are adducts //checks if any experimentals have same mods as e's ptmset, except e has additional adduct only mods. Parallel.ForEach(experimental_proteoforms, e => { e.adduct = e.linked_proteoform_references != null && e.ptm_set.ptm_combination.Any(m => Proteoform.modification_is_adduct(m.modification)) && experimental_proteoforms.Any(l => l.linked_proteoform_references != null && l.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) && l.ptm_set.ptm_combination.Count < e.ptm_set.ptm_combination.Count && e.ptm_set.ptm_combination.Where(m => l.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)) != e.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification))) .Count(p => !Proteoform.modification_is_adduct(p.modification)) == 0 ); if (e as TopDownProteoform != null) { (e as TopDownProteoform).set_correct_id(); } if (e.linked_proteoform_references != null) { var mods = e.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification)) .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList(); e.uniprot_mods = ""; string add = ""; foreach (string mod in mods) { // positions with mod List <int> theo_ptms = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p .OneBasedPossibleLocalizedModifications) .Where(p => p.Key >= e.begin && p.Key <= e.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { add += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (e.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count) { e.novel_mods = true; } } e.uniprot_mods += add; if (add.Length == 0) { e.uniprot_mods += "N/A"; } foreach (var ambig_id in e.ambiguous_identifications) { var ambig_mods = ambig_id.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification)) .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList(); e.uniprot_mods += " | "; add = ""; foreach (var mod in ambig_mods) { // positions with mod List <int> theo_ptms = ambig_id.theoretical_base.ExpandedProteinList.SelectMany(p => p .OneBasedPossibleLocalizedModifications) .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { add += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (ambig_id.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count) { e.novel_mods = true; } } e.uniprot_mods += add; if (add.Length == 0) { e.uniprot_mods += "N/A"; } } } //determine level # e.proteoform_level_description = ""; if (e.linked_proteoform_references == null) { e.proteoform_level = 5; e.proteoform_level_description = "Unidentified"; } else if (e.ambiguous_identifications.Count == 0) { if (e.ptm_set.ptm_combination.Count == 0) { e.proteoform_level = 1; } else { e.proteoform_level = 2; e.proteoform_level_description += "PTM localization ambiguity; "; } //check if accessions had been grouped in constructing the theoretical database if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1) { e.proteoform_level += 1; e.proteoform_level_description += "Gene ambiguity; "; } } else { var unique_accessions = new List <string>() { e.linked_proteoform_references.First().accession.Split('_')[0].Split('-')[0] }.Concat(e.ambiguous_identifications.Select(a => a.theoretical_base.accession.Split('_')[0].Split('-')[0])).Distinct(); var unique_sequences = new List <string>() { ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) }. Concat(e.ambiguous_identifications.Select(a => ExperimentalProteoform.get_sequence(a.theoretical_base, a.begin, a.end))).Distinct(); var unique_PTMs = new List <string>() { string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m)) }.Concat(e.ambiguous_identifications.Select(a => string.Join(", ", a.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))))).Distinct(); int gene_ambiguity = unique_accessions.Count() > 1 ? 1 : 0; //check if accessions had been grouped in constructing the theoretical database if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1) { gene_ambiguity = 1; } int sequence_ambiguity = unique_sequences.Count() > 1 ? 1 : 0; int PTM_ambiguity = unique_PTMs.Count() > 1 ? 1 : 0; int PTM_location = e.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0 || e.ambiguous_identifications.Any(a => a.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0) ? 1 : 0; e.proteoform_level = 1 + gene_ambiguity + sequence_ambiguity + PTM_ambiguity + PTM_location; if (gene_ambiguity > 0) { e.proteoform_level_description += "Gene ambiguity; "; } if (sequence_ambiguity > 0) { e.proteoform_level_description += "Sequence ambiguity; "; } if (PTM_ambiguity > 0) { e.proteoform_level_description += "PTM identity ambiguity; "; } if (PTM_location > 0) { e.proteoform_level_description += "PTM localization ambiguity; "; } } if (e.proteoform_level == 1) { e.proteoform_level_description = "Unambiguous"; } //determine if new intact-mass ID e.new_intact_mass_id = false; if (!e.topdown_id && e.linked_proteoform_references != null && e.ambiguous_identifications.Count == 0) { string this_id = string.Join(",", (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList.Select(a => a.Split('_')[0])).Distinct()) + "_" + ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) + "_" + string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m)); if (!topdown_ids.Any(t => this_id.Split('_')[0].Split(',').Contains(t.Split('_')[0]) && this_id.Split('_')[1] == t.Split('_')[1] && this_id.Split('_')[2] == t.Split('_')[2])) { e.new_intact_mass_id = true; } } }); if (Sweet.lollipop.remove_bad_connections) { if (theoretical_proteoforms.Count > 0 || (Sweet.lollipop.identify_from_td_nodes && experimental_proteoforms.Count(e => e.topdown_id) > 0)) { Parallel.ForEach(relations, r => { r.Accepted = r.Identification; }); } } }