public bool allowed_relation(Proteoform pf1, Proteoform pf2_with_allowed_lysines, ProteoformComparison relation_type) { if (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy) { return ((pf1.modified_mass - pf2_with_allowed_lysines.modified_mass) >= Sweet.lollipop.et_low_mass_difference && (pf1.modified_mass - pf2_with_allowed_lysines.modified_mass) <= Sweet.lollipop.et_high_mass_difference && (Sweet.lollipop.add_td_theoreticals || !(pf2_with_allowed_lysines as TheoreticalProteoform).new_topdown_proteoform)); } else if (relation_type == ProteoformComparison.ExperimentalExperimental) { return (pf1 != pf2_with_allowed_lysines && pf1.modified_mass >= pf2_with_allowed_lysines.modified_mass && pf1 != pf2_with_allowed_lysines && pf1.modified_mass - pf2_with_allowed_lysines.modified_mass <= Sweet.lollipop.ee_max_mass_difference && Math.Abs((pf1 as ExperimentalProteoform).agg_rt - (pf2_with_allowed_lysines as ExperimentalProteoform).agg_rt) <= Sweet.lollipop.ee_max_RetentionTime_difference); } else if (relation_type == ProteoformComparison.ExperimentalFalse) { //going to hard code in 10 minutes as min RT for 2 to not be related. return (pf1.modified_mass >= pf2_with_allowed_lysines.modified_mass && pf1 != pf2_with_allowed_lysines && (pf1.modified_mass - pf2_with_allowed_lysines.modified_mass <= Sweet.lollipop.ee_max_mass_difference) && (Sweet.lollipop.neucode_labeled || Math.Abs((pf1 as ExperimentalProteoform).agg_rt - (pf2_with_allowed_lysines as ExperimentalProteoform).agg_rt) > 10) && (!Sweet.lollipop.neucode_labeled || Math.Abs((pf1 as ExperimentalProteoform).agg_rt - (pf2_with_allowed_lysines as ExperimentalProteoform).agg_rt) < Sweet.lollipop.ee_max_RetentionTime_difference)); } else { return(false); } }
public bool allowed_relation(Proteoform pf1, Proteoform pf2, ProteoformComparison relation_type) { switch (relation_type) { case (ProteoformComparison.ExperimentalTheoretical): case (ProteoformComparison.ExperimentalDecoy): return((!SaveState.lollipop.neucode_labeled || pf2.lysine_count == pf1.lysine_count) && (pf1.modified_mass - pf2.modified_mass) >= SaveState.lollipop.et_low_mass_difference && (pf1.modified_mass - pf2.modified_mass) <= SaveState.lollipop.et_high_mass_difference && (pf2.ptm_set.ptm_combination.Count < 3 || pf2.ptm_set.ptm_combination.Select(ptm => ptm.modification.monoisotopicMass).All(x => x == pf2.ptm_set.ptm_combination.First().modification.monoisotopicMass))); case (ProteoformComparison.ExperimentalExperimental): return(pf1.modified_mass >= pf2.modified_mass && pf1 != pf2 && (!SaveState.lollipop.neucode_labeled || pf1.lysine_count == pf2.lysine_count) && pf1.modified_mass - pf2.modified_mass <= SaveState.lollipop.ee_max_mass_difference && Math.Abs(((ExperimentalProteoform)pf1).agg_rt - ((ExperimentalProteoform)pf2).agg_rt) <= SaveState.lollipop.ee_max_RetentionTime_difference); case (ProteoformComparison.ExperimentalFalse): return(pf1.modified_mass >= pf2.modified_mass && pf1 != pf2 && (pf1.modified_mass - pf2.modified_mass <= SaveState.lollipop.ee_max_mass_difference) && (!SaveState.lollipop.neucode_labeled || Math.Abs(pf1.lysine_count - pf2.lysine_count) > SaveState.lollipop.missed_lysines) && (SaveState.lollipop.neucode_labeled || Math.Abs(((ExperimentalProteoform)pf1).agg_rt - ((ExperimentalProteoform)pf2).agg_rt) > SaveState.lollipop.ee_max_RetentionTime_difference * 2) && (!SaveState.lollipop.neucode_labeled || Math.Abs(((ExperimentalProteoform)pf1).agg_rt - ((ExperimentalProteoform)pf2).agg_rt) < SaveState.lollipop.ee_max_RetentionTime_difference)); default: return(false); } }
private void assign_pf_identity(ExperimentalProteoform e, Proteoform theoretical_reference, PtmSet set, ProteoformRelation r, int sign, PtmSet change) { if (r.represented_ptmset == null) { r.represented_ptmset = change; if (r.RelationType == ProteoformComparison.ExperimentalExperimental) { r.DeltaMass *= sign; } } if (e.linked_proteoform_references == null) { e.linked_proteoform_references = new List <Proteoform>(this.linked_proteoform_references); e.linked_proteoform_references.Add(this); e.ptm_set = set; } if (e.gene_name == null) { e.gene_name = this.gene_name; } else { e.gene_name.gene_names.Concat(this.gene_name.gene_names); } }
public bool allowed_relation(Proteoform pf1, Proteoform pf2_with_allowed_lysines, ProteoformComparison relation_type) { if (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy) { return ((pf1.modified_mass - pf2_with_allowed_lysines.modified_mass) >= Sweet.lollipop.et_low_mass_difference && (pf1.modified_mass - pf2_with_allowed_lysines.modified_mass) <= Sweet.lollipop.et_high_mass_difference); } else if (relation_type == ProteoformComparison.ExperimentalExperimental) { return (pf1 != pf2_with_allowed_lysines && pf1.modified_mass >= pf2_with_allowed_lysines.modified_mass && pf1 != pf2_with_allowed_lysines && pf1.modified_mass - pf2_with_allowed_lysines.modified_mass <= Sweet.lollipop.ee_max_mass_difference && Math.Abs((pf1 as ExperimentalProteoform).agg_rt - (pf2_with_allowed_lysines as ExperimentalProteoform).agg_rt) <= Sweet.lollipop.ee_max_RetentionTime_difference); } else if (relation_type == ProteoformComparison.ExperimentalFalse) { return (pf1.modified_mass >= pf2_with_allowed_lysines.modified_mass && pf1 != pf2_with_allowed_lysines && (pf1.modified_mass - pf2_with_allowed_lysines.modified_mass <= Sweet.lollipop.ee_max_mass_difference) && (Sweet.lollipop.neucode_labeled || Math.Abs((pf1 as ExperimentalProteoform).agg_rt - (pf2_with_allowed_lysines as ExperimentalProteoform).agg_rt) > Sweet.lollipop.ee_max_RetentionTime_difference * 2) && (!Sweet.lollipop.neucode_labeled || Math.Abs((pf1 as ExperimentalProteoform).agg_rt - (pf2_with_allowed_lysines as ExperimentalProteoform).agg_rt) < Sweet.lollipop.ee_max_RetentionTime_difference)); } else { return(false); } }
public static string get_proteoform_shared_name(Proteoform p, string node_label, int double_rounding) { if (p as ExperimentalProteoform != null) { ExperimentalProteoform e = p as ExperimentalProteoform; string name = Math.Round(e.agg_mass, double_rounding) + "_Da_" + e.accession; if (node_label == Lollipop.node_labels[1] && e.linked_proteoform_references != null && e.linked_proteoform_references.Count > 0) { name += " " + (e.linked_proteoform_references.First() as TheoreticalProteoform).accession + " " + (e.ptm_set.ptm_combination.Count == 0 ? "Unmodified" : String.Join("; ", e.ptm_set.ptm_combination.Select(ptm => SaveState.lollipop.theoretical_database.unlocalized_lookup[ptm.modification].id))); } return(name); } else if (p as TheoreticalProteoform != null) { return(p.accession + " " + p.ptm_description); } else { return(p.accession); } }
private static PtmSet determine_mod_change(ExperimentalProteoform e, Proteoform p, TheoreticalProteoform theoretical_base, ProteoformRelation r, PtmSet this_ptmset) { double mass_tolerance = p.modified_mass / 1000000 * Sweet.lollipop.mass_tolerance; int sign = Math.Sign(e.modified_mass - p.modified_mass); double deltaM = Math.Sign(r.peak.DeltaMass) < 0 ? r.peak.DeltaMass : sign * r.peak.DeltaMass; // give EE relations the correct sign, but don't switch negative ET relation deltaM's List <PtmSet> possible_additions = r.peak.possiblePeakAssignments.Where(peak => Math.Abs(peak.mass - deltaM) <= 1).ToList(); // EE relations have PtmSets around both positive and negative deltaM, so remove the ones around the opposite of the deltaM of interest PtmSet best_addition = generate_possible_added_ptmsets(possible_additions, Sweet.lollipop.theoretical_database.all_mods_with_mass, theoretical_base, p.begin, p.end, p.ptm_set, 1, true) .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); PtmSet best_loss = null; foreach (PtmSet set in Sweet.lollipop.theoretical_database.all_possible_ptmsets) { bool within_loss_tolerance = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance; List <Modification> these_mods = this_ptmset.ptm_combination.Select(ptm => ptm.modification).ToList(); List <Modification> those_mods = set.ptm_combination.Select(ptm => ptm.modification).ToList(); // all must be in the current set to remove them bool can_be_removed = those_mods.All(m1 => these_mods.Count(m2 => m2.OriginalId == m1.OriginalId) >= those_mods.Count(m2 => m2.OriginalId == m1.OriginalId)); //# of each mod in current set must be greater than or equal to # in set to remove. bool better_than_current_best_loss = best_loss == null || Math.Abs(deltaM - (-set.mass)) < Math.Abs(deltaM - (-best_loss.mass)); if (can_be_removed && within_loss_tolerance && better_than_current_best_loss) { best_loss = set; } } if (best_addition == null && best_loss == null) { return(null); } // Make the new ptmset with ptms removed or added PtmSet with_mod_change = null; if (best_loss == null) { with_mod_change = new PtmSet(new List <Ptm>(this_ptmset.ptm_combination.Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.MonoisotopicMass != 0).ToList())); } else { List <Ptm> new_combo = new List <Ptm>(this_ptmset.ptm_combination); foreach (Ptm ptm in best_loss.ptm_combination) { new_combo.Remove(new_combo.FirstOrDefault(asdf => asdf.modification.Equals(ptm.modification))); } with_mod_change = new PtmSet(new_combo); } if (r.represented_ptmset == null) { r.represented_ptmset = best_loss == null ? best_addition : best_loss; if (r.RelationType == ProteoformComparison.ExperimentalExperimental) { r.DeltaMass *= sign; } } return(with_mod_change); }
public List <ProteoformFamily> construct_families() { ProteoformFamily.reset_family_counter(); Stack <Proteoform> remaining = new Stack <Proteoform>(this.experimental_proteoforms.ToArray()); List <ProteoformFamily> running_families = new List <ProteoformFamily>(); List <Proteoform> running = new List <Proteoform>(); List <Thread> active = new List <Thread>(); while (remaining.Count > 0 || active.Count > 0) { while (remaining.Count > 0 && active.Count < Environment.ProcessorCount) { Proteoform root = remaining.Pop(); ProteoformFamily fam = new ProteoformFamily(root); Thread t = new Thread(new ThreadStart(fam.construct_family)); t.Start(); running_families.Add(fam); running.Add(root); active.Add(t); } foreach (Thread t in active) { t.Join(); } List <Proteoform> cumulative_proteoforms = new List <Proteoform>(); foreach (ProteoformFamily family in running_families.ToList()) { if (cumulative_proteoforms.Contains(family.proteoforms.First())) { running_families.Remove(family); // check for duplicates due to arbitrary seed selection } else { cumulative_proteoforms.AddRange(family.proteoforms); Parallel.ForEach(family.proteoforms, p => { lock (p) p.family = family; }); } } this.families.AddRange(running_families); remaining = new Stack <Proteoform>(remaining.Except(cumulative_proteoforms)); running_families.Clear(); running.Clear(); active.Clear(); } if (Sweet.lollipop.gene_centric_families) { families = combine_gene_families(families).ToList(); } Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.neucode_labeled).AA_Masses; Parallel.ForEach(families, f => f.identify_experimentals()); return(families); }
public List <ProteoformFamily> construct_families() { Stack <Proteoform> remaining = new Stack <Proteoform>(this.experimental_proteoforms.Where(e => e.accepted).ToArray()); List <ProteoformFamily> running_families = new List <ProteoformFamily>(); List <Proteoform> running = new List <Proteoform>(); List <Thread> active = new List <Thread>(); while (remaining.Count > 0 || active.Count > 0) { while (remaining.Count > 0 && active.Count < Environment.ProcessorCount) { Proteoform root = remaining.Pop(); ProteoformFamily fam = new ProteoformFamily(root); Thread t = new Thread(new ThreadStart(fam.construct_family)); t.Start(); running_families.Add(fam); running.Add(root); active.Add(t); } foreach (Thread t in active) { t.Join(); } List <Proteoform> cumulative_proteoforms = new List <Proteoform>(); foreach (ProteoformFamily family in running_families.ToList()) { if (cumulative_proteoforms.Contains(family.proteoforms.First())) { running_families.Remove(family); // check for duplicates due to arbitrary seed selection } else { cumulative_proteoforms.AddRange(family.proteoforms); Parallel.ForEach(family.proteoforms, p => { lock (p) p.family = family; }); } } this.families.AddRange(running_families); remaining = new Stack <Proteoform>(remaining.Except(cumulative_proteoforms)); running_families.Clear(); running.Clear(); active.Clear(); } if (gene_centric_families) { families = combine_gene_families(families).ToList(); } Parallel.ForEach(families, f => f.identify_experimentals()); return(families); }
public bool shift_experimental_masses(int shift, bool neucode_labeled) { if (RelationType != ProteoformComparison.ExperimentalTheoretical) { return(false); //Not currently intended for ee relations } foreach (ProteoformRelation r in this.grouped_relations) { Proteoform p = r.connected_proteoforms[0]; if (p is ExperimentalProteoform && ((ExperimentalProteoform)p).mass_shifted == false && SaveState.lollipop.target_proteoform_community.experimental_proteoforms.Contains(p)) { ((ExperimentalProteoform)p).shift_masses(shift, neucode_labeled); } } return(true); }
public ProteoformRelation(Proteoform pf1, Proteoform pf2, ProteoformComparison relation_type, double delta_mass, string current_directory) { connected_proteoforms[0] = pf1; connected_proteoforms[1] = pf2; RelationType = relation_type; DeltaMass = delta_mass; InstanceId = instanceCounter; lock (SaveState.lollipop) instanceCounter += 1; //Not thread safe if (CH2 == null || HPO3 == null) { Loaders.LoadElements(Path.Combine(current_directory, "elements.dat")); CH2 = ChemicalFormula.ParseFormula("C1 H2"); HPO3 = ChemicalFormula.ParseFormula("H1 O3 P1"); } if (SaveState.lollipop.neucode_labeled) { lysine_count = pf1.lysine_count; } if ((relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy) && SaveState.lollipop.theoretical_database.possible_ptmset_dictionary.TryGetValue(Math.Round(delta_mass, 1), out List <PtmSet> candidate_sets) && pf2 as TheoreticalProteoform != null) { TheoreticalProteoform t = pf2 as TheoreticalProteoform; double mass_tolerance = t.modified_mass / 1000000 * (double)SaveState.lollipop.mass_tolerance; List <PtmSet> narrower_range_of_candidates = candidate_sets.Where(s => Math.Abs(s.mass - delta_mass) < 0.05).ToList(); candidate_ptmset = t.generate_possible_added_ptmsets(narrower_range_of_candidates, delta_mass, mass_tolerance, SaveState.lollipop.theoretical_database.all_mods_with_mass, t, t.sequence, SaveState.lollipop.mod_rank_first_quartile) .OrderBy(x => x.ptm_rank_sum + Math.Abs(Math.Abs(x.mass) - Math.Abs(delta_mass)) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); } // Start the model (0 Da) at the mass defect of CH2 or HPO3 itself, allowing the peak width tolerance on either side double half_peak_width = RelationType == ProteoformComparison.ExperimentalTheoretical || RelationType == ProteoformComparison.ExperimentalDecoy ? SaveState.lollipop.peak_width_base_et / 2 : SaveState.lollipop.peak_width_base_ee / 2; double low_decimal_bound = half_peak_width + ((CH2.MonoisotopicMass - Math.Truncate(CH2.MonoisotopicMass)) / CH2.MonoisotopicMass) * (Math.Abs(delta_mass) <= CH2.MonoisotopicMass ? CH2.MonoisotopicMass : Math.Abs(delta_mass)); double high_decimal_bound = 1 - half_peak_width + ((HPO3.MonoisotopicMass - Math.Ceiling(HPO3.MonoisotopicMass)) / HPO3.MonoisotopicMass) * (Math.Abs(delta_mass) <= HPO3.MonoisotopicMass ? HPO3.MonoisotopicMass : Math.Abs(delta_mass)); double delta_mass_decimal = Math.Abs(delta_mass - Math.Truncate(delta_mass)); outside_no_mans_land = delta_mass_decimal <= low_decimal_bound || delta_mass_decimal >= high_decimal_bound || high_decimal_bound <= low_decimal_bound; }
public ProteoformFamily(Proteoform seed) { family_counter++; this.family_id = family_counter; this.seed = seed; }
private void get_uniprot_mods() { var mods = topdown_ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification)) .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList(); topdown_uniprot_mods = ""; string add = ""; if (Sweet.lollipop.theoretical_database.theoreticals_by_accession.ContainsKey(Sweet.lollipop.target_proteoform_community.community_number)) { Sweet.lollipop.theoretical_database.theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number].TryGetValue(accession.Split('_')[0].Split('-')[0], out var matching_theoretical); if (matching_theoretical != null) { foreach (string mod in mods) { // positions with mod List <int> theo_ptms = matching_theoretical.First().ExpandedProteinList.SelectMany(p => p .OneBasedPossibleLocalizedModifications) .Where(p => p.Key >= topdown_begin && p.Key <= topdown_end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { add += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (topdown_ptm_set.ptm_combination.Where(ptm => !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count) { topdown_novel_mods = true; } } topdown_uniprot_mods += add; if (add.Length == 0) { topdown_uniprot_mods += "N/A"; } foreach (var ambig_id in ambiguous_topdown_hits) { Sweet.lollipop.theoretical_database.theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number].TryGetValue(accession.Split('_')[0].Split('-')[0], out var matching_ambig_theoretical); if (matching_ambig_theoretical != null) { var ambig_mods = ambig_id.ptm_list.Where(p => !Proteoform.modification_is_adduct(p.modification)) .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList(); topdown_uniprot_mods += " | "; add = ""; foreach (var mod in ambig_mods) { // positions with mod List <int> theo_ptms = matching_ambig_theoretical.First().ExpandedProteinList.SelectMany(p => p .OneBasedPossibleLocalizedModifications) .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { add += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (ambig_id.ptm_list.Where(ptm => !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count) { topdown_novel_mods = true; } } } topdown_uniprot_mods += add; if (add.Length == 0) { topdown_uniprot_mods += "N/A"; } } } } }
public List <ProteoformFamily> construct_families() { ProteoformFamily.reset_family_counter(); Stack <Proteoform> remaining = new Stack <Proteoform>(this.experimental_proteoforms.Where(e => e.accepted).ToArray()); List <ProteoformFamily> running_families = new List <ProteoformFamily>(); List <Proteoform> running = new List <Proteoform>(); List <Thread> active = new List <Thread>(); while (remaining.Count > 0 || active.Count > 0) { while (remaining.Count > 0 && active.Count < Environment.ProcessorCount) { Proteoform root = remaining.Pop(); ProteoformFamily fam = new ProteoformFamily(root); Thread t = new Thread(new ThreadStart(fam.construct_family)); t.Start(); running_families.Add(fam); running.Add(root); active.Add(t); } foreach (Thread t in active) { t.Join(); } List <Proteoform> cumulative_proteoforms = new List <Proteoform>(); foreach (ProteoformFamily family in running_families.ToList()) { if (cumulative_proteoforms.Contains(family.proteoforms.First())) { running_families.Remove(family); // check for duplicates due to arbitrary seed selection } else { cumulative_proteoforms.AddRange(family.proteoforms); Parallel.ForEach(family.proteoforms, p => { lock (p) p.family = family; }); } } this.families.AddRange(running_families); remaining = new Stack <Proteoform>(remaining.Except(cumulative_proteoforms)); running_families.Clear(); running.Clear(); active.Clear(); } if (Lollipop.gene_centric_families) { families = combine_gene_families(families).ToList(); } Sweet.lollipop.theoretical_database.aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.neucode_labeled).AA_Masses; Parallel.ForEach(families, f => f.identify_experimentals()); //read in BU results if available, map to proteoforms. //Sweet.lollipop.BottomUpPSMList.Clear(); //BottomUpReader.bottom_up_PTMs_not_in_dictionary.Clear(); //foreach (InputFile file in Sweet.lollipop.input_files.Where(f => f.purpose == Purpose.BottomUp)) //{ // Sweet.lollipop.BottomUpPSMList.AddRange(BottomUpReader.ReadBUFile(file.complete_path, theoreticals_by_accession.Values.ToList())); //} return(families); }
public ProteoformRelation(Proteoform pf1, Proteoform pf2, ProteoformComparison relation_type, double delta_mass, string current_directory) { connected_proteoforms[0] = pf1; connected_proteoforms[1] = pf2; RelationType = relation_type; DeltaMass = delta_mass; InstanceId = instanceCounter; lock (Sweet.lollipop) instanceCounter += 1; //Not thread safe if (CH2 == null || HPO3 == null) { CH2 = ChemicalFormula.ParseFormula("C1 H2"); HPO3 = ChemicalFormula.ParseFormula("H1 O3 P1"); } if (Sweet.lollipop.neucode_labeled) { lysine_count = pf1.lysine_count; } List <PtmSet> candidate_sets = new List <PtmSet>(); if (Sweet.lollipop.et_use_notch && (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy)) { if (Sweet.lollipop.et_use_notch && !Sweet.lollipop.et_notch_ppm) { double mass = delta_mass - Sweet.lollipop.notch_tolerance_et; while (mass <= delta_mass + Sweet.lollipop.notch_tolerance_et) { Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue( Math.Round(mass, 1), out List <PtmSet> candidates); if (candidates != null) { candidate_sets.AddRange(candidates); } mass += 0.1; } candidate_sets = candidate_sets.Distinct().ToList(); } else { Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue(Math.Round(delta_mass, 1), out candidate_sets); } if (candidate_sets != null) { candidate_sets = candidate_sets.Where(s => Sweet.lollipop.et_notch_ppm ? Math.Abs(s.mass - delta_mass) * 1e6 / pf1.modified_mass < Sweet.lollipop.notch_tolerance_et : Math.Abs(s.mass - delta_mass) < Sweet.lollipop.notch_tolerance_et).ToList(); candidate_ptmset = candidate_sets.OrderBy(s => s.ptm_rank_sum).FirstOrDefault(); } } else if (Sweet.lollipop.ee_use_notch && (relation_type == ProteoformComparison.ExperimentalExperimental || relation_type == ProteoformComparison.ExperimentalFalse)) { if (Sweet.lollipop.ee_use_notch && !Sweet.lollipop.ee_notch_ppm) { double mass = delta_mass - Sweet.lollipop.notch_tolerance_ee; while (mass <= delta_mass + Sweet.lollipop.notch_tolerance_ee) { Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue( Math.Round(mass, 1), out List <PtmSet> candidates); if (candidates != null) { candidate_sets.AddRange(candidates); } mass += 0.1; } candidate_sets = candidate_sets.Distinct().ToList(); } else { Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue(Math.Round(delta_mass, 1), out candidate_sets); } if (candidate_sets != null) { candidate_sets = candidate_sets.Where(s => Sweet.lollipop.ee_notch_ppm ? Math.Abs(s.mass - delta_mass) * 1e6 / pf1.modified_mass < Sweet.lollipop.notch_tolerance_ee : Math.Abs(s.mass - delta_mass) < Sweet.lollipop.notch_tolerance_ee).ToList(); candidate_ptmset = candidate_sets.OrderBy(s => s.ptm_rank_sum).FirstOrDefault(); } } else if (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy) { if (Sweet.lollipop.peak_width_base_et > 0.09) { double mass = delta_mass - Sweet.lollipop.peak_width_base_et; while (mass <= delta_mass + Sweet.lollipop.peak_width_base_et) { Sweet.lollipop.theoretical_database.possible_ptmset_dictionary.TryGetValue( Math.Round(mass, 1), out List <PtmSet> candidates); if (candidates != null) { candidate_sets.AddRange(candidates); } mass += 0.1; } candidate_sets = candidate_sets.Distinct().ToList(); } else { Sweet.lollipop.theoretical_database.possible_ptmset_dictionary.TryGetValue( Math.Round(delta_mass, 1), out candidate_sets); } if (pf2 as TheoreticalProteoform != null && candidate_sets != null && candidate_sets.Count > 0) { List <PtmSet> narrower_range_of_candidates = new List <PtmSet>(); if (Sweet.lollipop.et_use_notch) { narrower_range_of_candidates = candidate_sets; } else { narrower_range_of_candidates = candidate_sets .Where(s => Math.Abs(s.mass - delta_mass) < Sweet.lollipop.peak_width_base_et).ToList(); } TheoreticalProteoform t = pf2 as TheoreticalProteoform; candidate_ptmset = Proteoform.generate_possible_added_ptmsets(narrower_range_of_candidates, Sweet.lollipop.theoretical_database.all_mods_with_mass, t, pf2.begin, pf2.end, pf2.ptm_set, Sweet.lollipop.mod_rank_first_quartile, false).OrderBy(x => x.ptm_rank_sum + Math.Abs(Math.Abs(x.mass) - Math.Abs(delta_mass)) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1 .FirstOrDefault(); } } // Start the model (0 Da) at the mass defect of CH2 or HPO3 itself, allowing the peak width tolerance on either side double half_peak_width = RelationType == ProteoformComparison.ExperimentalTheoretical || RelationType == ProteoformComparison.ExperimentalDecoy ? Sweet.lollipop.peak_width_base_et / 2 : Sweet.lollipop.peak_width_base_ee / 2; double low_decimal_bound = half_peak_width + ((CH2.MonoisotopicMass - Math.Truncate(CH2.MonoisotopicMass)) / CH2.MonoisotopicMass) * (Math.Abs(delta_mass) <= CH2.MonoisotopicMass ? CH2.MonoisotopicMass : Math.Abs(delta_mass)); double high_decimal_bound = 1 - half_peak_width + ((HPO3.MonoisotopicMass - Math.Ceiling(HPO3.MonoisotopicMass)) / HPO3.MonoisotopicMass) * (Math.Abs(delta_mass) <= HPO3.MonoisotopicMass ? HPO3.MonoisotopicMass : Math.Abs(delta_mass)); double delta_mass_decimal = Math.Abs(delta_mass - Math.Truncate(delta_mass)); outside_no_mans_land = delta_mass_decimal <= low_decimal_bound || delta_mass_decimal >= high_decimal_bound || high_decimal_bound <= low_decimal_bound; if (Sweet.lollipop.et_use_notch && (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy)) { outside_no_mans_land = true; } if (Sweet.lollipop.ee_use_notch && (relation_type == ProteoformComparison.ExperimentalExperimental || relation_type == ProteoformComparison.ExperimentalFalse)) { outside_no_mans_land = true; } }
private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, ProteoformRelation r, TheoreticalProteoform theoretical_base) { if (e.linked_proteoform_references == null) { e.linked_proteoform_references = new List <Proteoform>(this.linked_proteoform_references); e.linked_proteoform_references.Add(this); e.ptm_set = set; e.begin = this.begin; e.end = this.end; List <Ptm> remove = new List <Ptm>(); //do retention of M first foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { e.begin--; remove.Add(mod); } foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString()) { e.begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } else if (theoretical_base.sequence[this.end - this.begin].ToString() == mod.modification.Target.ToString()) { e.end--; remove.Add(mod); } } foreach (var ptm in remove) { e.ptm_set.ptm_combination.Remove(ptm); } e.ptm_set = new PtmSet(e.ptm_set.ptm_combination); if (e.gene_name == null) { e.gene_name = this.gene_name; } else if (!e.topdown_id) { e.gene_name.gene_names.Concat(this.gene_name.gene_names); } } else { //check if assign int begin = this.begin; int end = this.end; PtmSet ptm_set = set; List <Ptm> remove = new List <Ptm>(); //do retention of M first foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { begin--; remove.Add(mod); } foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString()) { begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } else if (theoretical_base.sequence[this.end - this.begin].ToString() == mod.modification.Target.ToString()) { end--; remove.Add(mod); } } foreach (var ptm in remove) { ptm_set.ptm_combination.Remove(ptm); } ptm_set = new PtmSet(ptm_set.ptm_combination); if (e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label) || e.begin != begin || e.end != end || !e.ptm_set.same_ptmset(ptm_set, true)) { e.ambiguous = true; Proteoform linked_proteoform_reference = this.linked_proteoform_references == null || this.linked_proteoform_references.Count == 0 ? this : this.linked_proteoform_references.First(); Tuple <Proteoform, int, int, PtmSet> new_id = new Tuple <Proteoform, int, int, PtmSet>(linked_proteoform_reference, begin, end, ptm_set); lock (e.ambiguous_identifications) { if (!e.ambiguous_identifications.Any(p => p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) && p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 && p.Item4.same_ptmset(new_id.Item4, true))) { e.ambiguous_identifications.Add(new_id); } } } } if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous) { foreach (var id in this.ambiguous_identifications) { TheoreticalProteoform id_theoretical_base = id.Item1 as TheoreticalProteoform; int begin = id.Item2; int end = id.Item3; var remove = new List <Ptm>(); var ptm_set = determine_mod_change(e, this, id_theoretical_base, r, id.Item4); if (ptm_set == null) { continue; } //do retention of M first foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid")) { begin--; remove.Add(mod); } foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing")) { if (id_theoretical_base.sequence[id.Item2 - id.Item1.begin].ToString() == mod.modification.Target.ToString()) { begin++; remove.Add(mod); //dont have in ptmset --> change the begin & end } else if (id_theoretical_base.sequence[id.Item3 - id.Item2].ToString() == mod.modification.Target.ToString()) { end--; remove.Add(mod); } } foreach (var ptm in remove) { ptm_set.ptm_combination.Remove(ptm); } ptm_set = new PtmSet(ptm_set.ptm_combination); lock (e.ambiguous_identifications) { var new_id = new Tuple <Proteoform, int, int, PtmSet>(id.Item1, begin, end, ptm_set); if ((e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) || e.begin != new_id.Item2 || e.end != new_id.Item3 || !e.ptm_set.same_ptmset(new_id.Item4, true)) && !e.ambiguous_identifications.Any(p => p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) && p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 && p.Item4.same_ptmset(new_id.Item4, true))) { e.ambiguous_identifications.Add(new_id); e.ambiguous = true; } } } } e.uniprot_mods = ""; foreach (string mod in e.ptm_set.ptm_combination.Concat(e.ambiguous_identifications.SelectMany(i => i.Item4.ptm_combination)).Where(ptm => ptm.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m)) { // positions with mod List <int> theo_ptms = theoretical_base.ExpandedProteinList.First() .OneBasedPossibleLocalizedModifications .Where(p => p.Key >= e.begin && p.Key <= e.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { e.uniprot_mods += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (e.ptm_set.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count || e.ambiguous_identifications.Any(i => i.Item4.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count)) { e.novel_mods = true; } } //else if (!e.topdown_id && e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label) // && e.linked_proteoform_references.Count == this.linked_proteoform_references.Count + 1) //{ // e.ambiguous = true; //} }
public void identify_experimentals() { HashSet <ExperimentalProteoform> identified_experimentals = new HashSet <ExperimentalProteoform>(); if (Sweet.lollipop.identify_from_td_nodes) { foreach (TopDownProteoform topdown in experimental_proteoforms.Where(e => e.topdown_id)) { Sweet.lollipop.theoretical_database .theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number] .TryGetValue(topdown.accession.Split('_')[0].Split('-')[0], out var t); if (t != null && t.Count > 0) { TheoreticalProteoform theoretical = new TheoreticalProteoform(topdown.accession, topdown.name, topdown.sequence, t.First().ExpandedProteinList, topdown.modified_mass, topdown.lysine_count, topdown.topdown_ptm_set, true, false, null); theoretical.topdown_theoretical = true; theoretical.new_topdown_proteoform = true; theoretical.begin = topdown.topdown_begin; theoretical.end = topdown.topdown_end; foreach (ExperimentalProteoform e in topdown.identify_connected_experimentals(theoretical, topdown.topdown_begin, topdown.topdown_end, new PtmSet(topdown.topdown_ptm_set.ptm_combination), null)) { identified_experimentals.Add(e); } } } } foreach (TheoreticalProteoform t in theoretical_proteoforms.OrderBy(t => t.topdown_theoretical)) { lock (identified_experimentals) foreach (ExperimentalProteoform e in t.identify_connected_experimentals(t, t.begin, t.end, t.ptm_set, t.linked_proteoform_references)) { identified_experimentals.Add(e); } } //Continue looking for new experimental identifications until no more remain to be identified List <ExperimentalProteoform> newly_identified_experimentals = new List <ExperimentalProteoform>(identified_experimentals).OrderBy(p => p.relationships.Count(r => r.candidate_ptmset != null) > 0 ? p.relationships.Where(r => r.candidate_ptmset != null).Min(r => Math.Abs(r.DeltaMass - r.candidate_ptmset.mass)) : 1e6).ThenBy(p => p.modified_mass).ToList(); int last_identified_count = identified_experimentals.Count - 1; while (newly_identified_experimentals.Count > 0) //&& identified_experimentals.Count > last_identified_count) { last_identified_count = identified_experimentals.Count; HashSet <ExperimentalProteoform> tmp_new_experimentals = new HashSet <ExperimentalProteoform>(); foreach (ExperimentalProteoform id_experimental in newly_identified_experimentals) { { lock (identified_experimentals) lock (tmp_new_experimentals) foreach (ExperimentalProteoform new_e in id_experimental.identify_connected_experimentals(id_experimental.linked_proteoform_references.First() as TheoreticalProteoform, id_experimental.begin, id_experimental.end, id_experimental.ptm_set, id_experimental.linked_proteoform_references)) { identified_experimentals.Add(new_e); tmp_new_experimentals.Add(new_e); } } } newly_identified_experimentals = new List <ExperimentalProteoform>(tmp_new_experimentals); } List <string> topdown_ids = Sweet.lollipop.topdown_proteoforms .Select(p => p.accession.Split('_')[0].Split('-')[0] + "_" + p.sequence + "_" + string.Join(", ", p.topdown_ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))).ToList(); //determine identified experimentals that are adducts //checks if any experimentals have same mods as e's ptmset, except e has additional adduct only mods. Parallel.ForEach(experimental_proteoforms, e => { e.adduct = e.linked_proteoform_references != null && e.ptm_set.ptm_combination.Any(m => Proteoform.modification_is_adduct(m.modification)) && experimental_proteoforms.Any(l => l.linked_proteoform_references != null && l.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) && l.ptm_set.ptm_combination.Count < e.ptm_set.ptm_combination.Count && e.ptm_set.ptm_combination.Where(m => l.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)) != e.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification))) .Count(p => !Proteoform.modification_is_adduct(p.modification)) == 0 ); if (e as TopDownProteoform != null) { (e as TopDownProteoform).set_correct_id(); } if (e.linked_proteoform_references != null) { var mods = e.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification)) .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList(); e.uniprot_mods = ""; string add = ""; foreach (string mod in mods) { // positions with mod List <int> theo_ptms = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p .OneBasedPossibleLocalizedModifications) .Where(p => p.Key >= e.begin && p.Key <= e.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { add += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (e.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count) { e.novel_mods = true; } } e.uniprot_mods += add; if (add.Length == 0) { e.uniprot_mods += "N/A"; } foreach (var ambig_id in e.ambiguous_identifications) { var ambig_mods = ambig_id.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification)) .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList(); e.uniprot_mods += " | "; add = ""; foreach (var mod in ambig_mods) { // positions with mod List <int> theo_ptms = ambig_id.theoretical_base.ExpandedProteinList.SelectMany(p => p .OneBasedPossibleLocalizedModifications) .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end && p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod)) .Select(m => m.Key).ToList(); if (theo_ptms.Count > 0) { add += mod + " @ " + string.Join(", ", theo_ptms) + "; "; } if (ambig_id.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)) .Count(m => m == mod) > theo_ptms.Count) { e.novel_mods = true; } } e.uniprot_mods += add; if (add.Length == 0) { e.uniprot_mods += "N/A"; } } } //determine level # e.proteoform_level_description = ""; if (e.linked_proteoform_references == null) { e.proteoform_level = 5; e.proteoform_level_description = "Unidentified"; } else if (e.ambiguous_identifications.Count == 0) { if (e.ptm_set.ptm_combination.Count == 0) { e.proteoform_level = 1; } else { e.proteoform_level = 2; e.proteoform_level_description += "PTM localization ambiguity; "; } //check if accessions had been grouped in constructing the theoretical database if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1) { e.proteoform_level += 1; e.proteoform_level_description += "Gene ambiguity; "; } } else { var unique_accessions = new List <string>() { e.linked_proteoform_references.First().accession.Split('_')[0].Split('-')[0] }.Concat(e.ambiguous_identifications.Select(a => a.theoretical_base.accession.Split('_')[0].Split('-')[0])).Distinct(); var unique_sequences = new List <string>() { ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) }. Concat(e.ambiguous_identifications.Select(a => ExperimentalProteoform.get_sequence(a.theoretical_base, a.begin, a.end))).Distinct(); var unique_PTMs = new List <string>() { string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m)) }.Concat(e.ambiguous_identifications.Select(a => string.Join(", ", a.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))))).Distinct(); int gene_ambiguity = unique_accessions.Count() > 1 ? 1 : 0; //check if accessions had been grouped in constructing the theoretical database if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1) { gene_ambiguity = 1; } int sequence_ambiguity = unique_sequences.Count() > 1 ? 1 : 0; int PTM_ambiguity = unique_PTMs.Count() > 1 ? 1 : 0; int PTM_location = e.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0 || e.ambiguous_identifications.Any(a => a.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0) ? 1 : 0; e.proteoform_level = 1 + gene_ambiguity + sequence_ambiguity + PTM_ambiguity + PTM_location; if (gene_ambiguity > 0) { e.proteoform_level_description += "Gene ambiguity; "; } if (sequence_ambiguity > 0) { e.proteoform_level_description += "Sequence ambiguity; "; } if (PTM_ambiguity > 0) { e.proteoform_level_description += "PTM identity ambiguity; "; } if (PTM_location > 0) { e.proteoform_level_description += "PTM localization ambiguity; "; } } if (e.proteoform_level == 1) { e.proteoform_level_description = "Unambiguous"; } //determine if new intact-mass ID e.new_intact_mass_id = false; if (!e.topdown_id && e.linked_proteoform_references != null && e.ambiguous_identifications.Count == 0) { string this_id = string.Join(",", (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList.Select(a => a.Split('_')[0])).Distinct()) + "_" + ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) + "_" + string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m)); if (!topdown_ids.Any(t => this_id.Split('_')[0].Split(',').Contains(t.Split('_')[0]) && this_id.Split('_')[1] == t.Split('_')[1] && this_id.Split('_')[2] == t.Split('_')[2])) { e.new_intact_mass_id = true; } } }); if (Sweet.lollipop.remove_bad_connections) { if (theoretical_proteoforms.Count > 0 || (Sweet.lollipop.identify_from_td_nodes && experimental_proteoforms.Count(e => e.topdown_id) > 0)) { Parallel.ForEach(relations, r => { r.Accepted = r.Identification; }); } } }