public void get_theoretical_proteoforms(string current_directory) { //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <Modification> all_known_modifications = get_mods(current_directory); Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } }); Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); make_theoretical_proteoforms(); }
public void get_theoretical_proteoforms(string current_directory) { if (!ready_to_make_database(current_directory)) { return; } //Clear out data from potential previous runs foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <ModificationWithLocation> all_known_modifications = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path)).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); Dictionary <string, Modification> um; Parallel.ForEach(SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, false, all_known_modifications, database.ContaminantDB, SaveState.lollipop.mod_types_to_exclude, out um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).OfType <ModificationWithLocation>().Where(m => !SaveState.lollipop.mod_types_to_exclude.Contains(m.modificationType))); }); foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods"))) { var new_mods = !filename.EndsWith("variable.txt") || SaveState.lollipop.methionine_oxidation ? PtmListLoader.ReadModsFromFile(filename) : new List <ModificationWithLocation>(); // Empty variable modifications if not selected if (filename.EndsWith("variable.txt")) { variableModifications = new_mods.OfType <ModificationWithMass>().ToList(); } if (filename.EndsWith("intact_mods.txt")) { List <double> old_mods = all_known_modifications.OfType <ModificationWithMass>().Select(m => m.monoisotopicMass).ToList(); new_mods = new_mods.OfType <ModificationWithMass>().Where(m => !old_mods.Contains(m.monoisotopicMass)); // get rid of the unlocalized mods if they're already present } all_known_modifications.AddRange(new_mods); } all_known_modifications = new HashSet <ModificationWithLocation>(all_known_modifications).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().Concat(variableModifications).ToList(); SaveState.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <ModificationWithMass> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //Generate all two-member sets and all three-member (or greater) sets of the same modification (three-member combinitorics gets out of hand for assignment) all_possible_ptmsets = PtmCombos.generate_all_ptmsets(Math.Min(2, SaveState.lollipop.max_ptms), all_mods_with_mass, SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i < SaveState.lollipop.max_ptms + 1; i++) { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); expanded_proteins = expand_protein_entries(theoretical_proteins.Values.SelectMany(p => p).ToArray()); aaIsotopeMassList = new AminoAcidMasses(SaveState.lollipop.carbamidomethylation, SaveState.lollipop.natural_lysine_isotope_abundance, SaveState.lollipop.neucode_light_lysine, SaveState.lollipop.neucode_heavy_lysine).AA_Masses; if (SaveState.lollipop.combine_identical_sequences) { expanded_proteins = group_proteins_by_sequence(expanded_proteins); } expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ToArray(); // Take on harder problems first to use parallelization more effectively process_entries(expanded_proteins, variableModifications); process_decoys(expanded_proteins, variableModifications); if (SaveState.lollipop.combine_theoretical_proteoforms_byMass) { SaveState.lollipop.target_proteoform_community.theoretical_proteoforms = group_proteoforms_by_mass(SaveState.lollipop.target_proteoform_community.theoretical_proteoforms); foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = group_proteoforms_by_mass(community.theoretical_proteoforms); } } }
public void EnterTheoreticalProteformFamily(string seq, ProteinWithGoTerms prot, IDictionary <int, List <Modification> > modifications, string accession, List <TheoreticalProteoform> theoretical_proteoforms, int decoy_number, IEnumerable <Modification> variableModifications) { List <TheoreticalProteoform> new_theoreticals = new List <TheoreticalProteoform>(); if (seq.Length > 3000 || seq.Any(s => !aaIsotopeMassList.ContainsKey(s))) { return; } //Calculate the properties of this sequence double unmodified_mass = TheoreticalProteoform.CalculateProteoformMass(seq, new List <Ptm>()); int lysine_count = seq.Split('K').Length - 1; bool check_contaminants = theoretical_proteins.Any(item => item.Key.ContaminantDB); //Figure out the possible ptm sets Dictionary <int, List <Modification> > possibleLocalizedMods = modifications.ToDictionary(kv => kv.Key, kv => new List <Modification>(kv.Value)); foreach (Modification m in variableModifications) { for (int i = 1; i <= prot.BaseSequence.Length; i++) { if (prot.BaseSequence[i - 1].ToString() == m.Target.ToString()) { if (!possibleLocalizedMods.TryGetValue(i, out List <Modification> a)) { possibleLocalizedMods.Add(i, new List <Modification> { m }); } else { a.Add(m); } } } } int ptm_set_counter = 1; //if top-down protein sequence, only add PTMs from that top-down proteoforms (will happen in add_topdown_theoreticals method) if (!prot.topdown_protein) { List <PtmSet> unique_ptm_groups = PtmCombos.get_combinations(possibleLocalizedMods, Sweet.lollipop.max_ptms, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2, limit_triples_and_greater); //Enumerate the ptm combinations with _P# to distinguish from the counts in ProteinSequenceGroups (_#G) and TheoreticalPfGps (_#T) foreach (PtmSet ptm_set in unique_ptm_groups) { TheoreticalProteoform t = new TheoreticalProteoform( accession + "_P" + ptm_set_counter.ToString(), prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()), seq, (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }), unmodified_mass, lysine_count, ptm_set, decoy_number < 0, check_contaminants, theoretical_proteins); t.topdown_theoretical = prot.topdown_protein; new_theoreticals.Add(t); ptm_set_counter++; } } add_topdown_theoreticals(prot, seq, accession, unmodified_mass, decoy_number, lysine_count, new_theoreticals, ptm_set_counter, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2); lock (theoretical_proteoforms) theoretical_proteoforms.AddRange(new_theoreticals); }
public void get_theoretical_proteoforms(string current_directory) { if (!ready_to_make_database(current_directory)) { return; } //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(current_directory, "Mods", "PSI-MOD.obo.xml")); Dictionary <string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); List <Modification> all_known_modifications = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList) .SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings)) .ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } }); foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods"))) { List <Modification> new_mods = !filename.EndsWith("variable.txt") || Sweet.lollipop.methionine_oxidation ? PtmListLoader.ReadModsFromFile(filename, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings).ToList() : new List <Modification>(); // Empty variable modifications if not selected if (filename.EndsWith("variable.txt")) { variableModifications = new_mods; } all_known_modifications.AddRange(new_mods); } all_known_modifications = new HashSet <Modification>(all_known_modifications).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).Concat(variableModifications).ToList(); Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); make_theoretical_proteoforms(); }