Пример #1
0
        public void get_theoretical_proteoforms(string current_directory)
        {
            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }

            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            List <Modification> all_known_modifications = get_mods(current_directory);

            Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                if (database.extension == ".xml")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray());
                    lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType)));
                }
                else if (database.extension == ".fasta")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                            ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray());
                }
            });

            Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));


            //this is for ptmsets --> used in RELATIONS
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();
            make_theoretical_proteoforms();
        }
        public void get_theoretical_proteoforms(string current_directory)
        {
            if (!ready_to_make_database(current_directory))
            {
                return;
            }

            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }
            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            List <ModificationWithLocation> all_known_modifications = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path)).ToList();

            uniprotModifications = make_modification_dictionary(all_known_modifications);

            Dictionary <string, Modification> um;

            Parallel.ForEach(SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, false, all_known_modifications, database.ContaminantDB, SaveState.lollipop.mod_types_to_exclude, out um).ToArray());
                lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).OfType <ModificationWithLocation>().Where(m => !SaveState.lollipop.mod_types_to_exclude.Contains(m.modificationType)));
            });

            foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods")))
            {
                var new_mods = !filename.EndsWith("variable.txt") || SaveState.lollipop.methionine_oxidation ?
                               PtmListLoader.ReadModsFromFile(filename) :
                               new List <ModificationWithLocation>(); // Empty variable modifications if not selected
                if (filename.EndsWith("variable.txt"))
                {
                    variableModifications = new_mods.OfType <ModificationWithMass>().ToList();
                }
                if (filename.EndsWith("intact_mods.txt"))
                {
                    List <double> old_mods = all_known_modifications.OfType <ModificationWithMass>().Select(m => m.monoisotopicMass).ToList();
                    new_mods = new_mods.OfType <ModificationWithMass>().Where(m => !old_mods.Contains(m.monoisotopicMass)); // get rid of the unlocalized mods if they're already present
                }
                all_known_modifications.AddRange(new_mods);
            }

            all_known_modifications = new HashSet <ModificationWithLocation>(all_known_modifications).ToList();
            uniprotModifications    = make_modification_dictionary(all_known_modifications);
            all_mods_with_mass      = uniprotModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().Concat(variableModifications).ToList();
            SaveState.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <ModificationWithMass> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));

            //Generate all two-member sets and all three-member (or greater) sets of the same modification (three-member combinitorics gets out of hand for assignment)
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(Math.Min(2, SaveState.lollipop.max_ptms), all_mods_with_mass, SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i < SaveState.lollipop.max_ptms + 1; i++)
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();

            expanded_proteins = expand_protein_entries(theoretical_proteins.Values.SelectMany(p => p).ToArray());
            aaIsotopeMassList = new AminoAcidMasses(SaveState.lollipop.carbamidomethylation, SaveState.lollipop.natural_lysine_isotope_abundance, SaveState.lollipop.neucode_light_lysine, SaveState.lollipop.neucode_heavy_lysine).AA_Masses;
            if (SaveState.lollipop.combine_identical_sequences)
            {
                expanded_proteins = group_proteins_by_sequence(expanded_proteins);
            }

            expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ToArray(); // Take on harder problems first to use parallelization more effectively
            process_entries(expanded_proteins, variableModifications);
            process_decoys(expanded_proteins, variableModifications);

            if (SaveState.lollipop.combine_theoretical_proteoforms_byMass)
            {
                SaveState.lollipop.target_proteoform_community.theoretical_proteoforms = group_proteoforms_by_mass(SaveState.lollipop.target_proteoform_community.theoretical_proteoforms);
                foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values)
                {
                    community.theoretical_proteoforms = group_proteoforms_by_mass(community.theoretical_proteoforms);
                }
            }
        }
Пример #3
0
        public void EnterTheoreticalProteformFamily(string seq, ProteinWithGoTerms prot, IDictionary <int, List <Modification> > modifications, string accession, List <TheoreticalProteoform> theoretical_proteoforms, int decoy_number, IEnumerable <Modification> variableModifications)
        {
            List <TheoreticalProteoform> new_theoreticals = new List <TheoreticalProteoform>();

            if (seq.Length > 3000 || seq.Any(s => !aaIsotopeMassList.ContainsKey(s)))
            {
                return;
            }

            //Calculate the properties of this sequence
            double unmodified_mass = TheoreticalProteoform.CalculateProteoformMass(seq, new List <Ptm>());


            int  lysine_count       = seq.Split('K').Length - 1;
            bool check_contaminants = theoretical_proteins.Any(item => item.Key.ContaminantDB);

            //Figure out the possible ptm sets
            Dictionary <int, List <Modification> > possibleLocalizedMods = modifications.ToDictionary(kv => kv.Key, kv => new List <Modification>(kv.Value));

            foreach (Modification m in variableModifications)
            {
                for (int i = 1; i <= prot.BaseSequence.Length; i++)
                {
                    if (prot.BaseSequence[i - 1].ToString() == m.Target.ToString())
                    {
                        if (!possibleLocalizedMods.TryGetValue(i, out List <Modification> a))
                        {
                            possibleLocalizedMods.Add(i, new List <Modification> {
                                m
                            });
                        }
                        else
                        {
                            a.Add(m);
                        }
                    }
                }
            }

            int ptm_set_counter = 1;

            //if top-down protein sequence, only add PTMs from that top-down proteoforms (will happen in add_topdown_theoreticals method)
            if (!prot.topdown_protein)
            {
                List <PtmSet> unique_ptm_groups = PtmCombos.get_combinations(possibleLocalizedMods, Sweet.lollipop.max_ptms, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2, limit_triples_and_greater);

                //Enumerate the ptm combinations with _P# to distinguish from the counts in ProteinSequenceGroups (_#G) and TheoreticalPfGps (_#T)
                foreach (PtmSet ptm_set in unique_ptm_groups)
                {
                    TheoreticalProteoform t =
                        new TheoreticalProteoform(
                            accession + "_P" + ptm_set_counter.ToString(),
                            prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()),
                            seq,
                            (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }),
                            unmodified_mass,
                            lysine_count,
                            ptm_set,
                            decoy_number < 0,
                            check_contaminants,
                            theoretical_proteins);
                    t.topdown_theoretical = prot.topdown_protein;
                    new_theoreticals.Add(t);
                    ptm_set_counter++;
                }
            }
            add_topdown_theoreticals(prot, seq, accession, unmodified_mass, decoy_number, lysine_count, new_theoreticals, ptm_set_counter, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2);
            lock (theoretical_proteoforms) theoretical_proteoforms.AddRange(new_theoreticals);
        }
Пример #4
0
        public void get_theoretical_proteoforms(string current_directory)
        {
            if (!ready_to_make_database(current_directory))
            {
                return;
            }

            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }

            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(current_directory, "Mods", "PSI-MOD.obo.xml"));
            Dictionary <string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized);

            List <Modification> all_known_modifications = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList)
                                                          .SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings))
                                                          .ToList();

            uniprotModifications = make_modification_dictionary(all_known_modifications);
            Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                if (database.extension == ".xml")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray());
                    lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType)));
                }
                else if (database.extension == ".fasta")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                            ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray());
                }
            });

            foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods")))
            {
                List <Modification> new_mods = !filename.EndsWith("variable.txt") || Sweet.lollipop.methionine_oxidation ?
                                               PtmListLoader.ReadModsFromFile(filename, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings).ToList() :
                                               new List <Modification>(); // Empty variable modifications if not selected
                if (filename.EndsWith("variable.txt"))
                {
                    variableModifications = new_mods;
                }
                all_known_modifications.AddRange(new_mods);
            }

            all_known_modifications = new HashSet <Modification>(all_known_modifications).ToList();
            uniprotModifications    = make_modification_dictionary(all_known_modifications);

            all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).Concat(variableModifications).ToList();
            Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));

            //this is for ptmsets --> used in RELATIONS
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();
            make_theoretical_proteoforms();
        }