public void Modification_read_write_into_proteinDb()
        {
            Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, "elements2.dat"));
            var sampleModList = PtmListLoader.ReadModsFromFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "z.txt")).ToList();

            Assert.AreEqual(1, sampleModList.OfType <ModificationWithMass>().Count());
            Protein protein = new Protein("MCSSSSSSSSSS", "accession", "organism", new List <Tuple <string, string> >(), new Dictionary <int, List <Modification> > {
                { 2, sampleModList.OfType <Modification>().ToList() }
            }, null, "name", "full_name", false, false, new List <DatabaseReference>(), new List <SequenceVariation>(), new List <DisulfideBond>());

            Assert.AreEqual(1, protein.OneBasedPossibleLocalizedModifications[2].OfType <ModificationWithMass>().Count());
            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), new List <Protein> {
                protein
            }, Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml"));
            List <Protein> new_proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml"), true, DecoyType.None, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> um);

            Assert.AreEqual(1, new_proteins.Count);
            Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.Count);
            Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).Count());
            Assert.AreEqual("Type", new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().First().modificationType);
            Assert.AreEqual("Palmitoylation of C", new_proteins[0].OneBasedPossibleLocalizedModifications[2][0].id);
            Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications[2].OfType <ModificationWithMass>().Count());

            // Check that Modifications were saved after last load
            Assert.AreEqual(1, ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_modifications_with_proteins.xml")).Count);
            Assert.True(ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_modifications_with_proteins.xml"))[0] == new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).First());

            //But that we can still read modifications from other protein XMLs that exist
            Assert.AreEqual(0, ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, "xml.xml")).Count);
        }
        public static List <AnalysisResults> AnalyzeResultsFolder(string folder, IEnumerable <Modification> commonMods)
        {
            // setup
            var xmlMods = ProteinDbLoader.GetPtmListFromProteinXml(Task2GptmdDatabase(folder));
            Dictionary <string, Modification> mods = GetModificationDictWithMotifs(commonMods.Concat(xmlMods));

            // read PSM results
            var results = new List <AnalysisResults>();

            string[] files = new[] { Task1SearchPSMs(folder), Task1SearchPeptides(folder), Task3SearchPSMs(folder), Task3SearchPeptides(folder) };
            foreach (var file in files)
            {
                using (var stream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read))
                {
                    Stream fileStream = file.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                        (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                        stream;

                    StreamReader       fasta   = new StreamReader(fileStream);
                    List <string>      columns = null;
                    List <PsmFromText> targets = new List <PsmFromText>();
                    List <PsmFromText> decoys  = new List <PsmFromText>();

                    while (true)
                    {
                        string line = "";
                        line = fasta.ReadLine();
                        if (line == null)
                        {
                            break;
                        }

                        if (line.StartsWith("File Name")) // header
                        {
                            columns = line.Split('\t').ToList();
                        }
                        else
                        {
                            PsmFromText psm = new PsmFromText(columns, line.Split('\t').ToList(), FdrCutoff, mods);
                            if (psm.QValue < FdrCutoff)
                            {
                                (psm.IsDecoy ? decoys : targets).Add(psm);
                            }
                            else
                            {
                                break;
                            }
                        }
                    }

                    AnalysisResults result = new AnalysisResults(file, targets, decoys);
                    result.WriteToConsole();
                    results.Add(result);
                }
            }
            return(results);
        }
        /// <summary>
        /// Requires at least one ProteinDatabase input file and one input file listing modifications.
        /// </summary>
        /// <returns></returns>
        public bool ready_to_make_database(string current_directory)
        {
            Loaders.LoadElements(Path.Combine(current_directory, "elements.dat"));
            List <InputFile> proteinDbs = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList();

            return(proteinDbs.Count > 0 &&
                   (proteinDbs.Any(file => ProteinDbLoader.GetPtmListFromProteinXml(file.complete_path).Count > 0) ||
                    SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).Count() > 0));
        }
Beispiel #4
0
        /// <summary>
        /// Requires at least one ProteinDatabase input file and one input file listing modifications.
        /// </summary>
        /// <returns></returns>
        public bool ready_to_make_database(string current_directory)
        {
            Loaders.LoadElements();
            List <InputFile> proteinDbs = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList();

            return(proteinDbs.Count > 0 &&
                   (proteinDbs.Any(file => file.extension == ".xml" && ProteinDbLoader.GetPtmListFromProteinXml(file.complete_path).Count > 0) ||
                    Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList).Count() > 0));
        }
Beispiel #5
0
        public void get_theoretical_proteoforms(string current_directory)
        {
            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }

            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            List <Modification> all_known_modifications = get_mods(current_directory);

            Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                if (database.extension == ".xml")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray());
                    lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType)));
                }
                else if (database.extension == ".fasta")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                            ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray());
                }
            });

            Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));


            //this is for ptmsets --> used in RELATIONS
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();
            make_theoretical_proteoforms();
        }
Beispiel #6
0
        public void Test_getptms_from_mzLibxml_without_prep()
        {
            List <Modification> ok = ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, @"cRAP_databaseGPTMD.xml"));

            Assert.AreEqual(70, ok.Count);
        }
        public void get_theoretical_proteoforms(string current_directory)
        {
            if (!ready_to_make_database(current_directory))
            {
                return;
            }

            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }
            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            List <ModificationWithLocation> all_known_modifications = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path)).ToList();

            uniprotModifications = make_modification_dictionary(all_known_modifications);

            Dictionary <string, Modification> um;

            Parallel.ForEach(SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, false, all_known_modifications, database.ContaminantDB, SaveState.lollipop.mod_types_to_exclude, out um).ToArray());
                lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).OfType <ModificationWithLocation>().Where(m => !SaveState.lollipop.mod_types_to_exclude.Contains(m.modificationType)));
            });

            foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods")))
            {
                var new_mods = !filename.EndsWith("variable.txt") || SaveState.lollipop.methionine_oxidation ?
                               PtmListLoader.ReadModsFromFile(filename) :
                               new List <ModificationWithLocation>(); // Empty variable modifications if not selected
                if (filename.EndsWith("variable.txt"))
                {
                    variableModifications = new_mods.OfType <ModificationWithMass>().ToList();
                }
                if (filename.EndsWith("intact_mods.txt"))
                {
                    List <double> old_mods = all_known_modifications.OfType <ModificationWithMass>().Select(m => m.monoisotopicMass).ToList();
                    new_mods = new_mods.OfType <ModificationWithMass>().Where(m => !old_mods.Contains(m.monoisotopicMass)); // get rid of the unlocalized mods if they're already present
                }
                all_known_modifications.AddRange(new_mods);
            }

            all_known_modifications = new HashSet <ModificationWithLocation>(all_known_modifications).ToList();
            uniprotModifications    = make_modification_dictionary(all_known_modifications);
            all_mods_with_mass      = uniprotModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().Concat(variableModifications).ToList();
            SaveState.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <ModificationWithMass> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));

            //Generate all two-member sets and all three-member (or greater) sets of the same modification (three-member combinitorics gets out of hand for assignment)
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(Math.Min(2, SaveState.lollipop.max_ptms), all_mods_with_mass, SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i < SaveState.lollipop.max_ptms + 1; i++)
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();

            expanded_proteins = expand_protein_entries(theoretical_proteins.Values.SelectMany(p => p).ToArray());
            aaIsotopeMassList = new AminoAcidMasses(SaveState.lollipop.carbamidomethylation, SaveState.lollipop.natural_lysine_isotope_abundance, SaveState.lollipop.neucode_light_lysine, SaveState.lollipop.neucode_heavy_lysine).AA_Masses;
            if (SaveState.lollipop.combine_identical_sequences)
            {
                expanded_proteins = group_proteins_by_sequence(expanded_proteins);
            }

            expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ToArray(); // Take on harder problems first to use parallelization more effectively
            process_entries(expanded_proteins, variableModifications);
            process_decoys(expanded_proteins, variableModifications);

            if (SaveState.lollipop.combine_theoretical_proteoforms_byMass)
            {
                SaveState.lollipop.target_proteoform_community.theoretical_proteoforms = group_proteoforms_by_mass(SaveState.lollipop.target_proteoform_community.theoretical_proteoforms);
                foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values)
                {
                    community.theoretical_proteoforms = group_proteoforms_by_mass(community.theoretical_proteoforms);
                }
            }
        }
Beispiel #8
0
        public void get_theoretical_proteoforms(string current_directory)
        {
            if (!ready_to_make_database(current_directory))
            {
                return;
            }

            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }

            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(current_directory, "Mods", "PSI-MOD.obo.xml"));
            Dictionary <string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized);

            List <Modification> all_known_modifications = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList)
                                                          .SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings))
                                                          .ToList();

            uniprotModifications = make_modification_dictionary(all_known_modifications);
            Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                if (database.extension == ".xml")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray());
                    lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType)));
                }
                else if (database.extension == ".fasta")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                            ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray());
                }
            });

            foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods")))
            {
                List <Modification> new_mods = !filename.EndsWith("variable.txt") || Sweet.lollipop.methionine_oxidation ?
                                               PtmListLoader.ReadModsFromFile(filename, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings).ToList() :
                                               new List <Modification>(); // Empty variable modifications if not selected
                if (filename.EndsWith("variable.txt"))
                {
                    variableModifications = new_mods;
                }
                all_known_modifications.AddRange(new_mods);
            }

            all_known_modifications = new HashSet <Modification>(all_known_modifications).ToList();
            uniprotModifications    = make_modification_dictionary(all_known_modifications);

            all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).Concat(variableModifications).ToList();
            Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));

            //this is for ptmsets --> used in RELATIONS
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();
            make_theoretical_proteoforms();
        }