public void Modification_read_write_into_proteinDb() { Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, "elements2.dat")); var sampleModList = PtmListLoader.ReadModsFromFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "z.txt")).ToList(); Assert.AreEqual(1, sampleModList.OfType <ModificationWithMass>().Count()); Protein protein = new Protein("MCSSSSSSSSSS", "accession", "organism", new List <Tuple <string, string> >(), new Dictionary <int, List <Modification> > { { 2, sampleModList.OfType <Modification>().ToList() } }, null, "name", "full_name", false, false, new List <DatabaseReference>(), new List <SequenceVariation>(), new List <DisulfideBond>()); Assert.AreEqual(1, protein.OneBasedPossibleLocalizedModifications[2].OfType <ModificationWithMass>().Count()); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), new List <Protein> { protein }, Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml")); List <Protein> new_proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml"), true, DecoyType.None, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> um); Assert.AreEqual(1, new_proteins.Count); Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).Count()); Assert.AreEqual("Type", new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().First().modificationType); Assert.AreEqual("Palmitoylation of C", new_proteins[0].OneBasedPossibleLocalizedModifications[2][0].id); Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications[2].OfType <ModificationWithMass>().Count()); // Check that Modifications were saved after last load Assert.AreEqual(1, ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_modifications_with_proteins.xml")).Count); Assert.True(ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_modifications_with_proteins.xml"))[0] == new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).First()); //But that we can still read modifications from other protein XMLs that exist Assert.AreEqual(0, ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, "xml.xml")).Count); }
public static List <AnalysisResults> AnalyzeResultsFolder(string folder, IEnumerable <Modification> commonMods) { // setup var xmlMods = ProteinDbLoader.GetPtmListFromProteinXml(Task2GptmdDatabase(folder)); Dictionary <string, Modification> mods = GetModificationDictWithMotifs(commonMods.Concat(xmlMods)); // read PSM results var results = new List <AnalysisResults>(); string[] files = new[] { Task1SearchPSMs(folder), Task1SearchPeptides(folder), Task3SearchPSMs(folder), Task3SearchPeptides(folder) }; foreach (var file in files) { using (var stream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fileStream = file.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StreamReader fasta = new StreamReader(fileStream); List <string> columns = null; List <PsmFromText> targets = new List <PsmFromText>(); List <PsmFromText> decoys = new List <PsmFromText>(); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith("File Name")) // header { columns = line.Split('\t').ToList(); } else { PsmFromText psm = new PsmFromText(columns, line.Split('\t').ToList(), FdrCutoff, mods); if (psm.QValue < FdrCutoff) { (psm.IsDecoy ? decoys : targets).Add(psm); } else { break; } } } AnalysisResults result = new AnalysisResults(file, targets, decoys); result.WriteToConsole(); results.Add(result); } } return(results); }
/// <summary> /// Requires at least one ProteinDatabase input file and one input file listing modifications. /// </summary> /// <returns></returns> public bool ready_to_make_database(string current_directory) { Loaders.LoadElements(Path.Combine(current_directory, "elements.dat")); List <InputFile> proteinDbs = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(); return(proteinDbs.Count > 0 && (proteinDbs.Any(file => ProteinDbLoader.GetPtmListFromProteinXml(file.complete_path).Count > 0) || SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).Count() > 0)); }
/// <summary> /// Requires at least one ProteinDatabase input file and one input file listing modifications. /// </summary> /// <returns></returns> public bool ready_to_make_database(string current_directory) { Loaders.LoadElements(); List <InputFile> proteinDbs = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(); return(proteinDbs.Count > 0 && (proteinDbs.Any(file => file.extension == ".xml" && ProteinDbLoader.GetPtmListFromProteinXml(file.complete_path).Count > 0) || Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList).Count() > 0)); }
public void get_theoretical_proteoforms(string current_directory) { //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <Modification> all_known_modifications = get_mods(current_directory); Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } }); Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); make_theoretical_proteoforms(); }
public void Test_getptms_from_mzLibxml_without_prep() { List <Modification> ok = ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, @"cRAP_databaseGPTMD.xml")); Assert.AreEqual(70, ok.Count); }
public void get_theoretical_proteoforms(string current_directory) { if (!ready_to_make_database(current_directory)) { return; } //Clear out data from potential previous runs foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <ModificationWithLocation> all_known_modifications = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path)).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); Dictionary <string, Modification> um; Parallel.ForEach(SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, false, all_known_modifications, database.ContaminantDB, SaveState.lollipop.mod_types_to_exclude, out um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).OfType <ModificationWithLocation>().Where(m => !SaveState.lollipop.mod_types_to_exclude.Contains(m.modificationType))); }); foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods"))) { var new_mods = !filename.EndsWith("variable.txt") || SaveState.lollipop.methionine_oxidation ? PtmListLoader.ReadModsFromFile(filename) : new List <ModificationWithLocation>(); // Empty variable modifications if not selected if (filename.EndsWith("variable.txt")) { variableModifications = new_mods.OfType <ModificationWithMass>().ToList(); } if (filename.EndsWith("intact_mods.txt")) { List <double> old_mods = all_known_modifications.OfType <ModificationWithMass>().Select(m => m.monoisotopicMass).ToList(); new_mods = new_mods.OfType <ModificationWithMass>().Where(m => !old_mods.Contains(m.monoisotopicMass)); // get rid of the unlocalized mods if they're already present } all_known_modifications.AddRange(new_mods); } all_known_modifications = new HashSet <ModificationWithLocation>(all_known_modifications).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().Concat(variableModifications).ToList(); SaveState.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <ModificationWithMass> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //Generate all two-member sets and all three-member (or greater) sets of the same modification (three-member combinitorics gets out of hand for assignment) all_possible_ptmsets = PtmCombos.generate_all_ptmsets(Math.Min(2, SaveState.lollipop.max_ptms), all_mods_with_mass, SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i < SaveState.lollipop.max_ptms + 1; i++) { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); expanded_proteins = expand_protein_entries(theoretical_proteins.Values.SelectMany(p => p).ToArray()); aaIsotopeMassList = new AminoAcidMasses(SaveState.lollipop.carbamidomethylation, SaveState.lollipop.natural_lysine_isotope_abundance, SaveState.lollipop.neucode_light_lysine, SaveState.lollipop.neucode_heavy_lysine).AA_Masses; if (SaveState.lollipop.combine_identical_sequences) { expanded_proteins = group_proteins_by_sequence(expanded_proteins); } expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ToArray(); // Take on harder problems first to use parallelization more effectively process_entries(expanded_proteins, variableModifications); process_decoys(expanded_proteins, variableModifications); if (SaveState.lollipop.combine_theoretical_proteoforms_byMass) { SaveState.lollipop.target_proteoform_community.theoretical_proteoforms = group_proteoforms_by_mass(SaveState.lollipop.target_proteoform_community.theoretical_proteoforms); foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = group_proteoforms_by_mass(community.theoretical_proteoforms); } } }
public void get_theoretical_proteoforms(string current_directory) { if (!ready_to_make_database(current_directory)) { return; } //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(current_directory, "Mods", "PSI-MOD.obo.xml")); Dictionary <string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); List <Modification> all_known_modifications = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList) .SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings)) .ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } }); foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods"))) { List <Modification> new_mods = !filename.EndsWith("variable.txt") || Sweet.lollipop.methionine_oxidation ? PtmListLoader.ReadModsFromFile(filename, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings).ToList() : new List <Modification>(); // Empty variable modifications if not selected if (filename.EndsWith("variable.txt")) { variableModifications = new_mods; } all_known_modifications.AddRange(new_mods); } all_known_modifications = new HashSet <Modification>(all_known_modifications).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).Concat(variableModifications).ToList(); Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); make_theoretical_proteoforms(); }