public static void TestReverseDecoyFasta() { List <Protein> prots = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"fasta.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression, ProteinDbLoader.uniprot_organism_expression, out var a); Assert.AreEqual("MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG", prots[0].BaseSequence); Assert.AreEqual("MGGFGYLTRGQRKLAYVVDMATVTKRKAHETYTVADRIVNELFVKLVGRTEEYILGSIRKVGGRRALRRIAPKTIGQINDRLVKRHRKAGGKGLGKGGKGRGS", prots[1].BaseSequence); }
public void Test_read_xml_write_read_fasta() { ModificationMotif.TryGetMotif("X", out ModificationMotif motif); var nice = new List <Modification> { new Modification("fayk", null, "mt", null, motif, "Anywhere.", null, null, null, null, null, null, null, null) }; List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"xml2.xml"), true, DecoyType.None, nice, false, null, out Dictionary <string, Modification> un); ProteinDbWriter.WriteFastaDatabase(ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_xml_test.fasta"), "|"); List <Protein> ok2 = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_xml_test.fasta"), true, DecoyType.None, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var b); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].Name == ok2[i].Name)); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].Organism == ok2[i].Organism)); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].GeneNames.First().Item2 == ok2[i].GeneNames.First().Item2)); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); }
public static void FastaTest() { List <Protein> prots = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"fasta.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var a); ProteinDbWriter.WriteFastaDatabase(prots, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_fasta.fasta"), "|"); List <Protein> prots2 = ProteinDbLoader.LoadProteinFasta( Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_fasta.fasta"), true, DecoyType.None, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var un); Assert.AreEqual("P62805", prots.First().Accession); Assert.AreEqual("H4_HUMAN", prots.First().Name); Assert.AreEqual("Histone H4", prots.First().FullName); Assert.AreEqual("HIST1H4A", prots.First().GeneNames.First().Item2); Assert.AreEqual("H**o sapiens", prots.First().Organism); Assert.AreEqual("P62805", prots2.First().Accession); Assert.AreEqual("H4_HUMAN", prots2.First().Name); Assert.AreEqual("Histone H4", prots2.First().FullName); Assert.AreEqual("HIST1H4A", prots2.First().GeneNames.First().Item2); Assert.AreEqual("H**o sapiens", prots2.First().Organism); }
public static void TestSlideDecoyFasta() { List <Protein> prots = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"fasta.fasta"), true, DecoyType.Slide, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression, ProteinDbLoader.uniprot_organism_expression, out var a); Assert.AreEqual("MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG", prots[0].BaseSequence); Assert.AreEqual("MVRRRNAQGIGKGAGRKLRRSGGVGRGSKLLYKEGRKVHKKFLEDVIRGATTPTIHRKAKRVGAKDIVGAIKEQTRGLLGVGLGNFIYDTVGYRELAYRVTMT", prots[1].BaseSequence); }
public static void FastaTest() { List <Protein> prots = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"fasta.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression); Assert.AreEqual("P62805", prots.First().Accession); Assert.AreEqual("H4_HUMAN Histone H4", prots.First().FullName); Assert.AreEqual("HIST1H4A", prots.First().GeneNames.First().Item2); }
public static void IsoformReadTest() { string filepath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DatabaseTests\isoformTest.fasta"); var proteinList = ProteinDbLoader.LoadProteinFasta(filepath, true, DecoyType.None, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors); Assert.AreNotEqual(proteinList[2].Accession, proteinList[4].Accession); }
public static void ReplaceBadStdevTwo() { //here we are adding a really hydrophobic psm at the same time as a regular peptide so that there is a big difference in their computed hydrophobicities. The stdev of these hydrophobicities is out of whach the the collective and so it needs to get replaced by the global average var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams()); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchModes = new SinglePpmAroundZeroSearchMode(5); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\hela_snip_for_unitTest.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML", CommonParameters).OrderBy(b => b.PrecursorMass).ToArray(); //adding a new scan that creates a psm at an isolated retention time. This will ultimately cause PEP to replace its retention time standard deviation "Z-score" with the global average. Ms2ScanWithSpecificMass topMs2Scan = listOfSortedms2Scans[395]; int newOneBasedScanNumber = 1000; MzRange range = new MzRange(topMs2Scan.TheScan.MassSpectrum.XArray.Min(), topMs2Scan.TheScan.MassSpectrum.XArray.Max()); MzSpectrum mzs = new MzSpectrum(topMs2Scan.TheScan.MassSpectrum.XArray, topMs2Scan.TheScan.MassSpectrum.YArray, true); double newRetentionTime = topMs2Scan.TheScan.RetentionTime - 25; MsDataScan msd = new MsDataScan(mzs, newOneBasedScanNumber, 2, topMs2Scan.TheScan.IsCentroid, Polarity.Positive, newRetentionTime, range, "", MZAnalyzerType.Orbitrap, topMs2Scan.TheScan.TotalIonCurrent, topMs2Scan.TheScan.InjectionTime, topMs2Scan.TheScan.NoiseData, "", topMs2Scan.TheScan.SelectedIonMZ, topMs2Scan.TheScan.SelectedIonChargeStateGuess, topMs2Scan.TheScan.SelectedIonIntensity, topMs2Scan.TheScan.IsolationMz, topMs2Scan.TheScan.IsolationWidth, DissociationType.HCD, topMs2Scan.TheScan.OneBasedPrecursorScanNumber, topMs2Scan.TheScan.SelectedIonMonoisotopicGuessMz); Ms2ScanWithSpecificMass mwsm = new Ms2ScanWithSpecificMass(msd, topMs2Scan.PrecursorMonoisotopicPeakMz, topMs2Scan.PrecursorCharge, topMs2Scan.FullFilePath, new CommonParameters(), topMs2Scan.ExperimentalFragments); Ms2ScanWithSpecificMass[] extendedArray = new Ms2ScanWithSpecificMass[listOfSortedms2Scans.Length + 1]; for (int i = 0; i < listOfSortedms2Scans.Length; i++) { extendedArray[i] = listOfSortedms2Scans[i]; } extendedArray[listOfSortedms2Scans.Length] = mwsm; extendedArray = extendedArray.OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[extendedArray.Length]; new ClassicSearchEngine(allPsmsArray, extendedArray, variableModifications, fixedModifications, null, null, null, proteinList, searchModes, CommonParameters, new List <string>()).Run(); List <PeptideSpectralMatch> nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); nonNullPsms = nonNullPsms.OrderByDescending(p => p.Score).ToList(); List <PeptideSpectralMatch> psmBloated = new List <PeptideSpectralMatch>(); psmBloated.AddRange(nonNullPsms); int arrayMax = nonNullPsms.Count; psmBloated.AddRange(nonNullPsms.GetRange(2, arrayMax - 2)); psmBloated.AddRange(nonNullPsms.GetRange(2, arrayMax - 2)); PeptideSpectralMatch pp = psmBloated.Where(p => p.ScanRetentionTime < (newRetentionTime + 1)).First(); PeptideWithSetModifications newPwsmTwo = new PeptideWithSetModifications(new Protein("WAGVLPWFPWAAVVWGFWF", "ACCESSION", "ORGANISM"), new DigestionParams(), 1, 2, CleavageSpecificity.Full, "", 0, new Dictionary <int, Modification>(), 0); PeptideSpectralMatch newPsmTwo = new PeptideSpectralMatch(newPwsmTwo, pp.BestMatchingPeptides.First().Notch, pp.Score, pp.ScanIndex, mwsm, new DigestionParams(), pp.MatchedFragmentIons); psmBloated.Add(newPsmTwo); FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(psmBloated.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); }
public static void BadFastaTest() { ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"bad4.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression, ProteinDbLoader.uniprot_organism_expression, out var a); Assert.AreEqual(a.Count, 1); ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"bad3.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression, ProteinDbLoader.uniprot_organism_expression, out var b); Assert.AreEqual(b.Count, 1); ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"blank.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression, ProteinDbLoader.uniprot_organism_expression, out var c); Assert.AreEqual(c.Count, 1); }
public void Test_accession_regex_weird() { Regex bad = new Regex(@"/()/"); List <Protein> ok = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_ensembl.pep.all.fasta"), false, false, bad, bad, bad, bad); ProteinDbWriter.WriteFastaDatabase(ok, Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.fasta"), " "); List <Protein> ok2 = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.fasta"), false, false, bad, bad, bad, bad); Assert.AreEqual("ENSP00000381386 pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[0].Accession); Assert.AreEqual("ENSP00000381386 pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[0].Accession); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); }
public static void RemoveAmbiguousPeptides() { var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams()); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchModes = new SinglePpmAroundZeroSearchMode(5); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\hela_snip_for_unitTest.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML", CommonParameters).OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length]; new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, null, null, proteinList, searchModes, CommonParameters, new List <string>()).Run(); var nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); nonNullPsms.OrderByDescending(p => p.Score); var maxScore = nonNullPsms.Select(n => n.Score).Max(); PeptideSpectralMatch maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First(); Protein newProteinToRemove = new Protein("RREMVE", "BUBBA", isDecoy: false); PeptideWithSetModifications pwsmToRemove = new PeptideWithSetModifications(newProteinToRemove, new DigestionParams(), 1, 6, CleavageSpecificity.Full, "peptideDescription", 2, new Dictionary <int, Modification>(), 1, "RREMVE"); maxScorePsm.AddOrReplace(pwsmToRemove, maxScore, 1, true, maxScorePsm.MatchedFragmentIons, maxScore); maxScorePsm.ResolveAllAmbiguities(); List <PeptideSpectralMatch> psmBloated = new List <PeptideSpectralMatch>(); psmBloated.AddRange(nonNullPsms); psmBloated.AddRange(nonNullPsms.GetRange(0, nonNullPsms.Count - 2)); foreach (PeptideSpectralMatch psm in nonNullPsms.GetRange(0, nonNullPsms.Count - 2)) { Protein newDecoyProtein = new Protein(psm.BestMatchingPeptides.First().Peptide.BaseSequence + "K", "DECOY_" + psm.BestMatchingPeptides.First().Peptide.Protein.Accession, isDecoy: true); PeptideWithSetModifications pwsmDecoy = new PeptideWithSetModifications(newDecoyProtein, new DigestionParams(), 1, psm.BestMatchingPeptides.First().Peptide.BaseSequence.Length + 1, CleavageSpecificity.Full, "peptideDescription", 2, new Dictionary <int, Modification>(), 1, psm.BestMatchingPeptides.First().Peptide.BaseSequence + "K"); PeptideSpectralMatch decoyPsm = new PeptideSpectralMatch(pwsmDecoy, 1, psm.Score, psm.ScanIndex, listOfSortedms2Scans[psm.ScanIndex], new DigestionParams(), psm.MatchedFragmentIons); decoyPsm.ResolveAllAmbiguities(); psmBloated.Add(decoyPsm); } PeptideSpectralMatch oldBloatedMaxScorePsm = psmBloated.Where(n => n.Score == maxScore).First(); int countOfBestPeptidesBloatedMax = oldBloatedMaxScorePsm.BestMatchingPeptides.Count(); FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(psmBloated.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); PeptideSpectralMatch newMaxScorePsm = psmBloated.Where(n => n.Score == maxScore).First(); Assert.AreEqual(countOfBestPeptidesBloatedMax - 1, newMaxScorePsm.BestMatchingPeptides.Count()); }
/// <summary> /// Ensembl coding domain sequences (CDS) sometimes don't have start or stop codons annotated. /// The only way I can figure out how to tell which they are is to read in the protein FASTA and find the ones starting with X's or containing a stop codon '*' /// </summary> /// <param name="spritzDirectory"></param> /// <param name="proteinFastaPath"></param> /// <returns></returns> public void GetImportantProteinAccessions(string spritzDirectory, string proteinFastaPath) { Regex transcriptAccession = new Regex(@"(transcript:)([A-Za-z0-9_.]+)"); // need to include transcript accessions for when a GTF file is used and transcript IDs become the protein IDs List <Protein> proteins = ProteinDbLoader.LoadProteinFasta(proteinFastaPath, true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out List <string> errors); ProteinAccessionSequence = proteins.Select(p => new KeyValuePair <string, string>(p.Accession, p.BaseSequence)) .Concat(proteins.Select(p => new KeyValuePair <string, string>(transcriptAccession.Match(p.FullName).Groups[2].Value, p.BaseSequence))) .ToDictionary(kv => kv.Key, kv => kv.Value); HashSet <string> badOnes = new HashSet <string>(proteins.Where(p => p.BaseSequence.Contains('X') || p.BaseSequence.Contains('*')).SelectMany(p => new string[] { p.Accession, transcriptAccession.Match(p.FullName).Groups[2].Value })); BadProteinAccessions = badOnes; SelenocysteineProteinAccessions = proteins.Where(p => !badOnes.Contains(p.Accession) && p.BaseSequence.Contains('U')).ToDictionary(p => p.Accession, p => p.BaseSequence); }
public static void BadFastaTest() { ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bad4.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var a); Assert.AreEqual(1, a.Count); ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bad3.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var b); Assert.AreEqual(2, b.Count); ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"blank.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var c); Assert.AreEqual(1, c.Count); }
public void Test_read_write_read_fasta() { List <Protein> ok = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_ensembl.pep.all.fasta"), false, false, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_fullName_expression, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_gene_expression); ProteinDbWriter.WriteFastaDatabase(ok, Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.fasta"), " "); List <Protein> ok2 = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.fasta"), false, false, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_fullName_expression, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_gene_expression); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); }
public void get_theoretical_proteoforms(string current_directory) { //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <Modification> all_known_modifications = get_mods(current_directory); Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } }); Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); make_theoretical_proteoforms(); }
public void Test_read_write_read_fasta() { List <Protein> ok = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"test_ensembl.pep.all.fasta"), true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out var a); ProteinDbWriter.WriteFastaDatabase(ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_test_ensembl.pep.all.fasta"), " "); List <Protein> ok2 = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_test_ensembl.pep.all.fasta"), true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out var b); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); }
// Load proteins from XML or FASTA databases and keep them associated with the database file name from which they came from protected List <Protein> LoadProteins(DbForDigestion database) { List <string> dbErrors = new List <string>(); List <Protein> proteinList = new List <Protein>(); string theExtension = Path.GetExtension(database.FilePath).ToLowerInvariant(); bool compressed = theExtension.EndsWith("gz"); // allows for .bgz and .tgz, too which are used on occasion theExtension = compressed ? Path.GetExtension(Path.GetFileNameWithoutExtension(database.FilePath)).ToLowerInvariant() : theExtension; if (theExtension.Equals(".fasta") || theExtension.Equals(".fa")) { proteinList = ProteinDbLoader.LoadProteinFasta(database.FilePath, true, DecoyType.None, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out dbErrors, -1); if (!proteinList.Any()) { Warn("Warning: No protein entries were found in the database"); return(new List <Protein>() { }); } else { return(proteinList); } } else { List <string> modTypesToExclude = new List <string> { }; proteinList = ProteinDbLoader.LoadProteinXML(database.FilePath, true, DecoyType.None, GlobalVariables.AllModsKnown, false, modTypesToExclude, out Dictionary <string, Modification> um, -1, 4, 1); if (!proteinList.Any()) { Warn("Warning: No protein entries were found in the database"); return(new List <Protein>() { }); } else { return(proteinList); } } }
public static List <Protein> LoadProteinDb(string fileName, bool generateTargets, DecoyType decoyType, int MaxThreadsToUse) { List <string> dbErrors = new List <string>(); List <Protein> proteinList = new List <Protein>(); string theExtension = Path.GetExtension(fileName).ToLowerInvariant(); bool compressed = theExtension.EndsWith("gz"); // allows for .bgz and .tgz, too which are used on occasion theExtension = compressed ? Path.GetExtension(Path.GetFileNameWithoutExtension(fileName)).ToLowerInvariant() : theExtension; if (theExtension.Equals(".fasta") || theExtension.Equals(".fa")) { proteinList = ProteinDbLoader.LoadProteinFasta(fileName, generateTargets, decoyType, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out dbErrors, MaxThreadsToUse); } return(proteinList.Where(p => p.BaseSequence.Length > 0).ToList()); }
public static void TestHyphenAccession() { string fastaFile = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "TestHypenAccession.fasta"); string header = @">TTLL10-203.TTLL10-202|x|x|x|x|x|TTLL10|x"; List <string> output = new List <string> { header }; output.Add("PEPTIDE"); File.WriteAllLines(fastaFile, output); var proteins = ProteinDbLoader.LoadProteinFasta(fastaFile, true, DecoyType.None, false, out var errors); Assert.That(proteins.First().Accession == "TTLL10-203.TTLL10-202"); File.Delete(fastaFile); }
public static void TestStringSanitation() { string messedUpSequence = @"PRO�EIN�"; // just test the string sanitation method alone var sanitized = ProteinDbLoader.SanitizeAminoAcidSequence(messedUpSequence, 'X'); Assert.That(sanitized == "PROXEINX"); // test reading from a fasta Protein protein = new Protein(messedUpSequence, "accession"); string fastaPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"messedUp.fasta"); ProteinDbWriter.WriteFastaDatabase(new List <Protein> { protein }, fastaPath, "|"); var fastaProteins = ProteinDbLoader.LoadProteinFasta(fastaPath, true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var a); Assert.That(fastaProteins.First(p => !p.IsDecoy).BaseSequence == "PROXEINX"); // digest and fragment to check that there isn't a crash var peptides = fastaProteins.First().Digest(new DigestionParams(), new List <Modification>(), new List <Modification>()).ToList(); foreach (PeptideWithSetModifications peptide in peptides) { List <Product> fragments = new List <Product>(); peptide.Fragment(DissociationType.HCD, FragmentationTerminus.Both, fragments); } // test reading from an XML string xmlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"messedUp.xml"); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), new List <Protein> { protein }, xmlPath); var xmlProteins = ProteinDbLoader.LoadProteinXML(xmlPath, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out var unk); Assert.That(xmlProteins.First(p => !p.IsDecoy).BaseSequence == "PROXEINX"); }
public static void ProteaseModTest() { Loaders.LoadElements(); string subFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DigestionTest"); Directory.CreateDirectory(subFolder); string databasePath1 = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "ProteaseModTest.fasta"); var protDic = ProteaseDictionary.LoadProteaseDictionary(Path.Combine(GlobalVariables.DataDir, @"ProteolyticDigestion", @"proteases.tsv"), GlobalVariables.ProteaseMods); DigestionParams param = new DigestionParams(protease: "CNBr", maxMissedCleavages: 0); var proteinList = ProteinDbLoader.LoadProteinFasta(databasePath1, true, DecoyType.None, false, out List <string> errors); var protein = proteinList[0]; var peptides = protein.Digest(param, new List <Modification>(), new List <Modification>()).ToList(); Assert.AreEqual(2, peptides.Count()); Assert.AreNotEqual(peptides[0].FullSequence, peptides[1].FullSequence); Assert.AreEqual(882.39707781799996, peptides[0].MonoisotopicMass); Assert.AreEqual(930.400449121, peptides[1].MonoisotopicMass); }
public static void TestDifferentHeaderStyles() { // uniprot database string fastaFile = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "uniprot_aifm1.fasta"); var proteins = ProteinDbLoader.LoadProteinFasta(fastaFile, true, DecoyType.Reverse, false, out var errors); Assert.That(proteins.Count == 2); var targetProtein = proteins.First(p => !p.IsDecoy); Assert.That(targetProtein.Accession == "Q9Z0X1"); Assert.That(targetProtein.GeneNames.Count() == 1); Assert.That(targetProtein.GeneNames.First().Item2 == "Aifm1"); Assert.That(targetProtein.FullName == "Apoptosis-inducing factor 1, mitochondrial"); Assert.That(targetProtein.Name == "AIFM1_MOUSE"); Assert.That(targetProtein.Organism == "Mus musculus"); // gencode database fastaFile = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "gencode_mmp20.fa"); proteins = ProteinDbLoader.LoadProteinFasta(fastaFile, true, DecoyType.Reverse, false, out errors); Assert.That(proteins.Count == 2); targetProtein = proteins.First(p => !p.IsDecoy); Assert.That(targetProtein.Accession == "ENSMUSP00000034487.2"); Assert.That(targetProtein.GeneNames.Count() == 1); Assert.That(targetProtein.GeneNames.First().Item2 == "Mmp20"); Assert.That(targetProtein.FullName == "Mmp20-201"); // ensembl database fastaFile = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "ensembl_prrc2a.fa"); proteins = ProteinDbLoader.LoadProteinFasta(fastaFile, true, DecoyType.Reverse, false, out errors); Assert.That(proteins.Count == 2); targetProtein = proteins.First(p => !p.IsDecoy); Assert.That(targetProtein.Accession == "ENSP00000372947.2"); Assert.That(targetProtein.GeneNames.Count() == 1); Assert.That(targetProtein.GeneNames.First().Item2 == "ENSG00000206427.11"); }
private static List <Protein> LoadProteinDb(string fileName) { List <string> dbErrors = new List <string>(); List <Protein> proteinList = new List <Protein>(); string theExtension = Path.GetExtension(fileName).ToLowerInvariant(); bool compressed = theExtension.EndsWith("gz"); // allows for .bgz and .tgz, too which are used on occasion theExtension = compressed ? Path.GetExtension(Path.GetFileNameWithoutExtension(fileName)).ToLowerInvariant() : theExtension; if (theExtension.Equals(".fasta") || theExtension.Equals(".fa")) { proteinList = ProteinDbLoader.LoadProteinFasta(fileName, true, DecoyType.None, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out dbErrors); } else { proteinList = ProteinDbLoader.LoadProteinXML(fileName, true, DecoyType.None, null, false, null, out var um); } return(proteinList.Where(p => p.BaseSequence.Length > 0).ToList()); }
public static string TransferModifications(string sourceXmlPath, string destinationXmlPath) { var uniprotPtms = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory); var uniprot = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, DecoyType.None, uniprotPtms, false, null, out var un); string outxml = Path.Combine(Path.GetDirectoryName(destinationXmlPath), Path.GetFileNameWithoutExtension(destinationXmlPath) + ".withmods.xml"); var nonVariantProts = destinationXmlPath.EndsWith(".xml") | destinationXmlPath.EndsWith(".xml.gz") ? ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, DecoyType.None, uniprotPtms, false, null, out un).Select(p => p.NonVariantProtein).Distinct() : ProteinDbLoader.LoadProteinFasta(destinationXmlPath, true, DecoyType.None, false, ProteinDbLoader.UniprotAccessionRegex, PgmNameRegex, PgmNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var un2).Select(p => p.NonVariantProtein).Distinct(); var newProts = ProteinAnnotation.CombineAndAnnotateProteins(uniprot, nonVariantProts.ToList()); ProteinDbWriter.WriteXmlDatabase(null, newProts, outxml); string outfasta = Path.Combine(Path.GetDirectoryName(destinationXmlPath), Path.GetFileNameWithoutExtension(destinationXmlPath) + ".fasta"); string outfastaWithDecoys = Path.Combine(Path.GetDirectoryName(destinationXmlPath), Path.GetFileNameWithoutExtension(destinationXmlPath) + ".withdecoys.fasta"); var prot = newProts.FirstOrDefault(p => p.Accession.Contains("_")); var protsForFasta = newProts.SelectMany(p => p.GetVariantProteins()).Where(p => !p.BaseSequence.EndsWith('?')).ToList(); var decoyProtsForFasta = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, DecoyType.Reverse, uniprotPtms, false, null, out un).Where(p => !p.BaseSequence.EndsWith('?')).ToList(); ProteinDbWriter.WriteFastaDatabase(protsForFasta, outfasta, "|"); ProteinDbWriter.WriteFastaDatabase(decoyProtsForFasta, outfastaWithDecoys, "|"); File.WriteAllLines(outfastaWithDecoys, File.ReadAllLines(outfastaWithDecoys).Select(line => line.Replace("mz|DECOY_", "rev_mz|"))); return(outxml); }
public void Test_read_Ensembl_pepAllFasta() { ModificationMotif.TryGetMotif("X", out ModificationMotif motif); var nice = new List <Modification> { new Modification("fayk", null, "mt", null, motif, "Anywhere.", null, null, null, null, null, null, null, null) }; List <Protein> ok = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"test_ensembl.pep.all.fasta"), true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out var a); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_test_ensembl.pep.all.xml")); List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_test_ensembl.pep.all.xml"), true, DecoyType.None, nice, false, null, out Dictionary <string, Modification> un); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); Assert.AreEqual("ENSP00000381386", ok[0].Accession); Assert.AreEqual("ENSP00000215773", ok[1].Accession); Assert.AreEqual("ENSG00000099977", ok[0].GeneNames.First().Item2); Assert.AreEqual("ENSG00000099977", ok[1].GeneNames.First().Item2); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[0].FullName); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24322019:-1 gene:ENSG00000099977 transcript:ENST00000350608 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[1].FullName); Assert.AreEqual(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"test_ensembl.pep.all.fasta"), ok[0].DatabaseFilePath); Assert.AreEqual("ENSP00000381386", ok2[0].Accession); Assert.AreEqual("ENSP00000215773", ok2[1].Accession); Assert.AreEqual("ENSG00000099977", ok2[0].GeneNames.First().Item2); Assert.AreEqual("ENSG00000099977", ok2[1].GeneNames.First().Item2); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[0].FullName); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24322019:-1 gene:ENSG00000099977 transcript:ENST00000350608 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[1].FullName); Assert.AreEqual(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_test_ensembl.pep.all.xml"), ok2[0].DatabaseFilePath); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); }
public static void TestIndexEngineLowRes() { var proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"indexEngineTestFasta.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); var localizeableModifications = new List <Modification>(); Dictionary <Modification, ushort> modsDictionary = new Dictionary <Modification, ushort>(); foreach (var mod in fixedModifications) { modsDictionary.Add(mod, 0); } int i = 1; foreach (var mod in variableModifications) { modsDictionary.Add(mod, (ushort)i); i++; } foreach (var mod in localizeableModifications) { modsDictionary.Add(mod, (ushort)i); i++; } CommonParameters CommonParameters = new CommonParameters(dissociationType: DissociationType.LowCID, maxThreadsToUsePerFile: 1, scoreCutoff: 1, digestionParams: new DigestionParams(protease: "trypsin", minPeptideLength: 1)); var engine = new IndexingEngine(proteinList, variableModifications, fixedModifications, null, 1, DecoyType.Reverse, CommonParameters, 30000, false, new List <FileInfo>(), new List <string>()); var results = (IndexingResults)engine.Run(); Assert.AreEqual(10, results.PeptideIndex.Count); var bubba = results.FragmentIndex; var tooBubba = results.PrecursorIndex; var digestedList = proteinList[0].Digest(CommonParameters.DigestionParams, new List <Modification>(), variableModifications).ToList(); digestedList.AddRange(proteinList[1].Digest(CommonParameters.DigestionParams, new List <Modification>(), variableModifications)); Assert.AreEqual(10, digestedList.Count); foreach (PeptideWithSetModifications peptide in digestedList) { Assert.Contains(peptide, results.PeptideIndex); var fragments = peptide.Fragment(CommonParameters.DissociationType, FragmentationTerminus.Both).ToList(); int positionInPeptideIndex = results.PeptideIndex.IndexOf(peptide); foreach (Product fragment in fragments.Where(f => f.ProductType == ProductType.b || f.ProductType == ProductType.y)) { // mass of the fragment double fragmentMass = Math.Round(fragment.NeutralMass / 1.0005079, 0) * 1.0005079; int integerMassRepresentation = (int)Math.Round(fragmentMass * 1000); // look up the peptides that have fragments with this mass // the result of the lookup is a list of peptide IDs that have this fragment mass List <int> fragmentBin = results.FragmentIndex[integerMassRepresentation]; // this list should contain this peptide! Assert.Contains(positionInPeptideIndex, fragmentBin); } } foreach (var fdfd in digestedList) { Assert.Contains(fdfd, results.PeptideIndex); } }
public static void TestComputePEPValueTopDown() { //just making sure that topdown data goes through the pep calculator without crashing. CommonParameters CommonParameters = new CommonParameters( digestionParams: new DigestionParams(protease: "top-down"), scoreCutoff: 1, assumeOrphanPeaksAreZ1Fragments: false); var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\HeLaFakeTopDown.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchMode = new SinglePpmAroundZeroSearchMode(5); Tolerance DeconvolutionMassTolerance = new PpmTolerance(5); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, null, new CommonParameters()).OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length]; new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, null, null, proteinList, searchMode, CommonParameters, new List <string>()).Run(); var nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); List <PeptideSpectralMatch> moreNonNullPSMs = new List <PeptideSpectralMatch>(); int reps = 3; for (int i = 0; i < reps; i++) { foreach (PeptideSpectralMatch psm in nonNullPsms) { moreNonNullPSMs.Add(psm); } } FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(moreNonNullPSMs.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); var maxScore = nonNullPsms.Select(n => n.Score).Max(); PeptideSpectralMatch maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First(); Dictionary <string, int> sequenceToPsmCount = new Dictionary <string, int>(); List <string> sequences = new List <string>(); foreach (PeptideSpectralMatch psm in nonNullPsms) { var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList(); sequences.Add(String.Join("|", ss)); } var s = sequences.GroupBy(i => i); foreach (var grp in s) { sequenceToPsmCount.Add(grp.Key, grp.Count()); } Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTimeHI_behavior = new Dictionary <string, Dictionary <int, Tuple <double, double> > >(); Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTemHI_behaviorModifiedPeptides = new Dictionary <string, Dictionary <int, Tuple <double, double> > >(); string[] trainingVariables = PsmData.trainingInfos["topDown"]; int chargeStateMode = 4; var(notch, pwsm) = maxScorePsm.BestMatchingPeptides.First(); var maxPsmData = PEP_Analysis.CreateOnePsmDataEntry(maxScorePsm, sequenceToPsmCount, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, chargeStateMode, pwsm, trainingVariables, notch, !pwsm.Protein.IsDecoy); Assert.That(maxScorePsm.PeptidesToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); Assert.That(maxScorePsm.DeltaScore, Is.EqualTo(maxPsmData.DeltaScore).Within(0.05)); Assert.That((float)(maxScorePsm.Score - (int)maxScorePsm.Score), Is.EqualTo(maxPsmData.Intensity).Within(0.05)); Assert.AreEqual(maxPsmData.HydrophobicityZScore, float.NaN); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().MissedCleavages, Is.EqualTo(maxPsmData.MissedCleavagesCount)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().AllModsOneIsNterminus.Values.Count(), Is.EqualTo(maxPsmData.ModsCount)); Assert.That(maxScorePsm.Notch ?? 0, Is.EqualTo(maxPsmData.Notch)); Assert.That(maxScorePsm.PsmCount, Is.EqualTo(maxPsmData.PsmCount * reps)); Assert.That(-Math.Abs(chargeStateMode - maxScorePsm.ScanPrecursorCharge), Is.EqualTo(maxPsmData.PrecursorChargeDiffToMode)); Assert.AreEqual((float)0, maxPsmData.IsVariantPeptide); }
public static void TestRetrieveUniProtProteome() { string filepath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DatabaseTests"); //UP000008595 is Uukuniemi virus (strain S23) (Uuk) which only has 4 proteins string returnedFilePath = ProteinDbRetriever.RetrieveProteome("UP000008595", filepath, ProteinDbRetriever.ProteomeFormat.fasta, ProteinDbRetriever.Reviewed.yes, ProteinDbRetriever.Compress.yes, ProteinDbRetriever.IncludeIsoforms.yes); filepath += "\\UP000008595_reviewed_isoform.fasta.gz"; Assert.AreEqual(filepath, returnedFilePath); Assert.IsTrue(File.Exists(filepath)); var proteinList = ProteinDbLoader.LoadProteinFasta(filepath, true, DecoyType.None, false, out var dbErrors, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex); Assert.AreEqual(4, proteinList.Count); Assert.IsTrue(proteinList.Select(p => p.Accession).ToList().Contains("P33453")); Assert.AreEqual("MLLAICSRTIRQQGLNCPPAVTFTSSHMRPPIPSFLLWTEGSDVLMDFDLDTIPAGSVTGSSIGPKFKIKTQAASSFVHDFTFAHWCDASDMPLRDHFPLVNDTFDHWTPDFISQRLDGSKVVVEFTTNRSDQEQSLISAFNTKVGKYEVALHNRSTTSSILFGVVV" + "VSETTVVTNLNLNQQEVDELCFRFLVARAVHLEMTTKMIIPEYDDEDEDKRSREVKAAFHSVQPDWNVTEANFAPFSRRMFSNFAQMEPDKEYLAHIILDSLKQAQADLDGNHYLNESLTEQARLDRNREESLNMVKDFERDFNNAAQRSAWSHKSTVPFPGVIPKVSGDTTSLSRLVEL" + "PVITGGSDATIRAWRSAYGSVSNGTVERCDEDVERERRAALCSLTVEELEESKALRMKYHRCKIDNGMMDKLDLAMQGVEAKEFKNHPSIIKKRSKSKKTFPLTADTRDIDLFLHHDDLMFNNEHSQTPPAAMIEAVKAGADAQSLHGLDKSANPWYASALWFLGLPIGLWLFMCTCIGVEL" + "SISLKQHCGRQKFIIKKLRFFDIFLLIKPTNSGSHVFYSIAFPESAILGKLHRSQCFKGLQFEDGWFWTEFSSFKMSKLTNVVKCLSTGFNLFWFWRDYYEVPFWAGNEKDFQTGKQRANKMFKFCLLMLLEDKARTEEIATLSRYVMMEGFVSPPCIPKPQKMIEKLPNLARTKFQVWLISR" + "MLQTIIRVSDYPFKITAGHKSANWTGMFNWVTGEPIESTQKLISLFYLGYLKNKEESPERNASIGMYKKILEYEDKHPGRYTYLGLGDPPSDDTRFHEYSISLLKHLCIHAEHDLRRNWGESFKAMISRDIVDAIASLDLERLATLKASSNFNEEWYQKRGDGKTYHRSKVLEKVSKYVKKSSSH" + "VHHIMEECLRKVESQGCMHVCLFKKPQHGGLREIYVLGFEERVVQLVIETIARQICKRFKSETLTNPKQKLAIPETHGLRAVKTCGIHHETVATSDDAAKWNQCHHVTKFALMLCHFTDPLFHGFIIRGCSMFMKKRIMIDQSLIDIIDSHTTLETSDAYLQKIHRGYHGSLDDQPRWISRGGAFVQ" + "TETGMMQGILHYTSSLLHTLLQEWLRTFSQRFIRTRVSVDQRPDVLVDVLQSSDDSGMMISFPSTDKGATGKYRYLSALIFKYKKVIGKYLGIYSSVKSTNNTLHLLEFNSEFFFHINHNRPLLRWITACDTISEQESLASRQEEMYNNLTSVLEGGGSFSLVSFCQFGQLLLHYTLLGMTVSPLFLEY" + "IKLVSEIKDPSLGYFLMDHPFGSGLSGFKYNVWVAVQNSILGSRYRSLLEAIQNSDSAAPKKTLDTTTSGTFVQSTIIRFGDRKKWQRLVDRLNLPEDWLDVIDKNPEIVYRRPRDGFEVSLRIAEKVHSPGVSNSLSKGNCIIRVISSSVYILSRSILSDGLAWLYDEEEEVKRPLLYKVMNQPELDLHSRLTPA" + "QLSTLFPMMAEFEKLQTHLRSYMKIEGEFISKKKVITQTRVNILETERFLRARPEDLIADKWFGFTRTRMTPRTFKEEWENLTSVFPWLTGNPSETLELSPFQHHVQLRNFFSRLDLKGRDIRIIGAPIKKSSGVSNVSTAIRDNFFPRFVLTHIPDEAAMERIEAAGILKHALFLTVTGPYTDQSKLDMCRDF" + "ITSSEPITLKPNHGKTRTNVLSLFQDYFSKRGPDIIFNRIQMANCGVIGGFTSPQKPKEVDGKIVYTGDGVWRGIVDGFQIQLVITYMPKQKSNELKSITVNSDRCISALSSFCQSWCKEMGVFNTEDFSKTQRFSKASFFMHKFKISGSKQTLGAPIFIVSEKIFRPICWDPSKLEFRVRGNTLNLTY" + "KEVNPGAGQRMFNILSYTVKDTDVSDENAFKLMSLSPRHKFHGREPSTSWICMRALPISTIDKLLERILNRERISGSIDNERLAECFKNVMESTLRRKGVFLSEFSRATQKMLDGLSRDMLDFFAEAGLNDDLLLEEEPWLSGLDTFMLDDEAYLEEYNLGPFGVFSVEQEMNTKYYHHLLLD" + "SLVEDVIQKLSLDGLRKLFQEEEAPLEYKKEVIRLLNILQRDASQIKWKSRDLLSENMGLDVDDDMFG", proteinList.Where(p => p.Accession == "P33453").FirstOrDefault().BaseSequence); File.Delete(filepath); //fasta; unreviewed; non-compressed; no isoforms filepath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DatabaseTests"); returnedFilePath = ProteinDbRetriever.RetrieveProteome("UP000008595", filepath, ProteinDbRetriever.ProteomeFormat.fasta, ProteinDbRetriever.Reviewed.no, ProteinDbRetriever.Compress.no, ProteinDbRetriever.IncludeIsoforms.no); filepath += "\\UP000008595_unreviewed.fasta"; Assert.AreEqual(filepath, returnedFilePath); Assert.IsTrue(File.Exists(filepath)); File.Delete(filepath); //xml; reviewed; compresseded; no isoforms filepath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DatabaseTests"); returnedFilePath = ProteinDbRetriever.RetrieveProteome("UP000008595", filepath, ProteinDbRetriever.ProteomeFormat.xml, ProteinDbRetriever.Reviewed.yes, ProteinDbRetriever.Compress.yes, ProteinDbRetriever.IncludeIsoforms.no); filepath += "\\UP000008595_reviewed.xml.gz"; Assert.AreEqual(filepath, returnedFilePath); Assert.IsTrue(File.Exists(filepath)); File.Delete(filepath); //xml; unreviewed; non-compresseded; no isoforms filepath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DatabaseTests"); returnedFilePath = ProteinDbRetriever.RetrieveProteome("UP000008595", filepath, ProteinDbRetriever.ProteomeFormat.xml, ProteinDbRetriever.Reviewed.no, ProteinDbRetriever.Compress.no, ProteinDbRetriever.IncludeIsoforms.no); filepath += "\\UP000008595_unreviewed.xml"; Assert.AreEqual(filepath, returnedFilePath); Assert.IsTrue(File.Exists(filepath)); File.Delete(filepath); //junk null return filepath = "pathDoesNotExists"; returnedFilePath = ProteinDbRetriever.RetrieveProteome("UP000008595", filepath, ProteinDbRetriever.ProteomeFormat.xml, ProteinDbRetriever.Reviewed.no, ProteinDbRetriever.Compress.no, ProteinDbRetriever.IncludeIsoforms.no); filepath += "\\UP000008595_unreviewed.xml"; Assert.IsNull(returnedFilePath); //we don't support filetypes other than fasta or xml currently //requesting gff or other file formats will return null for now. filepath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DatabaseTests"); returnedFilePath = ProteinDbRetriever.RetrieveProteome("UP000008595", filepath, ProteinDbRetriever.ProteomeFormat.gff, ProteinDbRetriever.Reviewed.no, ProteinDbRetriever.Compress.no, ProteinDbRetriever.IncludeIsoforms.no); filepath += "\\UP000008595_unreviewed.xml"; Assert.IsNull(returnedFilePath); }
public static void Load_fasta_handle_tooHigh_indices() { ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"bad.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_fullName_expression, ProteinDbLoader.uniprot_accession_expression, ProteinDbLoader.uniprot_gene_expression); }
public static void TestComputePEPValue() { var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams()); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchModes = new SinglePpmAroundZeroSearchMode(5); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\hela_snip_for_unitTest.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, null, CommonParameters).OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length]; new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, proteinList, searchModes, CommonParameters, new List <string>()).Run(); FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(allPsmsArray.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); var nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); var nonNullPsmsOriginalCopy = allPsmsArray.Where(p => p != null).ToList(); var accessionCounts = PEP_Analysis.GetAccessionCounts(nonNullPsms); var maxScore = nonNullPsms.Select(n => n.Score).Max(); var maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First(); Dictionary <string, int> sequenceToPsmCount = new Dictionary <string, int>(); List <string> sequences = new List <string>(); foreach (PeptideSpectralMatch psm in nonNullPsms) { var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList(); sequences.Add(String.Join("|", ss)); } var s = sequences.GroupBy(i => i); foreach (var grp in s) { sequenceToPsmCount.Add(grp.Key, grp.Count()); } var maxPsmData = PEP_Analysis.CreateOnePsmDataFromPsm(maxScorePsm, accessionCounts, sequenceToPsmCount); Assert.That(maxScorePsm.PeptidesToMatchingFragments.Count, Is.EqualTo(maxPsmData.Ambiguity)); Assert.That(maxScorePsm.DeltaScore, Is.EqualTo(maxPsmData.DeltaScore).Within(0.05)); Assert.That((float)(maxScorePsm.Score - (int)maxScorePsm.Score), Is.EqualTo(maxPsmData.Intensity).Within(0.05)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().MissedCleavages, Is.EqualTo(maxPsmData.MissedCleavagesCount)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().AllModsOneIsNterminus.Values.Count(), Is.EqualTo(maxPsmData.ModsCount)); Assert.That(maxScorePsm.Notch ?? 0, Is.EqualTo(maxPsmData.Notch)); Assert.That(maxScorePsm.PsmCount, Is.EqualTo(maxPsmData.PsmCount)); Assert.That(maxScorePsm.ScanPrecursorCharge, Is.EqualTo(maxPsmData.ScanPrecursorCharge)); PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(nonNullPsms); int trueCount = 0; foreach (var item in allPsmsArray.Where(p => p != null)) { var b = item.FdrInfo.PEP; if (b >= 0.5) { trueCount++; } } Assert.GreaterOrEqual(32, trueCount); }
public static void Load_fasta_handle_tooHigh_indices() { var p = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bad.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var a); }