public void TranslateHardReverseStrand() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.14.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "reverse.gff3")); List <Protein> proteins = geneModel.Translate(true).ToList(); ISequence codingSequence = new FastAParser().Parse(Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "codingSeq.fa")).First(); Assert.AreEqual(SequenceExtensions.ConvertToString(codingSequence), SequenceExtensions.ConvertToString(geneModel.Genes[0].Transcripts[0].RetrieveCodingSequence())); Assert.AreEqual("MNLQAQPKAQNKRKRCLFGGQEPAPKEQPPPLQPPQQSIRVKEEQYLGHEGPGGAVSTSQ" + "PVELPPPSSLALLNSVVYGPERTSAAMLSQQVASVKWPNSVMAPGRGPERGGGGGVSDSS" + "WQQQPGQPPPHSTWNCHSLSLYSATKGSPHPGVGVPTYYNHPEALKREKAGGPQLDRYVR" + "PMMPQKVQLEVGRPQAPLNSFHAAKKPPNQSLPLQPFQLAFGHQVNRQVFRQGPPPPNPV" + "AAFPPQKQQQQQQPQQQQQQQQAALPQMPLFENFYSMPQQPSQQPQDFGLQPAGPLGQSH" + "LAHHSMAPYPFPPNPDMNPELRKALLQDSAPQPALPQVQIPFPRRSRRLSKEGILPPSAL" + "DGAGTQPGQEATGNLFLHHWPLQQPPPGSLGQPHPEALGFPLELRESQLLPDGERLAPNG" + "REREAPAMGSEEGMRAVSTGDCGQVLRGGVIQSTRRRRRASQEANLLTLAQKAVELASLQ" + "NAKDGSGSEEKRKSVLASTTKCGVEFSEPSLATKRAREDSGMVPLIIPVSVPVRTVDPTE" + "AAQAGGLDEDGKGPEQNPAEHKPSVIVTRRRSTRIPGTDAQAQAEDMNVKLEGEPSVRKP" + "KQRPRPEPLIIPTKAGTFIAPPVYSNITPYQSHLRSPVRLADHPSERSFELPPYTPPPIL" + "SPVREGSGLYFNAIISTSTIPAPPPITPKSAHRTLLRTNSAEVTPPVLSVMGEATPVSIE" + "PRINVGSRFQAEIPLMRDRALAAADPHKADLVWQPWEDLESSREKQRQVEDLLTAACSSI" + "FPGAGTNQELALHCLHESRGDILETLNKLLLKKPLRPHNHPLATYHYTGSDQWKMAERKL" + "FNKGIAIYKKDFFLVQKLIQTKTVAQCVEFYYTYKKQVKIGRNGTLTFGDVDTSDEKSAQ" + "EEVEVDIKTSQKFPRVPLPRRESPSEERLEPKREVKEPRKEGEEEVPEIQEKEEQEEGRE" + "RSRRAAAVKATQTLQANESASDILILRSHESNAPGSAGGQASEKPREGTGKSRRALPFSE" + "KKKKTETFSKTQNQENTFPCKKCGR", proteins[0].BaseSequence); }
public void TranslateReverseStrand() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript_reverse.gtf")); List <Protein> proteins_wo_variant = geneModel.Translate(true).ToList(); Assert.AreEqual("FFYFIIWSLTLLPRAGLELLTSSDPPASASQSVGITGVSHHAQ", proteins_wo_variant[0].BaseSequence); }
public void TranslateAnotherReverseStrand() { // See http://useast.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000233306;r=7:38362864-38363518;t=ENST00000426402 Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.7.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr7_one_transcript_reverse.gtf")); List <Protein> proteins = geneModel.Translate(true).ToList(); Assert.AreEqual("MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSNGYIHWYLHQEGKAPQRLQYYDSYNSKVVLESGVSPGKYYTYASTRNNLRLILRNLIENDFGVYYCATWDG", proteins[0].BaseSequence); }
public void TranslateSelenocysteineContaining() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.5.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr5_selenocysteineContaining.gff3")); GeneModel.GetImportantProteinAccessions(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.pep.all.fa"), out Dictionary <string, string> p, out HashSet <string> bad, out Dictionary <string, string> se); List <Protein> proteins = geneModel.Translate(true, bad, se).ToList(); Assert.AreEqual("MWRSLGLALALCLLPSGGTESQDQSSLCKQPPAWSIRDQDPMLNSNGSVTVVALLQASUYLCILQASKLEDLRVKLKKEGYSNISYIVVNHQGISSRLKYTHLKNKVSEHIPVYQQEENQTDVWTLLNGSKDDFLIYDRCGRLVYHLGLPFSFLTFPYVEEAIKIAYCEKKCGNCSLTTLKDEDFCKRVSLATVDKTVETPSPHYHHEHHHNHGHQHLGSSELSENQQPGAPNAPTHPAPPGLHHHHKHKGQHRQGHPENRDMPASEDLQDLQKKLCRKRCINQLLCKLPTDSELAPRSUCCHCRHLIFEKTGSAITUQCKENLPSLCSUQGLRAEENITESCQURLPPAAUQISQQLIPTEASASURUKNQAKKUEUPSN", proteins[0].BaseSequence); }
public void TranslateMTSeq() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.MT.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse.gtf")); List <Protein> proteins = geneModel.Translate(true).ToList(); Assert.AreEqual("MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYE" + "VTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAGPFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTAYPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT", proteins[0].BaseSequence); geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse2.gtf")); proteins = geneModel.Translate(true).ToList(); Assert.AreEqual("MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAMAMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPS" + "LNVSLLLTLSILSIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLLLNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSLIIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKM" + "KWQFEHTKPTPFLPTLIALTTLLLPISPFMLMIL", proteins[0].BaseSequence); }
/// <summary> /// Times and checks that all proteins in the pep.all.fasta protein fasta file are the same as are output by this library /// </summary> private static void SameProteins() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); // download and decompress references string genomeFasta = "Homo_sapiens.GRCh38.dna.primary_assembly.fa"; string geneModelFile = "Homo_sapiens.GRCh38.81.gff3"; string proteinFasta = "Homo_sapiens.GRCh38.pep.all.fa"; string[] gunzippedFiles = new[] { genomeFasta, geneModelFile, proteinFasta }; if (!gunzippedFiles.All(f => File.Exists(f) && new FileInfo(f).Length > 0)) { using (WebClient Client = new WebClient()) { Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"); Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81/gff3/homo_sapiens/Homo_sapiens.GRCh38.81.gff3.gz", "Homo_sapiens.GRCh38.81.gff3.gz"); Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz", "Homo_sapiens.GRCh38.pep.all.fa.gz"); } } foreach (var gunzippedFile in gunzippedFiles) { if (!File.Exists(gunzippedFile) || new FileInfo(gunzippedFile).Length == 0) { using (FileStream stream = new FileStream(gunzippedFile + ".gz", FileMode.Open)) using (GZipStream gunzip = new GZipStream(stream, CompressionMode.Decompress)) using (var f = File.Create(gunzippedFile)) { gunzip.CopyTo(f); } } } GeneModel.GetImportantProteinAccessions(proteinFasta, out Dictionary <string, string> proteinAccessionSequence, out HashSet <string> bad, out Dictionary <string, string> se); Genome genome = new Genome(genomeFasta); GeneModel geneModel = new GeneModel(genome, geneModelFile); List <Protein> geneBasedProteins = geneModel.Translate(true, bad, se); List <Protein> pepDotAll = ProteinDbLoader.LoadProteinFasta(proteinFasta, true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out List <string> errors); Dictionary <string, string> accSeq = geneBasedProteins.ToDictionary(p => p.Accession, p => p.BaseSequence); stopwatch.Stop(); bool allAreEqual = true; foreach (Protein p in pepDotAll) { // now handled with the badAccessions // && !p.BaseSequence.Contains('*') && !seq.Contains('*') && !p.BaseSequence.Contains('X')) if (accSeq.TryGetValue(p.Accession, out string seq)) { if (p.BaseSequence != seq) { allAreEqual = false; break; } } } stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: all proteins are " + (allAreEqual ? "" : "not ") + "equal "); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(genomeFasta) + "*")) { File.Delete(file); } foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(geneModelFile) + "*")) { File.Delete(file); } foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(proteinFasta) + "*")) { File.Delete(file); } }