public void TranslateSelenocysteineContaining() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.5.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr5_selenocysteineContaining.gff3")); GeneModel.GetImportantProteinAccessions(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.pep.all.fa"), out Dictionary <string, string> p, out HashSet <string> bad, out Dictionary <string, string> se); List <Protein> proteins = geneModel.Translate(true, bad, se).ToList(); Assert.AreEqual("MWRSLGLALALCLLPSGGTESQDQSSLCKQPPAWSIRDQDPMLNSNGSVTVVALLQASUYLCILQASKLEDLRVKLKKEGYSNISYIVVNHQGISSRLKYTHLKNKVSEHIPVYQQEENQTDVWTLLNGSKDDFLIYDRCGRLVYHLGLPFSFLTFPYVEEAIKIAYCEKKCGNCSLTTLKDEDFCKRVSLATVDKTVETPSPHYHHEHHHNHGHQHLGSSELSENQQPGAPNAPTHPAPPGLHHHHKHKGQHRQGHPENRDMPASEDLQDLQKKLCRKRCINQLLCKLPTDSELAPRSUCCHCRHLIFEKTGSAITUQCKENLPSLCSUQGLRAEENITESCQURLPPAAUQISQQLIPTEASASURUKNQAKKUEUPSN", proteins[0].BaseSequence); }
/// <summary> /// Times and checks that all proteins in the pep.all.fasta protein fasta file are the same as are output by this library /// </summary> private static void SameProteins() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); // download and decompress references string genomeFasta = "Homo_sapiens.GRCh38.dna.primary_assembly.fa"; string geneModelFile = "Homo_sapiens.GRCh38.81.gff3"; string proteinFasta = "Homo_sapiens.GRCh38.pep.all.fa"; string[] gunzippedFiles = new[] { genomeFasta, geneModelFile, proteinFasta }; if (!gunzippedFiles.All(f => File.Exists(f) && new FileInfo(f).Length > 0)) { using (WebClient Client = new WebClient()) { Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"); Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81/gff3/homo_sapiens/Homo_sapiens.GRCh38.81.gff3.gz", "Homo_sapiens.GRCh38.81.gff3.gz"); Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz", "Homo_sapiens.GRCh38.pep.all.fa.gz"); } } foreach (var gunzippedFile in gunzippedFiles) { if (!File.Exists(gunzippedFile) || new FileInfo(gunzippedFile).Length == 0) { using (FileStream stream = new FileStream(gunzippedFile + ".gz", FileMode.Open)) using (GZipStream gunzip = new GZipStream(stream, CompressionMode.Decompress)) using (var f = File.Create(gunzippedFile)) { gunzip.CopyTo(f); } } } GeneModel.GetImportantProteinAccessions(proteinFasta, out Dictionary <string, string> proteinAccessionSequence, out HashSet <string> bad, out Dictionary <string, string> se); Genome genome = new Genome(genomeFasta); GeneModel geneModel = new GeneModel(genome, geneModelFile); List <Protein> geneBasedProteins = geneModel.Translate(true, bad, se); List <Protein> pepDotAll = ProteinDbLoader.LoadProteinFasta(proteinFasta, true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out List <string> errors); Dictionary <string, string> accSeq = geneBasedProteins.ToDictionary(p => p.Accession, p => p.BaseSequence); stopwatch.Stop(); bool allAreEqual = true; foreach (Protein p in pepDotAll) { // now handled with the badAccessions // && !p.BaseSequence.Contains('*') && !seq.Contains('*') && !p.BaseSequence.Contains('X')) if (accSeq.TryGetValue(p.Accession, out string seq)) { if (p.BaseSequence != seq) { allAreEqual = false; break; } } } stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: all proteins are " + (allAreEqual ? "" : "not ") + "equal "); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(genomeFasta) + "*")) { File.Delete(file); } foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(geneModelFile) + "*")) { File.Delete(file); } foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(proteinFasta) + "*")) { File.Delete(file); } }