Example #1
0
        public void TranslateHardReverseStrand()
        {
            Genome         genome         = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.14.fa"));
            GeneModel      geneModel      = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "reverse.gff3"));
            List <Protein> proteins       = geneModel.Translate(true).ToList();
            ISequence      codingSequence = new FastAParser().Parse(Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "codingSeq.fa")).First();

            Assert.AreEqual(SequenceExtensions.ConvertToString(codingSequence),
                            SequenceExtensions.ConvertToString(geneModel.Genes[0].Transcripts[0].RetrieveCodingSequence()));
            Assert.AreEqual("MNLQAQPKAQNKRKRCLFGGQEPAPKEQPPPLQPPQQSIRVKEEQYLGHEGPGGAVSTSQ" +
                            "PVELPPPSSLALLNSVVYGPERTSAAMLSQQVASVKWPNSVMAPGRGPERGGGGGVSDSS" +
                            "WQQQPGQPPPHSTWNCHSLSLYSATKGSPHPGVGVPTYYNHPEALKREKAGGPQLDRYVR" +
                            "PMMPQKVQLEVGRPQAPLNSFHAAKKPPNQSLPLQPFQLAFGHQVNRQVFRQGPPPPNPV" +
                            "AAFPPQKQQQQQQPQQQQQQQQAALPQMPLFENFYSMPQQPSQQPQDFGLQPAGPLGQSH" +
                            "LAHHSMAPYPFPPNPDMNPELRKALLQDSAPQPALPQVQIPFPRRSRRLSKEGILPPSAL" +
                            "DGAGTQPGQEATGNLFLHHWPLQQPPPGSLGQPHPEALGFPLELRESQLLPDGERLAPNG" +
                            "REREAPAMGSEEGMRAVSTGDCGQVLRGGVIQSTRRRRRASQEANLLTLAQKAVELASLQ" +
                            "NAKDGSGSEEKRKSVLASTTKCGVEFSEPSLATKRAREDSGMVPLIIPVSVPVRTVDPTE" +
                            "AAQAGGLDEDGKGPEQNPAEHKPSVIVTRRRSTRIPGTDAQAQAEDMNVKLEGEPSVRKP" +
                            "KQRPRPEPLIIPTKAGTFIAPPVYSNITPYQSHLRSPVRLADHPSERSFELPPYTPPPIL" +
                            "SPVREGSGLYFNAIISTSTIPAPPPITPKSAHRTLLRTNSAEVTPPVLSVMGEATPVSIE" +
                            "PRINVGSRFQAEIPLMRDRALAAADPHKADLVWQPWEDLESSREKQRQVEDLLTAACSSI" +
                            "FPGAGTNQELALHCLHESRGDILETLNKLLLKKPLRPHNHPLATYHYTGSDQWKMAERKL" +
                            "FNKGIAIYKKDFFLVQKLIQTKTVAQCVEFYYTYKKQVKIGRNGTLTFGDVDTSDEKSAQ" +
                            "EEVEVDIKTSQKFPRVPLPRRESPSEERLEPKREVKEPRKEGEEEVPEIQEKEEQEEGRE" +
                            "RSRRAAAVKATQTLQANESASDILILRSHESNAPGSAGGQASEKPREGTGKSRRALPFSE" +
                            "KKKKTETFSKTQNQENTFPCKKCGR",
                            proteins[0].BaseSequence);
        }
Example #2
0
        public void TranslateReverseStrand()
        {
            Genome         genome              = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa"));
            GeneModel      geneModel           = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript_reverse.gtf"));
            List <Protein> proteins_wo_variant = geneModel.Translate(true).ToList();

            Assert.AreEqual("FFYFIIWSLTLLPRAGLELLTSSDPPASASQSVGITGVSHHAQ",
                            proteins_wo_variant[0].BaseSequence);
        }
Example #3
0
        public void TranslateAnotherReverseStrand()
        {
            // See http://useast.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000233306;r=7:38362864-38363518;t=ENST00000426402

            Genome         genome    = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.7.fa"));
            GeneModel      geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr7_one_transcript_reverse.gtf"));
            List <Protein> proteins  = geneModel.Translate(true).ToList();

            Assert.AreEqual("MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSNGYIHWYLHQEGKAPQRLQYYDSYNSKVVLESGVSPGKYYTYASTRNNLRLILRNLIENDFGVYYCATWDG",
                            proteins[0].BaseSequence);
        }
Example #4
0
        public void TranslateSelenocysteineContaining()
        {
            Genome    genome    = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.5.fa"));
            GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr5_selenocysteineContaining.gff3"));

            GeneModel.GetImportantProteinAccessions(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.pep.all.fa"),
                                                    out Dictionary <string, string> p, out HashSet <string> bad, out Dictionary <string, string> se);
            List <Protein> proteins = geneModel.Translate(true, bad, se).ToList();

            Assert.AreEqual("MWRSLGLALALCLLPSGGTESQDQSSLCKQPPAWSIRDQDPMLNSNGSVTVVALLQASUYLCILQASKLEDLRVKLKKEGYSNISYIVVNHQGISSRLKYTHLKNKVSEHIPVYQQEENQTDVWTLLNGSKDDFLIYDRCGRLVYHLGLPFSFLTFPYVEEAIKIAYCEKKCGNCSLTTLKDEDFCKRVSLATVDKTVETPSPHYHHEHHHNHGHQHLGSSELSENQQPGAPNAPTHPAPPGLHHHHKHKGQHRQGHPENRDMPASEDLQDLQKKLCRKRCINQLLCKLPTDSELAPRSUCCHCRHLIFEKTGSAITUQCKENLPSLCSUQGLRAEENITESCQURLPPAAUQISQQLIPTEASASURUKNQAKKUEUPSN",
                            proteins[0].BaseSequence);
        }
Example #5
0
        public void TranslateMTSeq()
        {
            Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.MT.fa"));

            GeneModel      geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse.gtf"));
            List <Protein> proteins  = geneModel.Translate(true).ToList();

            Assert.AreEqual("MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYE" +
                            "VTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAGPFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTAYPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT",
                            proteins[0].BaseSequence);

            geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse2.gtf"));
            proteins  = geneModel.Translate(true).ToList();
            Assert.AreEqual("MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAMAMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPS" +
                            "LNVSLLLTLSILSIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLLLNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSLIIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKM" +
                            "KWQFEHTKPTPFLPTLIALTTLLLPISPFMLMIL",
                            proteins[0].BaseSequence);
        }
        /// <summary>
        /// Times and checks that all proteins in the pep.all.fasta protein fasta file are the same as are output by this library
        /// </summary>
        private static void SameProteins()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            // download and decompress references
            string genomeFasta   = "Homo_sapiens.GRCh38.dna.primary_assembly.fa";
            string geneModelFile = "Homo_sapiens.GRCh38.81.gff3";
            string proteinFasta  = "Homo_sapiens.GRCh38.pep.all.fa";

            string[] gunzippedFiles = new[] { genomeFasta, geneModelFile, proteinFasta };
            if (!gunzippedFiles.All(f => File.Exists(f) && new FileInfo(f).Length > 0))
            {
                using (WebClient Client = new WebClient())
                {
                    Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz");
                    Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81/gff3/homo_sapiens/Homo_sapiens.GRCh38.81.gff3.gz", "Homo_sapiens.GRCh38.81.gff3.gz");
                    Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz", "Homo_sapiens.GRCh38.pep.all.fa.gz");
                }
            }

            foreach (var gunzippedFile in gunzippedFiles)
            {
                if (!File.Exists(gunzippedFile) || new FileInfo(gunzippedFile).Length == 0)
                {
                    using (FileStream stream = new FileStream(gunzippedFile + ".gz", FileMode.Open))
                        using (GZipStream gunzip = new GZipStream(stream, CompressionMode.Decompress))
                            using (var f = File.Create(gunzippedFile))
                            {
                                gunzip.CopyTo(f);
                            }
                }
            }

            GeneModel.GetImportantProteinAccessions(proteinFasta, out Dictionary <string, string> proteinAccessionSequence, out HashSet <string> bad, out Dictionary <string, string> se);

            Genome         genome            = new Genome(genomeFasta);
            GeneModel      geneModel         = new GeneModel(genome, geneModelFile);
            List <Protein> geneBasedProteins = geneModel.Translate(true, bad, se);
            List <Protein> pepDotAll         = ProteinDbLoader.LoadProteinFasta(proteinFasta, true, DecoyType.None, false,
                                                                                ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out List <string> errors);
            Dictionary <string, string> accSeq = geneBasedProteins.ToDictionary(p => p.Accession, p => p.BaseSequence);

            stopwatch.Stop();

            bool allAreEqual = true;

            foreach (Protein p in pepDotAll)
            {
                // now handled with the badAccessions // && !p.BaseSequence.Contains('*') && !seq.Contains('*') && !p.BaseSequence.Contains('X'))
                if (accSeq.TryGetValue(p.Accession, out string seq))
                {
                    if (p.BaseSequence != seq)
                    {
                        allAreEqual = false;
                        break;
                    }
                }
            }

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: all proteins are " + (allAreEqual ? "" : "not ") + "equal ");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();

            foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(genomeFasta) + "*"))
            {
                File.Delete(file);
            }
            foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(geneModelFile) + "*"))
            {
                File.Delete(file);
            }
            foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(proteinFasta) + "*"))
            {
                File.Delete(file);
            }
        }