public void GffAppliedToOther()
        {
            string    referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3");
            string    alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf");
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            List <Protein> proteins = a.Genes.SelectMany(g => g.Translate(true)).ToList();

            //Forward strand, single coding region
            Assert.AreEqual("PB2015.1.1", proteins[0].Accession);
            Assert.AreEqual(
                "MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF",
                proteins[0].BaseSequence);

            //Reverse strand, single coding region
            Assert.AreEqual("PB2015.2.1", proteins[1].Accession);
            Assert.AreEqual(
                "TSLWTPQAKLPTFQQLLHTQLLPPSGLFRPSSCFTRAFPGPTFVSWQPSLARFLPVSQQP" +
                "RQAQVLPHTGLSTSSLCLTVASPRPTPVPGHHLRAQNLLKSDSLVPTAASWWPMKAQNLL" +
                "KLTCPGPAPASCQRLQAQPLPHGGFSRPTSSSWLGLQAQLLPHNSLFWPSSCPANGGQCR" +
                "PKTSSSQTLQAHLLLPGGINRPSFDLRTASAGPALASQGLFPGPALASWQLPQAKFLPAC" +
                "QQPQQAQLLPHSGPFRPNL",
                proteins[1].BaseSequence);
        }
        public void GtfBasics()
        {
            GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gtf.gtf"));

            Assert.AreEqual(165, geneModel.Genes.SelectMany(g => g.Transcripts).Count());
            List <Protein> proteins = geneModel.Genes.SelectMany(g => g.Translate(true)).ToList();
        }
Exemple #3
0
        public void TranslateHardReverseStrand()
        {
            Genome         genome         = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.14.fa"));
            GeneModel      geneModel      = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "reverse.gff3"));
            List <Protein> proteins       = geneModel.Translate(true).ToList();
            ISequence      codingSequence = new FastAParser().Parse(Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "codingSeq.fa")).First();

            Assert.AreEqual(SequenceExtensions.ConvertToString(codingSequence),
                            SequenceExtensions.ConvertToString(geneModel.Genes[0].Transcripts[0].RetrieveCodingSequence()));
            Assert.AreEqual("MNLQAQPKAQNKRKRCLFGGQEPAPKEQPPPLQPPQQSIRVKEEQYLGHEGPGGAVSTSQ" +
                            "PVELPPPSSLALLNSVVYGPERTSAAMLSQQVASVKWPNSVMAPGRGPERGGGGGVSDSS" +
                            "WQQQPGQPPPHSTWNCHSLSLYSATKGSPHPGVGVPTYYNHPEALKREKAGGPQLDRYVR" +
                            "PMMPQKVQLEVGRPQAPLNSFHAAKKPPNQSLPLQPFQLAFGHQVNRQVFRQGPPPPNPV" +
                            "AAFPPQKQQQQQQPQQQQQQQQAALPQMPLFENFYSMPQQPSQQPQDFGLQPAGPLGQSH" +
                            "LAHHSMAPYPFPPNPDMNPELRKALLQDSAPQPALPQVQIPFPRRSRRLSKEGILPPSAL" +
                            "DGAGTQPGQEATGNLFLHHWPLQQPPPGSLGQPHPEALGFPLELRESQLLPDGERLAPNG" +
                            "REREAPAMGSEEGMRAVSTGDCGQVLRGGVIQSTRRRRRASQEANLLTLAQKAVELASLQ" +
                            "NAKDGSGSEEKRKSVLASTTKCGVEFSEPSLATKRAREDSGMVPLIIPVSVPVRTVDPTE" +
                            "AAQAGGLDEDGKGPEQNPAEHKPSVIVTRRRSTRIPGTDAQAQAEDMNVKLEGEPSVRKP" +
                            "KQRPRPEPLIIPTKAGTFIAPPVYSNITPYQSHLRSPVRLADHPSERSFELPPYTPPPIL" +
                            "SPVREGSGLYFNAIISTSTIPAPPPITPKSAHRTLLRTNSAEVTPPVLSVMGEATPVSIE" +
                            "PRINVGSRFQAEIPLMRDRALAAADPHKADLVWQPWEDLESSREKQRQVEDLLTAACSSI" +
                            "FPGAGTNQELALHCLHESRGDILETLNKLLLKKPLRPHNHPLATYHYTGSDQWKMAERKL" +
                            "FNKGIAIYKKDFFLVQKLIQTKTVAQCVEFYYTYKKQVKIGRNGTLTFGDVDTSDEKSAQ" +
                            "EEVEVDIKTSQKFPRVPLPRRESPSEERLEPKREVKEPRKEGEEEVPEIQEKEEQEEGRE" +
                            "RSRRAAAVKATQTLQANESASDILILRSHESNAPGSAGGQASEKPREGTGKSRRALPFSE" +
                            "KKKKTETFSKTQNQENTFPCKKCGR",
                            proteins[0].BaseSequence);
        }
        public void SmashDNA(DnaModel otherDna)
        {
            //We assume that everyone has the same genes. If they don't, shit breaks. Get real. Deal with it.

            if (_myDna == null)
            {
                throw new System.Exception("WHY YOU GOT NO DNA FOO!? P.S. You need to have SetDNA before you call this method.");
            }
            if (_myDna.GeneList.Count != otherDna.GeneList.Count)
            {
                //Make genius baby.
                throw new Exception("Number of Genes must match, will later add exception handling since we may want to change this value often as we add new genes.");
            }

            var newDna = new DnaModel();

            for (int i = 0; i < _myDna.GeneList.Count; i++)
            {
                var myGene    = _myDna.GeneList[i];
                var theirGene = otherDna.GeneList[i];

                var newGene = new GeneModel();
                newGene.AlleleList.Add(SplitTheGene(myGene));
                newGene.AlleleList.Add(SplitTheGene(theirGene));
                newGene.GeneSet = myGene.GeneSet;
                newDna.GeneList.Add(newGene);
            }
            _myDna = newDna;
            SumAttributes();

            if (DNAUpdated != null)
            {
                DNAUpdated.Invoke(this, new EventArgs());
            }
        }
        AlleleModel SplitTheGene(GeneModel gene)
        {
            //To Implement: Awesome cracking sounds.
            int rand = (int)Math.Round(UnityEngine.Random.value);

            return(gene.AlleleList[rand]);
        }
        public void OneTranscriptOneHeterozygousSynonymous()
        {
            Genome         genome   = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa"));
            VCFParser      vcf      = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "chr_1_one_heterozygous_synonymous.vcf"));
            List <Variant> variants = vcf.Select(x => new Variant(null, x, genome)).ToList();

            Assert.AreEqual(1, variants.Count);

            GeneModel         geneModel           = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript.gtf"));
            List <Protein>    proteins_wo_variant = geneModel.Translate(true).ToList();
            List <Transcript> transcripts         = geneModel.ApplyVariants(variants);
            List <Protein>    proteins            = transcripts.Select(t => t.Protein()).ToList();

            Assert.AreEqual(1, geneModel.Genes.Count);
            Assert.AreEqual(1, proteins.Count);
            Assert.AreEqual(1, proteins_wo_variant.Count);
            Assert.AreEqual(1, new HashSet <string> {
                proteins[0].BaseSequence, proteins_wo_variant[0].BaseSequence
            }.Count);
            Assert.IsTrue(proteins.Any(p => p.FullName.Contains(FunctionalClass.SILENT.ToString())));    // synonymous
            Assert.IsTrue(proteins.Any(p => p.FullName.Contains(GenotypeType.HETEROZYGOUS.ToString()))); // synonymous
            Assert.IsTrue(proteins.Any(p => p.FullName.Contains("1:69666")));

            string proteinFasta = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "chr_1_one_heterozygous_synonymous.fasta");

            ProteinDbWriter.WriteFastaDatabase(proteins, proteinFasta, " ");
            string[] proteinFastaLines = File.ReadLines(proteinFasta).ToArray();
            Assert.IsTrue(proteinFastaLines[0].Contains(FunctionalClass.SILENT.ToString())); // synonymous
            Assert.IsTrue(proteinFastaLines[0].Contains("1:69666"));
        }
        public void ProblematicChr19Gene()
        {
            Genome    genome    = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.19.fa"));
            GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "problematicChr19Gene.gff3"));

            geneModel.ApplyVariants(new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19problematic.vcf")).Select(v => new Variant(null, v, genome.Chromosomes[0])).ToList());
        }
Exemple #8
0
        public void TranslateReverseStrand()
        {
            Genome         genome              = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa"));
            GeneModel      geneModel           = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript_reverse.gtf"));
            List <Protein> proteins_wo_variant = geneModel.Translate(true).ToList();

            Assert.AreEqual("FFYFIIWSLTLLPRAGLELLTSSDPPASASQSVGITGVSHHAQ",
                            proteins_wo_variant[0].BaseSequence);
        }
 public void Chr19VariantTranscript()
 {
     Genome    genome    = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.19.fa"));
     GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19variantTranscript.gff3"));
     var       variants  = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19problematic.vcf"))
                           .Select(v => new Variant(null, v, genome.Chromosomes[0]))
                           .Where(v => v.SecondAlleleString.Length == 1 && v.ReferenceAlleleString.Length == 1).ToList();
     List <Transcript> transcripts = geneModel.ApplyVariants(variants).ToList();
     List <Protein>    proteins    = transcripts.Select(t => t.Protein(null)).ToList();
 }
        public void OutputGtfFromGeneModel()
        {
            string    referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3");
            string    alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf");
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio_merged.gtf"));
        }
Exemple #11
0
        public void TranslateAnotherReverseStrand()
        {
            // See http://useast.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000233306;r=7:38362864-38363518;t=ENST00000426402

            Genome         genome    = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.7.fa"));
            GeneModel      geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr7_one_transcript_reverse.gtf"));
            List <Protein> proteins  = geneModel.Translate(true).ToList();

            Assert.AreEqual("MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSNGYIHWYLHQEGKAPQRLQYYDSYNSKVVLESGVSPGKYYTYASTRNNLRLILRNLIENDFGVYYCATWDG",
                            proteins[0].BaseSequence);
        }
Exemple #12
0
        public void TranslateSelenocysteineContaining()
        {
            Genome    genome    = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.5.fa"));
            GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr5_selenocysteineContaining.gff3"));

            GeneModel.GetImportantProteinAccessions(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.pep.all.fa"),
                                                    out Dictionary <string, string> p, out HashSet <string> bad, out Dictionary <string, string> se);
            List <Protein> proteins = geneModel.Translate(true, bad, se).ToList();

            Assert.AreEqual("MWRSLGLALALCLLPSGGTESQDQSSLCKQPPAWSIRDQDPMLNSNGSVTVVALLQASUYLCILQASKLEDLRVKLKKEGYSNISYIVVNHQGISSRLKYTHLKNKVSEHIPVYQQEENQTDVWTLLNGSKDDFLIYDRCGRLVYHLGLPFSFLTFPYVEEAIKIAYCEKKCGNCSLTTLKDEDFCKRVSLATVDKTVETPSPHYHHEHHHNHGHQHLGSSELSENQQPGAPNAPTHPAPPGLHHHHKHKGQHRQGHPENRDMPASEDLQDLQKKLCRKRCINQLLCKLPTDSELAPRSUCCHCRHLIFEKTGSAITUQCKENLPSLCSUQGLRAEENITESCQURLPPAAUQISQQLIPTEASASURUKNQAKKUEUPSN",
                            proteins[0].BaseSequence);
        }
Exemple #13
0
        /// <summary>
        /// Filters GTF or GFF entries that lack strand information
        /// Can filter also by zero abundance stringtie estimates
        /// Add CDS at the end
        /// </summary>
        /// <param name="gtfPath"></param>
        /// <param name="gtfOutPath"></param>
        public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false)
        {
            var    chromFeatures   = GeneModel.SimplerParse(gtfPath);
            string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf");

            using (var file = File.Create(filteredGtfPath))
            {
                var formatter = new GffFormatter();
                foreach (var chromISeq in chromFeatures)
                {
                    List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >();
                    bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj);
                    if (isMetadata)
                    {
                        bool okayTranscript = false;
                        var  features       = featuresObj as List <MetadataListItem <List <string> > >;
                        foreach (var feature in features)
                        {
                            if (!feature.SubItems.TryGetValue("strand", out List <string> strandish))
                            {
                                continue;
                            }
                            var attributes = GeneModel.SplitAttributes(feature.FreeText);
                            if (feature.Key == "transcript")
                            {
                                bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                                attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0;
                                bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                               attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0;
                                okayTranscript = okayFpkm && okayTpm;
                            }
                            if (okayTranscript)
                            {
                                filteredFeatures.Add(feature);
                            }
                        }
                    }
                    chromISeq.Metadata["features"] = filteredFeatures;
                }
                formatter.Format(file, chromFeatures);
            }
            Genome    ensemblGenome      = new Genome(referenceGenomePath);
            GeneModel newGeneModel       = new GeneModel(ensemblGenome, filteredGtfPath);
            GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath);

            newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);
            string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf");

            newGeneModel.PrintToGTF(filteredGtfWithCdsPath);
        }
Exemple #14
0
        private DnaModel GenerateDefaultDNA()
        {
            DnaModel newDna = new DnaModel();

            foreach (var set in CompleteGenome.TheGenome)
            {
                int max     = set.allAlleles.Count;
                var allele1 = UnityEngine.Random.Range(0, max);
                var allele2 = UnityEngine.Random.Range(0, max);
                var newGene = new GeneModel();
                newGene.AlleleList.Add(set.allAlleles[allele1]);
                newGene.AlleleList.Add(set.allAlleles[allele2]);
                newGene.GeneSet = set;
                newDna.GeneList.Add(newGene);
            }
            return(newDna);
        }
Exemple #15
0
        public void TranslateMTSeq()
        {
            Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.MT.fa"));

            GeneModel      geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse.gtf"));
            List <Protein> proteins  = geneModel.Translate(true).ToList();

            Assert.AreEqual("MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYE" +
                            "VTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAGPFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTAYPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT",
                            proteins[0].BaseSequence);

            geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse2.gtf"));
            proteins  = geneModel.Translate(true).ToList();
            Assert.AreEqual("MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAMAMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPS" +
                            "LNVSLLLTLSILSIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLLLNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSLIIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKM" +
                            "KWQFEHTKPTPFLPTLIALTTLLLPISPFMLMIL",
                            proteins[0].BaseSequence);
        }
Exemple #16
0
        /// <summary>
        /// Filters GTF or GFF entries that lack strand information
        /// </summary>
        /// <param name="gtfPath"></param>
        /// <param name="gtfOutPath"></param>
        public void FilterGtfEntriesWithoutStrand(string gtfPath, string gtfOutPath, bool filterEntriesWithZeroAbundanceStringtieEstimates)
        {
            var chromFeatures = GeneModel.SimplerParse(gtfPath);

            //if (!File.Exists(gtfOutPath))
            //{
            using (var file = File.Create(gtfOutPath))
            {
                var formatter = new GffFormatter();
                foreach (var chromISeq in chromFeatures)
                {
                    List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >();
                    bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj);
                    if (isMetadata)
                    {
                        bool okayTranscript = false;
                        var  features       = featuresObj as List <MetadataListItem <List <string> > >;
                        foreach (var feature in features)
                        {
                            if (!feature.SubItems.TryGetValue("strand", out List <string> strandish))
                            {
                                continue;
                            }
                            var attributes = GeneModel.SplitAttributes(feature.FreeText);
                            if (feature.Key == "transcript")
                            {
                                bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                                attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0;
                                bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                               attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0;
                                okayTranscript = okayFpkm && okayTpm;
                            }
                            if (okayTranscript)
                            {
                                filteredFeatures.Add(feature);
                            }
                        }
                    }
                    chromISeq.Metadata["features"] = filteredFeatures;
                }
                formatter.Format(file, chromFeatures);
            }
            //}
        }
        public void TestMissenseMutation()
        {
            // Make a transcript
            Sequence seq = new Sequence(Alphabets.DNA, "AAA".Select(cc => (byte)cc).ToArray(), false);

            seq.ID = "1";
            Chromosome c = new Chromosome(seq, null);
            Gene       g = new Gene("", c, "", "+", 1, 3, null);
            Transcript t = new Transcript("", g, "", "+", 1, 3, "", null, null);
            Exon       x = new Exon(t, seq, "", 1, 3, seq.ID, "+", null, null);

            t.Exons = new List <Exon> {
                x
            };
            CDS cds = new CDS(t, seq.ID, "", "+", 1, 3, null, 0);

            t.CodingDomainSequences = new List <CDS> {
                cds
            };

            // Make a missense mutation
            // ugh.vcf has a homozygous variation that should change the codon from AAA to AGA, which code for K and R
            // # CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
            // 1   2 .   A   G   64.77 . info   GT:AD:DP:GQ:PL  1/1:2,3:5:69:93,0,69
            List <Variant> variants = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "ugh.vcf")).Select(v => new Variant(null, v, new Chromosome(seq, null))).ToList();

            // Make sure it makes it into the DNA sequence
            t.Variants = new HashSet <Variant>(variants);
            List <Transcript> variantTranscripts = GeneModel.ApplyVariantsCombinitorially(t);

            Assert.AreEqual("AAA", SequenceExtensions.ConvertToString(t.Exons[0].Sequence));
            Assert.AreEqual("K", t.Protein().BaseSequence);
            Assert.AreEqual("AGA", SequenceExtensions.ConvertToString(variantTranscripts[0].Exons[0].Sequence));
            Assert.AreEqual("R", variantTranscripts[0].Protein().BaseSequence);

            // Make sure it gets annotated as a missense mutation
            Assert.IsTrue(variantTranscripts[0].VariantAnnotations.Any(str => str.Contains(FunctionalClass.MISSENSE.ToString())));
        }
        /// <summary>
        /// Check that annotations from ensembl make it in from a merged gene model
        /// </summary>
        private static void StringtieAndEnsembl202122CDS()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Genome    genome       = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.karyotypic.fa");
            string    referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.gtf";
            string    alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.gtf";
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.withcds.gtf");

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff);
            Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
        /// <summary>
        /// Annotates PacBio transcript model for MCF7 with start codons in the reference model
        /// </summary>
        private static void PacBioCds()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Genome    genome       = new Genome(@"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.dna.primary_assembly.fa");
            string    referenceGff = @"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.gtf";
            string    alternateGff = @"E:\ProjectsActive\MCF7PacBio\IsoSeq_MCF72015edition_polished.unimapped.ensembl.unimapped.gff";
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(@"E:\ProjectsActive\MCF7PacBio\CDSAnnotated_IsoSeq_MCF7_2015edition_polished.unimapped.gff");

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " PacBio transcript isoforms");
            Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " PacBio transcript isoforms are new annotated as protein coding");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
        /// <summary>
        /// Check that annotations from ensembl make it in from a merged gene model
        /// </summary>
        private static void StringtieAndEnsemblFullCDS()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Genome    genome       = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.dna.primary_assembly.fa");
            string    referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.81.gtf";
            string    alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.gtf";
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);
            var       x            = a.Genes.SelectMany(g => g.Transcripts).FirstOrDefault(t => t.ID == ""); // should be null

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.withcds.gtf");

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff);
            Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
        /// <summary>
        /// Times and checks that all proteins in the pep.all.fasta protein fasta file are the same as are output by this library
        /// </summary>
        private static void SameProteins()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            // download and decompress references
            string genomeFasta   = "Homo_sapiens.GRCh38.dna.primary_assembly.fa";
            string geneModelFile = "Homo_sapiens.GRCh38.81.gff3";
            string proteinFasta  = "Homo_sapiens.GRCh38.pep.all.fa";

            string[] gunzippedFiles = new[] { genomeFasta, geneModelFile, proteinFasta };
            if (!gunzippedFiles.All(f => File.Exists(f) && new FileInfo(f).Length > 0))
            {
                using (WebClient Client = new WebClient())
                {
                    Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz");
                    Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81/gff3/homo_sapiens/Homo_sapiens.GRCh38.81.gff3.gz", "Homo_sapiens.GRCh38.81.gff3.gz");
                    Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz", "Homo_sapiens.GRCh38.pep.all.fa.gz");
                }
            }

            foreach (var gunzippedFile in gunzippedFiles)
            {
                if (!File.Exists(gunzippedFile) || new FileInfo(gunzippedFile).Length == 0)
                {
                    using (FileStream stream = new FileStream(gunzippedFile + ".gz", FileMode.Open))
                        using (GZipStream gunzip = new GZipStream(stream, CompressionMode.Decompress))
                            using (var f = File.Create(gunzippedFile))
                            {
                                gunzip.CopyTo(f);
                            }
                }
            }

            GeneModel.GetImportantProteinAccessions(proteinFasta, out Dictionary <string, string> proteinAccessionSequence, out HashSet <string> bad, out Dictionary <string, string> se);

            Genome         genome            = new Genome(genomeFasta);
            GeneModel      geneModel         = new GeneModel(genome, geneModelFile);
            List <Protein> geneBasedProteins = geneModel.Translate(true, bad, se);
            List <Protein> pepDotAll         = ProteinDbLoader.LoadProteinFasta(proteinFasta, true, DecoyType.None, false,
                                                                                ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out List <string> errors);
            Dictionary <string, string> accSeq = geneBasedProteins.ToDictionary(p => p.Accession, p => p.BaseSequence);

            stopwatch.Stop();

            bool allAreEqual = true;

            foreach (Protein p in pepDotAll)
            {
                // now handled with the badAccessions // && !p.BaseSequence.Contains('*') && !seq.Contains('*') && !p.BaseSequence.Contains('X'))
                if (accSeq.TryGetValue(p.Accession, out string seq))
                {
                    if (p.BaseSequence != seq)
                    {
                        allAreEqual = false;
                        break;
                    }
                }
            }

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: all proteins are " + (allAreEqual ? "" : "not ") + "equal ");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();

            foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(genomeFasta) + "*"))
            {
                File.Delete(file);
            }
            foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(geneModelFile) + "*"))
            {
                File.Delete(file);
            }
            foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(proteinFasta) + "*"))
            {
                File.Delete(file);
            }
        }
Exemple #22
0
        /// <summary>
        /// Generate sample specific protein database starting with fastq files
        /// </summary>
        public void GenerateSampleSpecificProteinDatabases()
        {
            // Download references and align reads
            Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta);
            if (Parameters.Fastqs != null)
            {
                Alignment.Parameters = new AlignmentParameters();
                Alignment.Parameters.SpritzDirectory          = Parameters.SpritzDirectory;
                Alignment.Parameters.AnalysisDirectory        = Parameters.AnalysisDirectory;
                Alignment.Parameters.Reference                = Parameters.Reference;
                Alignment.Parameters.Threads                  = Parameters.Threads;
                Alignment.Parameters.Fastqs                   = Parameters.Fastqs;
                Alignment.Parameters.ExperimentType           = Parameters.ExperimentType;
                Alignment.Parameters.StrandSpecific           = Parameters.StrandSpecific;
                Alignment.Parameters.InferStrandSpecificity   = Parameters.InferStrandSpecificity;
                Alignment.Parameters.OverwriteStarAlignment   = Parameters.OverwriteStarAlignment;
                Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory;
                Alignment.Parameters.ReorderedFastaPath       = Downloads.ReorderedFastaPath;
                Alignment.Parameters.GeneModelGtfOrGffPath    = Parameters.ReferenceGeneModelGtfOrGff;
                Alignment.Parameters.UseReadSubset            = Parameters.UseReadSubset;
                Alignment.Parameters.ReadSubset               = Parameters.ReadSubset;

                Alignment.PerformAlignment();
                Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath);
            }
            EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel);
            string    sortedBed12Path              = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel);
            GeneModel referenceGeneModel           = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff);
            string    referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided

            // Merge reference gene model and a new gene model (either specified or stringtie-generated)
            string newGeneModelPath           = Parameters.NewGeneModelGtfOrGff;
            string mergedGeneModelWithCdsPath = null;
            string mergedGeneModelProteinXml  = null;
            string reference = Parameters.Reference;

            if (Parameters.DoTranscriptIsoformAnalysis)
            {
                StringtieWrapper stringtie = new StringtieWrapper();
                if (newGeneModelPath == null)
                {
                    stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true);
                    newGeneModelPath = stringtie.FilteredMergedGtfPath;
                }
                else
                {
                    newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff);
                    string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf");
                    WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"),
                                                        StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> {
                        newGeneModelPath
                    }, mergedGeneModelPath)).WaitForExit();
                    newGeneModelPath = mergedGeneModelPath;
                }

                // Determine CDS from start codons of reference gene model
                // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki)
                GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath);
                newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);

                mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf");
                newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath);
            }

            // SnpEff databases or outputing protein XMLs from gene models
            if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database
            {
                reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath);

                if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf
                {
                    mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath);
                }
            }
            else // no isoform analysis
            {
                new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference);
                if (Parameters.Fastqs == null) // no isoform analysis and no fastqs
                {
                    referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff);
                }
            }

            // Gene Fusion Discovery
            List <Protein> fusionProteins = new List <Protein>();

            if (Parameters.DoFusionAnalysis)
            {
                Fusion.Parameters.SpritzDirectory   = Parameters.SpritzDirectory;
                Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory;
                Fusion.Parameters.Reference         = Parameters.Reference;
                Fusion.Parameters.Threads           = Parameters.Threads;
                Fusion.Parameters.Fastqs            = Parameters.Fastqs;
                Fusion.DiscoverGeneFusions();
                fusionProteins = Fusion.FusionProteins;
            }

            // Variant Calling
            if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis)
            {
                VariantCalling.CallVariants(
                    Parameters.SpritzDirectory,
                    Parameters.AnalysisDirectory,
                    Parameters.ExperimentType,
                    reference,
                    Parameters.Threads,
                    sortedBed12Path,
                    Parameters.EnsemblKnownSitesPath,
                    Alignment.DedupedBamFiles,
                    Downloads.ReorderedFastaPath,
                    Downloads.EnsemblGenome,
                    Parameters.QuickSnpEffWithoutStats,
                    Parameters.IndelFinder,
                    Parameters.VariantCallingWorkers);
            }

            // Transfer features from UniProt
            List <string> xmlsToUse = null;

            if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0)
            {
                xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths;
            }
            // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList()
            else
            {
                xmlsToUse = new List <string> {
                    Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml
                }
            };
            VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins);
        }