public void GffAppliedToOther() { string referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3"); string alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf"); GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); List <Protein> proteins = a.Genes.SelectMany(g => g.Translate(true)).ToList(); //Forward strand, single coding region Assert.AreEqual("PB2015.1.1", proteins[0].Accession); Assert.AreEqual( "MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF", proteins[0].BaseSequence); //Reverse strand, single coding region Assert.AreEqual("PB2015.2.1", proteins[1].Accession); Assert.AreEqual( "TSLWTPQAKLPTFQQLLHTQLLPPSGLFRPSSCFTRAFPGPTFVSWQPSLARFLPVSQQP" + "RQAQVLPHTGLSTSSLCLTVASPRPTPVPGHHLRAQNLLKSDSLVPTAASWWPMKAQNLL" + "KLTCPGPAPASCQRLQAQPLPHGGFSRPTSSSWLGLQAQLLPHNSLFWPSSCPANGGQCR" + "PKTSSSQTLQAHLLLPGGINRPSFDLRTASAGPALASQGLFPGPALASWQLPQAKFLPAC" + "QQPQQAQLLPHSGPFRPNL", proteins[1].BaseSequence); }
public void GtfBasics() { GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gtf.gtf")); Assert.AreEqual(165, geneModel.Genes.SelectMany(g => g.Transcripts).Count()); List <Protein> proteins = geneModel.Genes.SelectMany(g => g.Translate(true)).ToList(); }
public void TranslateHardReverseStrand() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.14.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "reverse.gff3")); List <Protein> proteins = geneModel.Translate(true).ToList(); ISequence codingSequence = new FastAParser().Parse(Path.Combine(TestContext.CurrentContext.TestDirectory, "HardReverseStrand", "codingSeq.fa")).First(); Assert.AreEqual(SequenceExtensions.ConvertToString(codingSequence), SequenceExtensions.ConvertToString(geneModel.Genes[0].Transcripts[0].RetrieveCodingSequence())); Assert.AreEqual("MNLQAQPKAQNKRKRCLFGGQEPAPKEQPPPLQPPQQSIRVKEEQYLGHEGPGGAVSTSQ" + "PVELPPPSSLALLNSVVYGPERTSAAMLSQQVASVKWPNSVMAPGRGPERGGGGGVSDSS" + "WQQQPGQPPPHSTWNCHSLSLYSATKGSPHPGVGVPTYYNHPEALKREKAGGPQLDRYVR" + "PMMPQKVQLEVGRPQAPLNSFHAAKKPPNQSLPLQPFQLAFGHQVNRQVFRQGPPPPNPV" + "AAFPPQKQQQQQQPQQQQQQQQAALPQMPLFENFYSMPQQPSQQPQDFGLQPAGPLGQSH" + "LAHHSMAPYPFPPNPDMNPELRKALLQDSAPQPALPQVQIPFPRRSRRLSKEGILPPSAL" + "DGAGTQPGQEATGNLFLHHWPLQQPPPGSLGQPHPEALGFPLELRESQLLPDGERLAPNG" + "REREAPAMGSEEGMRAVSTGDCGQVLRGGVIQSTRRRRRASQEANLLTLAQKAVELASLQ" + "NAKDGSGSEEKRKSVLASTTKCGVEFSEPSLATKRAREDSGMVPLIIPVSVPVRTVDPTE" + "AAQAGGLDEDGKGPEQNPAEHKPSVIVTRRRSTRIPGTDAQAQAEDMNVKLEGEPSVRKP" + "KQRPRPEPLIIPTKAGTFIAPPVYSNITPYQSHLRSPVRLADHPSERSFELPPYTPPPIL" + "SPVREGSGLYFNAIISTSTIPAPPPITPKSAHRTLLRTNSAEVTPPVLSVMGEATPVSIE" + "PRINVGSRFQAEIPLMRDRALAAADPHKADLVWQPWEDLESSREKQRQVEDLLTAACSSI" + "FPGAGTNQELALHCLHESRGDILETLNKLLLKKPLRPHNHPLATYHYTGSDQWKMAERKL" + "FNKGIAIYKKDFFLVQKLIQTKTVAQCVEFYYTYKKQVKIGRNGTLTFGDVDTSDEKSAQ" + "EEVEVDIKTSQKFPRVPLPRRESPSEERLEPKREVKEPRKEGEEEVPEIQEKEEQEEGRE" + "RSRRAAAVKATQTLQANESASDILILRSHESNAPGSAGGQASEKPREGTGKSRRALPFSE" + "KKKKTETFSKTQNQENTFPCKKCGR", proteins[0].BaseSequence); }
public void SmashDNA(DnaModel otherDna) { //We assume that everyone has the same genes. If they don't, shit breaks. Get real. Deal with it. if (_myDna == null) { throw new System.Exception("WHY YOU GOT NO DNA FOO!? P.S. You need to have SetDNA before you call this method."); } if (_myDna.GeneList.Count != otherDna.GeneList.Count) { //Make genius baby. throw new Exception("Number of Genes must match, will later add exception handling since we may want to change this value often as we add new genes."); } var newDna = new DnaModel(); for (int i = 0; i < _myDna.GeneList.Count; i++) { var myGene = _myDna.GeneList[i]; var theirGene = otherDna.GeneList[i]; var newGene = new GeneModel(); newGene.AlleleList.Add(SplitTheGene(myGene)); newGene.AlleleList.Add(SplitTheGene(theirGene)); newGene.GeneSet = myGene.GeneSet; newDna.GeneList.Add(newGene); } _myDna = newDna; SumAttributes(); if (DNAUpdated != null) { DNAUpdated.Invoke(this, new EventArgs()); } }
AlleleModel SplitTheGene(GeneModel gene) { //To Implement: Awesome cracking sounds. int rand = (int)Math.Round(UnityEngine.Random.value); return(gene.AlleleList[rand]); }
public void OneTranscriptOneHeterozygousSynonymous() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa")); VCFParser vcf = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "chr_1_one_heterozygous_synonymous.vcf")); List <Variant> variants = vcf.Select(x => new Variant(null, x, genome)).ToList(); Assert.AreEqual(1, variants.Count); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript.gtf")); List <Protein> proteins_wo_variant = geneModel.Translate(true).ToList(); List <Transcript> transcripts = geneModel.ApplyVariants(variants); List <Protein> proteins = transcripts.Select(t => t.Protein()).ToList(); Assert.AreEqual(1, geneModel.Genes.Count); Assert.AreEqual(1, proteins.Count); Assert.AreEqual(1, proteins_wo_variant.Count); Assert.AreEqual(1, new HashSet <string> { proteins[0].BaseSequence, proteins_wo_variant[0].BaseSequence }.Count); Assert.IsTrue(proteins.Any(p => p.FullName.Contains(FunctionalClass.SILENT.ToString()))); // synonymous Assert.IsTrue(proteins.Any(p => p.FullName.Contains(GenotypeType.HETEROZYGOUS.ToString()))); // synonymous Assert.IsTrue(proteins.Any(p => p.FullName.Contains("1:69666"))); string proteinFasta = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "chr_1_one_heterozygous_synonymous.fasta"); ProteinDbWriter.WriteFastaDatabase(proteins, proteinFasta, " "); string[] proteinFastaLines = File.ReadLines(proteinFasta).ToArray(); Assert.IsTrue(proteinFastaLines[0].Contains(FunctionalClass.SILENT.ToString())); // synonymous Assert.IsTrue(proteinFastaLines[0].Contains("1:69666")); }
public void ProblematicChr19Gene() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.19.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "problematicChr19Gene.gff3")); geneModel.ApplyVariants(new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19problematic.vcf")).Select(v => new Variant(null, v, genome.Chromosomes[0])).ToList()); }
public void TranslateReverseStrand() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript_reverse.gtf")); List <Protein> proteins_wo_variant = geneModel.Translate(true).ToList(); Assert.AreEqual("FFYFIIWSLTLLPRAGLELLTSSDPPASASQSVGITGVSHHAQ", proteins_wo_variant[0].BaseSequence); }
public void Chr19VariantTranscript() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.19.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19variantTranscript.gff3")); var variants = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19problematic.vcf")) .Select(v => new Variant(null, v, genome.Chromosomes[0])) .Where(v => v.SecondAlleleString.Length == 1 && v.ReferenceAlleleString.Length == 1).ToList(); List <Transcript> transcripts = geneModel.ApplyVariants(variants).ToList(); List <Protein> proteins = transcripts.Select(t => t.Protein(null)).ToList(); }
public void OutputGtfFromGeneModel() { string referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3"); string alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf"); GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio_merged.gtf")); }
public void TranslateAnotherReverseStrand() { // See http://useast.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000233306;r=7:38362864-38363518;t=ENST00000426402 Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.7.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr7_one_transcript_reverse.gtf")); List <Protein> proteins = geneModel.Translate(true).ToList(); Assert.AreEqual("MQWALAVLLAFLSPASQKSSNLEGRTKSVIRQTGSSAEITCDLAEGSNGYIHWYLHQEGKAPQRLQYYDSYNSKVVLESGVSPGKYYTYASTRNNLRLILRNLIENDFGVYYCATWDG", proteins[0].BaseSequence); }
public void TranslateSelenocysteineContaining() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.5.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr5_selenocysteineContaining.gff3")); GeneModel.GetImportantProteinAccessions(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.pep.all.fa"), out Dictionary <string, string> p, out HashSet <string> bad, out Dictionary <string, string> se); List <Protein> proteins = geneModel.Translate(true, bad, se).ToList(); Assert.AreEqual("MWRSLGLALALCLLPSGGTESQDQSSLCKQPPAWSIRDQDPMLNSNGSVTVVALLQASUYLCILQASKLEDLRVKLKKEGYSNISYIVVNHQGISSRLKYTHLKNKVSEHIPVYQQEENQTDVWTLLNGSKDDFLIYDRCGRLVYHLGLPFSFLTFPYVEEAIKIAYCEKKCGNCSLTTLKDEDFCKRVSLATVDKTVETPSPHYHHEHHHNHGHQHLGSSELSENQQPGAPNAPTHPAPPGLHHHHKHKGQHRQGHPENRDMPASEDLQDLQKKLCRKRCINQLLCKLPTDSELAPRSUCCHCRHLIFEKTGSAITUQCKENLPSLCSUQGLRAEENITESCQURLPPAAUQISQQLIPTEASASURUKNQAKKUEUPSN", proteins[0].BaseSequence); }
/// <summary> /// Filters GTF or GFF entries that lack strand information /// Can filter also by zero abundance stringtie estimates /// Add CDS at the end /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false) { var chromFeatures = GeneModel.SimplerParse(gtfPath); string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf"); using (var file = File.Create(filteredGtfPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } Genome ensemblGenome = new Genome(referenceGenomePath); GeneModel newGeneModel = new GeneModel(ensemblGenome, filteredGtfPath); GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(filteredGtfWithCdsPath); }
private DnaModel GenerateDefaultDNA() { DnaModel newDna = new DnaModel(); foreach (var set in CompleteGenome.TheGenome) { int max = set.allAlleles.Count; var allele1 = UnityEngine.Random.Range(0, max); var allele2 = UnityEngine.Random.Range(0, max); var newGene = new GeneModel(); newGene.AlleleList.Add(set.allAlleles[allele1]); newGene.AlleleList.Add(set.allAlleles[allele2]); newGene.GeneSet = set; newDna.GeneList.Add(newGene); } return(newDna); }
public void TranslateMTSeq() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.MT.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse.gtf")); List <Protein> proteins = geneModel.Translate(true).ToList(); Assert.AreEqual("MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYE" + "VTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAGPFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTAYPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT", proteins[0].BaseSequence); geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chrM_one_transcript_reverse2.gtf")); proteins = geneModel.Translate(true).ToList(); Assert.AreEqual("MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAMAMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPS" + "LNVSLLLTLSILSIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLLLNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSLIIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKM" + "KWQFEHTKPTPFLPTLIALTTLLLPISPFMLMIL", proteins[0].BaseSequence); }
/// <summary> /// Filters GTF or GFF entries that lack strand information /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public void FilterGtfEntriesWithoutStrand(string gtfPath, string gtfOutPath, bool filterEntriesWithZeroAbundanceStringtieEstimates) { var chromFeatures = GeneModel.SimplerParse(gtfPath); //if (!File.Exists(gtfOutPath)) //{ using (var file = File.Create(gtfOutPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } //} }
public void TestMissenseMutation() { // Make a transcript Sequence seq = new Sequence(Alphabets.DNA, "AAA".Select(cc => (byte)cc).ToArray(), false); seq.ID = "1"; Chromosome c = new Chromosome(seq, null); Gene g = new Gene("", c, "", "+", 1, 3, null); Transcript t = new Transcript("", g, "", "+", 1, 3, "", null, null); Exon x = new Exon(t, seq, "", 1, 3, seq.ID, "+", null, null); t.Exons = new List <Exon> { x }; CDS cds = new CDS(t, seq.ID, "", "+", 1, 3, null, 0); t.CodingDomainSequences = new List <CDS> { cds }; // Make a missense mutation // ugh.vcf has a homozygous variation that should change the codon from AAA to AGA, which code for K and R // # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample // 1 2 . A G 64.77 . info GT:AD:DP:GQ:PL 1/1:2,3:5:69:93,0,69 List <Variant> variants = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "ugh.vcf")).Select(v => new Variant(null, v, new Chromosome(seq, null))).ToList(); // Make sure it makes it into the DNA sequence t.Variants = new HashSet <Variant>(variants); List <Transcript> variantTranscripts = GeneModel.ApplyVariantsCombinitorially(t); Assert.AreEqual("AAA", SequenceExtensions.ConvertToString(t.Exons[0].Sequence)); Assert.AreEqual("K", t.Protein().BaseSequence); Assert.AreEqual("AGA", SequenceExtensions.ConvertToString(variantTranscripts[0].Exons[0].Sequence)); Assert.AreEqual("R", variantTranscripts[0].Protein().BaseSequence); // Make sure it gets annotated as a missense mutation Assert.IsTrue(variantTranscripts[0].VariantAnnotations.Any(str => str.Contains(FunctionalClass.MISSENSE.ToString()))); }
/// <summary> /// Check that annotations from ensembl make it in from a merged gene model /// </summary> private static void StringtieAndEnsembl202122CDS() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Genome genome = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.karyotypic.fa"); string referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.gtf"; string alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.gtf"; GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.withcds.gtf"); stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff); Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding"); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
/// <summary> /// Annotates PacBio transcript model for MCF7 with start codons in the reference model /// </summary> private static void PacBioCds() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Genome genome = new Genome(@"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.dna.primary_assembly.fa"); string referenceGff = @"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.gtf"; string alternateGff = @"E:\ProjectsActive\MCF7PacBio\IsoSeq_MCF72015edition_polished.unimapped.ensembl.unimapped.gff"; GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(@"E:\ProjectsActive\MCF7PacBio\CDSAnnotated_IsoSeq_MCF7_2015edition_polished.unimapped.gff"); stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " PacBio transcript isoforms"); Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " PacBio transcript isoforms are new annotated as protein coding"); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
/// <summary> /// Check that annotations from ensembl make it in from a merged gene model /// </summary> private static void StringtieAndEnsemblFullCDS() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Genome genome = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.dna.primary_assembly.fa"); string referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.81.gtf"; string alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.gtf"; GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); var x = a.Genes.SelectMany(g => g.Transcripts).FirstOrDefault(t => t.ID == ""); // should be null a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.withcds.gtf"); stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff); Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding"); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
/// <summary> /// Times and checks that all proteins in the pep.all.fasta protein fasta file are the same as are output by this library /// </summary> private static void SameProteins() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); // download and decompress references string genomeFasta = "Homo_sapiens.GRCh38.dna.primary_assembly.fa"; string geneModelFile = "Homo_sapiens.GRCh38.81.gff3"; string proteinFasta = "Homo_sapiens.GRCh38.pep.all.fa"; string[] gunzippedFiles = new[] { genomeFasta, geneModelFile, proteinFasta }; if (!gunzippedFiles.All(f => File.Exists(f) && new FileInfo(f).Length > 0)) { using (WebClient Client = new WebClient()) { Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"); Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81/gff3/homo_sapiens/Homo_sapiens.GRCh38.81.gff3.gz", "Homo_sapiens.GRCh38.81.gff3.gz"); Client.DownloadFile(@"ftp://ftp.ensembl.org/pub/release-81//fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz", "Homo_sapiens.GRCh38.pep.all.fa.gz"); } } foreach (var gunzippedFile in gunzippedFiles) { if (!File.Exists(gunzippedFile) || new FileInfo(gunzippedFile).Length == 0) { using (FileStream stream = new FileStream(gunzippedFile + ".gz", FileMode.Open)) using (GZipStream gunzip = new GZipStream(stream, CompressionMode.Decompress)) using (var f = File.Create(gunzippedFile)) { gunzip.CopyTo(f); } } } GeneModel.GetImportantProteinAccessions(proteinFasta, out Dictionary <string, string> proteinAccessionSequence, out HashSet <string> bad, out Dictionary <string, string> se); Genome genome = new Genome(genomeFasta); GeneModel geneModel = new GeneModel(genome, geneModelFile); List <Protein> geneBasedProteins = geneModel.Translate(true, bad, se); List <Protein> pepDotAll = ProteinDbLoader.LoadProteinFasta(proteinFasta, true, DecoyType.None, false, ProteinDbLoader.EnsemblAccessionRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblFullNameRegex, ProteinDbLoader.EnsemblGeneNameRegex, null, out List <string> errors); Dictionary <string, string> accSeq = geneBasedProteins.ToDictionary(p => p.Accession, p => p.BaseSequence); stopwatch.Stop(); bool allAreEqual = true; foreach (Protein p in pepDotAll) { // now handled with the badAccessions // && !p.BaseSequence.Contains('*') && !seq.Contains('*') && !p.BaseSequence.Contains('X')) if (accSeq.TryGetValue(p.Accession, out string seq)) { if (p.BaseSequence != seq) { allAreEqual = false; break; } } } stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: all proteins are " + (allAreEqual ? "" : "not ") + "equal "); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(genomeFasta) + "*")) { File.Delete(file); } foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(geneModelFile) + "*")) { File.Delete(file); } foreach (var file in Directory.GetFiles(Environment.CurrentDirectory, Path.GetFileNameWithoutExtension(proteinFasta) + "*")) { File.Delete(file); } }
/// <summary> /// Generate sample specific protein database starting with fastq files /// </summary> public void GenerateSampleSpecificProteinDatabases() { // Download references and align reads Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta); if (Parameters.Fastqs != null) { Alignment.Parameters = new AlignmentParameters(); Alignment.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Alignment.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Alignment.Parameters.Reference = Parameters.Reference; Alignment.Parameters.Threads = Parameters.Threads; Alignment.Parameters.Fastqs = Parameters.Fastqs; Alignment.Parameters.ExperimentType = Parameters.ExperimentType; Alignment.Parameters.StrandSpecific = Parameters.StrandSpecific; Alignment.Parameters.InferStrandSpecificity = Parameters.InferStrandSpecificity; Alignment.Parameters.OverwriteStarAlignment = Parameters.OverwriteStarAlignment; Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory; Alignment.Parameters.ReorderedFastaPath = Downloads.ReorderedFastaPath; Alignment.Parameters.GeneModelGtfOrGffPath = Parameters.ReferenceGeneModelGtfOrGff; Alignment.Parameters.UseReadSubset = Parameters.UseReadSubset; Alignment.Parameters.ReadSubset = Parameters.ReadSubset; Alignment.PerformAlignment(); Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath); } EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel); string sortedBed12Path = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel); GeneModel referenceGeneModel = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff); string referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided // Merge reference gene model and a new gene model (either specified or stringtie-generated) string newGeneModelPath = Parameters.NewGeneModelGtfOrGff; string mergedGeneModelWithCdsPath = null; string mergedGeneModelProteinXml = null; string reference = Parameters.Reference; if (Parameters.DoTranscriptIsoformAnalysis) { StringtieWrapper stringtie = new StringtieWrapper(); if (newGeneModelPath == null) { stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true); newGeneModelPath = stringtie.FilteredMergedGtfPath; } else { newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff); string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf"); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"), StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> { newGeneModelPath }, mergedGeneModelPath)).WaitForExit(); newGeneModelPath = mergedGeneModelPath; } // Determine CDS from start codons of reference gene model // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki) GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath); } // SnpEff databases or outputing protein XMLs from gene models if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database { reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath); if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf { mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath); } } else // no isoform analysis { new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference); if (Parameters.Fastqs == null) // no isoform analysis and no fastqs { referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff); } } // Gene Fusion Discovery List <Protein> fusionProteins = new List <Protein>(); if (Parameters.DoFusionAnalysis) { Fusion.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Fusion.Parameters.Reference = Parameters.Reference; Fusion.Parameters.Threads = Parameters.Threads; Fusion.Parameters.Fastqs = Parameters.Fastqs; Fusion.DiscoverGeneFusions(); fusionProteins = Fusion.FusionProteins; } // Variant Calling if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis) { VariantCalling.CallVariants( Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.ExperimentType, reference, Parameters.Threads, sortedBed12Path, Parameters.EnsemblKnownSitesPath, Alignment.DedupedBamFiles, Downloads.ReorderedFastaPath, Downloads.EnsemblGenome, Parameters.QuickSnpEffWithoutStats, Parameters.IndelFinder, Parameters.VariantCallingWorkers); } // Transfer features from UniProt List <string> xmlsToUse = null; if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0) { xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths; } // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList() else { xmlsToUse = new List <string> { Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml } }; VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins); }