public void GffAppliedToOther() { string referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3"); string alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf"); GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); List <Protein> proteins = a.Genes.SelectMany(g => g.Translate(true)).ToList(); //Forward strand, single coding region Assert.AreEqual("PB2015.1.1", proteins[0].Accession); Assert.AreEqual( "MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF", proteins[0].BaseSequence); //Reverse strand, single coding region Assert.AreEqual("PB2015.2.1", proteins[1].Accession); Assert.AreEqual( "TSLWTPQAKLPTFQQLLHTQLLPPSGLFRPSSCFTRAFPGPTFVSWQPSLARFLPVSQQP" + "RQAQVLPHTGLSTSSLCLTVASPRPTPVPGHHLRAQNLLKSDSLVPTAASWWPMKAQNLL" + "KLTCPGPAPASCQRLQAQPLPHGGFSRPTSSSWLGLQAQLLPHNSLFWPSSCPANGGQCR" + "PKTSSSQTLQAHLLLPGGINRPSFDLRTASAGPALASQGLFPGPALASWQLPQAKFLPAC" + "QQPQQAQLLPHSGPFRPNL", proteins[1].BaseSequence); }
public void OutputGtfFromGeneModel() { string referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3"); string alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf"); GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio_merged.gtf")); }
/// <summary> /// Filters GTF or GFF entries that lack strand information /// Can filter also by zero abundance stringtie estimates /// Add CDS at the end /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false) { var chromFeatures = GeneModel.SimplerParse(gtfPath); string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf"); using (var file = File.Create(filteredGtfPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } Genome ensemblGenome = new Genome(referenceGenomePath); GeneModel newGeneModel = new GeneModel(ensemblGenome, filteredGtfPath); GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(filteredGtfWithCdsPath); }
/// <summary> /// Annotates PacBio transcript model for MCF7 with start codons in the reference model /// </summary> private static void PacBioCds() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Genome genome = new Genome(@"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.dna.primary_assembly.fa"); string referenceGff = @"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.gtf"; string alternateGff = @"E:\ProjectsActive\MCF7PacBio\IsoSeq_MCF72015edition_polished.unimapped.ensembl.unimapped.gff"; GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(@"E:\ProjectsActive\MCF7PacBio\CDSAnnotated_IsoSeq_MCF7_2015edition_polished.unimapped.gff"); stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " PacBio transcript isoforms"); Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " PacBio transcript isoforms are new annotated as protein coding"); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
/// <summary> /// Check that annotations from ensembl make it in from a merged gene model /// </summary> private static void StringtieAndEnsembl202122CDS() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Genome genome = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.karyotypic.fa"); string referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.gtf"; string alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.gtf"; GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.withcds.gtf"); stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff); Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding"); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
/// <summary> /// Check that annotations from ensembl make it in from a merged gene model /// </summary> private static void StringtieAndEnsemblFullCDS() { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Genome genome = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.dna.primary_assembly.fa"); string referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.81.gtf"; string alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.gtf"; GeneModel r = new GeneModel(genome, referenceGff); GeneModel a = new GeneModel(genome, alternateGff); var x = a.Genes.SelectMany(g => g.Transcripts).FirstOrDefault(t => t.ID == ""); // should be null a.CreateCDSFromAnnotatedStartCodons(r); a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.withcds.gtf"); stopwatch.Stop(); Console.WriteLine("Finished checking that all proteins are the same."); Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds."); Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff); Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding"); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
/// <summary> /// Generate sample specific protein database starting with fastq files /// </summary> public void GenerateSampleSpecificProteinDatabases() { // Download references and align reads Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta); if (Parameters.Fastqs != null) { Alignment.Parameters = new AlignmentParameters(); Alignment.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Alignment.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Alignment.Parameters.Reference = Parameters.Reference; Alignment.Parameters.Threads = Parameters.Threads; Alignment.Parameters.Fastqs = Parameters.Fastqs; Alignment.Parameters.ExperimentType = Parameters.ExperimentType; Alignment.Parameters.StrandSpecific = Parameters.StrandSpecific; Alignment.Parameters.InferStrandSpecificity = Parameters.InferStrandSpecificity; Alignment.Parameters.OverwriteStarAlignment = Parameters.OverwriteStarAlignment; Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory; Alignment.Parameters.ReorderedFastaPath = Downloads.ReorderedFastaPath; Alignment.Parameters.GeneModelGtfOrGffPath = Parameters.ReferenceGeneModelGtfOrGff; Alignment.Parameters.UseReadSubset = Parameters.UseReadSubset; Alignment.Parameters.ReadSubset = Parameters.ReadSubset; Alignment.PerformAlignment(); Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath); } EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel); string sortedBed12Path = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel); GeneModel referenceGeneModel = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff); string referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided // Merge reference gene model and a new gene model (either specified or stringtie-generated) string newGeneModelPath = Parameters.NewGeneModelGtfOrGff; string mergedGeneModelWithCdsPath = null; string mergedGeneModelProteinXml = null; string reference = Parameters.Reference; if (Parameters.DoTranscriptIsoformAnalysis) { StringtieWrapper stringtie = new StringtieWrapper(); if (newGeneModelPath == null) { stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true); newGeneModelPath = stringtie.FilteredMergedGtfPath; } else { newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff); string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf"); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"), StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> { newGeneModelPath }, mergedGeneModelPath)).WaitForExit(); newGeneModelPath = mergedGeneModelPath; } // Determine CDS from start codons of reference gene model // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki) GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath); } // SnpEff databases or outputing protein XMLs from gene models if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database { reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath); if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf { mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath); } } else // no isoform analysis { new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference); if (Parameters.Fastqs == null) // no isoform analysis and no fastqs { referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff); } } // Gene Fusion Discovery List <Protein> fusionProteins = new List <Protein>(); if (Parameters.DoFusionAnalysis) { Fusion.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Fusion.Parameters.Reference = Parameters.Reference; Fusion.Parameters.Threads = Parameters.Threads; Fusion.Parameters.Fastqs = Parameters.Fastqs; Fusion.DiscoverGeneFusions(); fusionProteins = Fusion.FusionProteins; } // Variant Calling if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis) { VariantCalling.CallVariants( Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.ExperimentType, reference, Parameters.Threads, sortedBed12Path, Parameters.EnsemblKnownSitesPath, Alignment.DedupedBamFiles, Downloads.ReorderedFastaPath, Downloads.EnsemblGenome, Parameters.QuickSnpEffWithoutStats, Parameters.IndelFinder, Parameters.VariantCallingWorkers); } // Transfer features from UniProt List <string> xmlsToUse = null; if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0) { xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths; } // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList() else { xmlsToUse = new List <string> { Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml } }; VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins); }