/// <summary> /// lncRNA discovery from fastq files /// </summary> public void LncRNADiscoveryFromFastqs() { // Setup and Alignments EnsemblDownloadsWrapper ensemblDownloads = new EnsemblDownloadsWrapper(); ensemblDownloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta); AlignmentFlow alignment = new AlignmentFlow(); alignment.Parameters = new AlignmentParameters(); alignment.Parameters.SpritzDirectory = Parameters.SpritzDirectory; alignment.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; alignment.Parameters.Reference = Parameters.Reference; alignment.Parameters.Threads = Parameters.Threads; alignment.Parameters.Fastqs = Parameters.Fastqs; alignment.Parameters.ExperimentType = ExperimentType.RNASequencing; alignment.Parameters.StrandSpecific = Parameters.StrandSpecific; alignment.Parameters.InferStrandSpecificity = Parameters.InferStrandSpecificity; alignment.Parameters.OverwriteStarAlignment = Parameters.OverwriteStarAlignment; alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory; alignment.Parameters.ReorderedFastaPath = ensemblDownloads.ReorderedFastaPath; alignment.Parameters.GeneModelGtfOrGffPath = Parameters.GeneModelGtfOrGff; alignment.Parameters.UseReadSubset = Parameters.UseReadSubset; alignment.Parameters.ReadSubset = Parameters.ReadSubset; alignment.PerformAlignment(); ensemblDownloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFasta); EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.GeneModelGtfOrGff, ensemblDownloads.EnsemblGenome, out string filteredGeneModelForScalpel); string sortedBed12Path = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel); // Transcript Reconstruction StringtieWrapper stringtie = new StringtieWrapper(); stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.GeneModelGtfOrGff, ensemblDownloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, alignment.SortedBamFiles, true); ReconstructedTranscriptModels = stringtie.FilteredTranscriptGtfPaths; MergedTranscriptModel = stringtie.FilteredMergedGtfPath; // Annotate lncRNAs foreach (string gtf in ReconstructedTranscriptModels) { string slnckyScriptName = WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "SlnckyAnnotation.bash"); SlnckyOutPrefix = Path.Combine(Path.GetDirectoryName(gtf), Path.GetFileNameWithoutExtension(gtf) + ".slnckyOut", "annotated"); WrapperUtility.GenerateAndRunScript(slnckyScriptName, SlnckyWrapper.Annotate(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, gtf, Parameters.Reference, SlnckyOutPrefix)).WaitForExit(); } // Write quantification tables for differential expression analysis (using stringtie TPM values) }
/// <summary> /// Generate sample specific protein database starting with fastq files /// </summary> public void GenerateSampleSpecificProteinDatabases() { // Download references and align reads Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta); if (Parameters.Fastqs != null) { Alignment.Parameters = new AlignmentParameters(); Alignment.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Alignment.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Alignment.Parameters.Reference = Parameters.Reference; Alignment.Parameters.Threads = Parameters.Threads; Alignment.Parameters.Fastqs = Parameters.Fastqs; Alignment.Parameters.ExperimentType = Parameters.ExperimentType; Alignment.Parameters.StrandSpecific = Parameters.StrandSpecific; Alignment.Parameters.InferStrandSpecificity = Parameters.InferStrandSpecificity; Alignment.Parameters.OverwriteStarAlignment = Parameters.OverwriteStarAlignment; Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory; Alignment.Parameters.ReorderedFastaPath = Downloads.ReorderedFastaPath; Alignment.Parameters.GeneModelGtfOrGffPath = Parameters.ReferenceGeneModelGtfOrGff; Alignment.Parameters.UseReadSubset = Parameters.UseReadSubset; Alignment.Parameters.ReadSubset = Parameters.ReadSubset; Alignment.PerformAlignment(); Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath); } EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel); string sortedBed12Path = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel); GeneModel referenceGeneModel = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff); string referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided // Merge reference gene model and a new gene model (either specified or stringtie-generated) string newGeneModelPath = Parameters.NewGeneModelGtfOrGff; string mergedGeneModelWithCdsPath = null; string mergedGeneModelProteinXml = null; string reference = Parameters.Reference; if (Parameters.DoTranscriptIsoformAnalysis) { StringtieWrapper stringtie = new StringtieWrapper(); if (newGeneModelPath == null) { stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true); newGeneModelPath = stringtie.FilteredMergedGtfPath; } else { newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff); string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf"); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"), StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> { newGeneModelPath }, mergedGeneModelPath)).WaitForExit(); newGeneModelPath = mergedGeneModelPath; } // Determine CDS from start codons of reference gene model // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki) GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath); } // SnpEff databases or outputing protein XMLs from gene models if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database { reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath); if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf { mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath); } } else // no isoform analysis { new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference); if (Parameters.Fastqs == null) // no isoform analysis and no fastqs { referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff); } } // Gene Fusion Discovery List <Protein> fusionProteins = new List <Protein>(); if (Parameters.DoFusionAnalysis) { Fusion.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Fusion.Parameters.Reference = Parameters.Reference; Fusion.Parameters.Threads = Parameters.Threads; Fusion.Parameters.Fastqs = Parameters.Fastqs; Fusion.DiscoverGeneFusions(); fusionProteins = Fusion.FusionProteins; } // Variant Calling if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis) { VariantCalling.CallVariants( Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.ExperimentType, reference, Parameters.Threads, sortedBed12Path, Parameters.EnsemblKnownSitesPath, Alignment.DedupedBamFiles, Downloads.ReorderedFastaPath, Downloads.EnsemblGenome, Parameters.QuickSnpEffWithoutStats, Parameters.IndelFinder, Parameters.VariantCallingWorkers); } // Transfer features from UniProt List <string> xmlsToUse = null; if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0) { xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths; } // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList() else { xmlsToUse = new List <string> { Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml } }; VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins); }