//public string CombinedGatkGvcfFilePath { get; private set; } //public string CombinedGatkVcfFilePath { get; private set; } //public string CombinedGatkFilteredVcfFilePath { get; private set; } //public string CombinedAnnotatedVcfFilePath { get; private set; } //public string CombinedSnpEffHtmlFilePath { get; private set; } //public string CombinedAnnotatedGenesSummaryPath { get; private set; } //public string CombinedAnnotatedProteinFastaPath { get; private set; } //public string CombinedAnnotatedProteinXmlPath { get; private set; } public void CallVariants(string spritzDirectory, string analysisDirectory, ExperimentType experimentType, string reference, int threads, string sortedBed12Path, string ensemblKnownSitesPath, List<string> dedupedBamFiles, string reorderedFastaPath, Genome genome, bool quickSnpEff, string indelFinder, int workers) { // Generate scripts for each BAM file List<string> variantCallingBashScripts = new List<string>(); List<SnpEffWrapper> snpeffs = new List<SnpEffWrapper>(); foreach (string dedupedBam in dedupedBamFiles) { List<string> variantCallingCommands = new List<string>(); int workerThreads = (int)Math.Floor((double)threads / (double)workers); workerThreads = workerThreads == 0 ? workerThreads++ : workerThreads; // GATK var gatk = new GATKWrapper(workers); if (experimentType == ExperimentType.RNASequencing) { variantCallingCommands.AddRange(gatk.SplitNCigarReads(spritzDirectory, reorderedFastaPath, dedupedBam)); variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, gatk.SplitTrimBamPath, ensemblKnownSitesPath)); } else { variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, dedupedBam, ensemblKnownSitesPath)); } variantCallingCommands.AddRange(gatk.VariantCalling(spritzDirectory, experimentType, workerThreads, reorderedFastaPath, gatk.RecalibratedBamPath, Path.Combine(spritzDirectory, ensemblKnownSitesPath))); GatkGvcfFilePaths.Add(gatk.HaplotypeCallerGvcfPath); GatkVcfFilePaths.Add(gatk.HaplotypeCallerVcfPath); GatkFilteredVcfFilePaths.Add(gatk.FilteredHaplotypeCallerVcfPath); // Scalpel var scalpel = new ScalpelWrapper(); bool useScalpel = indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase); if (useScalpel) { variantCallingCommands.AddRange(scalpel.CallIndels(spritzDirectory, workerThreads, reorderedFastaPath, sortedBed12Path, dedupedBam, Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + "_scalpelOut"))); ScalpelVcfFilePaths.Add(scalpel.IndelVcfPath); ScalpelFilteredVcfFilePaths.Add(scalpel.FilteredIndelVcfPath); } // Combine & Annotate var vcftools = new VcfToolsWrapper(); var snpEff = new SnpEffWrapper(workers); var outprefix = Path.Combine(Path.GetDirectoryName(gatk.RecalibratedBamPath), Path.GetFileNameWithoutExtension(gatk.RecalibratedBamPath)); if (useScalpel) { variantCallingCommands.Add(vcftools.Concatenate(spritzDirectory, new string[] { gatk.FilteredHaplotypeCallerVcfPath, scalpel.FilteredIndelVcfPath }, outprefix)); variantCallingCommands.AddRange(gatk.SortVCF(spritzDirectory, analysisDirectory, vcftools.VcfConcatenatedPath, reorderedFastaPath)); CombinedVcfFilePaths.Add(vcftools.VcfConcatenatedPath); CombinedSortedVcfFilePaths.Add(gatk.SortedVcfPath); variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.SortedVcfPath)); } else if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase)) { variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath)); } else { variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath)); } CombinedAnnotatedVcfFilePaths.Add(snpEff.AnnotatedVcfPath); CombinedSnpEffHtmlFilePaths.Add(snpEff.HtmlReportPath); CombinedAnnotatedProteinFastaPaths.Add(snpEff.VariantProteinFastaPath); CombinedAnnotatedProteinXmlPaths.Add(snpEff.VariantProteinXmlPath); snpeffs.Add(snpEff); string littleScriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling{dedupedBam.GetHashCode().ToString()}.bash"); WrapperUtility.GenerateScript(littleScriptName, variantCallingCommands); variantCallingBashScripts.Add(littleScriptName); } // Run the scripts in parallel string scriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling.bash"); List<string> runnerCommands = new List<string>(); List<int> runners = new List<int>(); for (int i = 1; i <= variantCallingBashScripts.Count; i++) { // runs in parallel unless it's spawning enough workers or at the end of the line string logPath = Path.Combine(Path.GetDirectoryName(dedupedBamFiles[i - 1]), Path.GetFileNameWithoutExtension(dedupedBamFiles[i - 1]) + ".variantCalling.log"); bool waitForWorkersToFinish = i % workers == 0 || i == variantCallingBashScripts.Count; runnerCommands.Add($"echo \"Running {variantCallingBashScripts[i - 1]} in the background. See {WrapperUtility.ConvertWindowsPath(logPath).Trim('"')} for output.\""); runnerCommands.Add($"bash {WrapperUtility.ConvertWindowsPath(variantCallingBashScripts[i - 1])} &> {WrapperUtility.ConvertWindowsPath(logPath)} &"); runnerCommands.Add($"proc{i.ToString()}=$!"); runners.Add(i); if (waitForWorkersToFinish) { runners.ForEach(r => runnerCommands.Add($"wait $proc{r.ToString()}")); runners.Clear(); } } WrapperUtility.GenerateAndRunScript(scriptName, runnerCommands).WaitForExit(); // Combine GVCFs and make a final database // This doesn't work because CombineGVCFs doesn't handle MNPs... MergeVcfs doesn't actually decide anything about overlapping variants... // https://github.com/broadinstitute/gatk/issues/1385 // Note: I'm not going to figure out how to merge scalpel VCFs just yet until I find out whether it does any better than GATK... //if (GatkVcfFilePaths.Count > 1 && !indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase)) //{ // var gatk = new GATKWrapper(); // var snpEff = new SnpEffWrapper(); // variantCallingCommands.AddRange(gatk.CombineAndGenotypeGvcfs(spritzDirectory, reorderedFastaPath, GatkVcfFilePaths)); // if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase)) // { // variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath, quickSnpEff)); // } // else // { // variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath, quickSnpEff)); // } // CombinedGatkGvcfFilePath = gatk.HaplotypeCallerGvcfPath; // CombinedGatkVcfFilePath = gatk.HaplotypeCallerVcfPath; // CombinedAnnotatedVcfFilePath = snpEff.AnnotatedVcfPath; // CombinedSnpEffHtmlFilePath = snpEff.HtmlReportPath; // CombinedAnnotatedProteinFastaPath = snpEff.VariantProteinFastaPath; // CombinedAnnotatedProteinXmlPath = snpEff.VariantProteinXmlPath; //} }
/// <summary> /// Generate sample specific protein database starting with fastq files /// </summary> public void GenerateSampleSpecificProteinDatabases() { // Download references and align reads Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta); if (Parameters.Fastqs != null) { Alignment.Parameters = new AlignmentParameters(); Alignment.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Alignment.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Alignment.Parameters.Reference = Parameters.Reference; Alignment.Parameters.Threads = Parameters.Threads; Alignment.Parameters.Fastqs = Parameters.Fastqs; Alignment.Parameters.ExperimentType = Parameters.ExperimentType; Alignment.Parameters.StrandSpecific = Parameters.StrandSpecific; Alignment.Parameters.InferStrandSpecificity = Parameters.InferStrandSpecificity; Alignment.Parameters.OverwriteStarAlignment = Parameters.OverwriteStarAlignment; Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory; Alignment.Parameters.ReorderedFastaPath = Downloads.ReorderedFastaPath; Alignment.Parameters.GeneModelGtfOrGffPath = Parameters.ReferenceGeneModelGtfOrGff; Alignment.Parameters.UseReadSubset = Parameters.UseReadSubset; Alignment.Parameters.ReadSubset = Parameters.ReadSubset; Alignment.PerformAlignment(); Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath); } EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel); string sortedBed12Path = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel); GeneModel referenceGeneModel = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff); string referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided // Merge reference gene model and a new gene model (either specified or stringtie-generated) string newGeneModelPath = Parameters.NewGeneModelGtfOrGff; string mergedGeneModelWithCdsPath = null; string mergedGeneModelProteinXml = null; string reference = Parameters.Reference; if (Parameters.DoTranscriptIsoformAnalysis) { StringtieWrapper stringtie = new StringtieWrapper(); if (newGeneModelPath == null) { stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true); newGeneModelPath = stringtie.FilteredMergedGtfPath; } else { newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff); string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf"); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"), StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> { newGeneModelPath }, mergedGeneModelPath)).WaitForExit(); newGeneModelPath = mergedGeneModelPath; } // Determine CDS from start codons of reference gene model // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki) GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath); } // SnpEff databases or outputing protein XMLs from gene models if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database { reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath); if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf { mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath); } } else // no isoform analysis { new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference); if (Parameters.Fastqs == null) // no isoform analysis and no fastqs { referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff); } } // Gene Fusion Discovery List <Protein> fusionProteins = new List <Protein>(); if (Parameters.DoFusionAnalysis) { Fusion.Parameters.SpritzDirectory = Parameters.SpritzDirectory; Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory; Fusion.Parameters.Reference = Parameters.Reference; Fusion.Parameters.Threads = Parameters.Threads; Fusion.Parameters.Fastqs = Parameters.Fastqs; Fusion.DiscoverGeneFusions(); fusionProteins = Fusion.FusionProteins; } // Variant Calling if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis) { VariantCalling.CallVariants( Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.ExperimentType, reference, Parameters.Threads, sortedBed12Path, Parameters.EnsemblKnownSitesPath, Alignment.DedupedBamFiles, Downloads.ReorderedFastaPath, Downloads.EnsemblGenome, Parameters.QuickSnpEffWithoutStats, Parameters.IndelFinder, Parameters.VariantCallingWorkers); } // Transfer features from UniProt List <string> xmlsToUse = null; if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0) { xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths; } // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList() else { xmlsToUse = new List <string> { Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml } }; VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins); }