/// <summary> /// Runs a two-pass STAR alignment for a given set of RNA-Seq fastq files, /// or it performs a Bowtie2 alignment for WGS or exome sequencing files. /// </summary> public void PerformAlignment() { int starThreads = Math.Min(18, Parameters.Threads); // 18 max, otherwise it throws a segmentation fault in sorting the BAM files if (Parameters.ExperimentType == ExperimentType.RNASequencing) { // Alignment preparation WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "GenomeGenerate.bash"), STARWrapper.GenerateGenomeIndex( Parameters.SpritzDirectory, Parameters.Threads, Parameters.GenomeStarIndexDirectory, new string[] { Parameters.ReorderedFastaPath }, Parameters.GeneModelGtfOrGffPath, Parameters.Fastqs)) .WaitForExit(); // there's trouble with the number of open files for sorting and stuff, which increases with the number of threads // 18 is the max that works with the default max number of open files TwoPassAlignment(starThreads, Parameters.OverwriteStarAlignment); } else { foreach (string[] fastq in Parameters.Fastqs) { SkewerWrapper.Trim(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, 19, fastq, false, out string[] trimmedFastqs, out string skewerLog); FastqsForAlignment.Add(trimmedFastqs); } TopHatWrapper.GenerateBowtieIndex(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.ReorderedFastaPath, out string bowtieIndexPrefix); List <string> alignmentCommands = new List <string> { "echo \"Aligning reads with bowtie2.\"" }; foreach (string[] fastq in FastqsForAlignment) { // alignment alignmentCommands.AddRange(TopHatWrapper.Bowtie2Align(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, bowtieIndexPrefix, Parameters.Threads, fastq, Parameters.StrandSpecific, out string sortedBamPath)); alignmentCommands.Add(SamtoolsWrapper.IndexBamCommand(sortedBamPath)); // mark duplicates GATKWrapper gatk = new GATKWrapper(1); alignmentCommands.AddRange(gatk.PrepareBamAndFasta(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, sortedBamPath, Parameters.ReorderedFastaPath, Parameters.Reference)); alignmentCommands.Add(SamtoolsWrapper.IndexBamCommand(gatk.PreparedBamPath)); SortedBamFiles.Add(sortedBamPath); DedupedBamFiles.Add(gatk.PreparedBamPath); } WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "BowtieAlignment.bash"), alignmentCommands).WaitForExit(); } }
private void UpdateReference() { // Does dbSNP vcf already exist? var gatk = new GATKWrapper(1); string ensemblVcfPath = gatk.DownloadEnsemblKnownVariantSites(EverythingRunnerEngine.SpritzDirectory, true, txtEnsemblReference.Text, true); if (File.Exists(ensemblVcfPath)) { txtDbsnpVcfReference.Text = ensemblVcfPath; } else { txtDbsnpVcfReference.Text = TrimQuotesOrNull(null); } // Does gene model already exist? if (AnalysisDirectory != null && AnalysisDirectory != "" && !Directory.Exists(AnalysisDirectory)) { MessageBox.Show("Analysis directory does not exist.", "Workflow", MessageBoxButton.OK); return; } var ensembl = new EnsemblDownloadsWrapper(); ensembl.DownloadReferences(EverythingRunnerEngine.SpritzDirectory, EverythingRunnerEngine.SpritzDirectory, txtEnsemblReference.Text, true); if (File.Exists(ensembl.Gff3GeneModelPath)) { txtGeneModelGtfOrGff.Text = ensembl.Gff3GeneModelPath; } else if (File.Exists(ensembl.GtfGeneModelPath)) { txtGeneModelGtfOrGff.Text = ensembl.GtfGeneModelPath; } else { txtGeneModelGtfOrGff.Text = TrimQuotesOrNull(null); } // Does genome reference already exist? if (File.Exists(ensembl.GenomeFastaPath)) { txtGenomeFasta.Text = ensembl.GenomeFastaPath; } else { txtGenomeFasta.Text = TrimQuotesOrNull(null); } }
//public string CombinedGatkGvcfFilePath { get; private set; } //public string CombinedGatkVcfFilePath { get; private set; } //public string CombinedGatkFilteredVcfFilePath { get; private set; } //public string CombinedAnnotatedVcfFilePath { get; private set; } //public string CombinedSnpEffHtmlFilePath { get; private set; } //public string CombinedAnnotatedGenesSummaryPath { get; private set; } //public string CombinedAnnotatedProteinFastaPath { get; private set; } //public string CombinedAnnotatedProteinXmlPath { get; private set; } public void CallVariants(string spritzDirectory, string analysisDirectory, ExperimentType experimentType, string reference, int threads, string sortedBed12Path, string ensemblKnownSitesPath, List<string> dedupedBamFiles, string reorderedFastaPath, Genome genome, bool quickSnpEff, string indelFinder, int workers) { // Generate scripts for each BAM file List<string> variantCallingBashScripts = new List<string>(); List<SnpEffWrapper> snpeffs = new List<SnpEffWrapper>(); foreach (string dedupedBam in dedupedBamFiles) { List<string> variantCallingCommands = new List<string>(); int workerThreads = (int)Math.Floor((double)threads / (double)workers); workerThreads = workerThreads == 0 ? workerThreads++ : workerThreads; // GATK var gatk = new GATKWrapper(workers); if (experimentType == ExperimentType.RNASequencing) { variantCallingCommands.AddRange(gatk.SplitNCigarReads(spritzDirectory, reorderedFastaPath, dedupedBam)); variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, gatk.SplitTrimBamPath, ensemblKnownSitesPath)); } else { variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, dedupedBam, ensemblKnownSitesPath)); } variantCallingCommands.AddRange(gatk.VariantCalling(spritzDirectory, experimentType, workerThreads, reorderedFastaPath, gatk.RecalibratedBamPath, Path.Combine(spritzDirectory, ensemblKnownSitesPath))); GatkGvcfFilePaths.Add(gatk.HaplotypeCallerGvcfPath); GatkVcfFilePaths.Add(gatk.HaplotypeCallerVcfPath); GatkFilteredVcfFilePaths.Add(gatk.FilteredHaplotypeCallerVcfPath); // Scalpel var scalpel = new ScalpelWrapper(); bool useScalpel = indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase); if (useScalpel) { variantCallingCommands.AddRange(scalpel.CallIndels(spritzDirectory, workerThreads, reorderedFastaPath, sortedBed12Path, dedupedBam, Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + "_scalpelOut"))); ScalpelVcfFilePaths.Add(scalpel.IndelVcfPath); ScalpelFilteredVcfFilePaths.Add(scalpel.FilteredIndelVcfPath); } // Combine & Annotate var vcftools = new VcfToolsWrapper(); var snpEff = new SnpEffWrapper(workers); var outprefix = Path.Combine(Path.GetDirectoryName(gatk.RecalibratedBamPath), Path.GetFileNameWithoutExtension(gatk.RecalibratedBamPath)); if (useScalpel) { variantCallingCommands.Add(vcftools.Concatenate(spritzDirectory, new string[] { gatk.FilteredHaplotypeCallerVcfPath, scalpel.FilteredIndelVcfPath }, outprefix)); variantCallingCommands.AddRange(gatk.SortVCF(spritzDirectory, analysisDirectory, vcftools.VcfConcatenatedPath, reorderedFastaPath)); CombinedVcfFilePaths.Add(vcftools.VcfConcatenatedPath); CombinedSortedVcfFilePaths.Add(gatk.SortedVcfPath); variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.SortedVcfPath)); } else if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase)) { variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath)); } else { variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath)); } CombinedAnnotatedVcfFilePaths.Add(snpEff.AnnotatedVcfPath); CombinedSnpEffHtmlFilePaths.Add(snpEff.HtmlReportPath); CombinedAnnotatedProteinFastaPaths.Add(snpEff.VariantProteinFastaPath); CombinedAnnotatedProteinXmlPaths.Add(snpEff.VariantProteinXmlPath); snpeffs.Add(snpEff); string littleScriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling{dedupedBam.GetHashCode().ToString()}.bash"); WrapperUtility.GenerateScript(littleScriptName, variantCallingCommands); variantCallingBashScripts.Add(littleScriptName); } // Run the scripts in parallel string scriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling.bash"); List<string> runnerCommands = new List<string>(); List<int> runners = new List<int>(); for (int i = 1; i <= variantCallingBashScripts.Count; i++) { // runs in parallel unless it's spawning enough workers or at the end of the line string logPath = Path.Combine(Path.GetDirectoryName(dedupedBamFiles[i - 1]), Path.GetFileNameWithoutExtension(dedupedBamFiles[i - 1]) + ".variantCalling.log"); bool waitForWorkersToFinish = i % workers == 0 || i == variantCallingBashScripts.Count; runnerCommands.Add($"echo \"Running {variantCallingBashScripts[i - 1]} in the background. See {WrapperUtility.ConvertWindowsPath(logPath).Trim('"')} for output.\""); runnerCommands.Add($"bash {WrapperUtility.ConvertWindowsPath(variantCallingBashScripts[i - 1])} &> {WrapperUtility.ConvertWindowsPath(logPath)} &"); runnerCommands.Add($"proc{i.ToString()}=$!"); runners.Add(i); if (waitForWorkersToFinish) { runners.ForEach(r => runnerCommands.Add($"wait $proc{r.ToString()}")); runners.Clear(); } } WrapperUtility.GenerateAndRunScript(scriptName, runnerCommands).WaitForExit(); // Combine GVCFs and make a final database // This doesn't work because CombineGVCFs doesn't handle MNPs... MergeVcfs doesn't actually decide anything about overlapping variants... // https://github.com/broadinstitute/gatk/issues/1385 // Note: I'm not going to figure out how to merge scalpel VCFs just yet until I find out whether it does any better than GATK... //if (GatkVcfFilePaths.Count > 1 && !indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase)) //{ // var gatk = new GATKWrapper(); // var snpEff = new SnpEffWrapper(); // variantCallingCommands.AddRange(gatk.CombineAndGenotypeGvcfs(spritzDirectory, reorderedFastaPath, GatkVcfFilePaths)); // if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase)) // { // variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath, quickSnpEff)); // } // else // { // variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath, quickSnpEff)); // } // CombinedGatkGvcfFilePath = gatk.HaplotypeCallerGvcfPath; // CombinedGatkVcfFilePath = gatk.HaplotypeCallerVcfPath; // CombinedAnnotatedVcfFilePath = snpEff.AnnotatedVcfPath; // CombinedSnpEffHtmlFilePath = snpEff.HtmlReportPath; // CombinedAnnotatedProteinFastaPath = snpEff.VariantProteinFastaPath; // CombinedAnnotatedProteinXmlPath = snpEff.VariantProteinXmlPath; //} }