示例#1
0
        /// <summary>
        /// Runs a two-pass STAR alignment for a given set of RNA-Seq fastq files,
        /// or it performs a Bowtie2 alignment for WGS or exome sequencing files.
        /// </summary>
        public void PerformAlignment()
        {
            int starThreads = Math.Min(18, Parameters.Threads); // 18 max, otherwise it throws a segmentation fault in sorting the BAM files

            if (Parameters.ExperimentType == ExperimentType.RNASequencing)
            {
                // Alignment preparation
                WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "GenomeGenerate.bash"),
                                                    STARWrapper.GenerateGenomeIndex(
                                                        Parameters.SpritzDirectory,
                                                        Parameters.Threads,
                                                        Parameters.GenomeStarIndexDirectory,
                                                        new string[] { Parameters.ReorderedFastaPath },
                                                        Parameters.GeneModelGtfOrGffPath,
                                                        Parameters.Fastqs))
                .WaitForExit();

                // there's trouble with the number of open files for sorting and stuff, which increases with the number of threads
                // 18 is the max that works with the default max number of open files
                TwoPassAlignment(starThreads, Parameters.OverwriteStarAlignment);
            }
            else
            {
                foreach (string[] fastq in Parameters.Fastqs)
                {
                    SkewerWrapper.Trim(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, 19, fastq, false, out string[] trimmedFastqs, out string skewerLog);
                    FastqsForAlignment.Add(trimmedFastqs);
                }
                TopHatWrapper.GenerateBowtieIndex(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.ReorderedFastaPath, out string bowtieIndexPrefix);
                List <string> alignmentCommands = new List <string> {
                    "echo \"Aligning reads with bowtie2.\""
                };
                foreach (string[] fastq in FastqsForAlignment)
                {
                    // alignment
                    alignmentCommands.AddRange(TopHatWrapper.Bowtie2Align(Parameters.SpritzDirectory, Parameters.AnalysisDirectory,
                                                                          bowtieIndexPrefix, Parameters.Threads, fastq, Parameters.StrandSpecific, out string sortedBamPath));
                    alignmentCommands.Add(SamtoolsWrapper.IndexBamCommand(sortedBamPath));

                    // mark duplicates
                    GATKWrapper gatk = new GATKWrapper(1);
                    alignmentCommands.AddRange(gatk.PrepareBamAndFasta(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, sortedBamPath, Parameters.ReorderedFastaPath, Parameters.Reference));
                    alignmentCommands.Add(SamtoolsWrapper.IndexBamCommand(gatk.PreparedBamPath));

                    SortedBamFiles.Add(sortedBamPath);
                    DedupedBamFiles.Add(gatk.PreparedBamPath);
                }
                WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "BowtieAlignment.bash"), alignmentCommands).WaitForExit();
            }
        }
示例#2
0
        private void UpdateReference()
        {
            // Does dbSNP vcf already exist?
            var    gatk           = new GATKWrapper(1);
            string ensemblVcfPath = gatk.DownloadEnsemblKnownVariantSites(EverythingRunnerEngine.SpritzDirectory, true, txtEnsemblReference.Text, true);

            if (File.Exists(ensemblVcfPath))
            {
                txtDbsnpVcfReference.Text = ensemblVcfPath;
            }
            else
            {
                txtDbsnpVcfReference.Text = TrimQuotesOrNull(null);
            }

            // Does gene model already exist?
            if (AnalysisDirectory != null && AnalysisDirectory != "" && !Directory.Exists(AnalysisDirectory))
            {
                MessageBox.Show("Analysis directory does not exist.", "Workflow", MessageBoxButton.OK);
                return;
            }
            var ensembl = new EnsemblDownloadsWrapper();

            ensembl.DownloadReferences(EverythingRunnerEngine.SpritzDirectory, EverythingRunnerEngine.SpritzDirectory, txtEnsemblReference.Text, true);
            if (File.Exists(ensembl.Gff3GeneModelPath))
            {
                txtGeneModelGtfOrGff.Text = ensembl.Gff3GeneModelPath;
            }
            else if (File.Exists(ensembl.GtfGeneModelPath))
            {
                txtGeneModelGtfOrGff.Text = ensembl.GtfGeneModelPath;
            }
            else
            {
                txtGeneModelGtfOrGff.Text = TrimQuotesOrNull(null);
            }

            // Does genome reference already exist?
            if (File.Exists(ensembl.GenomeFastaPath))
            {
                txtGenomeFasta.Text = ensembl.GenomeFastaPath;
            }
            else
            {
                txtGenomeFasta.Text = TrimQuotesOrNull(null);
            }
        }
示例#3
0
        //public string CombinedGatkGvcfFilePath { get; private set; }
        //public string CombinedGatkVcfFilePath { get; private set; }
        //public string CombinedGatkFilteredVcfFilePath { get; private set; }
        //public string CombinedAnnotatedVcfFilePath { get; private set; }
        //public string CombinedSnpEffHtmlFilePath { get; private set; }
        //public string CombinedAnnotatedGenesSummaryPath { get; private set; }
        //public string CombinedAnnotatedProteinFastaPath { get; private set; }
        //public string CombinedAnnotatedProteinXmlPath { get; private set; }

        public void CallVariants(string spritzDirectory, string analysisDirectory, ExperimentType experimentType, string reference, int threads, string sortedBed12Path, string ensemblKnownSitesPath,
            List<string> dedupedBamFiles, string reorderedFastaPath, Genome genome, bool quickSnpEff, string indelFinder, int workers)
        {
            // Generate scripts for each BAM file
            List<string> variantCallingBashScripts = new List<string>();
            List<SnpEffWrapper> snpeffs = new List<SnpEffWrapper>();
            foreach (string dedupedBam in dedupedBamFiles)
            {
                List<string> variantCallingCommands = new List<string>();
                int workerThreads = (int)Math.Floor((double)threads / (double)workers);
                workerThreads = workerThreads == 0 ? workerThreads++ : workerThreads;

                // GATK
                var gatk = new GATKWrapper(workers);
                if (experimentType == ExperimentType.RNASequencing)
                {
                    variantCallingCommands.AddRange(gatk.SplitNCigarReads(spritzDirectory, reorderedFastaPath, dedupedBam));
                    variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, gatk.SplitTrimBamPath, ensemblKnownSitesPath));
                }
                else
                {
                    variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, dedupedBam, ensemblKnownSitesPath));
                }
                variantCallingCommands.AddRange(gatk.VariantCalling(spritzDirectory, experimentType, workerThreads, reorderedFastaPath, gatk.RecalibratedBamPath, Path.Combine(spritzDirectory, ensemblKnownSitesPath)));
                GatkGvcfFilePaths.Add(gatk.HaplotypeCallerGvcfPath);
                GatkVcfFilePaths.Add(gatk.HaplotypeCallerVcfPath);
                GatkFilteredVcfFilePaths.Add(gatk.FilteredHaplotypeCallerVcfPath);

                // Scalpel
                var scalpel = new ScalpelWrapper();
                bool useScalpel = indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase);
                if (useScalpel)
                {
                    variantCallingCommands.AddRange(scalpel.CallIndels(spritzDirectory, workerThreads, reorderedFastaPath, sortedBed12Path, dedupedBam, Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + "_scalpelOut")));
                    ScalpelVcfFilePaths.Add(scalpel.IndelVcfPath);
                    ScalpelFilteredVcfFilePaths.Add(scalpel.FilteredIndelVcfPath);
                }

                // Combine & Annotate
                var vcftools = new VcfToolsWrapper();
                var snpEff = new SnpEffWrapper(workers);
                var outprefix = Path.Combine(Path.GetDirectoryName(gatk.RecalibratedBamPath), Path.GetFileNameWithoutExtension(gatk.RecalibratedBamPath));
                if (useScalpel)
                {
                    variantCallingCommands.Add(vcftools.Concatenate(spritzDirectory, new string[] { gatk.FilteredHaplotypeCallerVcfPath, scalpel.FilteredIndelVcfPath }, outprefix));
                    variantCallingCommands.AddRange(gatk.SortVCF(spritzDirectory, analysisDirectory, vcftools.VcfConcatenatedPath, reorderedFastaPath));
                    CombinedVcfFilePaths.Add(vcftools.VcfConcatenatedPath);
                    CombinedSortedVcfFilePaths.Add(gatk.SortedVcfPath);
                    variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.SortedVcfPath));
                }
                else if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase))
                {
                    variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath));
                }
                else
                {
                    variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath));
                }
                CombinedAnnotatedVcfFilePaths.Add(snpEff.AnnotatedVcfPath);
                CombinedSnpEffHtmlFilePaths.Add(snpEff.HtmlReportPath);
                CombinedAnnotatedProteinFastaPaths.Add(snpEff.VariantProteinFastaPath);
                CombinedAnnotatedProteinXmlPaths.Add(snpEff.VariantProteinXmlPath);
                snpeffs.Add(snpEff);

                string littleScriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling{dedupedBam.GetHashCode().ToString()}.bash");
                WrapperUtility.GenerateScript(littleScriptName, variantCallingCommands);
                variantCallingBashScripts.Add(littleScriptName);
            }

            // Run the scripts in parallel
            string scriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling.bash");
            List<string> runnerCommands = new List<string>();
            List<int> runners = new List<int>();
            for (int i = 1; i <= variantCallingBashScripts.Count; i++)
            {
                // runs in parallel unless it's spawning enough workers or at the end of the line
                string logPath = Path.Combine(Path.GetDirectoryName(dedupedBamFiles[i - 1]), Path.GetFileNameWithoutExtension(dedupedBamFiles[i - 1]) + ".variantCalling.log");
                bool waitForWorkersToFinish = i % workers == 0 || i == variantCallingBashScripts.Count;
                runnerCommands.Add($"echo \"Running {variantCallingBashScripts[i - 1]} in the background. See {WrapperUtility.ConvertWindowsPath(logPath).Trim('"')} for output.\"");
                runnerCommands.Add($"bash {WrapperUtility.ConvertWindowsPath(variantCallingBashScripts[i - 1])} &> {WrapperUtility.ConvertWindowsPath(logPath)} &");
                runnerCommands.Add($"proc{i.ToString()}=$!");
                runners.Add(i);

                if (waitForWorkersToFinish)
                {
                    runners.ForEach(r => runnerCommands.Add($"wait $proc{r.ToString()}"));
                    runners.Clear();
                }
            }
            WrapperUtility.GenerateAndRunScript(scriptName, runnerCommands).WaitForExit();

            // Combine GVCFs and make a final database
            // This doesn't work because CombineGVCFs doesn't handle MNPs... MergeVcfs doesn't actually decide anything about overlapping variants... 
            // https://github.com/broadinstitute/gatk/issues/1385
            // Note: I'm not going to figure out how to merge scalpel VCFs just yet until I find out whether it does any better than GATK...
            //if (GatkVcfFilePaths.Count > 1 && !indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase))
            //{
            //    var gatk = new GATKWrapper();
            //    var snpEff = new SnpEffWrapper();
            //    variantCallingCommands.AddRange(gatk.CombineAndGenotypeGvcfs(spritzDirectory, reorderedFastaPath, GatkVcfFilePaths));
            //    if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase))
            //    {
            //        variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath, quickSnpEff));
            //    }
            //    else
            //    {
            //        variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath, quickSnpEff));
            //    }
            //    CombinedGatkGvcfFilePath = gatk.HaplotypeCallerGvcfPath;
            //    CombinedGatkVcfFilePath = gatk.HaplotypeCallerVcfPath;
            //    CombinedAnnotatedVcfFilePath = snpEff.AnnotatedVcfPath;
            //    CombinedSnpEffHtmlFilePath = snpEff.HtmlReportPath;
            //    CombinedAnnotatedProteinFastaPath = snpEff.VariantProteinFastaPath;
            //    CombinedAnnotatedProteinXmlPath = snpEff.VariantProteinXmlPath;
            //}
        }