Beispiel #1
0
        //public string CombinedGatkGvcfFilePath { get; private set; }
        //public string CombinedGatkVcfFilePath { get; private set; }
        //public string CombinedGatkFilteredVcfFilePath { get; private set; }
        //public string CombinedAnnotatedVcfFilePath { get; private set; }
        //public string CombinedSnpEffHtmlFilePath { get; private set; }
        //public string CombinedAnnotatedGenesSummaryPath { get; private set; }
        //public string CombinedAnnotatedProteinFastaPath { get; private set; }
        //public string CombinedAnnotatedProteinXmlPath { get; private set; }

        public void CallVariants(string spritzDirectory, string analysisDirectory, ExperimentType experimentType, string reference, int threads, string sortedBed12Path, string ensemblKnownSitesPath,
            List<string> dedupedBamFiles, string reorderedFastaPath, Genome genome, bool quickSnpEff, string indelFinder, int workers)
        {
            // Generate scripts for each BAM file
            List<string> variantCallingBashScripts = new List<string>();
            List<SnpEffWrapper> snpeffs = new List<SnpEffWrapper>();
            foreach (string dedupedBam in dedupedBamFiles)
            {
                List<string> variantCallingCommands = new List<string>();
                int workerThreads = (int)Math.Floor((double)threads / (double)workers);
                workerThreads = workerThreads == 0 ? workerThreads++ : workerThreads;

                // GATK
                var gatk = new GATKWrapper(workers);
                if (experimentType == ExperimentType.RNASequencing)
                {
                    variantCallingCommands.AddRange(gatk.SplitNCigarReads(spritzDirectory, reorderedFastaPath, dedupedBam));
                    variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, gatk.SplitTrimBamPath, ensemblKnownSitesPath));
                }
                else
                {
                    variantCallingCommands.AddRange(gatk.BaseRecalibration(spritzDirectory, analysisDirectory, reorderedFastaPath, dedupedBam, ensemblKnownSitesPath));
                }
                variantCallingCommands.AddRange(gatk.VariantCalling(spritzDirectory, experimentType, workerThreads, reorderedFastaPath, gatk.RecalibratedBamPath, Path.Combine(spritzDirectory, ensemblKnownSitesPath)));
                GatkGvcfFilePaths.Add(gatk.HaplotypeCallerGvcfPath);
                GatkVcfFilePaths.Add(gatk.HaplotypeCallerVcfPath);
                GatkFilteredVcfFilePaths.Add(gatk.FilteredHaplotypeCallerVcfPath);

                // Scalpel
                var scalpel = new ScalpelWrapper();
                bool useScalpel = indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase);
                if (useScalpel)
                {
                    variantCallingCommands.AddRange(scalpel.CallIndels(spritzDirectory, workerThreads, reorderedFastaPath, sortedBed12Path, dedupedBam, Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + "_scalpelOut")));
                    ScalpelVcfFilePaths.Add(scalpel.IndelVcfPath);
                    ScalpelFilteredVcfFilePaths.Add(scalpel.FilteredIndelVcfPath);
                }

                // Combine & Annotate
                var vcftools = new VcfToolsWrapper();
                var snpEff = new SnpEffWrapper(workers);
                var outprefix = Path.Combine(Path.GetDirectoryName(gatk.RecalibratedBamPath), Path.GetFileNameWithoutExtension(gatk.RecalibratedBamPath));
                if (useScalpel)
                {
                    variantCallingCommands.Add(vcftools.Concatenate(spritzDirectory, new string[] { gatk.FilteredHaplotypeCallerVcfPath, scalpel.FilteredIndelVcfPath }, outprefix));
                    variantCallingCommands.AddRange(gatk.SortVCF(spritzDirectory, analysisDirectory, vcftools.VcfConcatenatedPath, reorderedFastaPath));
                    CombinedVcfFilePaths.Add(vcftools.VcfConcatenatedPath);
                    CombinedSortedVcfFilePaths.Add(gatk.SortedVcfPath);
                    variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.SortedVcfPath));
                }
                else if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase))
                {
                    variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath));
                }
                else
                {
                    variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath));
                }
                CombinedAnnotatedVcfFilePaths.Add(snpEff.AnnotatedVcfPath);
                CombinedSnpEffHtmlFilePaths.Add(snpEff.HtmlReportPath);
                CombinedAnnotatedProteinFastaPaths.Add(snpEff.VariantProteinFastaPath);
                CombinedAnnotatedProteinXmlPaths.Add(snpEff.VariantProteinXmlPath);
                snpeffs.Add(snpEff);

                string littleScriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling{dedupedBam.GetHashCode().ToString()}.bash");
                WrapperUtility.GenerateScript(littleScriptName, variantCallingCommands);
                variantCallingBashScripts.Add(littleScriptName);
            }

            // Run the scripts in parallel
            string scriptName = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, $"VariantCalling.bash");
            List<string> runnerCommands = new List<string>();
            List<int> runners = new List<int>();
            for (int i = 1; i <= variantCallingBashScripts.Count; i++)
            {
                // runs in parallel unless it's spawning enough workers or at the end of the line
                string logPath = Path.Combine(Path.GetDirectoryName(dedupedBamFiles[i - 1]), Path.GetFileNameWithoutExtension(dedupedBamFiles[i - 1]) + ".variantCalling.log");
                bool waitForWorkersToFinish = i % workers == 0 || i == variantCallingBashScripts.Count;
                runnerCommands.Add($"echo \"Running {variantCallingBashScripts[i - 1]} in the background. See {WrapperUtility.ConvertWindowsPath(logPath).Trim('"')} for output.\"");
                runnerCommands.Add($"bash {WrapperUtility.ConvertWindowsPath(variantCallingBashScripts[i - 1])} &> {WrapperUtility.ConvertWindowsPath(logPath)} &");
                runnerCommands.Add($"proc{i.ToString()}=$!");
                runners.Add(i);

                if (waitForWorkersToFinish)
                {
                    runners.ForEach(r => runnerCommands.Add($"wait $proc{r.ToString()}"));
                    runners.Clear();
                }
            }
            WrapperUtility.GenerateAndRunScript(scriptName, runnerCommands).WaitForExit();

            // Combine GVCFs and make a final database
            // This doesn't work because CombineGVCFs doesn't handle MNPs... MergeVcfs doesn't actually decide anything about overlapping variants... 
            // https://github.com/broadinstitute/gatk/issues/1385
            // Note: I'm not going to figure out how to merge scalpel VCFs just yet until I find out whether it does any better than GATK...
            //if (GatkVcfFilePaths.Count > 1 && !indelFinder.Equals("scalpel", System.StringComparison.InvariantCultureIgnoreCase))
            //{
            //    var gatk = new GATKWrapper();
            //    var snpEff = new SnpEffWrapper();
            //    variantCallingCommands.AddRange(gatk.CombineAndGenotypeGvcfs(spritzDirectory, reorderedFastaPath, GatkVcfFilePaths));
            //    if (indelFinder.Equals("gatk", System.StringComparison.InvariantCultureIgnoreCase))
            //    {
            //        variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.HaplotypeCallerVcfPath, quickSnpEff));
            //    }
            //    else
            //    {
            //        variantCallingCommands.AddRange(snpEff.PrimaryVariantAnnotation(spritzDirectory, reference, gatk.FilteredHaplotypeCallerVcfPath, quickSnpEff));
            //    }
            //    CombinedGatkGvcfFilePath = gatk.HaplotypeCallerGvcfPath;
            //    CombinedGatkVcfFilePath = gatk.HaplotypeCallerVcfPath;
            //    CombinedAnnotatedVcfFilePath = snpEff.AnnotatedVcfPath;
            //    CombinedSnpEffHtmlFilePath = snpEff.HtmlReportPath;
            //    CombinedAnnotatedProteinFastaPath = snpEff.VariantProteinFastaPath;
            //    CombinedAnnotatedProteinXmlPath = snpEff.VariantProteinXmlPath;
            //}
        }
Beispiel #2
0
        /// <summary>
        /// Generate sample specific protein database starting with fastq files
        /// </summary>
        public void GenerateSampleSpecificProteinDatabases()
        {
            // Download references and align reads
            Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta);
            if (Parameters.Fastqs != null)
            {
                Alignment.Parameters = new AlignmentParameters();
                Alignment.Parameters.SpritzDirectory          = Parameters.SpritzDirectory;
                Alignment.Parameters.AnalysisDirectory        = Parameters.AnalysisDirectory;
                Alignment.Parameters.Reference                = Parameters.Reference;
                Alignment.Parameters.Threads                  = Parameters.Threads;
                Alignment.Parameters.Fastqs                   = Parameters.Fastqs;
                Alignment.Parameters.ExperimentType           = Parameters.ExperimentType;
                Alignment.Parameters.StrandSpecific           = Parameters.StrandSpecific;
                Alignment.Parameters.InferStrandSpecificity   = Parameters.InferStrandSpecificity;
                Alignment.Parameters.OverwriteStarAlignment   = Parameters.OverwriteStarAlignment;
                Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory;
                Alignment.Parameters.ReorderedFastaPath       = Downloads.ReorderedFastaPath;
                Alignment.Parameters.GeneModelGtfOrGffPath    = Parameters.ReferenceGeneModelGtfOrGff;
                Alignment.Parameters.UseReadSubset            = Parameters.UseReadSubset;
                Alignment.Parameters.ReadSubset               = Parameters.ReadSubset;

                Alignment.PerformAlignment();
                Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath);
            }
            EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel);
            string    sortedBed12Path              = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel);
            GeneModel referenceGeneModel           = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff);
            string    referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided

            // Merge reference gene model and a new gene model (either specified or stringtie-generated)
            string newGeneModelPath           = Parameters.NewGeneModelGtfOrGff;
            string mergedGeneModelWithCdsPath = null;
            string mergedGeneModelProteinXml  = null;
            string reference = Parameters.Reference;

            if (Parameters.DoTranscriptIsoformAnalysis)
            {
                StringtieWrapper stringtie = new StringtieWrapper();
                if (newGeneModelPath == null)
                {
                    stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true);
                    newGeneModelPath = stringtie.FilteredMergedGtfPath;
                }
                else
                {
                    newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff);
                    string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf");
                    WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"),
                                                        StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> {
                        newGeneModelPath
                    }, mergedGeneModelPath)).WaitForExit();
                    newGeneModelPath = mergedGeneModelPath;
                }

                // Determine CDS from start codons of reference gene model
                // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki)
                GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath);
                newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);

                mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf");
                newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath);
            }

            // SnpEff databases or outputing protein XMLs from gene models
            if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database
            {
                reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath);

                if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf
                {
                    mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath);
                }
            }
            else // no isoform analysis
            {
                new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference);
                if (Parameters.Fastqs == null) // no isoform analysis and no fastqs
                {
                    referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff);
                }
            }

            // Gene Fusion Discovery
            List <Protein> fusionProteins = new List <Protein>();

            if (Parameters.DoFusionAnalysis)
            {
                Fusion.Parameters.SpritzDirectory   = Parameters.SpritzDirectory;
                Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory;
                Fusion.Parameters.Reference         = Parameters.Reference;
                Fusion.Parameters.Threads           = Parameters.Threads;
                Fusion.Parameters.Fastqs            = Parameters.Fastqs;
                Fusion.DiscoverGeneFusions();
                fusionProteins = Fusion.FusionProteins;
            }

            // Variant Calling
            if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis)
            {
                VariantCalling.CallVariants(
                    Parameters.SpritzDirectory,
                    Parameters.AnalysisDirectory,
                    Parameters.ExperimentType,
                    reference,
                    Parameters.Threads,
                    sortedBed12Path,
                    Parameters.EnsemblKnownSitesPath,
                    Alignment.DedupedBamFiles,
                    Downloads.ReorderedFastaPath,
                    Downloads.EnsemblGenome,
                    Parameters.QuickSnpEffWithoutStats,
                    Parameters.IndelFinder,
                    Parameters.VariantCallingWorkers);
            }

            // Transfer features from UniProt
            List <string> xmlsToUse = null;

            if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0)
            {
                xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths;
            }
            // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList()
            else
            {
                xmlsToUse = new List <string> {
                    Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml
                }
            };
            VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins);
        }