Exemple #1
0
        /// <summary>
        /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used.
        /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="bamPath"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="inferStrandSpecificity"></param>
        /// <param name="outputDirectory"></param>
        public static List <string> AssembleTranscripts(string spritzDirectory, string analysisDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, bool strandSpecific, bool inferStrandSpecificity, out string outputDirectory)
        {
            bool isStranded = strandSpecific;

            if (inferStrandSpecificity)
            {
                BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8);
                isStranded = bamProperties.Strandedness != Strandedness.None;
            }

            string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck");

            outputDirectory = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksOutput");
            string script_name = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "CufflinksRun.bash");

            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi",
                "bam=" + WrapperUtility.ConvertWindowsPath(bamPath),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi",
                "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " || ! -s " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " ]]; then " +
                "cufflinks-2.2.1/cufflinks " +
                " --num-threads " + threads.ToString() +
                " --GTF-guide " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) +
                " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) +
                (isStranded ? "--library-type fr-firststrand" : "") +
                " $bam" +
                "; fi",
            });
        }
Exemple #2
0
        /// <summary>
        /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used.
        /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="bamPath"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="inferStrandSpecificity"></param>
        /// <param name="outputTranscriptGtfPath"></param>
        public static List <string> AssembleTranscripts(string spritzDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome,
                                                        Strandedness strandSpecific, bool inferStrandSpecificity, out string outputTranscriptGtfPath)
        {
            Strandedness strandedness = strandSpecific;

            if (inferStrandSpecificity)
            {
                BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8);
                strandedness = bamProperties.Strandedness;
            }

            string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck");

            outputTranscriptGtfPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".gtf");
            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi",
                "bam=" + WrapperUtility.ConvertWindowsPath(bamPath),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi",
                "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + " ]]; then",
                "  echo \"Performing stringtie transcript reconstruction on " + bamPath + "\"",
                "  stringtie $bam " +
                " -p " + threads.ToString() +
                " -G " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) +
                " -o " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) +
                (strandedness == Strandedness.None ? "" : strandedness == Strandedness.Forward ? "--fr" : "--rf"),
                "fi",
            });
        }
        public void BAMPropertiesStrandSpecificityTest()
        {
            BAMProperties bam = new BAMProperties(
                Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "unstrandedSingle202122.bam"),
                Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "202122.gtf"),
                new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "202122.fa")),
                0.8);

            Assert.AreEqual(Strandedness.None, bam.Strandedness);
            Assert.AreEqual(RnaSeqProtocol.SingleEnd, bam.Protocol);
        }
Exemple #4
0
        /// <summary>
        /// Infers the strandedness of reads based on aligning a subset.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="analysisDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="fastqPaths"></param>
        /// <param name="genomeStarIndexDirectory"></param>
        /// <param name="reorderedFasta"></param>
        /// <param name="geneModelGtfOrGff"></param>
        /// <returns></returns>
        public static BAMProperties InferStrandedness(string spritzDirectory, string analysisDirectory, int threads, string[] fastqPaths, string genomeStarIndexDirectory,
                                                      string reorderedFasta, string geneModelGtfOrGff)
        {
            // Alignment preparation
            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "GenomeGenerate.bash"),
                                                STARWrapper.GenerateGenomeIndex(spritzDirectory, threads, genomeStarIndexDirectory, new string[] { reorderedFasta }, geneModelGtfOrGff, new List <string[]> {
                fastqPaths
            }))
            .WaitForExit();

            STARWrapper.SubsetFastqs(spritzDirectory, analysisDirectory, fastqPaths, 30000, analysisDirectory, out string[] subsetFastqs);

            string subsetOutPrefix = Path.Combine(Path.GetDirectoryName(subsetFastqs[0]), Path.GetFileNameWithoutExtension(subsetFastqs[0]));

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "AlignSubset.bash"),
                                                STARWrapper.BasicAlignReadCommands(spritzDirectory, threads, genomeStarIndexDirectory, subsetFastqs, subsetOutPrefix, false, STARGenomeLoadOption.LoadAndKeep))
            .WaitForExit();
            BAMProperties bamProperties = new BAMProperties(subsetOutPrefix + STARWrapper.BamFileSuffix, geneModelGtfOrGff, new Genome(reorderedFasta), 0.8);

            return(bamProperties);
        }
Exemple #5
0
        /// <summary>
        /// Performs the bulk of two-pass alignments
        /// </summary>
        private void TwoPassAlignment(int threads, bool overWriteStarAlignment)
        {
            // Trimming and strand specificity
            Genome genome = new Genome(Parameters.ReorderedFastaPath);

            foreach (string[] fq in Parameters.Fastqs)
            {
                // Infer strand specificity before trimming because trimming can change read pairings
                string[] fqForAlignment      = fq;
                bool     localStrandSpecific = Parameters.StrandSpecific;
                if (Parameters.InferStrandSpecificity || Parameters.UseReadSubset)
                {
                    STARWrapper.SubsetFastqs(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, fqForAlignment,
                                             Parameters.ReadSubset, Parameters.AnalysisDirectory, out string[] subsetFastqs);
                    if (Parameters.UseReadSubset)
                    {
                        fqForAlignment = subsetFastqs;
                    }
                    if (Parameters.InferStrandSpecificity)
                    {
                        string subsetOutPrefix = Path.Combine(Path.GetDirectoryName(subsetFastqs[0]), Path.GetFileNameWithoutExtension(subsetFastqs[0]));
                        WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "AlignSubset.bash"),
                                                            STARWrapper.BasicAlignReadCommands(Parameters.SpritzDirectory, threads, Parameters.GenomeStarIndexDirectory, subsetFastqs, subsetOutPrefix, false, STARGenomeLoadOption.LoadAndKeep))
                        .WaitForExit();
                        BAMProperties bamProperties = new BAMProperties(subsetOutPrefix + STARWrapper.BamFileSuffix, Parameters.GeneModelGtfOrGffPath, new Genome(Parameters.ReorderedFastaPath), 0.8);
                        localStrandSpecific = bamProperties.Strandedness != Strandedness.None;
                    }
                }

                SkewerWrapper.Trim(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, threads, 19, fqForAlignment, false, out string[] trimmedFastqs, out string skewerLog);
                fqForAlignment = trimmedFastqs;

                StrandSpecificities.Add(localStrandSpecific);
                FastqsForAlignment.Add(fqForAlignment);
            }

            // Alignment
            List <string> alignmentCommands = new List <string>();

            foreach (string[] fq in FastqsForAlignment)
            {
                string outPrefix = Path.Combine(Path.GetDirectoryName(fq[0]), Path.GetFileNameWithoutExtension(fq[0]));
                if (!File.Exists(outPrefix + STARWrapper.SpliceJunctionFileSuffix) || overWriteStarAlignment)
                {
                    alignmentCommands.AddRange(STARWrapper.FirstPassAlignmentCommands(Parameters.SpritzDirectory, threads, Parameters.GenomeStarIndexDirectory, fq, outPrefix, StrandSpecificities[FastqsForAlignment.IndexOf(fq)], STARGenomeLoadOption.LoadAndKeep));
                }
                FirstPassSpliceJunctions.Add(outPrefix + STARWrapper.SpliceJunctionFileSuffix);
            }
            int uniqueSuffix = 1;

            foreach (string f in FastqsForAlignment.SelectMany(f => f))
            {
                uniqueSuffix = uniqueSuffix ^ f.GetHashCode();
            }
            alignmentCommands.AddRange(STARWrapper.RemoveGenome(Parameters.SpritzDirectory, Parameters.GenomeStarIndexDirectory));
            alignmentCommands.AddRange(STARWrapper.ProcessFirstPassSpliceCommands(FirstPassSpliceJunctions, uniqueSuffix, out string spliceJunctionStartDatabase));
            SecondPassGenomeDirectory = Parameters.GenomeStarIndexDirectory + "SecondPass" + uniqueSuffix.ToString();
            alignmentCommands.AddRange(STARWrapper.GenerateGenomeIndex(Parameters.SpritzDirectory, threads, SecondPassGenomeDirectory, new string[] { Parameters.ReorderedFastaPath }, Parameters.GeneModelGtfOrGffPath, Parameters.Fastqs, spliceJunctionStartDatabase));
            foreach (string[] fq in FastqsForAlignment)
            {
                string outPrefix = Path.Combine(Path.GetDirectoryName(fq[0]), Path.GetFileNameWithoutExtension(fq[0]));
                OutputPrefixes.Add(outPrefix);
                alignmentCommands.AddRange(STARWrapper.AlignRNASeqReadsForVariantCalling(Parameters.SpritzDirectory, threads, SecondPassGenomeDirectory, fq, outPrefix, overWriteStarAlignment, StrandSpecificities[FastqsForAlignment.IndexOf(fq)], STARGenomeLoadOption.LoadAndKeep));
                SortedBamFiles.Add(outPrefix + STARWrapper.SortedBamFileSuffix);
                DedupedBamFiles.Add(outPrefix + STARWrapper.DedupedBamFileSuffix);
                ChimericSamFiles.Add(outPrefix + STARWrapper.ChimericSamFileSuffix);
                ChimericJunctionFiles.Add(outPrefix + STARWrapper.ChimericJunctionsFileSuffix);
            }
            alignmentCommands.AddRange(STARWrapper.RemoveGenome(Parameters.SpritzDirectory, SecondPassGenomeDirectory));
            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "AlignReads.bash"), alignmentCommands).WaitForExit();
        }
Exemple #6
0
        public static void Main(string[] args)
        {
            if (!WrapperUtility.CheckBashSetup())
            {
                throw new FileNotFoundException("The Windows Subsystem for Windows has not been enabled. Please see https://smith-chem-wisc.github.io/Spritz/ for more details.");
            }

            // main setup involves installing tools
            if (args.Contains(ManageToolsFlow.Command))
            {
                ManageToolsFlow.Install(Path.GetDirectoryName(Assembly.GetEntryAssembly().Location));
                return;
            }

            Parsed <Options> result = Parser.Default.ParseArguments <Options>(args) as Parsed <Options>;

            if (result == null)
            {
                Console.WriteLine("Please use GUI.exe if you are a first time user of Spritz.");
                Console.WriteLine("It aims to guide you through setting up tools and running a workflow.");
                Console.WriteLine();
                Console.WriteLine("See above for commandline arguments for CMD.exe.");
                Console.WriteLine("    Required: -c for a command");
                Console.WriteLine("    1) Setting up tools: -c setup");
                Console.WriteLine("    2) Generating a protein database from ensembl: -c proteins");
                Console.WriteLine("    3) Analyzing variants: -c proteins");
                Console.WriteLine("          Also required: --fq1 (and --fq2 if paired-end) for FASTQ files that exist or -s to download an SRA (see https://www.ncbi.nlm.nih.gov/sra).");
                Console.WriteLine();
                Console.WriteLine("Press any key to exit.");
                Console.ReadKey();
                return;
            }

            Options options = result.Value;

            FinishSetup(options);

            // Download SRAs if they're specified
            bool            useSraMethod    = options.SraAccession != null && options.SraAccession.StartsWith("SR");
            List <string[]> fastqsSeparated = useSraMethod ?
                                              SRAToolkitWrapper.GetFastqsFromSras(options.SpritzDirectory, options.Threads, options.AnalysisDirectory, options.SraAccession) :
                                              SeparateFastqs(options.Fastq1, options.Fastq2);

            if (options.Command.Equals(SampleSpecificProteinDBFlow.Command, StringComparison.InvariantCultureIgnoreCase))
            {
                if (options.ReferenceVcf == null)
                {
                    options.ReferenceVcf = new GATKWrapper(1).DownloadEnsemblKnownVariantSites(options.SpritzDirectory, true, options.Reference, false);
                }

                if (options.UniProtXml == null)
                {
                    Console.WriteLine("Note: You can specify a UniProt XML file with the -x flag to transfer modificaitons and database references.");
                }

                // Parse the experiment type
                ExperimentType experimentType;
                if (options.ExperimentType == ExperimentType.RNASequencing.ToString())
                {
                    experimentType = ExperimentType.RNASequencing;
                }
                else if (options.ExperimentType == ExperimentType.WholeGenomeSequencing.ToString())
                {
                    experimentType = ExperimentType.WholeGenomeSequencing;
                }
                else if (options.ExperimentType == ExperimentType.ExomeSequencing.ToString())
                {
                    experimentType = ExperimentType.ExomeSequencing;
                }
                else
                {
                    throw new ArgumentException("Error: experiment type was not recognized.");
                }

                // Check that options make sense with experiment type
                if (options.DoTranscriptIsoformAnalysis && experimentType != ExperimentType.RNASequencing)
                {
                    throw new ArgumentException("Error: cannot do isoform analysis without RNA sequencing data.");
                }

                SampleSpecificProteinDBFlow flow = new SampleSpecificProteinDBFlow();
                flow.Parameters.SpritzDirectory             = options.SpritzDirectory;
                flow.Parameters.AnalysisDirectory           = options.AnalysisDirectory;
                flow.Parameters.Reference                   = options.Reference;
                flow.Parameters.Threads                     = options.Threads;
                flow.Parameters.Fastqs                      = fastqsSeparated;
                flow.Parameters.ExperimentType              = experimentType;
                flow.Parameters.StrandSpecific              = options.StrandSpecific;
                flow.Parameters.InferStrandSpecificity      = options.InferStrandSpecificity;
                flow.Parameters.OverwriteStarAlignment      = options.OverwriteStarAlignments;
                flow.Parameters.GenomeStarIndexDirectory    = options.GenomeStarIndexDirectory;
                flow.Parameters.GenomeFasta                 = options.GenomeFasta;
                flow.Parameters.ProteinFastaPath            = options.ProteinFastaPath;
                flow.Parameters.ReferenceGeneModelGtfOrGff  = options.GeneModelGtfOrGff;
                flow.Parameters.NewGeneModelGtfOrGff        = options.NewGeneModelGtfOrGff;
                flow.Parameters.EnsemblKnownSitesPath       = options.ReferenceVcf;
                flow.Parameters.UniProtXmlPath              = options.UniProtXml;
                flow.Parameters.SkipVariantAnalysis         = options.SkipVariantAnalysis;
                flow.Parameters.DoTranscriptIsoformAnalysis = options.DoTranscriptIsoformAnalysis;
                flow.Parameters.DoFusionAnalysis            = options.DoFusionAnalysis;
                flow.Parameters.IndelFinder                 = options.IndelFinder;
                flow.Parameters.VariantCallingWorkers       = options.VariantCallingWorkers;
                flow.GenerateSampleSpecificProteinDatabases();

                Console.WriteLine("done");
            }

            else if (options.Command.Equals(LncRNADiscoveryFlow.Command, StringComparison.InvariantCultureIgnoreCase))
            {
                if (options.ExperimentType != null && !options.ExperimentType.Equals(ExperimentType.RNASequencing.ToString()))
                {
                    throw new ArgumentException("Error: lncRNA discovery requires RNA-Seq reads.");
                }
                LncRNADiscoveryFlow lnc = new LncRNADiscoveryFlow();
                lnc.Parameters.SpritzDirectory          = options.SpritzDirectory;
                lnc.Parameters.AnalysisDirectory        = options.AnalysisDirectory;
                lnc.Parameters.Reference                = options.Reference;
                lnc.Parameters.Threads                  = options.Threads;
                lnc.Parameters.Fastqs                   = fastqsSeparated;
                lnc.Parameters.StrandSpecific           = options.StrandSpecific;
                lnc.Parameters.InferStrandSpecificity   = options.InferStrandSpecificity;
                lnc.Parameters.OverwriteStarAlignment   = options.OverwriteStarAlignments;
                lnc.Parameters.GenomeStarIndexDirectory = options.GenomeStarIndexDirectory;
                lnc.Parameters.GenomeFasta              = options.GenomeFasta;
                lnc.Parameters.ProteinFasta             = options.ProteinFastaPath;
                lnc.Parameters.GeneModelGtfOrGff        = options.GeneModelGtfOrGff;
                lnc.LncRNADiscoveryFromFastqs();
                return;
            }

            else if (options.Command.Equals(GeneFusionDiscoveryFlow.Command, StringComparison.InvariantCultureIgnoreCase))
            {
                if (options.ExperimentType != null && !options.ExperimentType.Equals(ExperimentType.RNASequencing.ToString()))
                {
                    throw new ArgumentException("Error: gene fusion discovery with STAR fusion requires RNA-Seq reads.");
                }

                GeneFusionDiscoveryFlow flow = new GeneFusionDiscoveryFlow();
                flow.Parameters.SpritzDirectory   = options.SpritzDirectory;
                flow.Parameters.AnalysisDirectory = options.AnalysisDirectory;
                flow.Parameters.Reference         = options.Reference;
                flow.Parameters.Threads           = options.Threads;
                flow.Parameters.Fastqs            = fastqsSeparated;
                flow.DiscoverGeneFusions();
                return;
            }

            else if (options.Command.Equals(TransferModificationsFlow.Command))
            {
                string[] xmls = options.UniProtXml.Split(',');
                TransferModificationsFlow transfer = new TransferModificationsFlow();
                transfer.TransferModifications(options.SpritzDirectory, xmls[0], xmls[1]);
                return;
            }

            else if (options.Command.Equals(TranscriptQuantificationFlow.Command, StringComparison.InvariantCultureIgnoreCase))
            {
                if (options.ExperimentType != null && !options.ExperimentType.Equals(ExperimentType.RNASequencing.ToString()))
                {
                    throw new ArgumentException("Error: transcript quantification requires RNA-Seq reads.");
                }

                foreach (string[] fastq in fastqsSeparated)
                {
                    Strandedness strandedness = options.StrandSpecific ? Strandedness.Forward : Strandedness.None;
                    if (options.InferStrandSpecificity)
                    {
                        var bamProps = AlignmentFlow.InferStrandedness(options.SpritzDirectory, options.AnalysisDirectory, options.Threads,
                                                                       fastq, options.GenomeStarIndexDirectory, options.GenomeFasta, options.GeneModelGtfOrGff);
                        strandedness = bamProps.Strandedness;
                    }
                    TranscriptQuantificationFlow quantify = new TranscriptQuantificationFlow();
                    quantify.Parameters = new TranscriptQuantificationParameters(
                        options.SpritzDirectory,
                        options.AnalysisDirectory,
                        options.GenomeFasta,
                        options.Threads,
                        options.GeneModelGtfOrGff,
                        RSEMAlignerOption.STAR,
                        strandedness,
                        fastq,
                        true);
                    quantify.QuantifyTranscripts();
                }
                return;
            }

            else if (options.Command.Equals("strandedness"))
            {
                string[] fastqs = options.Fastq2 == null ?
                                  new[] { options.Fastq1 } :
                new[] { options.Fastq1, options.Fastq2 };
                BAMProperties b = AlignmentFlow.InferStrandedness(options.SpritzDirectory, options.AnalysisDirectory, options.Threads,
                                                                  fastqs, options.GenomeStarIndexDirectory, options.GenomeFasta, options.GeneModelGtfOrGff);
                Console.WriteLine(b.ToString());
                return;
            }

            else
            {
                throw new ArgumentException($"Error: command not recognized, {options.Command}");
            }
        }