/// <summary> /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used. /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bamPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="inferStrandSpecificity"></param> /// <param name="outputDirectory"></param> public static List <string> AssembleTranscripts(string spritzDirectory, string analysisDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, bool strandSpecific, bool inferStrandSpecificity, out string outputDirectory) { bool isStranded = strandSpecific; if (inferStrandSpecificity) { BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8); isStranded = bamProperties.Strandedness != Strandedness.None; } string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck"); outputDirectory = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksOutput"); string script_name = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "CufflinksRun.bash"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi", "bam=" + WrapperUtility.ConvertWindowsPath(bamPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " || ! -s " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " ]]; then " + "cufflinks-2.2.1/cufflinks " + " --num-threads " + threads.ToString() + " --GTF-guide " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) + (isStranded ? "--library-type fr-firststrand" : "") + " $bam" + "; fi", }); }
/// <summary> /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used. /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bamPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="inferStrandSpecificity"></param> /// <param name="outputTranscriptGtfPath"></param> public static List <string> AssembleTranscripts(string spritzDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, Strandedness strandSpecific, bool inferStrandSpecificity, out string outputTranscriptGtfPath) { Strandedness strandedness = strandSpecific; if (inferStrandSpecificity) { BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8); strandedness = bamProperties.Strandedness; } string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck"); outputTranscriptGtfPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".gtf"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi", "bam=" + WrapperUtility.ConvertWindowsPath(bamPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + " ]]; then", " echo \"Performing stringtie transcript reconstruction on " + bamPath + "\"", " stringtie $bam " + " -p " + threads.ToString() + " -G " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " -o " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + (strandedness == Strandedness.None ? "" : strandedness == Strandedness.Forward ? "--fr" : "--rf"), "fi", }); }
public void BAMPropertiesStrandSpecificityTest() { BAMProperties bam = new BAMProperties( Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "unstrandedSingle202122.bam"), Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "202122.gtf"), new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "202122.fa")), 0.8); Assert.AreEqual(Strandedness.None, bam.Strandedness); Assert.AreEqual(RnaSeqProtocol.SingleEnd, bam.Protocol); }
/// <summary> /// Infers the strandedness of reads based on aligning a subset. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="analysisDirectory"></param> /// <param name="threads"></param> /// <param name="fastqPaths"></param> /// <param name="genomeStarIndexDirectory"></param> /// <param name="reorderedFasta"></param> /// <param name="geneModelGtfOrGff"></param> /// <returns></returns> public static BAMProperties InferStrandedness(string spritzDirectory, string analysisDirectory, int threads, string[] fastqPaths, string genomeStarIndexDirectory, string reorderedFasta, string geneModelGtfOrGff) { // Alignment preparation WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "GenomeGenerate.bash"), STARWrapper.GenerateGenomeIndex(spritzDirectory, threads, genomeStarIndexDirectory, new string[] { reorderedFasta }, geneModelGtfOrGff, new List <string[]> { fastqPaths })) .WaitForExit(); STARWrapper.SubsetFastqs(spritzDirectory, analysisDirectory, fastqPaths, 30000, analysisDirectory, out string[] subsetFastqs); string subsetOutPrefix = Path.Combine(Path.GetDirectoryName(subsetFastqs[0]), Path.GetFileNameWithoutExtension(subsetFastqs[0])); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "AlignSubset.bash"), STARWrapper.BasicAlignReadCommands(spritzDirectory, threads, genomeStarIndexDirectory, subsetFastqs, subsetOutPrefix, false, STARGenomeLoadOption.LoadAndKeep)) .WaitForExit(); BAMProperties bamProperties = new BAMProperties(subsetOutPrefix + STARWrapper.BamFileSuffix, geneModelGtfOrGff, new Genome(reorderedFasta), 0.8); return(bamProperties); }
/// <summary> /// Performs the bulk of two-pass alignments /// </summary> private void TwoPassAlignment(int threads, bool overWriteStarAlignment) { // Trimming and strand specificity Genome genome = new Genome(Parameters.ReorderedFastaPath); foreach (string[] fq in Parameters.Fastqs) { // Infer strand specificity before trimming because trimming can change read pairings string[] fqForAlignment = fq; bool localStrandSpecific = Parameters.StrandSpecific; if (Parameters.InferStrandSpecificity || Parameters.UseReadSubset) { STARWrapper.SubsetFastqs(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, fqForAlignment, Parameters.ReadSubset, Parameters.AnalysisDirectory, out string[] subsetFastqs); if (Parameters.UseReadSubset) { fqForAlignment = subsetFastqs; } if (Parameters.InferStrandSpecificity) { string subsetOutPrefix = Path.Combine(Path.GetDirectoryName(subsetFastqs[0]), Path.GetFileNameWithoutExtension(subsetFastqs[0])); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "AlignSubset.bash"), STARWrapper.BasicAlignReadCommands(Parameters.SpritzDirectory, threads, Parameters.GenomeStarIndexDirectory, subsetFastqs, subsetOutPrefix, false, STARGenomeLoadOption.LoadAndKeep)) .WaitForExit(); BAMProperties bamProperties = new BAMProperties(subsetOutPrefix + STARWrapper.BamFileSuffix, Parameters.GeneModelGtfOrGffPath, new Genome(Parameters.ReorderedFastaPath), 0.8); localStrandSpecific = bamProperties.Strandedness != Strandedness.None; } } SkewerWrapper.Trim(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, threads, 19, fqForAlignment, false, out string[] trimmedFastqs, out string skewerLog); fqForAlignment = trimmedFastqs; StrandSpecificities.Add(localStrandSpecific); FastqsForAlignment.Add(fqForAlignment); } // Alignment List <string> alignmentCommands = new List <string>(); foreach (string[] fq in FastqsForAlignment) { string outPrefix = Path.Combine(Path.GetDirectoryName(fq[0]), Path.GetFileNameWithoutExtension(fq[0])); if (!File.Exists(outPrefix + STARWrapper.SpliceJunctionFileSuffix) || overWriteStarAlignment) { alignmentCommands.AddRange(STARWrapper.FirstPassAlignmentCommands(Parameters.SpritzDirectory, threads, Parameters.GenomeStarIndexDirectory, fq, outPrefix, StrandSpecificities[FastqsForAlignment.IndexOf(fq)], STARGenomeLoadOption.LoadAndKeep)); } FirstPassSpliceJunctions.Add(outPrefix + STARWrapper.SpliceJunctionFileSuffix); } int uniqueSuffix = 1; foreach (string f in FastqsForAlignment.SelectMany(f => f)) { uniqueSuffix = uniqueSuffix ^ f.GetHashCode(); } alignmentCommands.AddRange(STARWrapper.RemoveGenome(Parameters.SpritzDirectory, Parameters.GenomeStarIndexDirectory)); alignmentCommands.AddRange(STARWrapper.ProcessFirstPassSpliceCommands(FirstPassSpliceJunctions, uniqueSuffix, out string spliceJunctionStartDatabase)); SecondPassGenomeDirectory = Parameters.GenomeStarIndexDirectory + "SecondPass" + uniqueSuffix.ToString(); alignmentCommands.AddRange(STARWrapper.GenerateGenomeIndex(Parameters.SpritzDirectory, threads, SecondPassGenomeDirectory, new string[] { Parameters.ReorderedFastaPath }, Parameters.GeneModelGtfOrGffPath, Parameters.Fastqs, spliceJunctionStartDatabase)); foreach (string[] fq in FastqsForAlignment) { string outPrefix = Path.Combine(Path.GetDirectoryName(fq[0]), Path.GetFileNameWithoutExtension(fq[0])); OutputPrefixes.Add(outPrefix); alignmentCommands.AddRange(STARWrapper.AlignRNASeqReadsForVariantCalling(Parameters.SpritzDirectory, threads, SecondPassGenomeDirectory, fq, outPrefix, overWriteStarAlignment, StrandSpecificities[FastqsForAlignment.IndexOf(fq)], STARGenomeLoadOption.LoadAndKeep)); SortedBamFiles.Add(outPrefix + STARWrapper.SortedBamFileSuffix); DedupedBamFiles.Add(outPrefix + STARWrapper.DedupedBamFileSuffix); ChimericSamFiles.Add(outPrefix + STARWrapper.ChimericSamFileSuffix); ChimericJunctionFiles.Add(outPrefix + STARWrapper.ChimericJunctionsFileSuffix); } alignmentCommands.AddRange(STARWrapper.RemoveGenome(Parameters.SpritzDirectory, SecondPassGenomeDirectory)); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "AlignReads.bash"), alignmentCommands).WaitForExit(); }
public static void Main(string[] args) { if (!WrapperUtility.CheckBashSetup()) { throw new FileNotFoundException("The Windows Subsystem for Windows has not been enabled. Please see https://smith-chem-wisc.github.io/Spritz/ for more details."); } // main setup involves installing tools if (args.Contains(ManageToolsFlow.Command)) { ManageToolsFlow.Install(Path.GetDirectoryName(Assembly.GetEntryAssembly().Location)); return; } Parsed <Options> result = Parser.Default.ParseArguments <Options>(args) as Parsed <Options>; if (result == null) { Console.WriteLine("Please use GUI.exe if you are a first time user of Spritz."); Console.WriteLine("It aims to guide you through setting up tools and running a workflow."); Console.WriteLine(); Console.WriteLine("See above for commandline arguments for CMD.exe."); Console.WriteLine(" Required: -c for a command"); Console.WriteLine(" 1) Setting up tools: -c setup"); Console.WriteLine(" 2) Generating a protein database from ensembl: -c proteins"); Console.WriteLine(" 3) Analyzing variants: -c proteins"); Console.WriteLine(" Also required: --fq1 (and --fq2 if paired-end) for FASTQ files that exist or -s to download an SRA (see https://www.ncbi.nlm.nih.gov/sra)."); Console.WriteLine(); Console.WriteLine("Press any key to exit."); Console.ReadKey(); return; } Options options = result.Value; FinishSetup(options); // Download SRAs if they're specified bool useSraMethod = options.SraAccession != null && options.SraAccession.StartsWith("SR"); List <string[]> fastqsSeparated = useSraMethod ? SRAToolkitWrapper.GetFastqsFromSras(options.SpritzDirectory, options.Threads, options.AnalysisDirectory, options.SraAccession) : SeparateFastqs(options.Fastq1, options.Fastq2); if (options.Command.Equals(SampleSpecificProteinDBFlow.Command, StringComparison.InvariantCultureIgnoreCase)) { if (options.ReferenceVcf == null) { options.ReferenceVcf = new GATKWrapper(1).DownloadEnsemblKnownVariantSites(options.SpritzDirectory, true, options.Reference, false); } if (options.UniProtXml == null) { Console.WriteLine("Note: You can specify a UniProt XML file with the -x flag to transfer modificaitons and database references."); } // Parse the experiment type ExperimentType experimentType; if (options.ExperimentType == ExperimentType.RNASequencing.ToString()) { experimentType = ExperimentType.RNASequencing; } else if (options.ExperimentType == ExperimentType.WholeGenomeSequencing.ToString()) { experimentType = ExperimentType.WholeGenomeSequencing; } else if (options.ExperimentType == ExperimentType.ExomeSequencing.ToString()) { experimentType = ExperimentType.ExomeSequencing; } else { throw new ArgumentException("Error: experiment type was not recognized."); } // Check that options make sense with experiment type if (options.DoTranscriptIsoformAnalysis && experimentType != ExperimentType.RNASequencing) { throw new ArgumentException("Error: cannot do isoform analysis without RNA sequencing data."); } SampleSpecificProteinDBFlow flow = new SampleSpecificProteinDBFlow(); flow.Parameters.SpritzDirectory = options.SpritzDirectory; flow.Parameters.AnalysisDirectory = options.AnalysisDirectory; flow.Parameters.Reference = options.Reference; flow.Parameters.Threads = options.Threads; flow.Parameters.Fastqs = fastqsSeparated; flow.Parameters.ExperimentType = experimentType; flow.Parameters.StrandSpecific = options.StrandSpecific; flow.Parameters.InferStrandSpecificity = options.InferStrandSpecificity; flow.Parameters.OverwriteStarAlignment = options.OverwriteStarAlignments; flow.Parameters.GenomeStarIndexDirectory = options.GenomeStarIndexDirectory; flow.Parameters.GenomeFasta = options.GenomeFasta; flow.Parameters.ProteinFastaPath = options.ProteinFastaPath; flow.Parameters.ReferenceGeneModelGtfOrGff = options.GeneModelGtfOrGff; flow.Parameters.NewGeneModelGtfOrGff = options.NewGeneModelGtfOrGff; flow.Parameters.EnsemblKnownSitesPath = options.ReferenceVcf; flow.Parameters.UniProtXmlPath = options.UniProtXml; flow.Parameters.SkipVariantAnalysis = options.SkipVariantAnalysis; flow.Parameters.DoTranscriptIsoformAnalysis = options.DoTranscriptIsoformAnalysis; flow.Parameters.DoFusionAnalysis = options.DoFusionAnalysis; flow.Parameters.IndelFinder = options.IndelFinder; flow.Parameters.VariantCallingWorkers = options.VariantCallingWorkers; flow.GenerateSampleSpecificProteinDatabases(); Console.WriteLine("done"); } else if (options.Command.Equals(LncRNADiscoveryFlow.Command, StringComparison.InvariantCultureIgnoreCase)) { if (options.ExperimentType != null && !options.ExperimentType.Equals(ExperimentType.RNASequencing.ToString())) { throw new ArgumentException("Error: lncRNA discovery requires RNA-Seq reads."); } LncRNADiscoveryFlow lnc = new LncRNADiscoveryFlow(); lnc.Parameters.SpritzDirectory = options.SpritzDirectory; lnc.Parameters.AnalysisDirectory = options.AnalysisDirectory; lnc.Parameters.Reference = options.Reference; lnc.Parameters.Threads = options.Threads; lnc.Parameters.Fastqs = fastqsSeparated; lnc.Parameters.StrandSpecific = options.StrandSpecific; lnc.Parameters.InferStrandSpecificity = options.InferStrandSpecificity; lnc.Parameters.OverwriteStarAlignment = options.OverwriteStarAlignments; lnc.Parameters.GenomeStarIndexDirectory = options.GenomeStarIndexDirectory; lnc.Parameters.GenomeFasta = options.GenomeFasta; lnc.Parameters.ProteinFasta = options.ProteinFastaPath; lnc.Parameters.GeneModelGtfOrGff = options.GeneModelGtfOrGff; lnc.LncRNADiscoveryFromFastqs(); return; } else if (options.Command.Equals(GeneFusionDiscoveryFlow.Command, StringComparison.InvariantCultureIgnoreCase)) { if (options.ExperimentType != null && !options.ExperimentType.Equals(ExperimentType.RNASequencing.ToString())) { throw new ArgumentException("Error: gene fusion discovery with STAR fusion requires RNA-Seq reads."); } GeneFusionDiscoveryFlow flow = new GeneFusionDiscoveryFlow(); flow.Parameters.SpritzDirectory = options.SpritzDirectory; flow.Parameters.AnalysisDirectory = options.AnalysisDirectory; flow.Parameters.Reference = options.Reference; flow.Parameters.Threads = options.Threads; flow.Parameters.Fastqs = fastqsSeparated; flow.DiscoverGeneFusions(); return; } else if (options.Command.Equals(TransferModificationsFlow.Command)) { string[] xmls = options.UniProtXml.Split(','); TransferModificationsFlow transfer = new TransferModificationsFlow(); transfer.TransferModifications(options.SpritzDirectory, xmls[0], xmls[1]); return; } else if (options.Command.Equals(TranscriptQuantificationFlow.Command, StringComparison.InvariantCultureIgnoreCase)) { if (options.ExperimentType != null && !options.ExperimentType.Equals(ExperimentType.RNASequencing.ToString())) { throw new ArgumentException("Error: transcript quantification requires RNA-Seq reads."); } foreach (string[] fastq in fastqsSeparated) { Strandedness strandedness = options.StrandSpecific ? Strandedness.Forward : Strandedness.None; if (options.InferStrandSpecificity) { var bamProps = AlignmentFlow.InferStrandedness(options.SpritzDirectory, options.AnalysisDirectory, options.Threads, fastq, options.GenomeStarIndexDirectory, options.GenomeFasta, options.GeneModelGtfOrGff); strandedness = bamProps.Strandedness; } TranscriptQuantificationFlow quantify = new TranscriptQuantificationFlow(); quantify.Parameters = new TranscriptQuantificationParameters( options.SpritzDirectory, options.AnalysisDirectory, options.GenomeFasta, options.Threads, options.GeneModelGtfOrGff, RSEMAlignerOption.STAR, strandedness, fastq, true); quantify.QuantifyTranscripts(); } return; } else if (options.Command.Equals("strandedness")) { string[] fastqs = options.Fastq2 == null ? new[] { options.Fastq1 } : new[] { options.Fastq1, options.Fastq2 }; BAMProperties b = AlignmentFlow.InferStrandedness(options.SpritzDirectory, options.AnalysisDirectory, options.Threads, fastqs, options.GenomeStarIndexDirectory, options.GenomeFasta, options.GeneModelGtfOrGff); Console.WriteLine(b.ToString()); return; } else { throw new ArgumentException($"Error: command not recognized, {options.Command}"); } }