/// <summary> /// Run star fusion from fastqs /// </summary> /// <param name="spritzDirectory"></param> /// <param name="analysisDirectory"></param> /// <param name="threads"></param> /// <param name="fastqs"></param> /// <param name="outdir"></param> /// <returns></returns> public List <string> RunStarFusion(string spritzDirectory, string analysisDirectory, int threads, string[] fastqs) { if (ReferenceLibraryDirectory == null) { throw new FileNotFoundException("STAR-Fusion reference library was not generated prior to running STAR-Fusion."); } if (fastqs == null || fastqs.Length == 0) { throw new ArgumentException("No fastqs were passed into STAR-Fusion."); } OutputDirectoryPath = Path.Combine(Path.GetDirectoryName(fastqs[0]), Path.GetFileNameWithoutExtension(fastqs[0]) + "FusionAnalysis"); Directory.CreateDirectory(OutputDirectoryPath); string tmp = Path.Combine(analysisDirectory, "_STARFusionTmp"); Directory.CreateDirectory(tmp); string arguments = " --examine_coding_effect" + " --left_fq " + WrapperUtility.ConvertWindowsPath(fastqs[0]) + (fastqs.Length > 1 ? " --right_fq " + WrapperUtility.ConvertWindowsPath(fastqs[1]) : "") + " --CPU " + threads.ToString() + " --output_dir " + WrapperUtility.ConvertWindowsPath(OutputDirectoryPath) + " --genome_lib_dir " + WrapperUtility.ConvertWindowsPath(ReferenceLibraryDirectory) + " --tmpdir " + WrapperUtility.ConvertWindowsPath(tmp); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), StarFusionDirectoryName + "/STAR-Fusion " + arguments }); }
/// <summary> /// Aligns reads and outputs alignment map and chimeric alignments. /// Note: fastqs must have \n line endings, not \r\n. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="genomeDir"></param> /// <param name="fastqFiles"></param> /// <param name="outprefix"></param> /// <param name="strandSpecific"></param> /// <param name="genomeLoad"></param> /// <param name="outSamType"></param> /// <returns></returns> public static List <string> BasicAlignReadCommands(string spritzDirectory, int threads, string genomeDir, string[] fastqFiles, string outprefix, bool strandSpecific = true, STARGenomeLoadOption genomeLoad = STARGenomeLoadOption.NoSharedMemory, string outSamType = "BAM Unsorted") { string reads_in = "\"" + string.Join("\" \"", fastqFiles.Select(f => WrapperUtility.ConvertWindowsPath(f))) + "\""; string read_command = fastqFiles.Any(f => Path.GetExtension(f) == ".gz") ? " --readFilesCommand zcat -c" : fastqFiles.Any(f => Path.GetExtension(f) == ".bz2") ? " --readFilesCommand bzip2 -c" : ""; string arguments = " --genomeLoad " + genomeLoad.ToString() + " --runThreadN " + threads.ToString() + " --genomeDir \"" + WrapperUtility.ConvertWindowsPath(genomeDir) + "\"" + " --readFilesIn " + reads_in + " --outSAMtype " + outSamType + " --limitBAMsortRAM " + Process.GetCurrentProcess().VirtualMemorySize64.ToString() + " --outSAMstrandField intronMotif" + // adds XS tag to all alignments that contain a splice junction " --outFilterIntronMotifs RemoveNoncanonical" + // for cufflinks " --outFileNamePrefix " + WrapperUtility.ConvertWindowsPath(outprefix) + read_command; string fileToCheck = WrapperUtility.ConvertWindowsPath(outprefix + (outSamType.Contains("Sorted") ? SortedBamFileSuffix : outSamType.Contains("Unsorted") ? BamFileSuffix : SpliceJunctionFileSuffix)); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(fileToCheck) + " || ! -s " + WrapperUtility.ConvertWindowsPath(fileToCheck) + " ) ]]; then STAR" + arguments + "; fi", File.Exists(outprefix + BamFileSuffix) && genomeLoad == STARGenomeLoadOption.LoadAndRemove ? "STAR --genomeLoad " + STARGenomeLoadOption.Remove.ToString() : "" }); }
public List <string> PrimaryVariantAnnotation(string spritzDirectory, string reference, string inputVcfPath, bool fromReference = false) { string outPrefix = Path.Combine(Path.GetDirectoryName(inputVcfPath), Path.GetFileNameWithoutExtension(inputVcfPath)); AnnotatedVcfPath = outPrefix + ".snpEffAnnotated.vcf"; HtmlReportPath = outPrefix + ".snpEffAnnotated.html"; AnnotatedGenesSummaryPath = outPrefix + ".snpEffAnnotated.genes.txt"; VariantProteinFastaPath = outPrefix + ".snpEffAnnotated.protein.fasta"; VariantProteinXmlPath = outPrefix + ".snpEffAnnotated.protein.xml"; Directory.CreateDirectory(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data")); string[] existingDatabases = Directory.GetDirectories(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data")); if (File.Exists(AnnotatedVcfPath) && new FileInfo(AnnotatedVcfPath).Length > 0) { return(new List <string>()); } return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SnpEff(Workers) + " -v -stats " + WrapperUtility.ConvertWindowsPath(HtmlReportPath) + " -fastaProt " + WrapperUtility.ConvertWindowsPath(VariantProteinFastaPath) + " -xmlProt " + WrapperUtility.ConvertWindowsPath(VariantProteinXmlPath) + " " + Path.GetFileName(existingDatabases.FirstOrDefault(x => Path.GetFileName(x).StartsWith(reference, true, null))) + (fromReference ? "" : $" {WrapperUtility.ConvertWindowsPath(inputVcfPath)} > {WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath)}"), // ensure that the files get closed before continuing WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath)), WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(VariantProteinFastaPath)), WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(VariantProteinXmlPath)), // remove the annotated VCF file if snpEff didn't work, e.g. if there was no VCF file to annotate "if [[ ( -f " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath) + " && ! -s " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath) + " ) ]]; then", " rm " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath), "fi", }); }
/// <summary> /// Scalpel installation folder cannot be moved and still work. There must be some hard-coded path references. /// If needed, delete and reinstall in another location. /// </summary> /// <param name="spritzDirectory"></param> /// <returns>true if no fix needed, false if fix performed</returns> public bool CheckInstallation(string spritzDirectory) { // Don't go further if installation hasn't been run at all string scalpelLocationFile = Path.Combine(spritzDirectory, "Tools", ScalpelLocationCheckFilename); if (!File.Exists(scalpelLocationFile)) { return(false); } // Remove and reinstall if it moved string removeScriptPath = WriteRemoveScript(spritzDirectory); string scriptPath = WrapperUtility.GetInstallationScriptPath(spritzDirectory, "CheckScalpelInstallation.bash"); string expectedLocation = WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "scalpel-" + ScalpelVersion)); bool isSame = File.ReadAllText(scalpelLocationFile).TrimEnd() == expectedLocation.Trim('"'); if (!isSame) { WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "bash " + WrapperUtility.ConvertWindowsPath(removeScriptPath), "bash " + WrapperUtility.ConvertWindowsPath(WriteInstallScript(spritzDirectory)), }).WaitForExit(); } return(isSame); }
/// <summary> /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used. /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bamPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="inferStrandSpecificity"></param> /// <param name="outputTranscriptGtfPath"></param> public static List <string> AssembleTranscripts(string spritzDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, Strandedness strandSpecific, bool inferStrandSpecificity, out string outputTranscriptGtfPath) { Strandedness strandedness = strandSpecific; if (inferStrandSpecificity) { BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8); strandedness = bamProperties.Strandedness; } string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck"); outputTranscriptGtfPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".gtf"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi", "bam=" + WrapperUtility.ConvertWindowsPath(bamPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + " ]]; then", " echo \"Performing stringtie transcript reconstruction on " + bamPath + "\"", " stringtie $bam " + " -p " + threads.ToString() + " -G " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " -o " + WrapperUtility.ConvertWindowsPath(outputTranscriptGtfPath) + (strandedness == Strandedness.None ? "" : strandedness == Strandedness.Forward ? "--fr" : "--rf"), "fi", }); }
// Need to filter VCF by FILTER = PASS; there are several reasons they don't accept calls that I trust // There's an attribute "ZYG" for zygosity, either "het" or "h**o" for heterozygous or homozygous public List <string> CallIndels(string spritzDirectory, int threads, string genomeFastaP, string bedPath, string bamPath, string outdir) { CheckInstallation(spritzDirectory); var vcftools = new VcfToolsWrapper(); IndelVcfPath = Path.Combine(outdir, "variants.indel.vcf"); //IndelVcf1IndexedPath = Path.Combine(outdir, "variants.indel.1index.vcf"); var commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(IndelVcfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(IndelVcfPath) + " ]]; then ", " scalpel-" + ScalpelVersion + "/scalpel-discovery --single " + "--bam " + WrapperUtility.ConvertWindowsPath(bamPath) + " --ref " + WrapperUtility.ConvertWindowsPath(genomeFastaP) + " --bed " + WrapperUtility.ConvertWindowsPath(bedPath) + " --numprocs " + threads.ToString() + " --dir " + WrapperUtility.ConvertWindowsPath(outdir), // scalpel uses 0-indexing, where SnpEff uses 1-indexing, so change this output to match snpeff //" awk 'BEGIN{OFS=\"\t\"}{ if (substr($0, 1, 1) != \"#\") $2=++$2; print $0 }' " + WrapperUtility.ConvertWindowsPath(IndelVcfPath) + " > " + WrapperUtility.ConvertWindowsPath(IndelVcf1IndexedPath), "fi", // vcf-concat doesn't keep all INFO header lines, so just dump the INFO from each variant vcftools.RemoveAllSnvs(spritzDirectory, IndelVcfPath, false, true) }; FilteredIndelVcfPath = vcftools.VcfWithoutSnvsPath; return(commands); }
/// <summary> /// Prepares an Ensembl genome fasta for alignment and all following analysis. The main issue is that Ensembl orders chromosomes lexigraphically, not karyotypically, like some software like GATK expects. /// </summary> /// <param name="genomeFasta"></param> /// <param name="ensemblGenome"></param> /// <param name="reorderedFasta"></param> public void PrepareEnsemblGenomeFasta(string analysisDirectory, string genomeFasta) { if (Path.GetExtension(genomeFasta) == ".gz" || Path.GetExtension(genomeFasta) == ".tgz") { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Gzippy.bash"), new List <string> { $"gunzip {WrapperUtility.ConvertWindowsPath(genomeFasta)}" }).WaitForExit(); genomeFasta = Path.ChangeExtension(genomeFasta, null); } // We need to use the same fasta file throughout and have all the VCF and GTF chromosome reference IDs be the same as these. // Right now this is based on ensembl references, so those are the chromosome IDs I will be using throughout // TODO: try this with UCSC references to judge whether there's a difference in quality / yield / FDR etc in subsequent proteomics analysis // This file needs to be in karyotypic order; this allows us not to have to reorder it for GATK analysis ReorderedFastaPath = Path.Combine(Path.GetDirectoryName(genomeFasta), Path.GetFileNameWithoutExtension(genomeFasta) + ".karyotypic.fa"); EnsemblGenome = new Genome(genomeFasta); if (!EnsemblGenome.IsKaryotypic()) { EnsemblGenome.Chromosomes = EnsemblGenome.KaryotypicOrder(); if (!File.Exists(ReorderedFastaPath)) { Genome.WriteFasta(EnsemblGenome.Chromosomes.Select(x => x.Sequence), ReorderedFastaPath); } } else { ReorderedFastaPath = genomeFasta; } }
/// <summary> /// Creates recalibration table and recalibrates reads. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="genomeFasta"></param> /// <param name="bam"></param> /// <param name="recalibrationTablePath"></param> /// <param name="knownSitesVcf"></param> public List <string> BaseRecalibration(string spritzDirectory, string analysisDirectory, string genomeFasta, string bam, string knownSitesVcf) { RecalibrationTablePath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recaltable"); RecalibratedBamPath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recal.bam"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // check that reference VCF is indexed "if [ ! -f " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + "; fi", "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " ) ]]; then " + Gatk(Workers) + " BaseRecalibrator" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + (knownSitesVcf != "" ? " --known-sites " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) : "") + " -O " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + "; fi", "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " ) ]]; then " + Gatk(Workers) + " ApplyBQSR" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " --bqsr-recal-file " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " -O " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + "; fi", SamtoolsWrapper.IndexBamCommand(RecalibratedBamPath), }); }
public static void Align(string spritzDirectory, string analysisDirectory, string IndexPrefix, string[] fastqPaths, out string outputDirectory) { if (fastqPaths.Length == 1) { outputDirectory = "Hisat2OutUnpaired.sam"; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Align.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "hisat2-2.1.0/hisat2 -q -x" + " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) + " -U " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))) + " -S " + outputDirectory, }).WaitForExit(); } else { outputDirectory = "Hisat2OutPaired.sam"; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Align.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "hisat2-2.1.0/hisat2 -q -x" + " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) + " -1 " + string.Join(",", WrapperUtility.ConvertWindowsPath(fastqPaths[0])) + " -2 " + string.Join(",", WrapperUtility.ConvertWindowsPath(fastqPaths[1])) + " -S " + outputDirectory, }).WaitForExit(); } }
/// <summary> /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used. /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bamPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="inferStrandSpecificity"></param> /// <param name="outputDirectory"></param> public static List <string> AssembleTranscripts(string spritzDirectory, string analysisDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, bool strandSpecific, bool inferStrandSpecificity, out string outputDirectory) { bool isStranded = strandSpecific; if (inferStrandSpecificity) { BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8); isStranded = bamProperties.Strandedness != Strandedness.None; } string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck"); outputDirectory = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksOutput"); string script_name = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "CufflinksRun.bash"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi", "bam=" + WrapperUtility.ConvertWindowsPath(bamPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " || ! -s " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " ]]; then " + "cufflinks-2.2.1/cufflinks " + " --num-threads " + threads.ToString() + " --GTF-guide " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) + (isStranded ? "--library-type fr-firststrand" : "") + " $bam" + "; fi", }); }
/// <summary> /// Gets commands to prepare an RSEM reference /// </summary> /// <param name="spritzDirectory"></param> /// <param name="referenceFastaPath"></param> /// <param name="referencePrefix"></param> /// <param name="threads"></param> /// <param name="geneModelPath"></param> /// <param name="aligner"></param> /// <returns></returns> public List <string> PrepareReferenceCommands(string spritzDirectory, string referenceFastaPath, int threads, string geneModelPath, RSEMAlignerOption aligner) { // make option strings, including putting reference files into a new directory string alignerOption = GetAlignerOption(spritzDirectory, aligner); string threadOption = "--num-threads " + threads.ToString(); string referencePrefixDirectory = Path.Combine(Path.GetDirectoryName(referenceFastaPath), Path.GetFileNameWithoutExtension(referenceFastaPath)) + (aligner == RSEMAlignerOption.STAR ? "RsemStarReference" : "RsemBowtieReference") + "_" + Path.GetExtension(geneModelPath).Substring(1).ToUpperInvariant() + geneModelPath.GetHashCode().ToString(); ReferenceIndexPrefix = Path.Combine(referencePrefixDirectory, Path.GetFileNameWithoutExtension(referenceFastaPath)); string geneModelOption = Path.GetExtension(geneModelPath).StartsWith(".gff") ? "--gff3 " + WrapperUtility.ConvertWindowsPath(geneModelPath) : Path.GetExtension(geneModelPath) == ".gtf" ? "--gtf " + WrapperUtility.ConvertWindowsPath(geneModelPath) : null; // construct the commands var scriptStrings = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "cd RSEM-1.3.0", "mkdir " + WrapperUtility.ConvertWindowsPath(referencePrefixDirectory), "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(Path.Combine(referencePrefixDirectory, "SA")) + " && ! -s " + WrapperUtility.ConvertWindowsPath(Path.Combine(referencePrefixDirectory, "SA")) + " ]]; then " + "./rsem-prepare-reference " + geneModelOption + " " + alignerOption + " " + threadOption + " " + WrapperUtility.ConvertWindowsPath(referenceFastaPath) + " " + WrapperUtility.ConvertWindowsPath(ReferenceIndexPrefix) + "; fi" }; return(scriptStrings); }
/// <summary> /// Run star fusion from chimericOutJunctions /// </summary> /// <param name="spritzDirectory"></param> /// <param name="analysisDirectory"></param> /// <param name="threads"></param> /// <param name="chimericOutJunction"></param> /// <param name="outdir"></param> /// <returns></returns> public List <string> RunStarFusion(string spritzDirectory, string analysisDirectory, int threads, string chimericOutJunction) { if (ReferenceLibraryDirectory == null) { throw new FileNotFoundException("STAR-Fusion reference library was not generated prior to running STAR-Fusion."); } OutputDirectoryPath = Path.Combine(Path.GetDirectoryName(chimericOutJunction), Path.GetFileNameWithoutExtension(chimericOutJunction) + "FusionAnalysis"); Directory.CreateDirectory(OutputDirectoryPath); string tmp = Path.Combine(analysisDirectory, "_STARFusionTmp"); Directory.CreateDirectory(tmp); string arguments = " --examine_coding_effect" + " --CPU " + threads.ToString() + " --output_dir " + WrapperUtility.ConvertWindowsPath(OutputDirectoryPath) + " --genome_lib_dir " + WrapperUtility.ConvertWindowsPath(ReferenceLibraryDirectory) + " --chimeric_junction " + WrapperUtility.ConvertWindowsPath(chimericOutJunction) + " --tmpdir " + WrapperUtility.ConvertWindowsPath(tmp); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), StarFusionDirectoryName + "/STAR-Fusion " + arguments }); }
/// <summary> /// Aligns reads and outputs alignment map and chimeric alignments. Duplicate reads are removed (deduped) from the alignment map, a step that's recommended for variant calling. /// Note: fastqs must have \n line endings, not \r\n. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="genomeDir"></param> /// <param name="fastqFiles"></param> /// <param name="outprefix"></param> /// <param name="strandSpecific"></param> /// <param name="genomeLoad"></param> /// <returns></returns> public static List <string> AlignRNASeqReadsForVariantCalling(string spritzDirectory, int threads, string genomeDir, string[] fastqFiles, string outprefix, bool overwriteStarAlignment, bool strandSpecific = true, STARGenomeLoadOption genomeLoad = STARGenomeLoadOption.NoSharedMemory) { string reads_in = string.Join(" ", fastqFiles.Select(f => WrapperUtility.ConvertWindowsPath(f))); string read_command = fastqFiles.Any(f => Path.GetExtension(f) == ".gz") ? " --readFilesCommand zcat -c" : fastqFiles.Any(f => Path.GetExtension(f) == ".bz2") ? " --readFilesCommand bzip2 -c" : ""; string alignmentArguments = " --genomeLoad " + genomeLoad.ToString() + " --runMode alignReads" + " --runThreadN " + threads.ToString() + " --genomeDir " + WrapperUtility.ConvertWindowsPath(genomeDir) + " --readFilesIn " + reads_in + " --outSAMtype BAM SortedByCoordinate" + " --outBAMcompression 10" + " --limitBAMsortRAM " + Process.GetCurrentProcess().VirtualMemorySize64.ToString() + " --outFileNamePrefix " + WrapperUtility.ConvertWindowsPath(outprefix) + // chimeric junction settings //" --chimSegmentMin 12" + //" --chimJunctionOverhangMin 12" + //" --alignSJDBoverhangMin 10" + //" --alignMatesGapMax 100000" + //" --alignIntronMax 100000" + //" --chimSegmentReadGapMax 3" + //" --alignSJstitchMismatchNmax 5 -1 5 5" + // stringtie parameters " --outSAMstrandField intronMotif" + // adds XS tag to all alignments that contain a splice junction " --outFilterIntronMotifs RemoveNoncanonical" + // for cufflinks // gatk parameters " --outSAMattrRGline ID:1 PU:platform PL:illumina SM:sample LB:library" + // this could shorten the time for samples that aren't multiplexed in preprocessing for GATK " --outSAMmapqUnique 60" + // this is used to ensure compatibility with GATK without having to use the GATK hacks read_command; // note in the future, two sets of reads can be comma separated here, and the RGline can also be comma separated to distinguish them later return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), overwriteStarAlignment ? "" : "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix) + " ) ]]; then", " STAR" + alignmentArguments, overwriteStarAlignment ? "" : "fi", SamtoolsWrapper.IndexBamCommand(WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix)), overwriteStarAlignment ? "" : "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix) + " ) ]]; then", " " + StarDedupCommand(threads, outprefix + SortedBamFileSuffix, outprefix + Path.GetFileNameWithoutExtension(SortedBamFileSuffix)), overwriteStarAlignment ? "" : "fi", SamtoolsWrapper.IndexBamCommand(WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix)), File.Exists(outprefix + BamFileSuffix) && File.Exists(outprefix + DedupedBamFileSuffix) && genomeLoad == STARGenomeLoadOption.LoadAndRemove ? "STAR --genomeLoad " + STARGenomeLoadOption.Remove.ToString() : "", }); }
/// <summary> /// Get command to concatenate a set of VCF files /// </summary> /// <param name="spritzDirectory"></param> /// <param name="vcfInputs">Windows-formatted VCF paths</param> /// <param name="outPrefix"></param> /// <returns>command to run vcftools to concatenate a set of VCF files</returns> public string Concatenate(string spritzDirectory, IEnumerable <string> vcfInputs, string outPrefix) { VcfConcatenatedPath = outPrefix + ".concat.vcf"; return ("if [ ! -f " + WrapperUtility.ConvertWindowsPath(VcfConcatenatedPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(VcfConcatenatedPath) + " ]; then " + "vcf-concat " + string.Join(" ", vcfInputs.Select(v => WrapperUtility.ConvertWindowsPath(v))) + " > " + WrapperUtility.ConvertWindowsPath(VcfConcatenatedPath) + "; fi"); }
public static string GetSequencesFromFasta(string inputFasta, IEnumerable <string> sequenceNames, string outputFasta) { if (sequenceNames.Any(name => name.Contains(" "))) { throw new ArgumentException("A sequence name query had a space in it; this is not supported" + string.Join(",", sequenceNames) + "."); } return("if [ ! -f " + WrapperUtility.ConvertWindowsPath(outputFasta) + " ]; then samtools faidx " + WrapperUtility.ConvertWindowsPath(inputFasta) + " " + string.Join(" ", sequenceNames) + " > " + WrapperUtility.ConvertWindowsPath(outputFasta) + "; fi"); }
public static void FilterGeneModel(string analysisDirectory, string geneModelGtfOrGff, Genome genome, out string filteredGeneModel) { string grepQuery = "\"^" + string.Join(@"\|^", genome.Chromosomes.Select(c => c.FriendlyName).Concat(new[] { "#" }).ToList()) + "\""; filteredGeneModel = Path.Combine(Path.GetDirectoryName(geneModelGtfOrGff), Path.GetFileNameWithoutExtension(geneModelGtfOrGff)) + ".filtered" + Path.GetExtension(geneModelGtfOrGff); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "FilterGeneModel.bash"), new List <string> { "grep " + grepQuery + " " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGff) + " > " + WrapperUtility.ConvertWindowsPath(filteredGeneModel) }).WaitForExit(); }
/// <summary> /// Groups (I'm just using one group, so it's more a formality) and sorts reads. Marks duplicates. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bam"></param> /// <param name="genomeFasta"></param> /// <param name="reference"></param> /// <param name="newBam"></param> /// <param name="convertToUCSC"></param> public List <string> PrepareBamAndFasta(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string reference) { string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerSorted"); string readGroupedCheckfile = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerReadGrouped"); string sortedBam = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".sorted.bam"); string groupedBamPath = Path.Combine(Path.GetDirectoryName(sortedBam), Path.GetFileNameWithoutExtension(sortedBam) + ".grouped.bam"); string markedDuplicatesBamPath = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.bam"); string markedDuplicateMetrics = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.metrics"); string tmpDir = Path.Combine(spritzDirectory, "tmp"); Directory.CreateDirectory(tmpDir); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep '^@RG' > " + WrapperUtility.ConvertWindowsPath(readGroupedCheckfile), // group and sort (note, using picard-tools works, but picard.jar somehow is trucating the BAM files) "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " ) && " + " ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + Gatk(Workers, 2) + " AddOrReplaceReadGroups" + " -PU platform -PL illumina -SM sample -LB library" + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " -O " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " -SO coordinate" + " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) + "; fi", SamtoolsWrapper.IndexBamCommand(groupedBamPath), // mark duplicates (AS means assume sorted; note, using picard-tools works, but picard.jar somehow is trucating the BAM files) "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + Gatk(Workers) + " MarkDuplicates" + // formerly picard " -I " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " -O " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " -M " + WrapperUtility.ConvertWindowsPath(markedDuplicateMetrics) + " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) + " -AS true" + "; fi", SamtoolsWrapper.IndexBamCommand(markedDuplicatesBamPath), // clean up "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + "rm " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + "; fi", }; PreparedBamPath = markedDuplicatesBamPath; return(commands); }
// see here for how to generate them from scratch: http://lab.loman.net/2012/11/16/how-to-get-snpeff-working-with-bacterial-genomes-from-ncbi/ public void DownloadSnpEffDatabase(string spritzDirectory, string analysisDirectory, string reference) { DatabaseListPath = Path.Combine(spritzDirectory, "snpEffDatabases.txt"); // check for existing list and database bool databaseListExists = File.Exists(DatabaseListPath); string databaseDirectory = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data"); string[] existingDatabases = Directory.Exists(databaseDirectory) ? Directory.GetDirectories(databaseDirectory) : new string[0]; bool databaseExists = existingDatabases.Any(d => Path.GetFileName(d).StartsWith(reference, true, null)); if (databaseListExists && databaseExists) { return; } // download database list string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownloadList.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"Downloading list of SnpEff references\"", SnpEff(Workers) + " databases > " + WrapperUtility.ConvertWindowsPath(DatabaseListPath), WrapperUtility.EnsureClosedFileCommands(DatabaseListPath) }).WaitForExit(); List <string> databases = new List <string>(); using (StreamReader reader = new StreamReader(DatabaseListPath)) { while (true) { string line = reader.ReadLine(); if (line == null) { break; } databases.Add(line.Split('\t')[0].TrimEnd()); } } string snpeffReference = databases.FirstOrDefault(d => d.StartsWith(reference, true, CultureInfo.InvariantCulture)); // download database (it downloads automatically now, with more feedback), but still need the mitochondrial references scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownload.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"\n# " + snpeffReference + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpeffReference + ".genome : Human genome " + snpeffReference.Split('.')[0] + " using RefSeq transcripts\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpeffReference + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpeffReference + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpeffReference + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), }).WaitForExit(); }
/// <summary> /// Creates a dictionary for the genome fasta, used by many GATK tools. /// </summary> /// <param name="genomeFastaPath"></param> /// <returns></returns> private string GenomeDictionaryIndexCommand(string genomeFastaPath) { string dictionaryPath = Path.Combine(Path.GetDirectoryName(genomeFastaPath), Path.GetFileNameWithoutExtension(genomeFastaPath) + ".dict"); return("if [ ! -f " + WrapperUtility.ConvertWindowsPath(dictionaryPath) + " ]; then " + //rm " + WrapperUtility.ConvertWindowsPath(dictionaryPath) + "; fi\n" + Gatk(Workers) + // formerly picard " CreateSequenceDictionary" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFastaPath) + " -O " + WrapperUtility.ConvertWindowsPath(dictionaryPath) + "; fi"); }
/// <summary> /// Removes transcripts with zero abundance predictions /// </summary> /// <returns></returns> public static List <string> RemoveZeroAbundanceCufflinksPredictionsCommand(string spritzDirectory, string transcriptGtfPath, out string filteredTranscriptGtfPath) { filteredTranscriptGtfPath = Path.Combine(Path.GetDirectoryName(transcriptGtfPath), Path.GetFileNameWithoutExtension(transcriptGtfPath)) + ".filtered" + Path.GetExtension(transcriptGtfPath); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"Removing zero-abundance transcripts from " + transcriptGtfPath + "\"", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(filteredTranscriptGtfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(filteredTranscriptGtfPath) + " ]]; then " + "grep -v 'FPKM \"0.0000000000\"' " + WrapperUtility.ConvertWindowsPath(transcriptGtfPath) + " > " + WrapperUtility.ConvertWindowsPath(filteredTranscriptGtfPath) + "; fi" }); }
/// <summary> /// Gets command to remove all indels from a VCF file /// </summary> /// <param name="spritzDirectory"></param> /// <param name="vcfPath"></param> /// <param name="keepInfo"></param> /// <param name="applyFilter"></param> /// <returns>bash command to run vcftools to remove all indels from a VCF file</returns> public string RemoveAllIndels(string spritzDirectory, string vcfPath, bool keepInfo, bool applyFilter) { VcfWithoutIndelsPath = Path.Combine(Path.GetDirectoryName(vcfPath), Path.GetFileNameWithoutExtension(vcfPath)) + ".NoIndels.vcf"; return ("if [ ! -f " + WrapperUtility.ConvertWindowsPath(VcfWithoutIndelsPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(VcfWithoutIndelsPath) + " ]; then " + "vcftools " + " --remove-indels --vcf " + WrapperUtility.ConvertWindowsPath(vcfPath) + " --recode " + (applyFilter ? " --remove-filtered-all " : "") + (keepInfo ? " --recode-INFO-all " : "") + " --stdout > " + WrapperUtility.ConvertWindowsPath(VcfWithoutIndelsPath) + "; fi"); }
/// <summary> /// Sets filter based on average genotype depth /// </summary> /// <param name="spritzDirectory"></param> /// <param name="vcfPath"></param> /// <param name="keepInfo"></param> /// <param name="minDepth"></param> /// <returns></returns> public string AverageGenotypeDepthFilter(string spritzDirectory, string vcfPath, bool keepInfo, float minDepth) { VcfDepthFilteredPath = Path.Combine(Path.GetDirectoryName(vcfPath), Path.GetFileNameWithoutExtension(vcfPath)) + ".DPFilter.vcf"; return ("if [ ! -f " + WrapperUtility.ConvertWindowsPath(VcfDepthFilteredPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(VcfDepthFilteredPath) + " ]; then " + "vcftools " + " --vcf " + WrapperUtility.ConvertWindowsPath(vcfPath) + " --min-meanDP " + minDepth.ToString() + " --recode " + (keepInfo ? " --recode-INFO-all " : "") + " --stdout > " + WrapperUtility.ConvertWindowsPath(VcfDepthFilteredPath) + "; fi"); }
public static string StarDedupCommand(int threads, string inputBamPath, string outBamPath) { string dedupArguments = " --runMode inputAlignmentsFromBAM" + " --bamRemoveDuplicatesType UniqueIdentical" + // this could shorten the time for samples that aren't multiplexed, too; might only work with sortedBAM input from --inputBAMfile " --limitBAMsortRAM " + Process.GetCurrentProcess().VirtualMemorySize64.ToString() + " --runThreadN " + threads.ToString() + " --outBAMcompression 10" + " --inputBAMfile " + WrapperUtility.ConvertWindowsPath(inputBamPath) + " --outFileNamePrefix " + WrapperUtility.ConvertWindowsPath(outBamPath); return("STAR" + dedupArguments); }
/// <summary> /// Generic method for subsetting a BAM file. Useful for testing new methods. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bam"></param> /// <param name="genomeFasta"></param> /// <param name="genomeRegion"></param> /// <param name="outputBam"></param> public void SubsetBam(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string genomeRegion, string outputBam) { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SubsetBam.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), Gatk(Workers) + " PrintReads" + " --num_threads " + threads.ToString() + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " -o " + WrapperUtility.ConvertWindowsPath(outputBam) + " -L " + genomeRegion, }).WaitForExit(); }
public static void GenerateIndex(string spritzDirectory, string analysisDirectory, string genomeFasta, out string IndexPrefix) { IndexPrefix = Path.Combine(Path.GetDirectoryName(genomeFasta), Path.GetFileNameWithoutExtension(genomeFasta)); if (IndexExists(genomeFasta)) { return; } WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Build.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "hisat2-2.1.0/hisat2-build" + " " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) }).WaitForExit(); }
public string DownloadEnsemblKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun) { DownloadUCSCKnownVariantSites(spritzDirectory, commonOnly, reference, dryRun); EnsemblKnownSitesPath = ConvertVCFChromosomesUCSC2Ensembl(spritzDirectory, UcscKnownSitesPath, reference, dryRun); if (!dryRun) { // indexing is used for most GATK tools WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "IndexKnownVariantSites.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [ ! -f " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + "; fi", "if [ ! -f " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + "; fi", }).WaitForExit(); } return(EnsemblKnownSitesPath); }
public static void Trim(string spritzDirectory, string analysisDirectory, int threads, int qualityFilter, string[] readPaths, bool dryRun, out string[] readTrimmedPaths, out string log) { log = ""; readTrimmedPaths = new string[readPaths.Length]; if (readPaths.Length <= 0) { return; } // Only create paired entry if paired input, and ignore inputs after second index bool compressed = Path.GetExtension(readPaths[0]) == ".gz"; string[] uncompressedReadPaths = compressed ? readPaths.Select(x => Path.Combine(Path.GetDirectoryName(x), Path.GetFileNameWithoutExtension(x))).ToArray() : readPaths; for (int i = 0; i < readPaths.Length; i++) { if (i == 0) { readTrimmedPaths[0] = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed" + (uncompressedReadPaths.Length > 1 ? "-pair1" : "") + ".fastq"); } if (i == 1) { readTrimmedPaths[1] = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed-pair2.fastq"); } } log = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed.log"); bool alreadyTrimmed = File.Exists(readTrimmedPaths[0]) && (readPaths.Length == 1 || File.Exists(readTrimmedPaths[1])); if (alreadyTrimmed || dryRun) { return; } string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Skewered.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "skewer-0.2.2/skewer" + " -q " + qualityFilter.ToString() + " -o " + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]))) + " -t " + threads.ToString() + " -x " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "BBMap", "resources", "adapters.fa")) + " " + WrapperUtility.ConvertWindowsPath(readPaths[0]) + (readPaths.Length > 1 ? " " + WrapperUtility.ConvertWindowsPath(readPaths[1]) : ""), }).WaitForExit(); }
/// <summary> /// Downloads dbSNP reference VCF file if it doesn't exist /// </summary> /// <param name="spritzDirectory"></param> /// <param name="commonOnly"></param> /// <param name="reference"></param> /// <returns></returns> public string DownloadUCSCKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun) { bool knownSitesExists = KnownVariantSitesFileExists(spritzDirectory, commonOnly, reference); if (!knownSitesExists && !dryRun) { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "DownloadUcscVariants.bash"), new List <string> { "cd " + WrapperUtility.ConvertWindowsPath(spritzDirectory), "wget " + TargetFileLocation, "gunzip " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz", "rm " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz", "mv " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + " " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) }).WaitForExit(); } return(UcscKnownSitesPath); }
/// <summary> /// Bundles splice jucntions from first pass alignments into a single splice junction file for second-pass alignment. /// Excludes splice junctions for mitochondrial chromosome alignments. /// </summary> /// <param name="spliceJunctionOuts"></param> /// <param name="uniqueSuffix"></param> /// <param name="spliceJunctionStarts"></param> /// <returns></returns> public static List <string> ProcessFirstPassSpliceCommands(List <string> spliceJunctionOuts, int uniqueSuffix, out string spliceJunctionStarts) { if (spliceJunctionOuts.Count == 0) { throw new ArgumentException("STARWrapper.ProcessFirstPassSpliceCommands: No splice junctions detected for second-pass genome generation."); } spliceJunctionStarts = Path.Combine(Path.GetDirectoryName(spliceJunctionOuts[0]), "combined" + uniqueSuffix.ToString() + "." + SpliceJunctionFileSuffix); return(new List <string> { "if [ ! -f " + WrapperUtility.ConvertWindowsPath(spliceJunctionStarts) + " ]; then " + "awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' " + string.Join(" ", spliceJunctionOuts.Select(f => WrapperUtility.ConvertWindowsPath(f))) + " | grep -v 'MT' >> " + WrapperUtility.ConvertWindowsPath(spliceJunctionStarts) + "; fi" }); }
/// <summary> /// Downloads Ensembl references for GRCh37 or GRCh38. /// /// Sets GenomeFastaPath, GtfGeneModelPath, Gff3GeneModelPath, and ProteinFastaPath properties. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="targetDirectory"></param> /// <param name="reference"></param> /// <param name="genomeFastaPath"></param> /// <param name="gtfGeneModelPath"></param> /// <param name="gff3GeneModelPath"></param> /// <param name="proteinFastaPath"></param> public void DownloadReferences(string spritzDirectory, string targetDirectory, string reference, bool dryRun) { bool downloadGrch37 = string.Equals(reference, "GRCh37", StringComparison.CurrentCultureIgnoreCase); bool downloadGrch38 = string.Equals(reference, "GRCh38", StringComparison.CurrentCultureIgnoreCase); GenomeFastaPath = downloadGrch37 ? Path.Combine(targetDirectory, GRCh37PrimaryAssemblyFilename) : downloadGrch38? Path.Combine(targetDirectory, GRCh38PrimaryAssemblyFilename) : ""; GtfGeneModelPath = downloadGrch37 ? Path.Combine(targetDirectory, GRCh37GtfGeneModelFilename) : downloadGrch38? Path.Combine(targetDirectory, GRCh38GtfGeneModelFilename) : ""; Gff3GeneModelPath = downloadGrch37 ? GtfGeneModelPath : downloadGrch38? Path.Combine(targetDirectory, GRCh38Gff3GeneModelFilename) : ""; ProteinFastaPath = downloadGrch37 ? Path.Combine(targetDirectory, GRCh37ProteinFastaFilename) : downloadGrch38? Path.Combine(targetDirectory, GRCh38ProteinFastaFilename) : ""; if (!downloadGrch37 && !downloadGrch38 || dryRun) { return; } WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(targetDirectory, "DownloadEnsemblReference.bash"), new List <string> { $"cd {WrapperUtility.ConvertWindowsPath(targetDirectory)}", $"if [ ! -f {Path.GetFileName(GenomeFastaPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38PrimaryAssemblyUrl : GRCh37PrimaryAssemblyUrl)} | gunzip -c > {Path.GetFileName(GenomeFastaPath)}; fi", $"if [ ! -f {Path.GetFileName(GtfGeneModelPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38GtfGeneModelUrl : GRCh37GtfGeneModelUrl)} | gunzip -c > {Path.GetFileName(GtfGeneModelPath)}; fi", $"if [ ! -f {Path.GetFileName(Gff3GeneModelPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38Gff3GeneModelUrl : GRCh37GtfGeneModelUrl)} | gunzip -c > {Path.GetFileName(Gff3GeneModelPath)}; fi", // note GRCh37 calls the gtf url instead $"if [ ! -f {Path.GetFileName(ProteinFastaPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38ProteinFastaUrl : GRCh37ProteinFastaUrl)} | gunzip -c > {Path.GetFileName(ProteinFastaPath)}; fi", // note GRCh37 calls the gtf url instead }).WaitForExit(); //Genome.WriteFasta(new Genome(genomeFastaPath).KaryotypicOrder(), genomeFastaPath); // todo: try this for ordering contigs before alignments; does gtf then need to be reordered? }