/// <summary> /// Creates recalibration table and recalibrates reads. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="genomeFasta"></param> /// <param name="bam"></param> /// <param name="recalibrationTablePath"></param> /// <param name="knownSitesVcf"></param> public List <string> BaseRecalibration(string spritzDirectory, string analysisDirectory, string genomeFasta, string bam, string knownSitesVcf) { RecalibrationTablePath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recaltable"); RecalibratedBamPath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recal.bam"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // check that reference VCF is indexed "if [ ! -f " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + "; fi", "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " ) ]]; then " + Gatk(Workers) + " BaseRecalibrator" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + (knownSitesVcf != "" ? " --known-sites " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) : "") + " -O " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + "; fi", "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " ) ]]; then " + Gatk(Workers) + " ApplyBQSR" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " --bqsr-recal-file " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " -O " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + "; fi", SamtoolsWrapper.IndexBamCommand(RecalibratedBamPath), }); }
/// <summary> /// Aligns reads and outputs alignment map and chimeric alignments. Duplicate reads are removed (deduped) from the alignment map, a step that's recommended for variant calling. /// Note: fastqs must have \n line endings, not \r\n. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="genomeDir"></param> /// <param name="fastqFiles"></param> /// <param name="outprefix"></param> /// <param name="strandSpecific"></param> /// <param name="genomeLoad"></param> /// <returns></returns> public static List <string> AlignRNASeqReadsForVariantCalling(string spritzDirectory, int threads, string genomeDir, string[] fastqFiles, string outprefix, bool overwriteStarAlignment, bool strandSpecific = true, STARGenomeLoadOption genomeLoad = STARGenomeLoadOption.NoSharedMemory) { string reads_in = string.Join(" ", fastqFiles.Select(f => WrapperUtility.ConvertWindowsPath(f))); string read_command = fastqFiles.Any(f => Path.GetExtension(f) == ".gz") ? " --readFilesCommand zcat -c" : fastqFiles.Any(f => Path.GetExtension(f) == ".bz2") ? " --readFilesCommand bzip2 -c" : ""; string alignmentArguments = " --genomeLoad " + genomeLoad.ToString() + " --runMode alignReads" + " --runThreadN " + threads.ToString() + " --genomeDir " + WrapperUtility.ConvertWindowsPath(genomeDir) + " --readFilesIn " + reads_in + " --outSAMtype BAM SortedByCoordinate" + " --outBAMcompression 10" + " --limitBAMsortRAM " + Process.GetCurrentProcess().VirtualMemorySize64.ToString() + " --outFileNamePrefix " + WrapperUtility.ConvertWindowsPath(outprefix) + // chimeric junction settings //" --chimSegmentMin 12" + //" --chimJunctionOverhangMin 12" + //" --alignSJDBoverhangMin 10" + //" --alignMatesGapMax 100000" + //" --alignIntronMax 100000" + //" --chimSegmentReadGapMax 3" + //" --alignSJstitchMismatchNmax 5 -1 5 5" + // stringtie parameters " --outSAMstrandField intronMotif" + // adds XS tag to all alignments that contain a splice junction " --outFilterIntronMotifs RemoveNoncanonical" + // for cufflinks // gatk parameters " --outSAMattrRGline ID:1 PU:platform PL:illumina SM:sample LB:library" + // this could shorten the time for samples that aren't multiplexed in preprocessing for GATK " --outSAMmapqUnique 60" + // this is used to ensure compatibility with GATK without having to use the GATK hacks read_command; // note in the future, two sets of reads can be comma separated here, and the RGline can also be comma separated to distinguish them later return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), overwriteStarAlignment ? "" : "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix) + " ) ]]; then", " STAR" + alignmentArguments, overwriteStarAlignment ? "" : "fi", SamtoolsWrapper.IndexBamCommand(WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix)), overwriteStarAlignment ? "" : "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix) + " ) ]]; then", " " + StarDedupCommand(threads, outprefix + SortedBamFileSuffix, outprefix + Path.GetFileNameWithoutExtension(SortedBamFileSuffix)), overwriteStarAlignment ? "" : "fi", SamtoolsWrapper.IndexBamCommand(WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix)), File.Exists(outprefix + BamFileSuffix) && File.Exists(outprefix + DedupedBamFileSuffix) && genomeLoad == STARGenomeLoadOption.LoadAndRemove ? "STAR --genomeLoad " + STARGenomeLoadOption.Remove.ToString() : "", }); }
/// <summary> /// Groups (I'm just using one group, so it's more a formality) and sorts reads. Marks duplicates. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bam"></param> /// <param name="genomeFasta"></param> /// <param name="reference"></param> /// <param name="newBam"></param> /// <param name="convertToUCSC"></param> public List <string> PrepareBamAndFasta(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string reference) { string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerSorted"); string readGroupedCheckfile = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerReadGrouped"); string sortedBam = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".sorted.bam"); string groupedBamPath = Path.Combine(Path.GetDirectoryName(sortedBam), Path.GetFileNameWithoutExtension(sortedBam) + ".grouped.bam"); string markedDuplicatesBamPath = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.bam"); string markedDuplicateMetrics = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.metrics"); string tmpDir = Path.Combine(spritzDirectory, "tmp"); Directory.CreateDirectory(tmpDir); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep '^@RG' > " + WrapperUtility.ConvertWindowsPath(readGroupedCheckfile), // group and sort (note, using picard-tools works, but picard.jar somehow is trucating the BAM files) "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " ) && " + " ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + Gatk(Workers, 2) + " AddOrReplaceReadGroups" + " -PU platform -PL illumina -SM sample -LB library" + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " -O " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " -SO coordinate" + " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) + "; fi", SamtoolsWrapper.IndexBamCommand(groupedBamPath), // mark duplicates (AS means assume sorted; note, using picard-tools works, but picard.jar somehow is trucating the BAM files) "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + Gatk(Workers) + " MarkDuplicates" + // formerly picard " -I " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " -O " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " -M " + WrapperUtility.ConvertWindowsPath(markedDuplicateMetrics) + " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) + " -AS true" + "; fi", SamtoolsWrapper.IndexBamCommand(markedDuplicatesBamPath), // clean up "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + "rm " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + "; fi", }; PreparedBamPath = markedDuplicatesBamPath; return(commands); }
/// <summary> /// Gets commands to calculate expression an RSEM reference /// </summary> /// <param name="spritzDirectory"></param> /// <returns></returns> public List <string> CalculateExpressionCommands(string spritzDirectory, string referencePrefix, int threads, RSEMAlignerOption aligner, Strandedness strandedness, string[] fastqPaths, bool doOuptutBam) { if (fastqPaths.Length < 1) { throw new ArgumentOutOfRangeException("No fastq files were given for RSEM calculate expression."); } if (fastqPaths.Length > 2) { throw new ArgumentOutOfRangeException("Too many fastq file types given for RSEM calculate expression."); } List <string> scriptCommands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "cd RSEM-1.3.0", }; string[] analysisFastqPaths = fastqPaths; string alignerOption = GetAlignerOption(spritzDirectory, aligner); string threadOption = "--num-threads " + threads.ToString(); string strandOption = "--strandedness " + strandedness.ToString().ToLowerInvariant(); // Decompress files if needed // The '--star-gzipped-read-file' and '--star-bzipped-read-file' options work, but then the rest of RSEM doesn't when using compressed files... bool fastqIsGunzipped = analysisFastqPaths[0].EndsWith("gz"); bool fastqIsBunzipped = analysisFastqPaths[0].EndsWith("bz2") || analysisFastqPaths[0].EndsWith("bz"); if (fastqIsGunzipped || fastqIsBunzipped) { for (int i = 0; i < analysisFastqPaths.Length; i++) { string decompressionCommand = fastqIsGunzipped ? "gunzip" : "bunzip2"; scriptCommands.Add($"{decompressionCommand} --keep {WrapperUtility.ConvertWindowsPath(analysisFastqPaths[i])}"); analysisFastqPaths[i] = Path.ChangeExtension(analysisFastqPaths[i], null); } } string inputOption = analysisFastqPaths.Length == 1 ? string.Join(",", analysisFastqPaths[0].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))) : "--paired-end " + string.Join(",", analysisFastqPaths[0].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))) + " " + string.Join(",", analysisFastqPaths[1].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))); var megabytes = Math.Floor((double)Process.GetCurrentProcess().VirtualMemorySize64 / 1000000); string bamOption = doOuptutBam ? "--output-genome-bam" : "--no-bam-output"; OutputPrefix = Path.Combine(Path.GetDirectoryName(analysisFastqPaths[0].Split(',')[0]), Path.GetFileNameWithoutExtension(analysisFastqPaths[0].Split(',')[0]) + "_" + Path.GetExtension(analysisFastqPaths[0].Split(',')[0]).Substring(1).ToUpperInvariant() + referencePrefix.GetHashCode().ToString()); // RSEM likes to sort the transcript.bam file, which takes forever and isn't very useful, I've found. Just sort the genome.bam file instead string samtoolsCommands = !doOuptutBam ? "" : "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(OutputPrefix + GenomeSortedBamSuffix) + " && ! -s " + WrapperUtility.ConvertWindowsPath(OutputPrefix + GenomeSortedBamSuffix) + " ]]; then\n" + " " + SamtoolsWrapper.SortBam(OutputPrefix + GenomeBamSuffix, threads) + "\n" + " " + SamtoolsWrapper.IndexBamCommand(OutputPrefix + GenomeSortedBamSuffix) + "\n" + "fi"; // construct the commands scriptCommands.AddRange(new List <string> { "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(OutputPrefix + IsoformResultsSuffix) + " && ! -s " + WrapperUtility.ConvertWindowsPath(OutputPrefix + IsoformResultsSuffix) + " ]]; then " + "./rsem-calculate-expression " + "--time " + // include timed results "--calc-ci " + // posterior calculation of 95% confidence intervals alignerOption + " " + threadOption + " " + bamOption + " " + inputOption + " " + WrapperUtility.ConvertWindowsPath(referencePrefix) + " " + WrapperUtility.ConvertWindowsPath(OutputPrefix) + "; fi", samtoolsCommands }); return(scriptCommands); }
/// <summary> /// Splits and trims reads splice junction reads with SplitNCigarReads. /// Apparently cigars are genomic intervals, and splice junctions are represented by a bunch of N's (unkonwn nucleotide), HaplotypeCaller requires splitting them in the BAM file. /// /// It's tempting to want to run a few of these at the same time because it's not well parallelized. It's just not worth it. It uses quite a bit of RAM and racks the I/O at the beginning when reading the BAM files. /// Could possibly do 4 at a time on 128 GB RAM and 28 processors. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="genomeFasta"></param> /// <param name="dedupedBam"></param> /// <param name="splitTrimBam"></param> /// <returns></returns> public List <string> SplitNCigarReads(string spritzDirectory, string genomeFasta, string dedupedBam) { string fixedQualsBam = Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + ".fixedQuals.bam"); SplitTrimBamPath = Path.Combine(Path.GetDirectoryName(fixedQualsBam), Path.GetFileNameWithoutExtension(fixedQualsBam) + ".split.bam"); // This also filters malformed reads string fixMisencodedQualsCmd = Gatk(Workers) + " FixMisencodedBaseQualityReads" + " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) + " -O " + WrapperUtility.ConvertWindowsPath(fixedQualsBam); string splitNCigarReadsCmd1 = Gatk(Workers) + " SplitNCigarReads" + //" --num_threads " + threads.ToString() + // not supported " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath) //" -rf ReassignOneMappingQuality" + // doing this with STAR //" -RMQF 255" + //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners //" -U ALLOW_N_CIGAR_READS" ; string splitNCigarReadsCmd2 = Gatk(Workers) + " SplitNCigarReads" + //" --num_threads " + threads.ToString() + // not supported " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) + " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath) //" -rf ReassignOneMappingQuality" + // doing this with STAR //" -RMQF 255" + //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners //" -U ALLOW_N_CIGAR_READS" ; List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // split and trim reads (some datasets are probably going to have misencoded quality scores; -fixMisencodedQuals just subtracts 31 from all quality scores if possible...) // exit code of 2 means that the FixMisencodedQualityBaseReads errored out because there were correctly encode base quality scores SamtoolsWrapper.IndexBamCommand(dedupedBam), "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " || ! -s " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " ) ]]; then", " " + fixMisencodedQualsCmd, " if [ $? -ne 2 ]; then", " " + splitNCigarReadsCmd1, " else", " " + splitNCigarReadsCmd2, " fi", "fi", SamtoolsWrapper.IndexBamCommand(SplitTrimBamPath), }; return(commands); }