/// <summary> /// Writes a script to install samtools. /// </summary> /// <param name="spritzDirectory"></param> /// <returns></returns> public string WriteInstallScript(string spritzDirectory) { string scriptPath = WrapperUtility.GetInstallationScriptPath(spritzDirectory, "InstallSamtools.bash"); WrapperUtility.GenerateScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), $"if [ ! -d samtools-{SamtoolsVersion} ]; then", $" wget --no-check https://github.com/samtools/samtools/releases/download/{SamtoolsVersion}/samtools-{SamtoolsVersion}.tar.bz2", $" tar -jxvf samtools-{SamtoolsVersion}.tar.bz2", $" rm samtools-{SamtoolsVersion}.tar.bz2", $" cd samtools-{SamtoolsVersion}/htslib-{SamtoolsVersion}", " ./configure", // configures install to /usr/local/bin and /usr/local/share " make", " make install", " cd ..", " ./configure", " make", " make install", "fi", }); return(scriptPath); }
/// <summary> /// Merge multiple transcript models (GTF) into a single one (GTF) /// </summary> /// <param name="spritzDirectory"></param> public static List <string> MergeTranscriptPredictions(string spritzDirectory, string geneModelGtfOrGffPath, List <string> transcriptGtfPaths, string combinedTranscriptGtfOutputPath) { string gtfListPath = Path.Combine(Path.GetDirectoryName(combinedTranscriptGtfOutputPath), Path.GetFileNameWithoutExtension(combinedTranscriptGtfOutputPath)) + "_gtflist.txt"; return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "readlink -f \"" + string.Join("\" \"", transcriptGtfPaths.Select(f => WrapperUtility.ConvertWindowsPath(f))) + "\" > " + WrapperUtility.ConvertWindowsPath(gtfListPath), "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(combinedTranscriptGtfOutputPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(combinedTranscriptGtfOutputPath) + " ]]; then ", " echo \"Performing stringtie transcript merger on GTF list:" + gtfListPath + "\"", " stringtie --merge " + " -G " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " -o " + WrapperUtility.ConvertWindowsPath(combinedTranscriptGtfOutputPath) + " -g " + GapBetweenTranscriptsToMergeTogether.ToString() + //" -T 0 -F 0" + // filtering is done elsewhere; this really does lead to a lot of bad transcript predictions. Just use the default. //" -f 0.01" + // minimum isoform fraction -- use default in stringtie for now " " + WrapperUtility.ConvertWindowsPath(gtfListPath), "fi" }); }
/// <summary> /// Writes an installation script for STAR. Also installs seqtk, which is useful for subsetting fastq files. /// </summary> /// <param name="spritzDirectory"></param> /// <returns></returns> public string WriteInstallScript(string spritzDirectory) { string scriptPath = WrapperUtility.GetInstallationScriptPath(spritzDirectory, "InstallStar.bash"); WrapperUtility.GenerateScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [ ! -d seqtk ]; then ", " git clone https://github.com/lh3/seqtk.git", " cd seqtk; make", "fi", WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [ ! -d STAR-" + STARVersion + " ]; then ", " wget https://github.com/alexdobin/STAR/archive/" + STARVersion + ".tar.gz; tar xvf " + STARVersion + ".tar.gz; rm " + STARVersion + ".tar.gz", " cd STAR-" + STARVersion + "/source", " make STAR", " cp STAR /usr/local/bin", " make clean", " make STARlong", " cp STARlong /usr/local/bin", "fi" }); return(scriptPath); }
/// <summary> /// Writes an installation script for SRAToolkit. /// </summary> /// <param name="spritzDirectory"></param> /// <returns></returns> public string WriteInstallScript(string spritzDirectory) { string scriptPath = WrapperUtility.GetInstallationScriptPath(spritzDirectory, "InstallSRAToolkit.bash"); WrapperUtility.GenerateScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if ls sratoolkit*/bin/faster-dump 1> /dev/null 2>&1; then", // if there are files listed matching the pattern sratoolkit* " echo \"Found SRAToolkit.\"", "else", " wget " + DownloadLocation, " tar -xvf sratoolkit.2.9.2-ubuntu64.tar.gz", " rm sratoolkit.2.9.2-ubuntu64.tar.gz", "fi" //"wget " + AscpDownloadLocation, //"tar -xvf aspera-connect*.tar.gz", //"rm apera-connect*.tar.gz", //"echo \"Installing Aspera ASCP Download Client\"", //"sudo sh aspera-connect*.sh", //"rm ascp-install-3.5.4.102989-linux-64.sh", }); return(scriptPath); }
/// <summary> /// Gets commands to calculate expression an RSEM reference /// </summary> /// <param name="spritzDirectory"></param> /// <returns></returns> public List <string> CalculateExpressionCommands(string spritzDirectory, string referencePrefix, int threads, RSEMAlignerOption aligner, Strandedness strandedness, string[] fastqPaths, bool doOuptutBam) { if (fastqPaths.Length < 1) { throw new ArgumentOutOfRangeException("No fastq files were given for RSEM calculate expression."); } if (fastqPaths.Length > 2) { throw new ArgumentOutOfRangeException("Too many fastq file types given for RSEM calculate expression."); } List <string> scriptCommands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "cd RSEM-1.3.0", }; string[] analysisFastqPaths = fastqPaths; string alignerOption = GetAlignerOption(spritzDirectory, aligner); string threadOption = "--num-threads " + threads.ToString(); string strandOption = "--strandedness " + strandedness.ToString().ToLowerInvariant(); // Decompress files if needed // The '--star-gzipped-read-file' and '--star-bzipped-read-file' options work, but then the rest of RSEM doesn't when using compressed files... bool fastqIsGunzipped = analysisFastqPaths[0].EndsWith("gz"); bool fastqIsBunzipped = analysisFastqPaths[0].EndsWith("bz2") || analysisFastqPaths[0].EndsWith("bz"); if (fastqIsGunzipped || fastqIsBunzipped) { for (int i = 0; i < analysisFastqPaths.Length; i++) { string decompressionCommand = fastqIsGunzipped ? "gunzip" : "bunzip2"; scriptCommands.Add($"{decompressionCommand} --keep {WrapperUtility.ConvertWindowsPath(analysisFastqPaths[i])}"); analysisFastqPaths[i] = Path.ChangeExtension(analysisFastqPaths[i], null); } } string inputOption = analysisFastqPaths.Length == 1 ? string.Join(",", analysisFastqPaths[0].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))) : "--paired-end " + string.Join(",", analysisFastqPaths[0].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))) + " " + string.Join(",", analysisFastqPaths[1].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))); var megabytes = Math.Floor((double)Process.GetCurrentProcess().VirtualMemorySize64 / 1000000); string bamOption = doOuptutBam ? "--output-genome-bam" : "--no-bam-output"; OutputPrefix = Path.Combine(Path.GetDirectoryName(analysisFastqPaths[0].Split(',')[0]), Path.GetFileNameWithoutExtension(analysisFastqPaths[0].Split(',')[0]) + "_" + Path.GetExtension(analysisFastqPaths[0].Split(',')[0]).Substring(1).ToUpperInvariant() + referencePrefix.GetHashCode().ToString()); // RSEM likes to sort the transcript.bam file, which takes forever and isn't very useful, I've found. Just sort the genome.bam file instead string samtoolsCommands = !doOuptutBam ? "" : "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(OutputPrefix + GenomeSortedBamSuffix) + " && ! -s " + WrapperUtility.ConvertWindowsPath(OutputPrefix + GenomeSortedBamSuffix) + " ]]; then\n" + " " + SamtoolsWrapper.SortBam(OutputPrefix + GenomeBamSuffix, threads) + "\n" + " " + SamtoolsWrapper.IndexBamCommand(OutputPrefix + GenomeSortedBamSuffix) + "\n" + "fi"; // construct the commands scriptCommands.AddRange(new List <string> { "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(OutputPrefix + IsoformResultsSuffix) + " && ! -s " + WrapperUtility.ConvertWindowsPath(OutputPrefix + IsoformResultsSuffix) + " ]]; then " + "./rsem-calculate-expression " + "--time " + // include timed results "--calc-ci " + // posterior calculation of 95% confidence intervals alignerOption + " " + threadOption + " " + bamOption + " " + inputOption + " " + WrapperUtility.ConvertWindowsPath(referencePrefix) + " " + WrapperUtility.ConvertWindowsPath(OutputPrefix) + "; fi", samtoolsCommands }); return(scriptCommands); }
public List <string> CombineAndGenotypeGvcfs(string spritzDirectory, string genomeFasta, List <string> gvcfPaths) { if (gvcfPaths == null || gvcfPaths.Count <= 1) { throw new ArgumentException("CombineAndGenotypeGvcfs exception: no gvcfs were specified to combine"); } int uniqueSuffix = 1; foreach (string f in gvcfPaths) { uniqueSuffix = uniqueSuffix ^ f.GetHashCode(); } HaplotypeCallerGvcfPath = Path.Combine(Path.GetDirectoryName(gvcfPaths.First()), $"combined{uniqueSuffix}.g.vcf.gz"); HaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(HaplotypeCallerGvcfPath), $"{Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(HaplotypeCallerGvcfPath))}.gt.vcf"); FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(HaplotypeCallerGvcfPath), $"{Path.GetFileNameWithoutExtension(HaplotypeCallerGvcfPath)}.NoIndels.vcf"); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta) }; foreach (string gvcf in gvcfPaths) { // double check that the compressed gvcf file is indexed commands.Add($"if [ ! -f {WrapperUtility.ConvertWindowsPath(gvcf)}.idx ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(gvcf)}; fi"); } // combine GVCFs string combineCommand = "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " + Gatk(Workers) + " CombineGVCFs" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + string.Join(" -V ", gvcfPaths.Select(gvcf => WrapperUtility.ConvertWindowsPath(gvcf))) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + "; fi"; commands.Add(combineCommand); commands.Add($"if [ ! -f {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}.idx ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi"); // genotype the gvcf file into a traditional vcf file string genotypeCommand = "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers) + " GenotypeGVCFs" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + "; fi"; commands.Add(genotypeCommand); // filter out indels string filterIndelsCommand = "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers) + " SelectVariants" + " --select-type-to-exclude INDEL" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + "; fi"; commands.Add(filterIndelsCommand); return(commands); }
/// <summary> /// HaplotypeCaller for calling variants on each RNA-Seq BAM file individually. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="genomeFasta"></param> /// <param name="splitTrimBam"></param> /// <param name="dbsnpReferenceVcfPath"></param> /// <param name="newVcf"></param> public List <string> VariantCalling(string spritzDirectory, ExperimentType experimentType, int threads, string genomeFasta, string splitTrimBam, string dbsnpReferenceVcfPath) { HaplotypeCallerGvcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.vcf.gz"); HaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.vcf"); FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.NoIndels.vcf"); var vcftools = new VcfToolsWrapper(); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // check that reference VCF is indexed "if [ ! -f " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + "; fi", // call variants "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " + Gatk(Workers, 2) + " HaplotypeCaller" + " --native-pair-hmm-threads " + threads.ToString() + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(splitTrimBam) + " --min-base-quality-score 20" + (experimentType == ExperimentType.RNASequencing ? " --dont-use-soft-clipped-bases true" : "") + " --dbsnp " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " -ERC GVCF" + // this prompts phasing! " --max-mnp-distance 3" + // note: this can't be used for joint genotyping here, but this setting is available in mutect2 for doing tumor vs normal calls "; fi", // index compressed gvcf file $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerGvcfPath}.tbi")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi", // genotype the gvcf file into a traditional vcf file "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers, 2) + " GenotypeGVCFs" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + "; fi", $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath)}; fi", // filter out indels "if [ ! -f " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers, 2) + " SelectVariants" + " --select-type-to-exclude INDEL" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + "; fi", $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{FilteredHaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath)}; fi", // filter variants (RNA-Seq specific params... need to check out recommendations before using DNA-Seq) //"if [ ! -f " + WrapperUtility.ConvertWindowsPath(newVcf) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(newVcf) + " ]; then " + // Gatk() + // " -T VariantFiltration" + // " -nct " + threads.ToString() + // " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + // " -V " + WrapperUtility.ConvertWindowsPath(unfliteredVcf) + // " -window 35 -cluster 3" + // filter out clusters of 3 snps within 35 bases (https://software.broadinstitute.org/gatk/documentation/topic?name=methods) // " -filterName FS -filter \"FS > 30.0\"" + // " -filterName QD -filter \"QD < 2.0\"" + // " -o " + WrapperUtility.ConvertWindowsPath(newVcf) + // "; fi", }; return(commands); }
/// <summary> /// Splits and trims reads splice junction reads with SplitNCigarReads. /// Apparently cigars are genomic intervals, and splice junctions are represented by a bunch of N's (unkonwn nucleotide), HaplotypeCaller requires splitting them in the BAM file. /// /// It's tempting to want to run a few of these at the same time because it's not well parallelized. It's just not worth it. It uses quite a bit of RAM and racks the I/O at the beginning when reading the BAM files. /// Could possibly do 4 at a time on 128 GB RAM and 28 processors. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="genomeFasta"></param> /// <param name="dedupedBam"></param> /// <param name="splitTrimBam"></param> /// <returns></returns> public List <string> SplitNCigarReads(string spritzDirectory, string genomeFasta, string dedupedBam) { string fixedQualsBam = Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + ".fixedQuals.bam"); SplitTrimBamPath = Path.Combine(Path.GetDirectoryName(fixedQualsBam), Path.GetFileNameWithoutExtension(fixedQualsBam) + ".split.bam"); // This also filters malformed reads string fixMisencodedQualsCmd = Gatk(Workers) + " FixMisencodedBaseQualityReads" + " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) + " -O " + WrapperUtility.ConvertWindowsPath(fixedQualsBam); string splitNCigarReadsCmd1 = Gatk(Workers) + " SplitNCigarReads" + //" --num_threads " + threads.ToString() + // not supported " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath) //" -rf ReassignOneMappingQuality" + // doing this with STAR //" -RMQF 255" + //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners //" -U ALLOW_N_CIGAR_READS" ; string splitNCigarReadsCmd2 = Gatk(Workers) + " SplitNCigarReads" + //" --num_threads " + threads.ToString() + // not supported " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) + " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath) //" -rf ReassignOneMappingQuality" + // doing this with STAR //" -RMQF 255" + //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners //" -U ALLOW_N_CIGAR_READS" ; List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // split and trim reads (some datasets are probably going to have misencoded quality scores; -fixMisencodedQuals just subtracts 31 from all quality scores if possible...) // exit code of 2 means that the FixMisencodedQualityBaseReads errored out because there were correctly encode base quality scores SamtoolsWrapper.IndexBamCommand(dedupedBam), "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " || ! -s " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " ) ]]; then", " " + fixMisencodedQualsCmd, " if [ $? -ne 2 ]; then", " " + splitNCigarReadsCmd1, " else", " " + splitNCigarReadsCmd2, " fi", "fi", SamtoolsWrapper.IndexBamCommand(SplitTrimBamPath), }; return(commands); }