/// <summary> /// Creates a snpeff model for a custom gene model /// </summary> /// <param name="spritzDirectory"></param> /// <param name="analysisDirectory"></param> /// <param name="genomeFastaPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <returns>Name of the snpEff reference that was generated</returns> public static string GenerateDatabase(string spritzDirectory, string analysisDirectory, string genomeFastaPath, string referenceProteinFastaPath, string geneModelGtfOrGffPath) { string snpEffReferenceName = Path.GetExtension(geneModelGtfOrGffPath).Substring(1).ToUpperInvariant() + geneModelGtfOrGffPath.GetHashCode().ToString(); string snpEffReferenceFolderPath = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName); string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseGeneration.bash"); string geneModelOption = Path.GetExtension(geneModelGtfOrGffPath).EndsWith("gtf") ? "-gtf22" : "-gff3"; // if the database is already made, don't remake it if (File.Exists(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName, "snpEffectPredictor.bin"))) { return(snpEffReferenceName); } WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "cd SnpEff", // create data folder for this reference, and copy the custom gene model (can also copy regulatory annotations) "mkdir data/" + snpEffReferenceName, "cp " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "genes" + Path.GetExtension(geneModelGtfOrGffPath))), "cp " + WrapperUtility.ConvertWindowsPath(referenceProteinFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "protein.fa")), // copy the genome to the genomes folder "mkdir " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes")), "cp " + WrapperUtility.ConvertWindowsPath(genomeFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes", snpEffReferenceName + ".fa")), // configure SnpEff for this custom reference // note: if different organism is used in the future, this becomes pretty complex... probably would list the organisms from snpEff.config in the GUI "echo \"\n# " + snpEffReferenceName + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpEffReferenceName + ".genome : Homo_sapiens\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpEffReferenceName + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpEffReferenceName + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpEffReferenceName + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), // build snpEff model "cd ..", SnpEff(1) + " build " + geneModelOption + " -v " + snpEffReferenceName, }).WaitForExit(); return(snpEffReferenceName); }
public List <string> PrimaryVariantAnnotation(string spritzDirectory, string reference, string inputVcfPath, bool fromReference = false) { string outPrefix = Path.Combine(Path.GetDirectoryName(inputVcfPath), Path.GetFileNameWithoutExtension(inputVcfPath)); AnnotatedVcfPath = outPrefix + ".snpEffAnnotated.vcf"; HtmlReportPath = outPrefix + ".snpEffAnnotated.html"; AnnotatedGenesSummaryPath = outPrefix + ".snpEffAnnotated.genes.txt"; VariantProteinFastaPath = outPrefix + ".snpEffAnnotated.protein.fasta"; VariantProteinXmlPath = outPrefix + ".snpEffAnnotated.protein.xml"; Directory.CreateDirectory(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data")); string[] existingDatabases = Directory.GetDirectories(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data")); if (File.Exists(AnnotatedVcfPath) && new FileInfo(AnnotatedVcfPath).Length > 0) { return(new List <string>()); } return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SnpEff(Workers) + " -v -stats " + WrapperUtility.ConvertWindowsPath(HtmlReportPath) + " -fastaProt " + WrapperUtility.ConvertWindowsPath(VariantProteinFastaPath) + " -xmlProt " + WrapperUtility.ConvertWindowsPath(VariantProteinXmlPath) + " " + Path.GetFileName(existingDatabases.FirstOrDefault(x => Path.GetFileName(x).StartsWith(reference, true, null))) + (fromReference ? "" : $" {WrapperUtility.ConvertWindowsPath(inputVcfPath)} > {WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath)}"), // ensure that the files get closed before continuing WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath)), WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(VariantProteinFastaPath)), WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(VariantProteinXmlPath)), // remove the annotated VCF file if snpEff didn't work, e.g. if there was no VCF file to annotate "if [[ ( -f " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath) + " && ! -s " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath) + " ) ]]; then", " rm " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath), "fi", }); }
/// <summary> /// Aligns reads in fastq files using TopHat2. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="bowtieIndexPrefix"></param> /// <param name="threads"></param> /// <param name="fastqPaths"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="outputDirectory"></param> public static void Align(string spritzDirectory, string analysisDirectory, string bowtieIndexPrefix, int threads, string[] fastqPaths, bool strandSpecific, out string outputDirectory) { string tempDir = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), "tmpDir"); outputDirectory = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), Path.GetFileNameWithoutExtension(fastqPaths[0]) + "TophatOut"); Directory.CreateDirectory(tempDir); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "TophatRun.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "tophat-2.1.1/tophat2" + " --num-threads " + threads.ToString() + " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) + //" --GTF " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + /// this triggers tophat to try building an index " --tmp-dir " + WrapperUtility.ConvertWindowsPath(tempDir) + (strandSpecific ? " --library-type fr-firststrand" : "") + " " + WrapperUtility.ConvertWindowsPath(bowtieIndexPrefix) + " " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))), "if [ -d " + WrapperUtility.ConvertWindowsPath(tempDir) + " ]; then rm -r " + WrapperUtility.ConvertWindowsPath(tempDir) + "; fi", }).WaitForExit(); }
// see here for how to generate them from scratch: http://lab.loman.net/2012/11/16/how-to-get-snpeff-working-with-bacterial-genomes-from-ncbi/ public void DownloadSnpEffDatabase(string spritzDirectory, string analysisDirectory, string reference) { DatabaseListPath = Path.Combine(spritzDirectory, "snpEffDatabases.txt"); // check for existing list and database bool databaseListExists = File.Exists(DatabaseListPath); string databaseDirectory = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data"); string[] existingDatabases = Directory.Exists(databaseDirectory) ? Directory.GetDirectories(databaseDirectory) : new string[0]; bool databaseExists = existingDatabases.Any(d => Path.GetFileName(d).StartsWith(reference, true, null)); if (databaseListExists && databaseExists) { return; } // download database list string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownloadList.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"Downloading list of SnpEff references\"", SnpEff(Workers) + " databases > " + WrapperUtility.ConvertWindowsPath(DatabaseListPath), WrapperUtility.EnsureClosedFileCommands(DatabaseListPath) }).WaitForExit(); List <string> databases = new List <string>(); using (StreamReader reader = new StreamReader(DatabaseListPath)) { while (true) { string line = reader.ReadLine(); if (line == null) { break; } databases.Add(line.Split('\t')[0].TrimEnd()); } } string snpeffReference = databases.FirstOrDefault(d => d.StartsWith(reference, true, CultureInfo.InvariantCulture)); // download database (it downloads automatically now, with more feedback), but still need the mitochondrial references scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownload.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"\n# " + snpeffReference + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpeffReference + ".genome : Human genome " + snpeffReference.Split('.')[0] + " using RefSeq transcripts\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpeffReference + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpeffReference + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpeffReference + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), }).WaitForExit(); }
/// <summary> /// Creates recalibration table and recalibrates reads. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="genomeFasta"></param> /// <param name="bam"></param> /// <param name="recalibrationTablePath"></param> /// <param name="knownSitesVcf"></param> public List <string> BaseRecalibration(string spritzDirectory, string analysisDirectory, string genomeFasta, string bam, string knownSitesVcf) { RecalibrationTablePath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recaltable"); RecalibratedBamPath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recal.bam"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // check that reference VCF is indexed "if [ ! -f " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + "; fi", "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " ) ]]; then " + Gatk(Workers) + " BaseRecalibrator" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + (knownSitesVcf != "" ? " --known-sites " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) : "") + " -O " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + "; fi", "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " ) ]]; then " + Gatk(Workers) + " ApplyBQSR" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " --bqsr-recal-file " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " -O " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + "; fi", SamtoolsWrapper.IndexBamCommand(RecalibratedBamPath), }); }
public static List <string> Bowtie2Align(string spritzDirectory, string analysisDirectory, string bowtieIndexPrefix, int threads, string[] fastqPaths, bool strandSpecific, out string sortedBamFilePath) { sortedBamFilePath = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), Path.GetFileNameWithoutExtension(fastqPaths[0])) + ".sorted.bam"; return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(sortedBamFilePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(sortedBamFilePath) + " ) ]]; then", " bowtie2-2.3.4/bowtie2 " + " -x " + WrapperUtility.ConvertWindowsPath(bowtieIndexPrefix) + " -p " + threads.ToString() + (fastqPaths.Length == 1 ? " -U " + WrapperUtility.ConvertWindowsPath(fastqPaths[0]) : " -1 " + WrapperUtility.ConvertWindowsPath(fastqPaths[0]) + " -2 " + WrapperUtility.ConvertWindowsPath(fastqPaths[1])) + " | samtools view -b - " + " | " + SamtoolsWrapper.SortBamFromStdin(sortedBamFilePath, threads), "fi", }); }
public List <string> CombineAndGenotypeGvcfs(string spritzDirectory, string genomeFasta, List <string> gvcfPaths) { if (gvcfPaths == null || gvcfPaths.Count <= 1) { throw new ArgumentException("CombineAndGenotypeGvcfs exception: no gvcfs were specified to combine"); } int uniqueSuffix = 1; foreach (string f in gvcfPaths) { uniqueSuffix = uniqueSuffix ^ f.GetHashCode(); } HaplotypeCallerGvcfPath = Path.Combine(Path.GetDirectoryName(gvcfPaths.First()), $"combined{uniqueSuffix}.g.vcf.gz"); HaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(HaplotypeCallerGvcfPath), $"{Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(HaplotypeCallerGvcfPath))}.gt.vcf"); FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(HaplotypeCallerGvcfPath), $"{Path.GetFileNameWithoutExtension(HaplotypeCallerGvcfPath)}.NoIndels.vcf"); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta) }; foreach (string gvcf in gvcfPaths) { // double check that the compressed gvcf file is indexed commands.Add($"if [ ! -f {WrapperUtility.ConvertWindowsPath(gvcf)}.idx ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(gvcf)}; fi"); } // combine GVCFs string combineCommand = "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " + Gatk(Workers) + " CombineGVCFs" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + string.Join(" -V ", gvcfPaths.Select(gvcf => WrapperUtility.ConvertWindowsPath(gvcf))) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + "; fi"; commands.Add(combineCommand); commands.Add($"if [ ! -f {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}.idx ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi"); // genotype the gvcf file into a traditional vcf file string genotypeCommand = "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers) + " GenotypeGVCFs" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + "; fi"; commands.Add(genotypeCommand); // filter out indels string filterIndelsCommand = "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers) + " SelectVariants" + " --select-type-to-exclude INDEL" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + "; fi"; commands.Add(filterIndelsCommand); return(commands); }
/// <summary> /// Groups (I'm just using one group, so it's more a formality) and sorts reads. Marks duplicates. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bam"></param> /// <param name="genomeFasta"></param> /// <param name="reference"></param> /// <param name="newBam"></param> /// <param name="convertToUCSC"></param> public List <string> PrepareBamAndFasta(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string reference) { string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerSorted"); string readGroupedCheckfile = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerReadGrouped"); string sortedBam = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".sorted.bam"); string groupedBamPath = Path.Combine(Path.GetDirectoryName(sortedBam), Path.GetFileNameWithoutExtension(sortedBam) + ".grouped.bam"); string markedDuplicatesBamPath = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.bam"); string markedDuplicateMetrics = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.metrics"); string tmpDir = Path.Combine(spritzDirectory, "tmp"); Directory.CreateDirectory(tmpDir); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep '^@RG' > " + WrapperUtility.ConvertWindowsPath(readGroupedCheckfile), // group and sort (note, using picard-tools works, but picard.jar somehow is trucating the BAM files) "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " ) && " + " ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + Gatk(Workers, 2) + " AddOrReplaceReadGroups" + " -PU platform -PL illumina -SM sample -LB library" + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " -O " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " -SO coordinate" + " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) + "; fi", SamtoolsWrapper.IndexBamCommand(groupedBamPath), // mark duplicates (AS means assume sorted; note, using picard-tools works, but picard.jar somehow is trucating the BAM files) "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + Gatk(Workers) + " MarkDuplicates" + // formerly picard " -I " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " -O " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " -M " + WrapperUtility.ConvertWindowsPath(markedDuplicateMetrics) + " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) + " -AS true" + "; fi", SamtoolsWrapper.IndexBamCommand(markedDuplicatesBamPath), // clean up "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " + "rm " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + "; fi", }; PreparedBamPath = markedDuplicatesBamPath; return(commands); }
/// <summary> /// HaplotypeCaller for calling variants on each RNA-Seq BAM file individually. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="genomeFasta"></param> /// <param name="splitTrimBam"></param> /// <param name="dbsnpReferenceVcfPath"></param> /// <param name="newVcf"></param> public List <string> VariantCalling(string spritzDirectory, ExperimentType experimentType, int threads, string genomeFasta, string splitTrimBam, string dbsnpReferenceVcfPath) { HaplotypeCallerGvcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.vcf.gz"); HaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.vcf"); FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.NoIndels.vcf"); var vcftools = new VcfToolsWrapper(); List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // check that reference VCF is indexed "if [ ! -f " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + "; fi", // call variants "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " + Gatk(Workers, 2) + " HaplotypeCaller" + " --native-pair-hmm-threads " + threads.ToString() + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(splitTrimBam) + " --min-base-quality-score 20" + (experimentType == ExperimentType.RNASequencing ? " --dont-use-soft-clipped-bases true" : "") + " --dbsnp " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " -ERC GVCF" + // this prompts phasing! " --max-mnp-distance 3" + // note: this can't be used for joint genotyping here, but this setting is available in mutect2 for doing tumor vs normal calls "; fi", // index compressed gvcf file $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerGvcfPath}.tbi")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi", // genotype the gvcf file into a traditional vcf file "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers, 2) + " GenotypeGVCFs" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + "; fi", $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath)}; fi", // filter out indels "if [ ! -f " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ]; then " + Gatk(Workers, 2) + " SelectVariants" + " --select-type-to-exclude INDEL" + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + "; fi", $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{FilteredHaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath)}; fi", // filter variants (RNA-Seq specific params... need to check out recommendations before using DNA-Seq) //"if [ ! -f " + WrapperUtility.ConvertWindowsPath(newVcf) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(newVcf) + " ]; then " + // Gatk() + // " -T VariantFiltration" + // " -nct " + threads.ToString() + // " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + // " -V " + WrapperUtility.ConvertWindowsPath(unfliteredVcf) + // " -window 35 -cluster 3" + // filter out clusters of 3 snps within 35 bases (https://software.broadinstitute.org/gatk/documentation/topic?name=methods) // " -filterName FS -filter \"FS > 30.0\"" + // " -filterName QD -filter \"QD < 2.0\"" + // " -o " + WrapperUtility.ConvertWindowsPath(newVcf) + // "; fi", }; return(commands); }
/// <summary> /// Splits and trims reads splice junction reads with SplitNCigarReads. /// Apparently cigars are genomic intervals, and splice junctions are represented by a bunch of N's (unkonwn nucleotide), HaplotypeCaller requires splitting them in the BAM file. /// /// It's tempting to want to run a few of these at the same time because it's not well parallelized. It's just not worth it. It uses quite a bit of RAM and racks the I/O at the beginning when reading the BAM files. /// Could possibly do 4 at a time on 128 GB RAM and 28 processors. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="genomeFasta"></param> /// <param name="dedupedBam"></param> /// <param name="splitTrimBam"></param> /// <returns></returns> public List <string> SplitNCigarReads(string spritzDirectory, string genomeFasta, string dedupedBam) { string fixedQualsBam = Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + ".fixedQuals.bam"); SplitTrimBamPath = Path.Combine(Path.GetDirectoryName(fixedQualsBam), Path.GetFileNameWithoutExtension(fixedQualsBam) + ".split.bam"); // This also filters malformed reads string fixMisencodedQualsCmd = Gatk(Workers) + " FixMisencodedBaseQualityReads" + " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) + " -O " + WrapperUtility.ConvertWindowsPath(fixedQualsBam); string splitNCigarReadsCmd1 = Gatk(Workers) + " SplitNCigarReads" + //" --num_threads " + threads.ToString() + // not supported " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath) //" -rf ReassignOneMappingQuality" + // doing this with STAR //" -RMQF 255" + //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners //" -U ALLOW_N_CIGAR_READS" ; string splitNCigarReadsCmd2 = Gatk(Workers) + " SplitNCigarReads" + //" --num_threads " + threads.ToString() + // not supported " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) + " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath) //" -rf ReassignOneMappingQuality" + // doing this with STAR //" -RMQF 255" + //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners //" -U ALLOW_N_CIGAR_READS" ; List <string> commands = new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta), GenomeDictionaryIndexCommand(genomeFasta), // split and trim reads (some datasets are probably going to have misencoded quality scores; -fixMisencodedQuals just subtracts 31 from all quality scores if possible...) // exit code of 2 means that the FixMisencodedQualityBaseReads errored out because there were correctly encode base quality scores SamtoolsWrapper.IndexBamCommand(dedupedBam), "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " || ! -s " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " ) ]]; then", " " + fixMisencodedQualsCmd, " if [ $? -ne 2 ]; then", " " + splitNCigarReadsCmd1, " else", " " + splitNCigarReadsCmd2, " fi", "fi", SamtoolsWrapper.IndexBamCommand(SplitTrimBamPath), }; return(commands); }
public string DownloadEnsemblKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun) { DownloadUCSCKnownVariantSites(spritzDirectory, commonOnly, reference, dryRun); EnsemblKnownSitesPath = ConvertVCFChromosomesUCSC2Ensembl(spritzDirectory, UcscKnownSitesPath, reference, dryRun); if (!dryRun) { // indexing is used for most GATK tools WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "IndexKnownVariantSites.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [ ! -f " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + "; fi", "if [ ! -f " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + "; fi", }).WaitForExit(); } return(EnsemblKnownSitesPath); }
/// <summary> /// Downloads dbSNP reference VCF file if it doesn't exist /// </summary> /// <param name="spritzDirectory"></param> /// <param name="commonOnly"></param> /// <param name="reference"></param> /// <returns></returns> public string DownloadUCSCKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun) { bool knownSitesExists = KnownVariantSitesFileExists(spritzDirectory, commonOnly, reference); if (!knownSitesExists && !dryRun) { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "DownloadUcscVariants.bash"), new List <string> { "cd " + WrapperUtility.ConvertWindowsPath(spritzDirectory), "wget " + TargetFileLocation, "gunzip " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz", "rm " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz", "mv " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + " " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) }).WaitForExit(); } return(UcscKnownSitesPath); }
public void Fetch(string spritzDirectory, int threads, string analysisDirectory, string sraAccession) { LogPath = Path.Combine(analysisDirectory, sraAccession + "download.log"); FastqPaths = new[] { sraAccession + "_1.fastq", sraAccession + "_2.fastq" }.Select(f => Path.Combine(analysisDirectory, f)).Where(f => File.Exists(f)).ToArray(); if (FastqPaths.Length > 0) // already downloaded { FastqPaths = FastqPaths.Where(x => x != null && !x.Contains("trimmed") && x.EndsWith(".fastq")).ToArray(); return; } ; string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Download" + sraAccession + ".bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { $"echo \"Downloading {sraAccession}\"", WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), $"sratoolkit*/bin/fasterq-dump --progress --threads {threads.ToString()} --split-files --outdir \"{WrapperUtility.ConvertWindowsPath(analysisDirectory)}\" {sraAccession} 2> {WrapperUtility.ConvertWindowsPath(LogPath)}", }).WaitForExit(); FastqPaths = Directory.GetFiles(analysisDirectory, sraAccession + "*.fastq").ToArray(); }