Beispiel #1
0
        /// <summary>
        /// Creates a snpeff model for a custom gene model
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="analysisDirectory"></param>
        /// <param name="genomeFastaPath"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <returns>Name of the snpEff reference that was generated</returns>
        public static string GenerateDatabase(string spritzDirectory, string analysisDirectory, string genomeFastaPath, string referenceProteinFastaPath, string geneModelGtfOrGffPath)
        {
            string snpEffReferenceName       = Path.GetExtension(geneModelGtfOrGffPath).Substring(1).ToUpperInvariant() + geneModelGtfOrGffPath.GetHashCode().ToString();
            string snpEffReferenceFolderPath = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName);
            string scriptPath      = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseGeneration.bash");
            string geneModelOption = Path.GetExtension(geneModelGtfOrGffPath).EndsWith("gtf") ? "-gtf22" : "-gff3";

            // if the database is already made, don't remake it
            if (File.Exists(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName, "snpEffectPredictor.bin")))
            {
                return(snpEffReferenceName);
            }

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "cd SnpEff",

                // create data folder for this reference, and copy the custom gene model (can also copy regulatory annotations)
                "mkdir data/" + snpEffReferenceName,
                "cp " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "genes" + Path.GetExtension(geneModelGtfOrGffPath))),
                "cp " + WrapperUtility.ConvertWindowsPath(referenceProteinFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "protein.fa")),

                // copy the genome to the genomes folder
                "mkdir " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes")),
                "cp " + WrapperUtility.ConvertWindowsPath(genomeFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes", snpEffReferenceName + ".fa")),

                // configure SnpEff for this custom reference
                // note: if different organism is used in the future, this becomes pretty complex... probably would list the organisms from snpEff.config in the GUI
                "echo \"\n# " + snpEffReferenceName + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpEffReferenceName + ".genome : Homo_sapiens\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpEffReferenceName + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpEffReferenceName + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpEffReferenceName + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),

                // build snpEff model
                "cd ..",
                SnpEff(1) + " build " + geneModelOption + " -v " + snpEffReferenceName,
            }).WaitForExit();
            return(snpEffReferenceName);
        }
Beispiel #2
0
        public List <string> PrimaryVariantAnnotation(string spritzDirectory, string reference, string inputVcfPath, bool fromReference = false)
        {
            string outPrefix = Path.Combine(Path.GetDirectoryName(inputVcfPath), Path.GetFileNameWithoutExtension(inputVcfPath));

            AnnotatedVcfPath          = outPrefix + ".snpEffAnnotated.vcf";
            HtmlReportPath            = outPrefix + ".snpEffAnnotated.html";
            AnnotatedGenesSummaryPath = outPrefix + ".snpEffAnnotated.genes.txt";
            VariantProteinFastaPath   = outPrefix + ".snpEffAnnotated.protein.fasta";
            VariantProteinXmlPath     = outPrefix + ".snpEffAnnotated.protein.xml";
            Directory.CreateDirectory(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data"));
            string[] existingDatabases = Directory.GetDirectories(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data"));
            if (File.Exists(AnnotatedVcfPath) && new FileInfo(AnnotatedVcfPath).Length > 0)
            {
                return(new List <string>());
            }
            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SnpEff(Workers) + " -v -stats " + WrapperUtility.ConvertWindowsPath(HtmlReportPath) +
                " -fastaProt " + WrapperUtility.ConvertWindowsPath(VariantProteinFastaPath) +
                " -xmlProt " + WrapperUtility.ConvertWindowsPath(VariantProteinXmlPath) +
                " " + Path.GetFileName(existingDatabases.FirstOrDefault(x => Path.GetFileName(x).StartsWith(reference, true, null))) +
                (fromReference ? "" : $" {WrapperUtility.ConvertWindowsPath(inputVcfPath)} > {WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath)}"),

                // ensure that the files get closed before continuing
                WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath)),
                WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(VariantProteinFastaPath)),
                WrapperUtility.EnsureClosedFileCommands(WrapperUtility.ConvertWindowsPath(VariantProteinXmlPath)),

                // remove the annotated VCF file if snpEff didn't work, e.g. if there was no VCF file to annotate
                "if [[ ( -f " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath) + " && ! -s " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath) + " ) ]]; then",
                "  rm " + WrapperUtility.ConvertWindowsPath(AnnotatedVcfPath),
                "fi",
            });
        }
Beispiel #3
0
        /// <summary>
        /// Aligns reads in fastq files using TopHat2.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="bowtieIndexPrefix"></param>
        /// <param name="threads"></param>
        /// <param name="fastqPaths"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="outputDirectory"></param>
        public static void Align(string spritzDirectory, string analysisDirectory, string bowtieIndexPrefix, int threads, string[] fastqPaths,
                                 bool strandSpecific, out string outputDirectory)
        {
            string tempDir = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), "tmpDir");

            outputDirectory = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), Path.GetFileNameWithoutExtension(fastqPaths[0]) + "TophatOut");
            Directory.CreateDirectory(tempDir);
            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "TophatRun.bash"), new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "tophat-2.1.1/tophat2" +
                " --num-threads " + threads.ToString() +
                " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) +
                //" --GTF " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + /// this triggers tophat to try building an index
                " --tmp-dir " + WrapperUtility.ConvertWindowsPath(tempDir) +
                (strandSpecific ? " --library-type fr-firststrand" : "") +
                " " + WrapperUtility.ConvertWindowsPath(bowtieIndexPrefix) +
                " " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))),
                "if [ -d " + WrapperUtility.ConvertWindowsPath(tempDir) + " ]; then rm -r " + WrapperUtility.ConvertWindowsPath(tempDir) + "; fi",
            }).WaitForExit();
        }
Beispiel #4
0
        // see here for how to generate them from scratch: http://lab.loman.net/2012/11/16/how-to-get-snpeff-working-with-bacterial-genomes-from-ncbi/
        public void DownloadSnpEffDatabase(string spritzDirectory, string analysisDirectory, string reference)
        {
            DatabaseListPath = Path.Combine(spritzDirectory, "snpEffDatabases.txt");

            // check for existing list and database
            bool   databaseListExists = File.Exists(DatabaseListPath);
            string databaseDirectory  = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data");

            string[] existingDatabases = Directory.Exists(databaseDirectory) ? Directory.GetDirectories(databaseDirectory) : new string[0];
            bool     databaseExists    = existingDatabases.Any(d => Path.GetFileName(d).StartsWith(reference, true, null));

            if (databaseListExists && databaseExists)
            {
                return;
            }

            // download database list
            string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownloadList.bash");

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "echo \"Downloading list of SnpEff references\"",
                SnpEff(Workers) + " databases > " + WrapperUtility.ConvertWindowsPath(DatabaseListPath),
                WrapperUtility.EnsureClosedFileCommands(DatabaseListPath)
            }).WaitForExit();

            List <string> databases = new List <string>();

            using (StreamReader reader = new StreamReader(DatabaseListPath))
            {
                while (true)
                {
                    string line = reader.ReadLine();
                    if (line == null)
                    {
                        break;
                    }
                    databases.Add(line.Split('\t')[0].TrimEnd());
                }
            }
            string snpeffReference = databases.FirstOrDefault(d => d.StartsWith(reference, true, CultureInfo.InvariantCulture));

            // download database (it downloads automatically now, with more feedback), but still need the mitochondrial references
            scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownload.bash");
            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "echo \"\n# " + snpeffReference + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpeffReference + ".genome : Human genome " + snpeffReference.Split('.')[0] + " using RefSeq transcripts\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpeffReference + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpeffReference + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpeffReference + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
            }).WaitForExit();
        }
Beispiel #5
0
        /// <summary>
        /// Creates recalibration table and recalibrates reads.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="bam"></param>
        /// <param name="recalibrationTablePath"></param>
        /// <param name="knownSitesVcf"></param>
        public List <string> BaseRecalibration(string spritzDirectory, string analysisDirectory, string genomeFasta, string bam, string knownSitesVcf)
        {
            RecalibrationTablePath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recaltable");
            RecalibratedBamPath    = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recal.bam");

            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                // check that reference VCF is indexed
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + "; fi",

                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " ) ]]; then " +
                Gatk(Workers) +
                " BaseRecalibrator" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(bam) +
                (knownSitesVcf != "" ? " --known-sites " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) : "") +
                " -O " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) +
                "; fi",

                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " ) ]]; then " +
                Gatk(Workers) +
                " ApplyBQSR" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(bam) +
                " --bqsr-recal-file " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) +
                " -O " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) +
                "; fi",
                SamtoolsWrapper.IndexBamCommand(RecalibratedBamPath),
            });
        }
Beispiel #6
0
 public static List <string> Bowtie2Align(string spritzDirectory, string analysisDirectory, string bowtieIndexPrefix, int threads, string[] fastqPaths,
                                          bool strandSpecific, out string sortedBamFilePath)
 {
     sortedBamFilePath = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), Path.GetFileNameWithoutExtension(fastqPaths[0])) + ".sorted.bam";
     return(new List <string>
     {
         WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
         "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(sortedBamFilePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(sortedBamFilePath) + " ) ]]; then",
         "  bowtie2-2.3.4/bowtie2 " +
         " -x " + WrapperUtility.ConvertWindowsPath(bowtieIndexPrefix) +
         " -p " + threads.ToString() +
         (fastqPaths.Length == 1 ?
          " -U " + WrapperUtility.ConvertWindowsPath(fastqPaths[0]) :
          " -1 " + WrapperUtility.ConvertWindowsPath(fastqPaths[0]) + " -2 " + WrapperUtility.ConvertWindowsPath(fastqPaths[1])) +
         " | samtools view -b - " +
         " | " + SamtoolsWrapper.SortBamFromStdin(sortedBamFilePath, threads),
         "fi",
     });
 }
Beispiel #7
0
        public List <string> CombineAndGenotypeGvcfs(string spritzDirectory, string genomeFasta, List <string> gvcfPaths)
        {
            if (gvcfPaths == null || gvcfPaths.Count <= 1)
            {
                throw new ArgumentException("CombineAndGenotypeGvcfs exception: no gvcfs were specified to combine");
            }
            int uniqueSuffix = 1;

            foreach (string f in gvcfPaths)
            {
                uniqueSuffix = uniqueSuffix ^ f.GetHashCode();
            }
            HaplotypeCallerGvcfPath        = Path.Combine(Path.GetDirectoryName(gvcfPaths.First()), $"combined{uniqueSuffix}.g.vcf.gz");
            HaplotypeCallerVcfPath         = Path.Combine(Path.GetDirectoryName(HaplotypeCallerGvcfPath), $"{Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(HaplotypeCallerGvcfPath))}.gt.vcf");
            FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(HaplotypeCallerGvcfPath), $"{Path.GetFileNameWithoutExtension(HaplotypeCallerGvcfPath)}.NoIndels.vcf");

            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta)
            };

            foreach (string gvcf in gvcfPaths)
            {
                // double check that the compressed gvcf file is indexed
                commands.Add($"if [ ! -f {WrapperUtility.ConvertWindowsPath(gvcf)}.idx ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(gvcf)}; fi");
            }

            // combine GVCFs
            string combineCommand =
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " +
                Gatk(Workers) +
                " CombineGVCFs" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + string.Join(" -V ", gvcfPaths.Select(gvcf => WrapperUtility.ConvertWindowsPath(gvcf))) +
                " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) +
                "; fi";

            commands.Add(combineCommand);
            commands.Add($"if [ ! -f {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}.idx ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi");

            // genotype the gvcf file into a traditional vcf file
            string genotypeCommand =
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " +
                Gatk(Workers) +
                " GenotypeGVCFs" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) +
                "; fi";

            commands.Add(genotypeCommand);

            // filter out indels
            string filterIndelsCommand =
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " +
                Gatk(Workers) +
                " SelectVariants" +
                " --select-type-to-exclude INDEL" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) +
                "; fi";

            commands.Add(filterIndelsCommand);

            return(commands);
        }
Beispiel #8
0
        /// <summary>
        /// Groups (I'm just using one group, so it's more a formality) and sorts reads. Marks duplicates.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="bam"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="reference"></param>
        /// <param name="newBam"></param>
        /// <param name="convertToUCSC"></param>
        public List <string> PrepareBamAndFasta(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string reference)
        {
            string sortedCheckPath         = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerSorted");
            string readGroupedCheckfile    = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerReadGrouped");
            string sortedBam               = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".sorted.bam");
            string groupedBamPath          = Path.Combine(Path.GetDirectoryName(sortedBam), Path.GetFileNameWithoutExtension(sortedBam) + ".grouped.bam");
            string markedDuplicatesBamPath = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.bam");
            string markedDuplicateMetrics  = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.metrics");

            string tmpDir = Path.Combine(spritzDirectory, "tmp");

            Directory.CreateDirectory(tmpDir);
            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),

                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath),
                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep '^@RG' > " + WrapperUtility.ConvertWindowsPath(readGroupedCheckfile),

                // group and sort (note, using picard-tools works, but picard.jar somehow is trucating the BAM files)
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " ) && " +
                " ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " +
                Gatk(Workers, 2) +
                " AddOrReplaceReadGroups" +
                " -PU platform  -PL illumina -SM sample -LB library" +
                " -I " + WrapperUtility.ConvertWindowsPath(bam) +
                " -O " + WrapperUtility.ConvertWindowsPath(groupedBamPath) +
                " -SO coordinate" +
                " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) +
                "; fi",
                SamtoolsWrapper.IndexBamCommand(groupedBamPath),

                // mark duplicates (AS means assume sorted; note, using picard-tools works, but picard.jar somehow is trucating the BAM files)
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " +
                Gatk(Workers) +
                " MarkDuplicates" +     // formerly picard
                " -I " + WrapperUtility.ConvertWindowsPath(groupedBamPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) +
                " -M " + WrapperUtility.ConvertWindowsPath(markedDuplicateMetrics) +
                " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) +
                " -AS true" +
                "; fi",
                SamtoolsWrapper.IndexBamCommand(markedDuplicatesBamPath),

                // clean up
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " +
                "rm " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + "; fi",
            };

            PreparedBamPath = markedDuplicatesBamPath;
            return(commands);
        }
Beispiel #9
0
        /// <summary>
        /// HaplotypeCaller for calling variants on each RNA-Seq BAM file individually.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="splitTrimBam"></param>
        /// <param name="dbsnpReferenceVcfPath"></param>
        /// <param name="newVcf"></param>
        public List <string> VariantCalling(string spritzDirectory, ExperimentType experimentType, int threads, string genomeFasta, string splitTrimBam, string dbsnpReferenceVcfPath)
        {
            HaplotypeCallerGvcfPath        = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.vcf.gz");
            HaplotypeCallerVcfPath         = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.vcf");
            FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.NoIndels.vcf");
            var vcftools = new VcfToolsWrapper();

            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                // check that reference VCF is indexed
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + "; fi",

                // call variants
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " +
                Gatk(Workers, 2) +
                " HaplotypeCaller" +
                " --native-pair-hmm-threads " + threads.ToString() +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(splitTrimBam) +
                " --min-base-quality-score 20" +
                (experimentType == ExperimentType.RNASequencing ? " --dont-use-soft-clipped-bases true" : "") +
                " --dbsnp " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) +
                " -ERC GVCF" +            // this prompts phasing!
                " --max-mnp-distance 3" + // note: this can't be used for joint genotyping here, but this setting is available in mutect2 for doing tumor vs normal calls
                "; fi",

                // index compressed gvcf file
                $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerGvcfPath}.tbi")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi",

                // genotype the gvcf file into a traditional vcf file
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " +
                Gatk(Workers, 2) +
                " GenotypeGVCFs" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) +
                "; fi",
                $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath)}; fi",

                // filter out indels
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ]; then " +
                Gatk(Workers, 2) +
                " SelectVariants" +
                " --select-type-to-exclude INDEL" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) +
                "; fi",
                $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{FilteredHaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath)}; fi",

                // filter variants (RNA-Seq specific params... need to check out recommendations before using DNA-Seq)
                //"if [ ! -f " + WrapperUtility.ConvertWindowsPath(newVcf) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(newVcf) + " ]; then " +
                //    Gatk() +
                //    " -T VariantFiltration" +
                //    " -nct " + threads.ToString() +
                //    " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                //    " -V " + WrapperUtility.ConvertWindowsPath(unfliteredVcf) +
                //    " -window 35 -cluster 3" + // filter out clusters of 3 snps within 35 bases (https://software.broadinstitute.org/gatk/documentation/topic?name=methods)
                //    " -filterName FS -filter \"FS > 30.0\"" +
                //    " -filterName QD -filter \"QD < 2.0\"" +
                //    " -o " + WrapperUtility.ConvertWindowsPath(newVcf) +
                //    "; fi",
            };

            return(commands);
        }
Beispiel #10
0
        /// <summary>
        /// Splits and trims reads splice junction reads with SplitNCigarReads.
        /// Apparently cigars are genomic intervals, and splice junctions are represented by a bunch of N's (unkonwn nucleotide), HaplotypeCaller requires splitting them in the BAM file.
        ///
        /// It's tempting to want to run a few of these at the same time because it's not well parallelized. It's just not worth it. It uses quite a bit of RAM and racks the I/O at the beginning when reading the BAM files.
        /// Could possibly do 4 at a time on 128 GB RAM and 28 processors.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="dedupedBam"></param>
        /// <param name="splitTrimBam"></param>
        /// <returns></returns>
        public List <string> SplitNCigarReads(string spritzDirectory, string genomeFasta, string dedupedBam)
        {
            string fixedQualsBam = Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + ".fixedQuals.bam");

            SplitTrimBamPath = Path.Combine(Path.GetDirectoryName(fixedQualsBam), Path.GetFileNameWithoutExtension(fixedQualsBam) + ".split.bam");

            // This also filters malformed reads
            string fixMisencodedQualsCmd =
                Gatk(Workers) +
                " FixMisencodedBaseQualityReads" +
                " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) +
                " -O " + WrapperUtility.ConvertWindowsPath(fixedQualsBam);

            string splitNCigarReadsCmd1 =
                Gatk(Workers) +
                " SplitNCigarReads" +
                //" --num_threads " + threads.ToString() + // not supported
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) +
                " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath)
                //" -rf ReassignOneMappingQuality" + // doing this with STAR
                //" -RMQF 255" +
                //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners
                //" -U ALLOW_N_CIGAR_READS"
            ;

            string splitNCigarReadsCmd2 =
                Gatk(Workers) +
                " SplitNCigarReads" +
                //" --num_threads " + threads.ToString() + // not supported
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) +
                " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath)
                //" -rf ReassignOneMappingQuality" + // doing this with STAR
                //" -RMQF 255" +
                //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners
                //" -U ALLOW_N_CIGAR_READS"
            ;

            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                // split and trim reads (some datasets are probably going to have misencoded quality scores; -fixMisencodedQuals just subtracts 31 from all quality scores if possible...)
                // exit code of 2 means that the FixMisencodedQualityBaseReads errored out because there were correctly encode base quality scores
                SamtoolsWrapper.IndexBamCommand(dedupedBam),
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " || ! -s " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " ) ]]; then",
                "  " + fixMisencodedQualsCmd,
                "  if [ $? -ne 2 ]; then",
                "    " + splitNCigarReadsCmd1,
                "  else",
                "    " + splitNCigarReadsCmd2,
                "  fi",
                "fi",
                SamtoolsWrapper.IndexBamCommand(SplitTrimBamPath),
            };

            return(commands);
        }
Beispiel #11
0
 public string DownloadEnsemblKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun)
 {
     DownloadUCSCKnownVariantSites(spritzDirectory, commonOnly, reference, dryRun);
     EnsemblKnownSitesPath = ConvertVCFChromosomesUCSC2Ensembl(spritzDirectory, UcscKnownSitesPath, reference, dryRun);
     if (!dryRun)
     {
         // indexing is used for most GATK tools
         WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "IndexKnownVariantSites.bash"), new List <string>
         {
             WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
             "if [ ! -f " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + "; fi",
             "if [ ! -f " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + "; fi",
         }).WaitForExit();
     }
     return(EnsemblKnownSitesPath);
 }
Beispiel #12
0
        /// <summary>
        /// Downloads dbSNP reference VCF file if it doesn't exist
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="commonOnly"></param>
        /// <param name="reference"></param>
        /// <returns></returns>
        public string DownloadUCSCKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun)
        {
            bool knownSitesExists = KnownVariantSitesFileExists(spritzDirectory, commonOnly, reference);

            if (!knownSitesExists && !dryRun)
            {
                WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "DownloadUcscVariants.bash"), new List <string>
                {
                    "cd " + WrapperUtility.ConvertWindowsPath(spritzDirectory),
                    "wget " + TargetFileLocation,
                    "gunzip " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz",
                    "rm " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz",
                    "mv " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + " " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath)
                }).WaitForExit();
            }
            return(UcscKnownSitesPath);
        }
Beispiel #13
0
        public void Fetch(string spritzDirectory, int threads, string analysisDirectory, string sraAccession)
        {
            LogPath    = Path.Combine(analysisDirectory, sraAccession + "download.log");
            FastqPaths = new[] { sraAccession + "_1.fastq", sraAccession + "_2.fastq" }.Select(f => Path.Combine(analysisDirectory, f)).Where(f => File.Exists(f)).ToArray();
            if (FastqPaths.Length > 0) // already downloaded
            {
                FastqPaths = FastqPaths.Where(x => x != null && !x.Contains("trimmed") && x.EndsWith(".fastq")).ToArray();
                return;
            }
            ;
            string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Download" + sraAccession + ".bash");

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                $"echo \"Downloading {sraAccession}\"",
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                $"sratoolkit*/bin/fasterq-dump --progress --threads {threads.ToString()} --split-files --outdir \"{WrapperUtility.ConvertWindowsPath(analysisDirectory)}\" {sraAccession} 2> {WrapperUtility.ConvertWindowsPath(LogPath)}",
            }).WaitForExit();
            FastqPaths = Directory.GetFiles(analysisDirectory, sraAccession + "*.fastq").ToArray();
        }