示例#1
0
        /// <summary>
        /// Creates recalibration table and recalibrates reads.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="bam"></param>
        /// <param name="recalibrationTablePath"></param>
        /// <param name="knownSitesVcf"></param>
        public List <string> BaseRecalibration(string spritzDirectory, string analysisDirectory, string genomeFasta, string bam, string knownSitesVcf)
        {
            RecalibrationTablePath = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recaltable");
            RecalibratedBamPath    = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".recal.bam");

            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                // check that reference VCF is indexed
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) + "; fi",

                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) + " ) ]]; then " +
                Gatk(Workers) +
                " BaseRecalibrator" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(bam) +
                (knownSitesVcf != "" ? " --known-sites " + WrapperUtility.ConvertWindowsPath(knownSitesVcf) : "") +
                " -O " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) +
                "; fi",

                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) + " ) ]]; then " +
                Gatk(Workers) +
                " ApplyBQSR" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(bam) +
                " --bqsr-recal-file " + WrapperUtility.ConvertWindowsPath(RecalibrationTablePath) +
                " -O " + WrapperUtility.ConvertWindowsPath(RecalibratedBamPath) +
                "; fi",
                SamtoolsWrapper.IndexBamCommand(RecalibratedBamPath),
            });
        }
示例#2
0
        /// <summary>
        /// Aligns reads and outputs alignment map and chimeric alignments. Duplicate reads are removed (deduped) from the alignment map, a step that's recommended for variant calling.
        /// Note: fastqs must have \n line endings, not \r\n.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="genomeDir"></param>
        /// <param name="fastqFiles"></param>
        /// <param name="outprefix"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="genomeLoad"></param>
        /// <returns></returns>
        public static List <string> AlignRNASeqReadsForVariantCalling(string spritzDirectory, int threads, string genomeDir, string[] fastqFiles,
                                                                      string outprefix, bool overwriteStarAlignment, bool strandSpecific = true, STARGenomeLoadOption genomeLoad = STARGenomeLoadOption.NoSharedMemory)
        {
            string reads_in     = string.Join(" ", fastqFiles.Select(f => WrapperUtility.ConvertWindowsPath(f)));
            string read_command = fastqFiles.Any(f => Path.GetExtension(f) == ".gz") ?
                                  " --readFilesCommand zcat -c" :
                                  fastqFiles.Any(f => Path.GetExtension(f) == ".bz2") ?
                                  " --readFilesCommand bzip2 -c" :
                                  "";

            string alignmentArguments =
                " --genomeLoad " + genomeLoad.ToString() +
                " --runMode alignReads" +
                " --runThreadN " + threads.ToString() +
                " --genomeDir " + WrapperUtility.ConvertWindowsPath(genomeDir) +
                " --readFilesIn " + reads_in +
                " --outSAMtype BAM SortedByCoordinate" +
                " --outBAMcompression 10" +
                " --limitBAMsortRAM " + Process.GetCurrentProcess().VirtualMemorySize64.ToString() +
                " --outFileNamePrefix " + WrapperUtility.ConvertWindowsPath(outprefix) +

                // chimeric junction settings
                //" --chimSegmentMin 12" +
                //" --chimJunctionOverhangMin 12" +
                //" --alignSJDBoverhangMin 10" +
                //" --alignMatesGapMax 100000" +
                //" --alignIntronMax 100000" +
                //" --chimSegmentReadGapMax 3" +
                //" --alignSJstitchMismatchNmax 5 -1 5 5" +

                // stringtie parameters
                " --outSAMstrandField intronMotif" +            // adds XS tag to all alignments that contain a splice junction
                " --outFilterIntronMotifs RemoveNoncanonical" + // for cufflinks

                // gatk parameters
                " --outSAMattrRGline ID:1 PU:platform  PL:illumina SM:sample LB:library" + // this could shorten the time for samples that aren't multiplexed in preprocessing for GATK
                " --outSAMmapqUnique 60" +                                                 // this is used to ensure compatibility with GATK without having to use the GATK hacks
                read_command;                                                              // note in the future, two sets of reads can be comma separated here, and the RGline can also be comma separated to distinguish them later

            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),

                overwriteStarAlignment ? "" :
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix) + " ) ]]; then",
                "  STAR" + alignmentArguments,
                overwriteStarAlignment ? "" : "fi",
                SamtoolsWrapper.IndexBamCommand(WrapperUtility.ConvertWindowsPath(outprefix + SortedBamFileSuffix)),

                overwriteStarAlignment ? "" : "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix) + " || ! -s " + WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix) + " ) ]]; then",
                "  " + StarDedupCommand(threads, outprefix + SortedBamFileSuffix, outprefix + Path.GetFileNameWithoutExtension(SortedBamFileSuffix)),
                overwriteStarAlignment ? "" : "fi",
                SamtoolsWrapper.IndexBamCommand(WrapperUtility.ConvertWindowsPath(outprefix + DedupedBamFileSuffix)),

                File.Exists(outprefix + BamFileSuffix) && File.Exists(outprefix + DedupedBamFileSuffix) && genomeLoad == STARGenomeLoadOption.LoadAndRemove ?
                "STAR --genomeLoad " + STARGenomeLoadOption.Remove.ToString() :
                "",
            });
        }
示例#3
0
        /// <summary>
        /// Groups (I'm just using one group, so it's more a formality) and sorts reads. Marks duplicates.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="bam"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="reference"></param>
        /// <param name="newBam"></param>
        /// <param name="convertToUCSC"></param>
        public List <string> PrepareBamAndFasta(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string reference)
        {
            string sortedCheckPath         = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerSorted");
            string readGroupedCheckfile    = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".headerReadGrouped");
            string sortedBam               = Path.Combine(Path.GetDirectoryName(bam), Path.GetFileNameWithoutExtension(bam) + ".sorted.bam");
            string groupedBamPath          = Path.Combine(Path.GetDirectoryName(sortedBam), Path.GetFileNameWithoutExtension(sortedBam) + ".grouped.bam");
            string markedDuplicatesBamPath = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.bam");
            string markedDuplicateMetrics  = Path.Combine(Path.GetDirectoryName(groupedBamPath), Path.GetFileNameWithoutExtension(groupedBamPath) + ".marked.metrics");

            string tmpDir = Path.Combine(spritzDirectory, "tmp");

            Directory.CreateDirectory(tmpDir);
            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),

                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath),
                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bam) + " | grep '^@RG' > " + WrapperUtility.ConvertWindowsPath(readGroupedCheckfile),

                // group and sort (note, using picard-tools works, but picard.jar somehow is trucating the BAM files)
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + " ) && " +
                " ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " +
                Gatk(Workers, 2) +
                " AddOrReplaceReadGroups" +
                " -PU platform  -PL illumina -SM sample -LB library" +
                " -I " + WrapperUtility.ConvertWindowsPath(bam) +
                " -O " + WrapperUtility.ConvertWindowsPath(groupedBamPath) +
                " -SO coordinate" +
                " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) +
                "; fi",
                SamtoolsWrapper.IndexBamCommand(groupedBamPath),

                // mark duplicates (AS means assume sorted; note, using picard-tools works, but picard.jar somehow is trucating the BAM files)
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " +
                Gatk(Workers) +
                " MarkDuplicates" +     // formerly picard
                " -I " + WrapperUtility.ConvertWindowsPath(groupedBamPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) +
                " -M " + WrapperUtility.ConvertWindowsPath(markedDuplicateMetrics) +
                " --TMP_DIR " + WrapperUtility.ConvertWindowsPath(tmpDir) +
                " -AS true" +
                "; fi",
                SamtoolsWrapper.IndexBamCommand(markedDuplicatesBamPath),

                // clean up
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(markedDuplicatesBamPath) + " ) ]]; then " +
                "rm " + WrapperUtility.ConvertWindowsPath(groupedBamPath) + "; fi",
            };

            PreparedBamPath = markedDuplicatesBamPath;
            return(commands);
        }
示例#4
0
        /// <summary>
        /// Gets commands to calculate expression an RSEM reference
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <returns></returns>
        public List <string> CalculateExpressionCommands(string spritzDirectory, string referencePrefix, int threads, RSEMAlignerOption aligner, Strandedness strandedness,
                                                         string[] fastqPaths, bool doOuptutBam)
        {
            if (fastqPaths.Length < 1)
            {
                throw new ArgumentOutOfRangeException("No fastq files were given for RSEM calculate expression.");
            }
            if (fastqPaths.Length > 2)
            {
                throw new ArgumentOutOfRangeException("Too many fastq file types given for RSEM calculate expression.");
            }

            List <string> scriptCommands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "cd RSEM-1.3.0",
            };

            string[] analysisFastqPaths = fastqPaths;
            string   alignerOption      = GetAlignerOption(spritzDirectory, aligner);
            string   threadOption       = "--num-threads " + threads.ToString();
            string   strandOption       = "--strandedness " + strandedness.ToString().ToLowerInvariant();

            // Decompress files if needed
            // The '--star-gzipped-read-file' and '--star-bzipped-read-file' options work, but then the rest of RSEM doesn't when using compressed files...
            bool fastqIsGunzipped = analysisFastqPaths[0].EndsWith("gz");
            bool fastqIsBunzipped = analysisFastqPaths[0].EndsWith("bz2") || analysisFastqPaths[0].EndsWith("bz");

            if (fastqIsGunzipped || fastqIsBunzipped)
            {
                for (int i = 0; i < analysisFastqPaths.Length; i++)
                {
                    string decompressionCommand = fastqIsGunzipped ? "gunzip" : "bunzip2";
                    scriptCommands.Add($"{decompressionCommand} --keep {WrapperUtility.ConvertWindowsPath(analysisFastqPaths[i])}");
                    analysisFastqPaths[i] = Path.ChangeExtension(analysisFastqPaths[i], null);
                }
            }

            string inputOption = analysisFastqPaths.Length == 1 ? string.Join(",", analysisFastqPaths[0].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))) :
                                 "--paired-end " +
                                 string.Join(",", analysisFastqPaths[0].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f))) +
                                 " " +
                                 string.Join(",", analysisFastqPaths[1].Split(',').Select(f => WrapperUtility.ConvertWindowsPath(f)));

            var    megabytes = Math.Floor((double)Process.GetCurrentProcess().VirtualMemorySize64 / 1000000);
            string bamOption = doOuptutBam ? "--output-genome-bam" : "--no-bam-output";

            OutputPrefix = Path.Combine(Path.GetDirectoryName(analysisFastqPaths[0].Split(',')[0]),
                                        Path.GetFileNameWithoutExtension(analysisFastqPaths[0].Split(',')[0]) +
                                        "_" + Path.GetExtension(analysisFastqPaths[0].Split(',')[0]).Substring(1).ToUpperInvariant() +
                                        referencePrefix.GetHashCode().ToString());

            // RSEM likes to sort the transcript.bam file, which takes forever and isn't very useful, I've found. Just sort the genome.bam file instead
            string samtoolsCommands = !doOuptutBam ?
                                      "" :
                                      "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(OutputPrefix + GenomeSortedBamSuffix) + " && ! -s " + WrapperUtility.ConvertWindowsPath(OutputPrefix + GenomeSortedBamSuffix) + " ]]; then\n" +
                                      "  " + SamtoolsWrapper.SortBam(OutputPrefix + GenomeBamSuffix, threads) + "\n" +
                                      "  " + SamtoolsWrapper.IndexBamCommand(OutputPrefix + GenomeSortedBamSuffix) + "\n" +
                                      "fi";

            // construct the commands
            scriptCommands.AddRange(new List <string>
            {
                "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(OutputPrefix + IsoformResultsSuffix) + " && ! -s " + WrapperUtility.ConvertWindowsPath(OutputPrefix + IsoformResultsSuffix) + " ]]; then " +
                "./rsem-calculate-expression " +
                "--time " +         // include timed results
                "--calc-ci " +      // posterior calculation of 95% confidence intervals
                alignerOption + " " +
                threadOption + " " +
                bamOption + " " +
                inputOption + " " +
                WrapperUtility.ConvertWindowsPath(referencePrefix) + " " +
                WrapperUtility.ConvertWindowsPath(OutputPrefix) +
                "; fi",
                samtoolsCommands
            });
            return(scriptCommands);
        }
示例#5
0
        /// <summary>
        /// Splits and trims reads splice junction reads with SplitNCigarReads.
        /// Apparently cigars are genomic intervals, and splice junctions are represented by a bunch of N's (unkonwn nucleotide), HaplotypeCaller requires splitting them in the BAM file.
        ///
        /// It's tempting to want to run a few of these at the same time because it's not well parallelized. It's just not worth it. It uses quite a bit of RAM and racks the I/O at the beginning when reading the BAM files.
        /// Could possibly do 4 at a time on 128 GB RAM and 28 processors.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="dedupedBam"></param>
        /// <param name="splitTrimBam"></param>
        /// <returns></returns>
        public List <string> SplitNCigarReads(string spritzDirectory, string genomeFasta, string dedupedBam)
        {
            string fixedQualsBam = Path.Combine(Path.GetDirectoryName(dedupedBam), Path.GetFileNameWithoutExtension(dedupedBam) + ".fixedQuals.bam");

            SplitTrimBamPath = Path.Combine(Path.GetDirectoryName(fixedQualsBam), Path.GetFileNameWithoutExtension(fixedQualsBam) + ".split.bam");

            // This also filters malformed reads
            string fixMisencodedQualsCmd =
                Gatk(Workers) +
                " FixMisencodedBaseQualityReads" +
                " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) +
                " -O " + WrapperUtility.ConvertWindowsPath(fixedQualsBam);

            string splitNCigarReadsCmd1 =
                Gatk(Workers) +
                " SplitNCigarReads" +
                //" --num_threads " + threads.ToString() + // not supported
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) +
                " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath)
                //" -rf ReassignOneMappingQuality" + // doing this with STAR
                //" -RMQF 255" +
                //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners
                //" -U ALLOW_N_CIGAR_READS"
            ;

            string splitNCigarReadsCmd2 =
                Gatk(Workers) +
                " SplitNCigarReads" +
                //" --num_threads " + threads.ToString() + // not supported
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(dedupedBam) +
                " -O " + WrapperUtility.ConvertWindowsPath(SplitTrimBamPath)
                //" -rf ReassignOneMappingQuality" + // doing this with STAR
                //" -RMQF 255" +
                //" -RMQT 60" + // default mapping quality is 60; required for RNA-Seq aligners
                //" -U ALLOW_N_CIGAR_READS"
            ;

            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                // split and trim reads (some datasets are probably going to have misencoded quality scores; -fixMisencodedQuals just subtracts 31 from all quality scores if possible...)
                // exit code of 2 means that the FixMisencodedQualityBaseReads errored out because there were correctly encode base quality scores
                SamtoolsWrapper.IndexBamCommand(dedupedBam),
                "if [[ ( ! -f " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " || ! -s " + WrapperUtility.ConvertWindowsPath(fixedQualsBam) + " ) ]]; then",
                "  " + fixMisencodedQualsCmd,
                "  if [ $? -ne 2 ]; then",
                "    " + splitNCigarReadsCmd1,
                "  else",
                "    " + splitNCigarReadsCmd2,
                "  fi",
                "fi",
                SamtoolsWrapper.IndexBamCommand(SplitTrimBamPath),
            };

            return(commands);
        }