Example #1
0
        /// <summary>
        /// Perform transcript reconstruction using stringtie
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="analysisDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="geneModelGtfOrGff"></param>
        /// <param name="genome"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="inferStrandSpecificity"></param>
        /// <param name="sortedBamFiles"></param>
        public void TranscriptReconstruction(string spritzDirectory, string analysisDirectory, int threads, string geneModelGtfOrGff, Genome genome,
                                             bool strandSpecific, bool inferStrandSpecificity, List <string> sortedBamFiles, bool filterEntriesWithZeroAbundanceStringtieEstimates)
        {
            // transcript reconstruction with stringtie (transcripts and quantities used for lncRNA discovery, etc.)
            List <string> reconstructionCommands = new List <string>();

            foreach (string sortedBam in sortedBamFiles)
            {
                reconstructionCommands.AddRange(AssembleTranscripts(spritzDirectory, threads, sortedBam, geneModelGtfOrGff, genome, strandSpecific ? Strandedness.Forward : Strandedness.None, inferStrandSpecificity, out string stringtieGtfTranscriptGtfPath));
                TranscriptGtfPaths.Add(stringtieGtfTranscriptGtfPath);
            }

            // merge the resultant gene models with the reference (used for sample specific databases)
            int uniqueSuffix = 1;

            foreach (string f in TranscriptGtfPaths)
            {
                uniqueSuffix = uniqueSuffix ^ f.GetHashCode();
            }
            MergedGtfPath = MergedGtfPath = Path.Combine(analysisDirectory, "MergedStringtieModel" + uniqueSuffix + ".gtf");
            reconstructionCommands.AddRange(MergeTranscriptPredictions(spritzDirectory, geneModelGtfOrGff, TranscriptGtfPaths, MergedGtfPath));
            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "TranscriptReconstruction.bash"), reconstructionCommands).WaitForExit();

            // filter out the transcripts lacking strand information
            foreach (string gtf in TranscriptGtfPaths)
            {
                string filtered = Path.Combine(Path.GetDirectoryName(gtf), Path.GetFileNameWithoutExtension(gtf) + ".filtered.gtf");
                FilterGtfEntriesWithoutStrand(gtf, filtered, filterEntriesWithZeroAbundanceStringtieEstimates);
                FilteredTranscriptGtfPaths.Add(filtered);
            }
            FilteredMergedGtfPath = Path.Combine(Path.GetDirectoryName(MergedGtfPath), Path.GetFileNameWithoutExtension(MergedGtfPath) + ".filtered.gtf");
            FilterGtfEntriesWithoutStrand(MergedGtfPath, FilteredMergedGtfPath, false); // stringtie merged GTFs no longer have abundance values
        }
Example #2
0
 public static void Align(string spritzDirectory, string analysisDirectory, string IndexPrefix, string[] fastqPaths, out string outputDirectory)
 {
     if (fastqPaths.Length == 1)
     {
         outputDirectory = "Hisat2OutUnpaired.sam";
         WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Align.bash"), new List <string>
         {
             WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
             "hisat2-2.1.0/hisat2 -q -x" +
             " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) +
             " -U " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))) +
             " -S " + outputDirectory,
         }).WaitForExit();
     }
     else
     {
         outputDirectory = "Hisat2OutPaired.sam";
         WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Align.bash"), new List <string>
         {
             WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
             "hisat2-2.1.0/hisat2 -q -x" +
             " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) +
             " -1 " + string.Join(",", WrapperUtility.ConvertWindowsPath(fastqPaths[0])) +
             " -2 " + string.Join(",", WrapperUtility.ConvertWindowsPath(fastqPaths[1])) +
             " -S " + outputDirectory,
         }).WaitForExit();
     }
 }
Example #3
0
        /// <summary>
        /// Prepares an Ensembl genome fasta for alignment and all following analysis. The main issue is that Ensembl orders chromosomes lexigraphically, not karyotypically, like some software like GATK expects.
        /// </summary>
        /// <param name="genomeFasta"></param>
        /// <param name="ensemblGenome"></param>
        /// <param name="reorderedFasta"></param>
        public void PrepareEnsemblGenomeFasta(string analysisDirectory, string genomeFasta)
        {
            if (Path.GetExtension(genomeFasta) == ".gz" || Path.GetExtension(genomeFasta) == ".tgz")
            {
                WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Gzippy.bash"), new List <string> {
                    $"gunzip {WrapperUtility.ConvertWindowsPath(genomeFasta)}"
                }).WaitForExit();
                genomeFasta = Path.ChangeExtension(genomeFasta, null);
            }

            // We need to use the same fasta file throughout and have all the VCF and GTF chromosome reference IDs be the same as these.
            // Right now this is based on ensembl references, so those are the chromosome IDs I will be using throughout
            // TODO: try this with UCSC references to judge whether there's a difference in quality / yield / FDR etc in subsequent proteomics analysis
            // This file needs to be in karyotypic order; this allows us not to have to reorder it for GATK analysis
            ReorderedFastaPath = Path.Combine(Path.GetDirectoryName(genomeFasta), Path.GetFileNameWithoutExtension(genomeFasta) + ".karyotypic.fa");
            EnsemblGenome      = new Genome(genomeFasta);
            if (!EnsemblGenome.IsKaryotypic())
            {
                EnsemblGenome.Chromosomes = EnsemblGenome.KaryotypicOrder();
                if (!File.Exists(ReorderedFastaPath))
                {
                    Genome.WriteFasta(EnsemblGenome.Chromosomes.Select(x => x.Sequence), ReorderedFastaPath);
                }
            }
            else
            {
                ReorderedFastaPath = genomeFasta;
            }
        }
Example #4
0
        /// <summary>
        /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used.
        /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="bamPath"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="inferStrandSpecificity"></param>
        /// <param name="outputDirectory"></param>
        public static List <string> AssembleTranscripts(string spritzDirectory, string analysisDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, bool strandSpecific, bool inferStrandSpecificity, out string outputDirectory)
        {
            bool isStranded = strandSpecific;

            if (inferStrandSpecificity)
            {
                BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8);
                isStranded = bamProperties.Strandedness != Strandedness.None;
            }

            string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck");

            outputDirectory = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksOutput");
            string script_name = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "CufflinksRun.bash");

            return(new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi",
                "bam=" + WrapperUtility.ConvertWindowsPath(bamPath),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi",
                "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " || ! -s " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " ]]; then " +
                "cufflinks-2.2.1/cufflinks " +
                " --num-threads " + threads.ToString() +
                " --GTF-guide " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) +
                " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) +
                (isStranded ? "--library-type fr-firststrand" : "") +
                " $bam" +
                "; fi",
            });
        }
Example #5
0
        public static string GenerateXmlDatabaseFromReference(string spritzDirectory, string analysisDirectory, string reference, string inputFilePathForFilePrefix)
        {
            var snpeff = new SnpEffWrapper(1);

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffGenerateProteinXml.bash"),
                                                snpeff.PrimaryVariantAnnotation(spritzDirectory, reference, inputFilePathForFilePrefix, true)).WaitForExit();
            return(snpeff.VariantProteinXmlPath);
        }
Example #6
0
        public static void FilterGeneModel(string analysisDirectory, string geneModelGtfOrGff, Genome genome, out string filteredGeneModel)
        {
            string grepQuery = "\"^" + string.Join(@"\|^", genome.Chromosomes.Select(c => c.FriendlyName).Concat(new[] { "#" }).ToList()) + "\"";

            filteredGeneModel = Path.Combine(Path.GetDirectoryName(geneModelGtfOrGff), Path.GetFileNameWithoutExtension(geneModelGtfOrGff)) + ".filtered" + Path.GetExtension(geneModelGtfOrGff);
            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "FilterGeneModel.bash"), new List <string>
            {
                "grep " + grepQuery + " " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGff) + " > " + WrapperUtility.ConvertWindowsPath(filteredGeneModel)
            }).WaitForExit();
        }
Example #7
0
        // see here for how to generate them from scratch: http://lab.loman.net/2012/11/16/how-to-get-snpeff-working-with-bacterial-genomes-from-ncbi/
        public void DownloadSnpEffDatabase(string spritzDirectory, string analysisDirectory, string reference)
        {
            DatabaseListPath = Path.Combine(spritzDirectory, "snpEffDatabases.txt");

            // check for existing list and database
            bool   databaseListExists = File.Exists(DatabaseListPath);
            string databaseDirectory  = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data");

            string[] existingDatabases = Directory.Exists(databaseDirectory) ? Directory.GetDirectories(databaseDirectory) : new string[0];
            bool     databaseExists    = existingDatabases.Any(d => Path.GetFileName(d).StartsWith(reference, true, null));

            if (databaseListExists && databaseExists)
            {
                return;
            }

            // download database list
            string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownloadList.bash");

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "echo \"Downloading list of SnpEff references\"",
                SnpEff(Workers) + " databases > " + WrapperUtility.ConvertWindowsPath(DatabaseListPath),
                WrapperUtility.EnsureClosedFileCommands(DatabaseListPath)
            }).WaitForExit();

            List <string> databases = new List <string>();

            using (StreamReader reader = new StreamReader(DatabaseListPath))
            {
                while (true)
                {
                    string line = reader.ReadLine();
                    if (line == null)
                    {
                        break;
                    }
                    databases.Add(line.Split('\t')[0].TrimEnd());
                }
            }
            string snpeffReference = databases.FirstOrDefault(d => d.StartsWith(reference, true, CultureInfo.InvariantCulture));

            // download database (it downloads automatically now, with more feedback), but still need the mitochondrial references
            scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownload.bash");
            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "echo \"\n# " + snpeffReference + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpeffReference + ".genome : Human genome " + snpeffReference.Split('.')[0] + " using RefSeq transcripts\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpeffReference + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpeffReference + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpeffReference + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
            }).WaitForExit();
        }
Example #8
0
 /// <summary>
 /// Generic method for subsetting a BAM file. Useful for testing new methods.
 /// </summary>
 /// <param name="spritzDirectory"></param>
 /// <param name="threads"></param>
 /// <param name="bam"></param>
 /// <param name="genomeFasta"></param>
 /// <param name="genomeRegion"></param>
 /// <param name="outputBam"></param>
 public void SubsetBam(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string genomeRegion, string outputBam)
 {
     WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SubsetBam.bash"), new List <string>
     {
         WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
         Gatk(Workers) +
         " PrintReads" +
         " --num_threads " + threads.ToString() +
         " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
         " -I " + WrapperUtility.ConvertWindowsPath(bam) +
         " -o " + WrapperUtility.ConvertWindowsPath(outputBam) +
         " -L " + genomeRegion,
     }).WaitForExit();
 }
Example #9
0
 public static void GenerateIndex(string spritzDirectory, string analysisDirectory, string genomeFasta, out string IndexPrefix)
 {
     IndexPrefix = Path.Combine(Path.GetDirectoryName(genomeFasta), Path.GetFileNameWithoutExtension(genomeFasta));
     if (IndexExists(genomeFasta))
     {
         return;
     }
     WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Build.bash"), new List <string>
     {
         WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
         "hisat2-2.1.0/hisat2-build" +
         " " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
         " " + WrapperUtility.ConvertWindowsPath(IndexPrefix)
     }).WaitForExit();
 }
Example #10
0
 public string DownloadEnsemblKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun)
 {
     DownloadUCSCKnownVariantSites(spritzDirectory, commonOnly, reference, dryRun);
     EnsemblKnownSitesPath = ConvertVCFChromosomesUCSC2Ensembl(spritzDirectory, UcscKnownSitesPath, reference, dryRun);
     if (!dryRun)
     {
         // indexing is used for most GATK tools
         WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "IndexKnownVariantSites.bash"), new List <string>
         {
             WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
             "if [ ! -f " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + "; fi",
             "if [ ! -f " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + "; fi",
         }).WaitForExit();
     }
     return(EnsemblKnownSitesPath);
 }
Example #11
0
        public static void Trim(string spritzDirectory, string analysisDirectory, int threads, int qualityFilter, string[] readPaths, bool dryRun, out string[] readTrimmedPaths, out string log)
        {
            log = "";
            readTrimmedPaths = new string[readPaths.Length];
            if (readPaths.Length <= 0)
            {
                return;
            }

            // Only create paired entry if paired input, and ignore inputs after second index
            bool compressed = Path.GetExtension(readPaths[0]) == ".gz";

            string[] uncompressedReadPaths = compressed ? readPaths.Select(x => Path.Combine(Path.GetDirectoryName(x), Path.GetFileNameWithoutExtension(x))).ToArray() : readPaths;
            for (int i = 0; i < readPaths.Length; i++)
            {
                if (i == 0)
                {
                    readTrimmedPaths[0] = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed" + (uncompressedReadPaths.Length > 1 ? "-pair1" : "") + ".fastq");
                }
                if (i == 1)
                {
                    readTrimmedPaths[1] = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed-pair2.fastq");
                }
            }
            log = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed.log");

            bool alreadyTrimmed = File.Exists(readTrimmedPaths[0]) && (readPaths.Length == 1 || File.Exists(readTrimmedPaths[1]));

            if (alreadyTrimmed || dryRun)
            {
                return;
            }

            string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Skewered.bash");

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "skewer-0.2.2/skewer" +
                " -q " + qualityFilter.ToString() +
                " -o " + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]))) +
                " -t " + threads.ToString() +
                " -x " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "BBMap", "resources", "adapters.fa")) +
                " " + WrapperUtility.ConvertWindowsPath(readPaths[0]) +
                (readPaths.Length > 1 ? " " + WrapperUtility.ConvertWindowsPath(readPaths[1]) : ""),
            }).WaitForExit();
        }
Example #12
0
        /// <summary>
        /// Downloads dbSNP reference VCF file if it doesn't exist
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="commonOnly"></param>
        /// <param name="reference"></param>
        /// <returns></returns>
        public string DownloadUCSCKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun)
        {
            bool knownSitesExists = KnownVariantSitesFileExists(spritzDirectory, commonOnly, reference);

            if (!knownSitesExists && !dryRun)
            {
                WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "DownloadUcscVariants.bash"), new List <string>
                {
                    "cd " + WrapperUtility.ConvertWindowsPath(spritzDirectory),
                    "wget " + TargetFileLocation,
                    "gunzip " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz",
                    "rm " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz",
                    "mv " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + " " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath)
                }).WaitForExit();
            }
            return(UcscKnownSitesPath);
        }
Example #13
0
        /// <summary>
        /// Downloads Ensembl references for GRCh37 or GRCh38.
        ///
        /// Sets GenomeFastaPath, GtfGeneModelPath, Gff3GeneModelPath, and ProteinFastaPath properties.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="targetDirectory"></param>
        /// <param name="reference"></param>
        /// <param name="genomeFastaPath"></param>
        /// <param name="gtfGeneModelPath"></param>
        /// <param name="gff3GeneModelPath"></param>
        /// <param name="proteinFastaPath"></param>
        public void DownloadReferences(string spritzDirectory, string targetDirectory, string reference, bool dryRun)
        {
            bool downloadGrch37 = string.Equals(reference, "GRCh37", StringComparison.CurrentCultureIgnoreCase);
            bool downloadGrch38 = string.Equals(reference, "GRCh38", StringComparison.CurrentCultureIgnoreCase);

            GenomeFastaPath = downloadGrch37 ?
                              Path.Combine(targetDirectory, GRCh37PrimaryAssemblyFilename) :
                              downloadGrch38?
                              Path.Combine(targetDirectory, GRCh38PrimaryAssemblyFilename) :
                                  "";

            GtfGeneModelPath = downloadGrch37 ?
                               Path.Combine(targetDirectory, GRCh37GtfGeneModelFilename) :
                               downloadGrch38?
                               Path.Combine(targetDirectory, GRCh38GtfGeneModelFilename) :
                                   "";

            Gff3GeneModelPath = downloadGrch37 ?
                                GtfGeneModelPath :
                                downloadGrch38?
                                Path.Combine(targetDirectory, GRCh38Gff3GeneModelFilename) :
                                    "";

            ProteinFastaPath = downloadGrch37 ?
                               Path.Combine(targetDirectory, GRCh37ProteinFastaFilename) :
                               downloadGrch38?
                               Path.Combine(targetDirectory, GRCh38ProteinFastaFilename) :
                                   "";

            if (!downloadGrch37 && !downloadGrch38 || dryRun)
            {
                return;
            }

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(targetDirectory, "DownloadEnsemblReference.bash"), new List <string>
            {
                $"cd {WrapperUtility.ConvertWindowsPath(targetDirectory)}",
                $"if [ ! -f {Path.GetFileName(GenomeFastaPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38PrimaryAssemblyUrl : GRCh37PrimaryAssemblyUrl)} | gunzip -c > {Path.GetFileName(GenomeFastaPath)}; fi",
                $"if [ ! -f {Path.GetFileName(GtfGeneModelPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38GtfGeneModelUrl : GRCh37GtfGeneModelUrl)} | gunzip -c > {Path.GetFileName(GtfGeneModelPath)}; fi",
                $"if [ ! -f {Path.GetFileName(Gff3GeneModelPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38Gff3GeneModelUrl : GRCh37GtfGeneModelUrl)} | gunzip -c > {Path.GetFileName(Gff3GeneModelPath)}; fi", // note GRCh37 calls the gtf url instead
                $"if [ ! -f {Path.GetFileName(ProteinFastaPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38ProteinFastaUrl : GRCh37ProteinFastaUrl)} | gunzip -c > {Path.GetFileName(ProteinFastaPath)}; fi",    // note GRCh37 calls the gtf url instead
            }).WaitForExit();

            //Genome.WriteFasta(new Genome(genomeFastaPath).KaryotypicOrder(), genomeFastaPath); // todo: try this for ordering contigs before alignments; does gtf then need to be reordered?
        }
Example #14
0
        /// <summary>
        /// Converts a gene model file (GTF or GFF2) to BED6, meaning the BED file has the minimum 6 columns.
        ///
        /// See https://www.biostars.org/p/206342/ for the awk fix.
        ///
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="gtfOrGffPath"></param>
        /// <returns></returns>
        public static string GtfOrGff2Bed6(string spritzDirectory, string analysisDirectory, string gtfOrGffPath)
        {
            string extension = Path.GetExtension(gtfOrGffPath);
            string bedPath   = Path.Combine(Path.GetDirectoryName(gtfOrGffPath), Path.GetFileNameWithoutExtension(gtfOrGffPath) + ".bed");

            if (!File.Exists(bedPath) || new FileInfo(bedPath).Length == 0)
            {
                WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Bed6Conversion.bash"), new List <string>
                {
                    WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                    (extension == ".gtf" ? "awk '{ if ($0 ~ \"transcript_id\") print $0; else print $0\" transcript_id \\\"\\\";\"; }' " : "cat ") +
                    WrapperUtility.ConvertWindowsPath(gtfOrGffPath) +
                    " | " + (extension == ".gtf" ? "gtf2bed" : "gff2bed") +
                    " - > " + WrapperUtility.ConvertWindowsPath(bedPath),
                }).WaitForExit();
            }
            return(bedPath);
        }
Example #15
0
        /// <summary>
        /// Converts a GFF formatted gene model to GTF
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="geneModelGffPath"></param>
        /// <param name="geneModelGtfPath"></param>
        public static void GffToGtf(string spritzDirectory, string analysisDirectory, string geneModelGffPath, out string geneModelGtfPath)
        {
            if (!Path.GetExtension(geneModelGffPath).StartsWith(".gff"))
            {
                throw new ArgumentException("Input gene model must be gff formatted to convert to gtf.");
            }

            geneModelGtfPath = geneModelGffPath + ".converted.gtf";

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "GffToGtf.bash"), new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "echo \"Converting GFF to GTF: " + geneModelGffPath + " -> " + geneModelGtfPath + "\"",
                "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(geneModelGtfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(geneModelGtfPath) + " ]]; then " +
                "cufflinks-2.2.1/gffread " + WrapperUtility.ConvertWindowsPath(geneModelGffPath) + " -T -o " + WrapperUtility.ConvertWindowsPath(geneModelGtfPath) +
                "; fi"
            }).WaitForExit();
        }
Example #16
0
        /// <summary>
        /// Converts a gene model file (GTF?) to a BED12 file with all 12 columns sometimes required of a BED file.
        ///
        /// see https://gist.github.com/gireeshkbogu/f478ad8495dca56545746cd391615b93
        ///
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="filteredGeneModelGtfGffPath"></param>
        /// <returns></returns>
        public static string Gtf2Bed12(string spritzDirectory, string analysisDirectory, string filteredGeneModelGtfGffPath, string genomeFastaPath)
        {
            string geneModelGtf = filteredGeneModelGtfGffPath;

            if (Path.GetExtension(filteredGeneModelGtfGffPath).StartsWith(".gff"))
            {
                CufflinksWrapper.GffToGtf(spritzDirectory, analysisDirectory, filteredGeneModelGtfGffPath, out geneModelGtf);
            }
            string genePredPath    = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".genePred");
            string bed12Path       = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".bed12");
            string sortedBed12Path = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".sorted.bed12");

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Bed12FaidxSortConversion.bash"), new List <string>
            {
                "gtfToGenePred " + WrapperUtility.ConvertWindowsPath(geneModelGtf) + " " + WrapperUtility.ConvertWindowsPath(genePredPath),
                "genePredToBed " + WrapperUtility.ConvertWindowsPath(genePredPath) + " " + WrapperUtility.ConvertWindowsPath(bed12Path),
                "bedtools sort -faidx " + WrapperUtility.ConvertWindowsPath(genomeFastaPath + ".fai") + " -i " + WrapperUtility.ConvertWindowsPath(bed12Path) + " > " + WrapperUtility.ConvertWindowsPath(sortedBed12Path),
            }).WaitForExit();
            return(sortedBed12Path);
        }
Example #17
0
        public void Fetch(string spritzDirectory, int threads, string analysisDirectory, string sraAccession)
        {
            LogPath    = Path.Combine(analysisDirectory, sraAccession + "download.log");
            FastqPaths = new[] { sraAccession + "_1.fastq", sraAccession + "_2.fastq" }.Select(f => Path.Combine(analysisDirectory, f)).Where(f => File.Exists(f)).ToArray();
            if (FastqPaths.Length > 0) // already downloaded
            {
                FastqPaths = FastqPaths.Where(x => x != null && !x.Contains("trimmed") && x.EndsWith(".fastq")).ToArray();
                return;
            }
            ;
            string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Download" + sraAccession + ".bash");

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                $"echo \"Downloading {sraAccession}\"",
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                $"sratoolkit*/bin/fasterq-dump --progress --threads {threads.ToString()} --split-files --outdir \"{WrapperUtility.ConvertWindowsPath(analysisDirectory)}\" {sraAccession} 2> {WrapperUtility.ConvertWindowsPath(LogPath)}",
            }).WaitForExit();
            FastqPaths = Directory.GetFiles(analysisDirectory, sraAccession + "*.fastq").ToArray();
        }
Example #18
0
        /// <summary>
        /// Creates a snpeff model for a custom gene model
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="analysisDirectory"></param>
        /// <param name="genomeFastaPath"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <returns>Name of the snpEff reference that was generated</returns>
        public static string GenerateDatabase(string spritzDirectory, string analysisDirectory, string genomeFastaPath, string referenceProteinFastaPath, string geneModelGtfOrGffPath)
        {
            string snpEffReferenceName       = Path.GetExtension(geneModelGtfOrGffPath).Substring(1).ToUpperInvariant() + geneModelGtfOrGffPath.GetHashCode().ToString();
            string snpEffReferenceFolderPath = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName);
            string scriptPath      = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseGeneration.bash");
            string geneModelOption = Path.GetExtension(geneModelGtfOrGffPath).EndsWith("gtf") ? "-gtf22" : "-gff3";

            // if the database is already made, don't remake it
            if (File.Exists(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName, "snpEffectPredictor.bin")))
            {
                return(snpEffReferenceName);
            }

            WrapperUtility.GenerateAndRunScript(scriptPath, new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "cd SnpEff",

                // create data folder for this reference, and copy the custom gene model (can also copy regulatory annotations)
                "mkdir data/" + snpEffReferenceName,
                "cp " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "genes" + Path.GetExtension(geneModelGtfOrGffPath))),
                "cp " + WrapperUtility.ConvertWindowsPath(referenceProteinFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "protein.fa")),

                // copy the genome to the genomes folder
                "mkdir " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes")),
                "cp " + WrapperUtility.ConvertWindowsPath(genomeFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes", snpEffReferenceName + ".fa")),

                // configure SnpEff for this custom reference
                // note: if different organism is used in the future, this becomes pretty complex... probably would list the organisms from snpEff.config in the GUI
                "echo \"\n# " + snpEffReferenceName + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpEffReferenceName + ".genome : Homo_sapiens\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"" + snpEffReferenceName + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpEffReferenceName + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),
                "echo \"\t" + snpEffReferenceName + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")),

                // build snpEff model
                "cd ..",
                SnpEff(1) + " build " + geneModelOption + " -v " + snpEffReferenceName,
            }).WaitForExit();
            return(snpEffReferenceName);
        }
Example #19
0
        public static int InferInnerDistance(string spritzDirectory, string analysisDirectory, string bamPath, string geneModelPath, out string[] outputFiles)
        {
            if (Path.GetExtension(geneModelPath) != ".bed")
            {
                geneModelPath = BEDOPSWrapper.GffOrGtf2Bed12(spritzDirectory, analysisDirectory, geneModelPath);
            }

            outputFiles = new string[]
            {
                Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceRPlotSuffix,
                Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceFrequencyTableSuffix,
                Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceDistanceTableSuffix
            };

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "InferInnerDistance.bash"), new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "python RSeQC-2.6.4/scripts/inner_distance.py" +
                " -i " + WrapperUtility.ConvertWindowsPath(bamPath) +                                                                                 // input
                " -o " + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath))) + // out prefix
                " -r " + WrapperUtility.ConvertWindowsPath(geneModelPath),                                                                            // gene model in BED format
                WrapperUtility.EnsureClosedFileCommands(outputFiles[0]),
                WrapperUtility.EnsureClosedFileCommands(outputFiles[1]),
                WrapperUtility.EnsureClosedFileCommands(outputFiles[2]),
            }).WaitForExit();

            string[]   distance_lines = File.ReadAllLines(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceDistanceTableSuffix);
            List <int> distances      = new List <int>();

            foreach (string dline in distance_lines)
            {
                if (int.TryParse(dline.Split('\t')[1], out int distance) &&
                    distance <250 && distance> -250)      // default settings for infer_distance
                {
                    distances.Add(distance);
                }
            }
            int averageDistance = (int)Math.Round(distances.Average(), 0);

            return(averageDistance);
        }
Example #20
0
        /// <summary>
        /// Aligns reads in fastq files using TopHat2.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="bowtieIndexPrefix"></param>
        /// <param name="threads"></param>
        /// <param name="fastqPaths"></param>
        /// <param name="geneModelGtfOrGffPath"></param>
        /// <param name="strandSpecific"></param>
        /// <param name="outputDirectory"></param>
        public static void Align(string spritzDirectory, string analysisDirectory, string bowtieIndexPrefix, int threads, string[] fastqPaths,
                                 bool strandSpecific, out string outputDirectory)
        {
            string tempDir = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), "tmpDir");

            outputDirectory = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), Path.GetFileNameWithoutExtension(fastqPaths[0]) + "TophatOut");
            Directory.CreateDirectory(tempDir);
            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "TophatRun.bash"), new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "tophat-2.1.1/tophat2" +
                " --num-threads " + threads.ToString() +
                " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) +
                //" --GTF " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + /// this triggers tophat to try building an index
                " --tmp-dir " + WrapperUtility.ConvertWindowsPath(tempDir) +
                (strandSpecific ? " --library-type fr-firststrand" : "") +
                " " + WrapperUtility.ConvertWindowsPath(bowtieIndexPrefix) +
                " " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))),
                "if [ -d " + WrapperUtility.ConvertWindowsPath(tempDir) + " ]; then rm -r " + WrapperUtility.ConvertWindowsPath(tempDir) + "; fi",
            }).WaitForExit();
        }
Example #21
0
        /// <summary>
        /// Converts a gene model file (GTF?) to a BED12 file with all 12 columns sometimes required of a BED file.
        ///
        /// see https://gist.github.com/gireeshkbogu/f478ad8495dca56545746cd391615b93
        ///
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="geneModelGtfOrGff"></param>
        /// <returns></returns>
        public static string GffOrGtf2Bed12(string spritzDirectory, string analysisDirectory, string geneModelGtfOrGff)
        {
            string geneModelGtf = geneModelGtfOrGff;

            if (Path.GetExtension(geneModelGtfOrGff).StartsWith(".gff"))
            {
                CufflinksWrapper.GffToGtf(spritzDirectory, analysisDirectory, geneModelGtfOrGff, out geneModelGtf);
            }
            string genePredPath    = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".genePred");
            string bed12Path       = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".bed12");
            string sortedBed12Path = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".sorted.bed12");

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Bed12conversion.bash"), new List <string>
            {
                // Note, there is a gff3ToGenePred program. Could test and replace that here for gff3 files.
                "gtfToGenePred " + WrapperUtility.ConvertWindowsPath(geneModelGtf) + " " + WrapperUtility.ConvertWindowsPath(genePredPath),
                "genePredToBed " + WrapperUtility.ConvertWindowsPath(genePredPath) + " " + WrapperUtility.ConvertWindowsPath(bed12Path),
                "sort -k1,1 -k2,2n " + WrapperUtility.ConvertWindowsPath(bed12Path) + " > " + WrapperUtility.ConvertWindowsPath(sortedBed12Path),
            }).WaitForExit();
            return(sortedBed12Path);
        }
Example #22
0
        /// <summary>
        /// Uses seqtk to get a subset of reads from a (pair of) fastq file(s).
        /// Note: fastqs must have \n line endings, not \r\n.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="fastqFiles"></param>
        /// <param name="numReads"></param>
        /// <param name="currentDirectory"></param>
        /// <param name="newFfiles"></param>
        /// <param name="useSeed"></param>
        /// <param name="seed"></param>
        public static void SubsetFastqs(string spritzDirectory, string analysisDirectory, string[] fastqFiles, int numReads, string currentDirectory, out string[] newFfiles, bool useSeed = false, int seed = 0)
        {
            newFfiles = new string[] { Path.Combine(Path.GetDirectoryName(fastqFiles[0]), Path.GetFileNameWithoutExtension(fastqFiles[0]) + ".segment.fastq") };
            if (fastqFiles.Length > 1)
            {
                newFfiles = new string[] { newFfiles[0], Path.Combine(Path.GetDirectoryName(fastqFiles[1]), Path.GetFileNameWithoutExtension(fastqFiles[1]) + ".segment.fastq") }
            }
            ;

            WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SubsetReads.bash"), new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "if [ ! -s " + WrapperUtility.ConvertWindowsPath(newFfiles[0]) + " ]; then",
                "  echo \"Subsetting " + numReads.ToString() + " reads from " + string.Join(",", fastqFiles) + "\"",
                "  seqtk/seqtk sample" + (useSeed || fastqFiles.Length > 1 ? " -s" + seed.ToString() : "") + " " + WrapperUtility.ConvertWindowsPath(fastqFiles[0]) + " " + numReads.ToString() + " > " + WrapperUtility.ConvertWindowsPath(newFfiles[0]),
                fastqFiles.Length > 1 ? "  seqtk/seqtk sample -s" + seed.ToString() + " " + WrapperUtility.ConvertWindowsPath(fastqFiles[1]) + " " + numReads.ToString() + " > " + WrapperUtility.ConvertWindowsPath(newFfiles[1]) : "",
                "fi"
            }).WaitForExit();
        }

        #endregion Public Methods
    }