/// <summary> /// Perform transcript reconstruction using stringtie /// </summary> /// <param name="spritzDirectory"></param> /// <param name="analysisDirectory"></param> /// <param name="threads"></param> /// <param name="geneModelGtfOrGff"></param> /// <param name="genome"></param> /// <param name="strandSpecific"></param> /// <param name="inferStrandSpecificity"></param> /// <param name="sortedBamFiles"></param> public void TranscriptReconstruction(string spritzDirectory, string analysisDirectory, int threads, string geneModelGtfOrGff, Genome genome, bool strandSpecific, bool inferStrandSpecificity, List <string> sortedBamFiles, bool filterEntriesWithZeroAbundanceStringtieEstimates) { // transcript reconstruction with stringtie (transcripts and quantities used for lncRNA discovery, etc.) List <string> reconstructionCommands = new List <string>(); foreach (string sortedBam in sortedBamFiles) { reconstructionCommands.AddRange(AssembleTranscripts(spritzDirectory, threads, sortedBam, geneModelGtfOrGff, genome, strandSpecific ? Strandedness.Forward : Strandedness.None, inferStrandSpecificity, out string stringtieGtfTranscriptGtfPath)); TranscriptGtfPaths.Add(stringtieGtfTranscriptGtfPath); } // merge the resultant gene models with the reference (used for sample specific databases) int uniqueSuffix = 1; foreach (string f in TranscriptGtfPaths) { uniqueSuffix = uniqueSuffix ^ f.GetHashCode(); } MergedGtfPath = MergedGtfPath = Path.Combine(analysisDirectory, "MergedStringtieModel" + uniqueSuffix + ".gtf"); reconstructionCommands.AddRange(MergeTranscriptPredictions(spritzDirectory, geneModelGtfOrGff, TranscriptGtfPaths, MergedGtfPath)); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "TranscriptReconstruction.bash"), reconstructionCommands).WaitForExit(); // filter out the transcripts lacking strand information foreach (string gtf in TranscriptGtfPaths) { string filtered = Path.Combine(Path.GetDirectoryName(gtf), Path.GetFileNameWithoutExtension(gtf) + ".filtered.gtf"); FilterGtfEntriesWithoutStrand(gtf, filtered, filterEntriesWithZeroAbundanceStringtieEstimates); FilteredTranscriptGtfPaths.Add(filtered); } FilteredMergedGtfPath = Path.Combine(Path.GetDirectoryName(MergedGtfPath), Path.GetFileNameWithoutExtension(MergedGtfPath) + ".filtered.gtf"); FilterGtfEntriesWithoutStrand(MergedGtfPath, FilteredMergedGtfPath, false); // stringtie merged GTFs no longer have abundance values }
public static void Align(string spritzDirectory, string analysisDirectory, string IndexPrefix, string[] fastqPaths, out string outputDirectory) { if (fastqPaths.Length == 1) { outputDirectory = "Hisat2OutUnpaired.sam"; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Align.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "hisat2-2.1.0/hisat2 -q -x" + " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) + " -U " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))) + " -S " + outputDirectory, }).WaitForExit(); } else { outputDirectory = "Hisat2OutPaired.sam"; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Align.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "hisat2-2.1.0/hisat2 -q -x" + " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) + " -1 " + string.Join(",", WrapperUtility.ConvertWindowsPath(fastqPaths[0])) + " -2 " + string.Join(",", WrapperUtility.ConvertWindowsPath(fastqPaths[1])) + " -S " + outputDirectory, }).WaitForExit(); } }
/// <summary> /// Prepares an Ensembl genome fasta for alignment and all following analysis. The main issue is that Ensembl orders chromosomes lexigraphically, not karyotypically, like some software like GATK expects. /// </summary> /// <param name="genomeFasta"></param> /// <param name="ensemblGenome"></param> /// <param name="reorderedFasta"></param> public void PrepareEnsemblGenomeFasta(string analysisDirectory, string genomeFasta) { if (Path.GetExtension(genomeFasta) == ".gz" || Path.GetExtension(genomeFasta) == ".tgz") { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Gzippy.bash"), new List <string> { $"gunzip {WrapperUtility.ConvertWindowsPath(genomeFasta)}" }).WaitForExit(); genomeFasta = Path.ChangeExtension(genomeFasta, null); } // We need to use the same fasta file throughout and have all the VCF and GTF chromosome reference IDs be the same as these. // Right now this is based on ensembl references, so those are the chromosome IDs I will be using throughout // TODO: try this with UCSC references to judge whether there's a difference in quality / yield / FDR etc in subsequent proteomics analysis // This file needs to be in karyotypic order; this allows us not to have to reorder it for GATK analysis ReorderedFastaPath = Path.Combine(Path.GetDirectoryName(genomeFasta), Path.GetFileNameWithoutExtension(genomeFasta) + ".karyotypic.fa"); EnsemblGenome = new Genome(genomeFasta); if (!EnsemblGenome.IsKaryotypic()) { EnsemblGenome.Chromosomes = EnsemblGenome.KaryotypicOrder(); if (!File.Exists(ReorderedFastaPath)) { Genome.WriteFasta(EnsemblGenome.Chromosomes.Select(x => x.Sequence), ReorderedFastaPath); } } else { ReorderedFastaPath = genomeFasta; } }
/// <summary> /// Transcript assembly. Note that fragment bias estimation (--frag-bias-correct) and multi-read rescuing (--multi-read-correct) are not used. /// These take a lot of time, and they only provide better abundance estimates, which we use RSEM for. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bamPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="inferStrandSpecificity"></param> /// <param name="outputDirectory"></param> public static List <string> AssembleTranscripts(string spritzDirectory, string analysisDirectory, int threads, string bamPath, string geneModelGtfOrGffPath, Genome genome, bool strandSpecific, bool inferStrandSpecificity, out string outputDirectory) { bool isStranded = strandSpecific; if (inferStrandSpecificity) { BAMProperties bamProperties = new BAMProperties(bamPath, geneModelGtfOrGffPath, genome, 0.8); isStranded = bamProperties.Strandedness != Strandedness.None; } string sortedCheckPath = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksSortCheck"); outputDirectory = Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".cufflinksOutput"); string script_name = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "CufflinksRun.bash"); return(new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "samtools view -H " + WrapperUtility.ConvertWindowsPath(bamPath) + " | grep SO:coordinate > " + WrapperUtility.ConvertWindowsPath(sortedCheckPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then " + SamtoolsWrapper.SortBam(bamPath, threads) + "; fi", "bam=" + WrapperUtility.ConvertWindowsPath(bamPath), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(sortedCheckPath) + " ]; then bam=" + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath) + ".sorted.bam")) + "; fi", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " || ! -s " + WrapperUtility.ConvertWindowsPath(Path.Combine(outputDirectory, TranscriptsFilename)) + " ]]; then " + "cufflinks-2.2.1/cufflinks " + " --num-threads " + threads.ToString() + " --GTF-guide " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) + (isStranded ? "--library-type fr-firststrand" : "") + " $bam" + "; fi", }); }
public static string GenerateXmlDatabaseFromReference(string spritzDirectory, string analysisDirectory, string reference, string inputFilePathForFilePrefix) { var snpeff = new SnpEffWrapper(1); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffGenerateProteinXml.bash"), snpeff.PrimaryVariantAnnotation(spritzDirectory, reference, inputFilePathForFilePrefix, true)).WaitForExit(); return(snpeff.VariantProteinXmlPath); }
public static void FilterGeneModel(string analysisDirectory, string geneModelGtfOrGff, Genome genome, out string filteredGeneModel) { string grepQuery = "\"^" + string.Join(@"\|^", genome.Chromosomes.Select(c => c.FriendlyName).Concat(new[] { "#" }).ToList()) + "\""; filteredGeneModel = Path.Combine(Path.GetDirectoryName(geneModelGtfOrGff), Path.GetFileNameWithoutExtension(geneModelGtfOrGff)) + ".filtered" + Path.GetExtension(geneModelGtfOrGff); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "FilterGeneModel.bash"), new List <string> { "grep " + grepQuery + " " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGff) + " > " + WrapperUtility.ConvertWindowsPath(filteredGeneModel) }).WaitForExit(); }
// see here for how to generate them from scratch: http://lab.loman.net/2012/11/16/how-to-get-snpeff-working-with-bacterial-genomes-from-ncbi/ public void DownloadSnpEffDatabase(string spritzDirectory, string analysisDirectory, string reference) { DatabaseListPath = Path.Combine(spritzDirectory, "snpEffDatabases.txt"); // check for existing list and database bool databaseListExists = File.Exists(DatabaseListPath); string databaseDirectory = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data"); string[] existingDatabases = Directory.Exists(databaseDirectory) ? Directory.GetDirectories(databaseDirectory) : new string[0]; bool databaseExists = existingDatabases.Any(d => Path.GetFileName(d).StartsWith(reference, true, null)); if (databaseListExists && databaseExists) { return; } // download database list string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownloadList.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"Downloading list of SnpEff references\"", SnpEff(Workers) + " databases > " + WrapperUtility.ConvertWindowsPath(DatabaseListPath), WrapperUtility.EnsureClosedFileCommands(DatabaseListPath) }).WaitForExit(); List <string> databases = new List <string>(); using (StreamReader reader = new StreamReader(DatabaseListPath)) { while (true) { string line = reader.ReadLine(); if (line == null) { break; } databases.Add(line.Split('\t')[0].TrimEnd()); } } string snpeffReference = databases.FirstOrDefault(d => d.StartsWith(reference, true, CultureInfo.InvariantCulture)); // download database (it downloads automatically now, with more feedback), but still need the mitochondrial references scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseDownload.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"\n# " + snpeffReference + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpeffReference + ".genome : Human genome " + snpeffReference.Split('.')[0] + " using RefSeq transcripts\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpeffReference + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpeffReference + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpeffReference + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), }).WaitForExit(); }
/// <summary> /// Generic method for subsetting a BAM file. Useful for testing new methods. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="threads"></param> /// <param name="bam"></param> /// <param name="genomeFasta"></param> /// <param name="genomeRegion"></param> /// <param name="outputBam"></param> public void SubsetBam(string spritzDirectory, string analysisDirectory, int threads, string bam, string genomeFasta, string genomeRegion, string outputBam) { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SubsetBam.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), Gatk(Workers) + " PrintReads" + " --num_threads " + threads.ToString() + " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " -I " + WrapperUtility.ConvertWindowsPath(bam) + " -o " + WrapperUtility.ConvertWindowsPath(outputBam) + " -L " + genomeRegion, }).WaitForExit(); }
public static void GenerateIndex(string spritzDirectory, string analysisDirectory, string genomeFasta, out string IndexPrefix) { IndexPrefix = Path.Combine(Path.GetDirectoryName(genomeFasta), Path.GetFileNameWithoutExtension(genomeFasta)); if (IndexExists(genomeFasta)) { return; } WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Hisat2Build.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "hisat2-2.1.0/hisat2-build" + " " + WrapperUtility.ConvertWindowsPath(genomeFasta) + " " + WrapperUtility.ConvertWindowsPath(IndexPrefix) }).WaitForExit(); }
public string DownloadEnsemblKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun) { DownloadUCSCKnownVariantSites(spritzDirectory, commonOnly, reference, dryRun); EnsemblKnownSitesPath = ConvertVCFChromosomesUCSC2Ensembl(spritzDirectory, UcscKnownSitesPath, reference, dryRun); if (!dryRun) { // indexing is used for most GATK tools WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "IndexKnownVariantSites.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [ ! -f " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) + "; fi", "if [ ! -f " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(EnsemblKnownSitesPath) + "; fi", }).WaitForExit(); } return(EnsemblKnownSitesPath); }
public static void Trim(string spritzDirectory, string analysisDirectory, int threads, int qualityFilter, string[] readPaths, bool dryRun, out string[] readTrimmedPaths, out string log) { log = ""; readTrimmedPaths = new string[readPaths.Length]; if (readPaths.Length <= 0) { return; } // Only create paired entry if paired input, and ignore inputs after second index bool compressed = Path.GetExtension(readPaths[0]) == ".gz"; string[] uncompressedReadPaths = compressed ? readPaths.Select(x => Path.Combine(Path.GetDirectoryName(x), Path.GetFileNameWithoutExtension(x))).ToArray() : readPaths; for (int i = 0; i < readPaths.Length; i++) { if (i == 0) { readTrimmedPaths[0] = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed" + (uncompressedReadPaths.Length > 1 ? "-pair1" : "") + ".fastq"); } if (i == 1) { readTrimmedPaths[1] = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed-pair2.fastq"); } } log = Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]) + "-trimmed.log"); bool alreadyTrimmed = File.Exists(readTrimmedPaths[0]) && (readPaths.Length == 1 || File.Exists(readTrimmedPaths[1])); if (alreadyTrimmed || dryRun) { return; } string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Skewered.bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "skewer-0.2.2/skewer" + " -q " + qualityFilter.ToString() + " -o " + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(uncompressedReadPaths[0]), Path.GetFileNameWithoutExtension(uncompressedReadPaths[0]))) + " -t " + threads.ToString() + " -x " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "BBMap", "resources", "adapters.fa")) + " " + WrapperUtility.ConvertWindowsPath(readPaths[0]) + (readPaths.Length > 1 ? " " + WrapperUtility.ConvertWindowsPath(readPaths[1]) : ""), }).WaitForExit(); }
/// <summary> /// Downloads dbSNP reference VCF file if it doesn't exist /// </summary> /// <param name="spritzDirectory"></param> /// <param name="commonOnly"></param> /// <param name="reference"></param> /// <returns></returns> public string DownloadUCSCKnownVariantSites(string spritzDirectory, bool commonOnly, string reference, bool dryRun) { bool knownSitesExists = KnownVariantSitesFileExists(spritzDirectory, commonOnly, reference); if (!knownSitesExists && !dryRun) { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(spritzDirectory, "DownloadUcscVariants.bash"), new List <string> { "cd " + WrapperUtility.ConvertWindowsPath(spritzDirectory), "wget " + TargetFileLocation, "gunzip " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz", "rm " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + ".gz", "mv " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesDownloadPath) + " " + WrapperUtility.ConvertWindowsPath(UcscKnownSitesPath) }).WaitForExit(); } return(UcscKnownSitesPath); }
/// <summary> /// Downloads Ensembl references for GRCh37 or GRCh38. /// /// Sets GenomeFastaPath, GtfGeneModelPath, Gff3GeneModelPath, and ProteinFastaPath properties. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="targetDirectory"></param> /// <param name="reference"></param> /// <param name="genomeFastaPath"></param> /// <param name="gtfGeneModelPath"></param> /// <param name="gff3GeneModelPath"></param> /// <param name="proteinFastaPath"></param> public void DownloadReferences(string spritzDirectory, string targetDirectory, string reference, bool dryRun) { bool downloadGrch37 = string.Equals(reference, "GRCh37", StringComparison.CurrentCultureIgnoreCase); bool downloadGrch38 = string.Equals(reference, "GRCh38", StringComparison.CurrentCultureIgnoreCase); GenomeFastaPath = downloadGrch37 ? Path.Combine(targetDirectory, GRCh37PrimaryAssemblyFilename) : downloadGrch38? Path.Combine(targetDirectory, GRCh38PrimaryAssemblyFilename) : ""; GtfGeneModelPath = downloadGrch37 ? Path.Combine(targetDirectory, GRCh37GtfGeneModelFilename) : downloadGrch38? Path.Combine(targetDirectory, GRCh38GtfGeneModelFilename) : ""; Gff3GeneModelPath = downloadGrch37 ? GtfGeneModelPath : downloadGrch38? Path.Combine(targetDirectory, GRCh38Gff3GeneModelFilename) : ""; ProteinFastaPath = downloadGrch37 ? Path.Combine(targetDirectory, GRCh37ProteinFastaFilename) : downloadGrch38? Path.Combine(targetDirectory, GRCh38ProteinFastaFilename) : ""; if (!downloadGrch37 && !downloadGrch38 || dryRun) { return; } WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(targetDirectory, "DownloadEnsemblReference.bash"), new List <string> { $"cd {WrapperUtility.ConvertWindowsPath(targetDirectory)}", $"if [ ! -f {Path.GetFileName(GenomeFastaPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38PrimaryAssemblyUrl : GRCh37PrimaryAssemblyUrl)} | gunzip -c > {Path.GetFileName(GenomeFastaPath)}; fi", $"if [ ! -f {Path.GetFileName(GtfGeneModelPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38GtfGeneModelUrl : GRCh37GtfGeneModelUrl)} | gunzip -c > {Path.GetFileName(GtfGeneModelPath)}; fi", $"if [ ! -f {Path.GetFileName(Gff3GeneModelPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38Gff3GeneModelUrl : GRCh37GtfGeneModelUrl)} | gunzip -c > {Path.GetFileName(Gff3GeneModelPath)}; fi", // note GRCh37 calls the gtf url instead $"if [ ! -f {Path.GetFileName(ProteinFastaPath)} ]; then wget -O - {(downloadGrch38 ? GRCh38ProteinFastaUrl : GRCh37ProteinFastaUrl)} | gunzip -c > {Path.GetFileName(ProteinFastaPath)}; fi", // note GRCh37 calls the gtf url instead }).WaitForExit(); //Genome.WriteFasta(new Genome(genomeFastaPath).KaryotypicOrder(), genomeFastaPath); // todo: try this for ordering contigs before alignments; does gtf then need to be reordered? }
/// <summary> /// Converts a gene model file (GTF or GFF2) to BED6, meaning the BED file has the minimum 6 columns. /// /// See https://www.biostars.org/p/206342/ for the awk fix. /// /// </summary> /// <param name="spritzDirectory"></param> /// <param name="gtfOrGffPath"></param> /// <returns></returns> public static string GtfOrGff2Bed6(string spritzDirectory, string analysisDirectory, string gtfOrGffPath) { string extension = Path.GetExtension(gtfOrGffPath); string bedPath = Path.Combine(Path.GetDirectoryName(gtfOrGffPath), Path.GetFileNameWithoutExtension(gtfOrGffPath) + ".bed"); if (!File.Exists(bedPath) || new FileInfo(bedPath).Length == 0) { WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Bed6Conversion.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), (extension == ".gtf" ? "awk '{ if ($0 ~ \"transcript_id\") print $0; else print $0\" transcript_id \\\"\\\";\"; }' " : "cat ") + WrapperUtility.ConvertWindowsPath(gtfOrGffPath) + " | " + (extension == ".gtf" ? "gtf2bed" : "gff2bed") + " - > " + WrapperUtility.ConvertWindowsPath(bedPath), }).WaitForExit(); } return(bedPath); }
/// <summary> /// Converts a GFF formatted gene model to GTF /// </summary> /// <param name="spritzDirectory"></param> /// <param name="geneModelGffPath"></param> /// <param name="geneModelGtfPath"></param> public static void GffToGtf(string spritzDirectory, string analysisDirectory, string geneModelGffPath, out string geneModelGtfPath) { if (!Path.GetExtension(geneModelGffPath).StartsWith(".gff")) { throw new ArgumentException("Input gene model must be gff formatted to convert to gtf."); } geneModelGtfPath = geneModelGffPath + ".converted.gtf"; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "GffToGtf.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "echo \"Converting GFF to GTF: " + geneModelGffPath + " -> " + geneModelGtfPath + "\"", "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(geneModelGtfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(geneModelGtfPath) + " ]]; then " + "cufflinks-2.2.1/gffread " + WrapperUtility.ConvertWindowsPath(geneModelGffPath) + " -T -o " + WrapperUtility.ConvertWindowsPath(geneModelGtfPath) + "; fi" }).WaitForExit(); }
/// <summary> /// Converts a gene model file (GTF?) to a BED12 file with all 12 columns sometimes required of a BED file. /// /// see https://gist.github.com/gireeshkbogu/f478ad8495dca56545746cd391615b93 /// /// </summary> /// <param name="spritzDirectory"></param> /// <param name="filteredGeneModelGtfGffPath"></param> /// <returns></returns> public static string Gtf2Bed12(string spritzDirectory, string analysisDirectory, string filteredGeneModelGtfGffPath, string genomeFastaPath) { string geneModelGtf = filteredGeneModelGtfGffPath; if (Path.GetExtension(filteredGeneModelGtfGffPath).StartsWith(".gff")) { CufflinksWrapper.GffToGtf(spritzDirectory, analysisDirectory, filteredGeneModelGtfGffPath, out geneModelGtf); } string genePredPath = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".genePred"); string bed12Path = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".bed12"); string sortedBed12Path = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".sorted.bed12"); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Bed12FaidxSortConversion.bash"), new List <string> { "gtfToGenePred " + WrapperUtility.ConvertWindowsPath(geneModelGtf) + " " + WrapperUtility.ConvertWindowsPath(genePredPath), "genePredToBed " + WrapperUtility.ConvertWindowsPath(genePredPath) + " " + WrapperUtility.ConvertWindowsPath(bed12Path), "bedtools sort -faidx " + WrapperUtility.ConvertWindowsPath(genomeFastaPath + ".fai") + " -i " + WrapperUtility.ConvertWindowsPath(bed12Path) + " > " + WrapperUtility.ConvertWindowsPath(sortedBed12Path), }).WaitForExit(); return(sortedBed12Path); }
public void Fetch(string spritzDirectory, int threads, string analysisDirectory, string sraAccession) { LogPath = Path.Combine(analysisDirectory, sraAccession + "download.log"); FastqPaths = new[] { sraAccession + "_1.fastq", sraAccession + "_2.fastq" }.Select(f => Path.Combine(analysisDirectory, f)).Where(f => File.Exists(f)).ToArray(); if (FastqPaths.Length > 0) // already downloaded { FastqPaths = FastqPaths.Where(x => x != null && !x.Contains("trimmed") && x.EndsWith(".fastq")).ToArray(); return; } ; string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Download" + sraAccession + ".bash"); WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { $"echo \"Downloading {sraAccession}\"", WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), $"sratoolkit*/bin/fasterq-dump --progress --threads {threads.ToString()} --split-files --outdir \"{WrapperUtility.ConvertWindowsPath(analysisDirectory)}\" {sraAccession} 2> {WrapperUtility.ConvertWindowsPath(LogPath)}", }).WaitForExit(); FastqPaths = Directory.GetFiles(analysisDirectory, sraAccession + "*.fastq").ToArray(); }
/// <summary> /// Creates a snpeff model for a custom gene model /// </summary> /// <param name="spritzDirectory"></param> /// <param name="analysisDirectory"></param> /// <param name="genomeFastaPath"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <returns>Name of the snpEff reference that was generated</returns> public static string GenerateDatabase(string spritzDirectory, string analysisDirectory, string genomeFastaPath, string referenceProteinFastaPath, string geneModelGtfOrGffPath) { string snpEffReferenceName = Path.GetExtension(geneModelGtfOrGffPath).Substring(1).ToUpperInvariant() + geneModelGtfOrGffPath.GetHashCode().ToString(); string snpEffReferenceFolderPath = Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName); string scriptPath = WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SnpEffDatabaseGeneration.bash"); string geneModelOption = Path.GetExtension(geneModelGtfOrGffPath).EndsWith("gtf") ? "-gtf22" : "-gff3"; // if the database is already made, don't remake it if (File.Exists(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", snpEffReferenceName, "snpEffectPredictor.bin"))) { return(snpEffReferenceName); } WrapperUtility.GenerateAndRunScript(scriptPath, new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "cd SnpEff", // create data folder for this reference, and copy the custom gene model (can also copy regulatory annotations) "mkdir data/" + snpEffReferenceName, "cp " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "genes" + Path.GetExtension(geneModelGtfOrGffPath))), "cp " + WrapperUtility.ConvertWindowsPath(referenceProteinFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(snpEffReferenceFolderPath, "protein.fa")), // copy the genome to the genomes folder "mkdir " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes")), "cp " + WrapperUtility.ConvertWindowsPath(genomeFastaPath) + " " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "data", "genomes", snpEffReferenceName + ".fa")), // configure SnpEff for this custom reference // note: if different organism is used in the future, this becomes pretty complex... probably would list the organisms from snpEff.config in the GUI "echo \"\n# " + snpEffReferenceName + "\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpEffReferenceName + ".genome : Homo_sapiens\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"" + snpEffReferenceName + ".reference : ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpEffReferenceName + ".M.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), "echo \"\t" + snpEffReferenceName + ".MT.codonTable : Vertebrate_Mitochondrial\" >> " + WrapperUtility.ConvertWindowsPath(Path.Combine(spritzDirectory, "Tools", "SnpEff", "snpEff.config")), // build snpEff model "cd ..", SnpEff(1) + " build " + geneModelOption + " -v " + snpEffReferenceName, }).WaitForExit(); return(snpEffReferenceName); }
public static int InferInnerDistance(string spritzDirectory, string analysisDirectory, string bamPath, string geneModelPath, out string[] outputFiles) { if (Path.GetExtension(geneModelPath) != ".bed") { geneModelPath = BEDOPSWrapper.GffOrGtf2Bed12(spritzDirectory, analysisDirectory, geneModelPath); } outputFiles = new string[] { Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceRPlotSuffix, Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceFrequencyTableSuffix, Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceDistanceTableSuffix }; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "InferInnerDistance.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "python RSeQC-2.6.4/scripts/inner_distance.py" + " -i " + WrapperUtility.ConvertWindowsPath(bamPath) + // input " -o " + WrapperUtility.ConvertWindowsPath(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath))) + // out prefix " -r " + WrapperUtility.ConvertWindowsPath(geneModelPath), // gene model in BED format WrapperUtility.EnsureClosedFileCommands(outputFiles[0]), WrapperUtility.EnsureClosedFileCommands(outputFiles[1]), WrapperUtility.EnsureClosedFileCommands(outputFiles[2]), }).WaitForExit(); string[] distance_lines = File.ReadAllLines(Path.Combine(Path.GetDirectoryName(bamPath), Path.GetFileNameWithoutExtension(bamPath)) + InnerDistanceDistanceTableSuffix); List <int> distances = new List <int>(); foreach (string dline in distance_lines) { if (int.TryParse(dline.Split('\t')[1], out int distance) && distance <250 && distance> -250) // default settings for infer_distance { distances.Add(distance); } } int averageDistance = (int)Math.Round(distances.Average(), 0); return(averageDistance); }
/// <summary> /// Aligns reads in fastq files using TopHat2. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="bowtieIndexPrefix"></param> /// <param name="threads"></param> /// <param name="fastqPaths"></param> /// <param name="geneModelGtfOrGffPath"></param> /// <param name="strandSpecific"></param> /// <param name="outputDirectory"></param> public static void Align(string spritzDirectory, string analysisDirectory, string bowtieIndexPrefix, int threads, string[] fastqPaths, bool strandSpecific, out string outputDirectory) { string tempDir = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), "tmpDir"); outputDirectory = Path.Combine(Path.GetDirectoryName(fastqPaths[0]), Path.GetFileNameWithoutExtension(fastqPaths[0]) + "TophatOut"); Directory.CreateDirectory(tempDir); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "TophatRun.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "tophat-2.1.1/tophat2" + " --num-threads " + threads.ToString() + " --output-dir " + WrapperUtility.ConvertWindowsPath(outputDirectory) + //" --GTF " + WrapperUtility.ConvertWindowsPath(geneModelGtfOrGffPath) + /// this triggers tophat to try building an index " --tmp-dir " + WrapperUtility.ConvertWindowsPath(tempDir) + (strandSpecific ? " --library-type fr-firststrand" : "") + " " + WrapperUtility.ConvertWindowsPath(bowtieIndexPrefix) + " " + string.Join(",", fastqPaths.Select(x => WrapperUtility.ConvertWindowsPath(x))), "if [ -d " + WrapperUtility.ConvertWindowsPath(tempDir) + " ]; then rm -r " + WrapperUtility.ConvertWindowsPath(tempDir) + "; fi", }).WaitForExit(); }
/// <summary> /// Converts a gene model file (GTF?) to a BED12 file with all 12 columns sometimes required of a BED file. /// /// see https://gist.github.com/gireeshkbogu/f478ad8495dca56545746cd391615b93 /// /// </summary> /// <param name="spritzDirectory"></param> /// <param name="geneModelGtfOrGff"></param> /// <returns></returns> public static string GffOrGtf2Bed12(string spritzDirectory, string analysisDirectory, string geneModelGtfOrGff) { string geneModelGtf = geneModelGtfOrGff; if (Path.GetExtension(geneModelGtfOrGff).StartsWith(".gff")) { CufflinksWrapper.GffToGtf(spritzDirectory, analysisDirectory, geneModelGtfOrGff, out geneModelGtf); } string genePredPath = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".genePred"); string bed12Path = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".bed12"); string sortedBed12Path = Path.Combine(Path.GetDirectoryName(geneModelGtf), Path.GetFileNameWithoutExtension(geneModelGtf) + ".sorted.bed12"); WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "Bed12conversion.bash"), new List <string> { // Note, there is a gff3ToGenePred program. Could test and replace that here for gff3 files. "gtfToGenePred " + WrapperUtility.ConvertWindowsPath(geneModelGtf) + " " + WrapperUtility.ConvertWindowsPath(genePredPath), "genePredToBed " + WrapperUtility.ConvertWindowsPath(genePredPath) + " " + WrapperUtility.ConvertWindowsPath(bed12Path), "sort -k1,1 -k2,2n " + WrapperUtility.ConvertWindowsPath(bed12Path) + " > " + WrapperUtility.ConvertWindowsPath(sortedBed12Path), }).WaitForExit(); return(sortedBed12Path); }
/// <summary> /// Uses seqtk to get a subset of reads from a (pair of) fastq file(s). /// Note: fastqs must have \n line endings, not \r\n. /// </summary> /// <param name="spritzDirectory"></param> /// <param name="fastqFiles"></param> /// <param name="numReads"></param> /// <param name="currentDirectory"></param> /// <param name="newFfiles"></param> /// <param name="useSeed"></param> /// <param name="seed"></param> public static void SubsetFastqs(string spritzDirectory, string analysisDirectory, string[] fastqFiles, int numReads, string currentDirectory, out string[] newFfiles, bool useSeed = false, int seed = 0) { newFfiles = new string[] { Path.Combine(Path.GetDirectoryName(fastqFiles[0]), Path.GetFileNameWithoutExtension(fastqFiles[0]) + ".segment.fastq") }; if (fastqFiles.Length > 1) { newFfiles = new string[] { newFfiles[0], Path.Combine(Path.GetDirectoryName(fastqFiles[1]), Path.GetFileNameWithoutExtension(fastqFiles[1]) + ".segment.fastq") } } ; WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(analysisDirectory, "SubsetReads.bash"), new List <string> { WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory), "if [ ! -s " + WrapperUtility.ConvertWindowsPath(newFfiles[0]) + " ]; then", " echo \"Subsetting " + numReads.ToString() + " reads from " + string.Join(",", fastqFiles) + "\"", " seqtk/seqtk sample" + (useSeed || fastqFiles.Length > 1 ? " -s" + seed.ToString() : "") + " " + WrapperUtility.ConvertWindowsPath(fastqFiles[0]) + " " + numReads.ToString() + " > " + WrapperUtility.ConvertWindowsPath(newFfiles[0]), fastqFiles.Length > 1 ? " seqtk/seqtk sample -s" + seed.ToString() + " " + WrapperUtility.ConvertWindowsPath(fastqFiles[1]) + " " + numReads.ToString() + " > " + WrapperUtility.ConvertWindowsPath(newFfiles[1]) : "", "fi" }).WaitForExit(); } #endregion Public Methods }