Exemplo n.º 1
0
        /// <summary>
        /// lncRNA discovery from fastq files
        /// </summary>
        public void LncRNADiscoveryFromFastqs()
        {
            // Setup and Alignments
            EnsemblDownloadsWrapper ensemblDownloads = new EnsemblDownloadsWrapper();

            ensemblDownloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta);
            AlignmentFlow alignment = new AlignmentFlow();

            alignment.Parameters = new AlignmentParameters();
            alignment.Parameters.SpritzDirectory          = Parameters.SpritzDirectory;
            alignment.Parameters.AnalysisDirectory        = Parameters.AnalysisDirectory;
            alignment.Parameters.Reference                = Parameters.Reference;
            alignment.Parameters.Threads                  = Parameters.Threads;
            alignment.Parameters.Fastqs                   = Parameters.Fastqs;
            alignment.Parameters.ExperimentType           = ExperimentType.RNASequencing;
            alignment.Parameters.StrandSpecific           = Parameters.StrandSpecific;
            alignment.Parameters.InferStrandSpecificity   = Parameters.InferStrandSpecificity;
            alignment.Parameters.OverwriteStarAlignment   = Parameters.OverwriteStarAlignment;
            alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory;
            alignment.Parameters.ReorderedFastaPath       = ensemblDownloads.ReorderedFastaPath;
            alignment.Parameters.GeneModelGtfOrGffPath    = Parameters.GeneModelGtfOrGff;
            alignment.Parameters.UseReadSubset            = Parameters.UseReadSubset;
            alignment.Parameters.ReadSubset               = Parameters.ReadSubset;

            alignment.PerformAlignment();
            ensemblDownloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFasta);
            EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.GeneModelGtfOrGff, ensemblDownloads.EnsemblGenome, out string filteredGeneModelForScalpel);
            string sortedBed12Path = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel);

            // Transcript Reconstruction
            StringtieWrapper stringtie = new StringtieWrapper();

            stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.GeneModelGtfOrGff, ensemblDownloads.EnsemblGenome,
                                               Parameters.StrandSpecific, Parameters.InferStrandSpecificity, alignment.SortedBamFiles, true);
            ReconstructedTranscriptModels = stringtie.FilteredTranscriptGtfPaths;
            MergedTranscriptModel         = stringtie.FilteredMergedGtfPath;

            // Annotate lncRNAs
            foreach (string gtf in ReconstructedTranscriptModels)
            {
                string slnckyScriptName = WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "SlnckyAnnotation.bash");
                SlnckyOutPrefix = Path.Combine(Path.GetDirectoryName(gtf), Path.GetFileNameWithoutExtension(gtf) + ".slnckyOut", "annotated");
                WrapperUtility.GenerateAndRunScript(slnckyScriptName,
                                                    SlnckyWrapper.Annotate(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads,
                                                                           gtf, Parameters.Reference, SlnckyOutPrefix)).WaitForExit();
            }

            // Write quantification tables for differential expression analysis (using stringtie TPM values)
        }
Exemplo n.º 2
0
        /// <summary>
        /// Generate sample specific protein database starting with fastq files
        /// </summary>
        public void GenerateSampleSpecificProteinDatabases()
        {
            // Download references and align reads
            Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta);
            if (Parameters.Fastqs != null)
            {
                Alignment.Parameters = new AlignmentParameters();
                Alignment.Parameters.SpritzDirectory          = Parameters.SpritzDirectory;
                Alignment.Parameters.AnalysisDirectory        = Parameters.AnalysisDirectory;
                Alignment.Parameters.Reference                = Parameters.Reference;
                Alignment.Parameters.Threads                  = Parameters.Threads;
                Alignment.Parameters.Fastqs                   = Parameters.Fastqs;
                Alignment.Parameters.ExperimentType           = Parameters.ExperimentType;
                Alignment.Parameters.StrandSpecific           = Parameters.StrandSpecific;
                Alignment.Parameters.InferStrandSpecificity   = Parameters.InferStrandSpecificity;
                Alignment.Parameters.OverwriteStarAlignment   = Parameters.OverwriteStarAlignment;
                Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory;
                Alignment.Parameters.ReorderedFastaPath       = Downloads.ReorderedFastaPath;
                Alignment.Parameters.GeneModelGtfOrGffPath    = Parameters.ReferenceGeneModelGtfOrGff;
                Alignment.Parameters.UseReadSubset            = Parameters.UseReadSubset;
                Alignment.Parameters.ReadSubset               = Parameters.ReadSubset;

                Alignment.PerformAlignment();
                Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath);
            }
            EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel);
            string    sortedBed12Path              = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel);
            GeneModel referenceGeneModel           = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff);
            string    referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided

            // Merge reference gene model and a new gene model (either specified or stringtie-generated)
            string newGeneModelPath           = Parameters.NewGeneModelGtfOrGff;
            string mergedGeneModelWithCdsPath = null;
            string mergedGeneModelProteinXml  = null;
            string reference = Parameters.Reference;

            if (Parameters.DoTranscriptIsoformAnalysis)
            {
                StringtieWrapper stringtie = new StringtieWrapper();
                if (newGeneModelPath == null)
                {
                    stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true);
                    newGeneModelPath = stringtie.FilteredMergedGtfPath;
                }
                else
                {
                    newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff);
                    string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf");
                    WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"),
                                                        StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> {
                        newGeneModelPath
                    }, mergedGeneModelPath)).WaitForExit();
                    newGeneModelPath = mergedGeneModelPath;
                }

                // Determine CDS from start codons of reference gene model
                // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki)
                GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath);
                newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);

                mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf");
                newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath);
            }

            // SnpEff databases or outputing protein XMLs from gene models
            if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database
            {
                reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath);

                if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf
                {
                    mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath);
                }
            }
            else // no isoform analysis
            {
                new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference);
                if (Parameters.Fastqs == null) // no isoform analysis and no fastqs
                {
                    referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff);
                }
            }

            // Gene Fusion Discovery
            List <Protein> fusionProteins = new List <Protein>();

            if (Parameters.DoFusionAnalysis)
            {
                Fusion.Parameters.SpritzDirectory   = Parameters.SpritzDirectory;
                Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory;
                Fusion.Parameters.Reference         = Parameters.Reference;
                Fusion.Parameters.Threads           = Parameters.Threads;
                Fusion.Parameters.Fastqs            = Parameters.Fastqs;
                Fusion.DiscoverGeneFusions();
                fusionProteins = Fusion.FusionProteins;
            }

            // Variant Calling
            if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis)
            {
                VariantCalling.CallVariants(
                    Parameters.SpritzDirectory,
                    Parameters.AnalysisDirectory,
                    Parameters.ExperimentType,
                    reference,
                    Parameters.Threads,
                    sortedBed12Path,
                    Parameters.EnsemblKnownSitesPath,
                    Alignment.DedupedBamFiles,
                    Downloads.ReorderedFastaPath,
                    Downloads.EnsemblGenome,
                    Parameters.QuickSnpEffWithoutStats,
                    Parameters.IndelFinder,
                    Parameters.VariantCallingWorkers);
            }

            // Transfer features from UniProt
            List <string> xmlsToUse = null;

            if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0)
            {
                xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths;
            }
            // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList()
            else
            {
                xmlsToUse = new List <string> {
                    Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml
                }
            };
            VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins);
        }