public void GffAppliedToOther()
        {
            string    referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3");
            string    alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf");
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            List <Protein> proteins = a.Genes.SelectMany(g => g.Translate(true)).ToList();

            //Forward strand, single coding region
            Assert.AreEqual("PB2015.1.1", proteins[0].Accession);
            Assert.AreEqual(
                "MVTEFIFLGLSDSQELQTFLFMLFFVFYGGIVFGNLLIVITVVSDSHLHSPMYFLLANLSLIDLSLSSVTAPKMITDFFSQRKVISFKGCLVQIFLLHFFGGSEMVILIAMGFDRYIAICKPLHYTTIMCGNACVGIMAVTWGIGFLHSVSQLAFAVHLLFCGPNEVDSFYCDLPRVIKLACTDTYRLDIMVIANSGVLTVCSFVLLIISYTIILMTIQHRPLDKSSKALSTLTAHITVVLLFFGPCVFIYAWPFPIKSLDKFLAVFYSVITPLLNPIIYTLRNKDMKTAIRQLRKWDAHSSVKF",
                proteins[0].BaseSequence);

            //Reverse strand, single coding region
            Assert.AreEqual("PB2015.2.1", proteins[1].Accession);
            Assert.AreEqual(
                "TSLWTPQAKLPTFQQLLHTQLLPPSGLFRPSSCFTRAFPGPTFVSWQPSLARFLPVSQQP" +
                "RQAQVLPHTGLSTSSLCLTVASPRPTPVPGHHLRAQNLLKSDSLVPTAASWWPMKAQNLL" +
                "KLTCPGPAPASCQRLQAQPLPHGGFSRPTSSSWLGLQAQLLPHNSLFWPSSCPANGGQCR" +
                "PKTSSSQTLQAHLLLPGGINRPSFDLRTASAGPALASQGLFPGPALASWQLPQAKFLPAC" +
                "QQPQQAQLLPHSGPFRPNL",
                proteins[1].BaseSequence);
        }
        public void OutputGtfFromGeneModel()
        {
            string    referenceGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_gff.gff3");
            string    alternateGff = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio.gtf");
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "sample_pacbio_merged.gtf"));
        }
Exemple #3
0
        /// <summary>
        /// Filters GTF or GFF entries that lack strand information
        /// Can filter also by zero abundance stringtie estimates
        /// Add CDS at the end
        /// </summary>
        /// <param name="gtfPath"></param>
        /// <param name="gtfOutPath"></param>
        public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false)
        {
            var    chromFeatures   = GeneModel.SimplerParse(gtfPath);
            string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf");

            using (var file = File.Create(filteredGtfPath))
            {
                var formatter = new GffFormatter();
                foreach (var chromISeq in chromFeatures)
                {
                    List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >();
                    bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj);
                    if (isMetadata)
                    {
                        bool okayTranscript = false;
                        var  features       = featuresObj as List <MetadataListItem <List <string> > >;
                        foreach (var feature in features)
                        {
                            if (!feature.SubItems.TryGetValue("strand", out List <string> strandish))
                            {
                                continue;
                            }
                            var attributes = GeneModel.SplitAttributes(feature.FreeText);
                            if (feature.Key == "transcript")
                            {
                                bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                                attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0;
                                bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                               attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0;
                                okayTranscript = okayFpkm && okayTpm;
                            }
                            if (okayTranscript)
                            {
                                filteredFeatures.Add(feature);
                            }
                        }
                    }
                    chromISeq.Metadata["features"] = filteredFeatures;
                }
                formatter.Format(file, chromFeatures);
            }
            Genome    ensemblGenome      = new Genome(referenceGenomePath);
            GeneModel newGeneModel       = new GeneModel(ensemblGenome, filteredGtfPath);
            GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath);

            newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);
            string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf");

            newGeneModel.PrintToGTF(filteredGtfWithCdsPath);
        }
        /// <summary>
        /// Annotates PacBio transcript model for MCF7 with start codons in the reference model
        /// </summary>
        private static void PacBioCds()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Genome    genome       = new Genome(@"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.dna.primary_assembly.fa");
            string    referenceGff = @"E:\ProjectsActive\MCF7PacBio\Homo_sapiens.GRCh37.73.gtf";
            string    alternateGff = @"E:\ProjectsActive\MCF7PacBio\IsoSeq_MCF72015edition_polished.unimapped.ensembl.unimapped.gff";
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(@"E:\ProjectsActive\MCF7PacBio\CDSAnnotated_IsoSeq_MCF7_2015edition_polished.unimapped.gff");

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " PacBio transcript isoforms");
            Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " PacBio transcript isoforms are new annotated as protein coding");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
        /// <summary>
        /// Check that annotations from ensembl make it in from a merged gene model
        /// </summary>
        private static void StringtieAndEnsembl202122CDS()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Genome    genome       = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.karyotypic.fa");
            string    referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\202122.gtf";
            string    alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.gtf";
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-806392539.filtered.withcds.gtf");

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff);
            Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
        /// <summary>
        /// Check that annotations from ensembl make it in from a merged gene model
        /// </summary>
        private static void StringtieAndEnsemblFullCDS()
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Genome    genome       = new Genome(@"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.dna.primary_assembly.fa");
            string    referenceGff = @"E:\source\repos\Spritz\Test\bin\Debug\TestData\Homo_sapiens.GRCh38.81.gtf";
            string    alternateGff = @"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.gtf";
            GeneModel r            = new GeneModel(genome, referenceGff);
            GeneModel a            = new GeneModel(genome, alternateGff);
            var       x            = a.Genes.SelectMany(g => g.Transcripts).FirstOrDefault(t => t.ID == ""); // should be null

            a.CreateCDSFromAnnotatedStartCodons(r);
            a.PrintToGTF(@"E:\ProjectsActive\Spritz\customGtfCdsAnnotatedTest\MergedStringtieModel-1914802334.filtered.withcds.gtf");

            stopwatch.Stop();
            Console.WriteLine("Finished checking that all proteins are the same.");
            Console.WriteLine("Time elapsed: " + stopwatch.Elapsed.Minutes.ToString() + " minutes and " + stopwatch.Elapsed.Seconds.ToString() + " seconds.");
            Console.WriteLine("Result: there are " + a.Genes.Sum(g => g.Transcripts.Count) + " transcript isoforms in " + alternateGff);
            Console.WriteLine("Result: " + a.Genes.Sum(g => g.Transcripts.Count(t => t.IsProteinCoding())) + " of those transcript isoforms are new annotated as protein coding");
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
Exemple #7
0
        /// <summary>
        /// Generate sample specific protein database starting with fastq files
        /// </summary>
        public void GenerateSampleSpecificProteinDatabases()
        {
            // Download references and align reads
            Downloads.PrepareEnsemblGenomeFasta(Parameters.AnalysisDirectory, Parameters.GenomeFasta);
            if (Parameters.Fastqs != null)
            {
                Alignment.Parameters = new AlignmentParameters();
                Alignment.Parameters.SpritzDirectory          = Parameters.SpritzDirectory;
                Alignment.Parameters.AnalysisDirectory        = Parameters.AnalysisDirectory;
                Alignment.Parameters.Reference                = Parameters.Reference;
                Alignment.Parameters.Threads                  = Parameters.Threads;
                Alignment.Parameters.Fastqs                   = Parameters.Fastqs;
                Alignment.Parameters.ExperimentType           = Parameters.ExperimentType;
                Alignment.Parameters.StrandSpecific           = Parameters.StrandSpecific;
                Alignment.Parameters.InferStrandSpecificity   = Parameters.InferStrandSpecificity;
                Alignment.Parameters.OverwriteStarAlignment   = Parameters.OverwriteStarAlignment;
                Alignment.Parameters.GenomeStarIndexDirectory = Parameters.GenomeStarIndexDirectory;
                Alignment.Parameters.ReorderedFastaPath       = Downloads.ReorderedFastaPath;
                Alignment.Parameters.GeneModelGtfOrGffPath    = Parameters.ReferenceGeneModelGtfOrGff;
                Alignment.Parameters.UseReadSubset            = Parameters.UseReadSubset;
                Alignment.Parameters.ReadSubset               = Parameters.ReadSubset;

                Alignment.PerformAlignment();
                Downloads.GetImportantProteinAccessions(Parameters.SpritzDirectory, Parameters.ProteinFastaPath);
            }
            EnsemblDownloadsWrapper.FilterGeneModel(Parameters.AnalysisDirectory, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, out string filteredGeneModelForScalpel);
            string    sortedBed12Path              = BEDOPSWrapper.GffOrGtf2Bed12(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, filteredGeneModelForScalpel);
            GeneModel referenceGeneModel           = new GeneModel(Downloads.EnsemblGenome, Parameters.ReferenceGeneModelGtfOrGff);
            string    referenceGeneModelProteinXml = Path.Combine(Path.GetDirectoryName(Parameters.ReferenceGeneModelGtfOrGff), Path.GetFileNameWithoutExtension(Parameters.ReferenceGeneModelGtfOrGff) + ".protein.xml"); // used if no fastqs are provided

            // Merge reference gene model and a new gene model (either specified or stringtie-generated)
            string newGeneModelPath           = Parameters.NewGeneModelGtfOrGff;
            string mergedGeneModelWithCdsPath = null;
            string mergedGeneModelProteinXml  = null;
            string reference = Parameters.Reference;

            if (Parameters.DoTranscriptIsoformAnalysis)
            {
                StringtieWrapper stringtie = new StringtieWrapper();
                if (newGeneModelPath == null)
                {
                    stringtie.TranscriptReconstruction(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Threads, Parameters.ReferenceGeneModelGtfOrGff, Downloads.EnsemblGenome, Parameters.StrandSpecific, Parameters.InferStrandSpecificity, Alignment.SortedBamFiles, true);
                    newGeneModelPath = stringtie.FilteredMergedGtfPath;
                }
                else
                {
                    newGeneModelPath = EnsemblDownloadsWrapper.ConvertFirstColumnUCSC2Ensembl(Parameters.SpritzDirectory, Parameters.Reference, Parameters.NewGeneModelGtfOrGff);
                    string mergedGeneModelPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".merged.gtf");
                    WrapperUtility.GenerateAndRunScript(WrapperUtility.GetAnalysisScriptPath(Parameters.AnalysisDirectory, "MergeTranscriptModels.bash"),
                                                        StringtieWrapper.MergeTranscriptPredictions(Parameters.SpritzDirectory, Parameters.ReferenceGeneModelGtfOrGff, new List <string> {
                        newGeneModelPath
                    }, mergedGeneModelPath)).WaitForExit();
                    newGeneModelPath = mergedGeneModelPath;
                }

                // Determine CDS from start codons of reference gene model
                // In the future, we could also try ORF finding to expand this (e.g. https://github.com/TransDecoder/TransDecoder/wiki)
                GeneModel newGeneModel = new GeneModel(Downloads.EnsemblGenome, newGeneModelPath);
                newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);

                mergedGeneModelWithCdsPath = Path.Combine(Path.GetDirectoryName(newGeneModelPath), Path.GetFileNameWithoutExtension(newGeneModelPath) + ".withcds.gtf");
                newGeneModel.PrintToGTF(mergedGeneModelWithCdsPath);
            }

            // SnpEff databases or outputing protein XMLs from gene models
            if (Parameters.DoTranscriptIsoformAnalysis) // isoform analysis, so generate a new snpeff database
            {
                reference = SnpEffWrapper.GenerateDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Downloads.ReorderedFastaPath, Parameters.ProteinFastaPath, mergedGeneModelWithCdsPath);

                if (Parameters.Fastqs == null || Parameters.SkipVariantAnalysis) // isoform analysis without variant analysis, so generate a protein database directly from merged gtf
                {
                    mergedGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, reference, mergedGeneModelWithCdsPath);
                }
            }
            else // no isoform analysis
            {
                new SnpEffWrapper(1).DownloadSnpEffDatabase(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference);
                if (Parameters.Fastqs == null) // no isoform analysis and no fastqs
                {
                    referenceGeneModelProteinXml = SnpEffWrapper.GenerateXmlDatabaseFromReference(Parameters.SpritzDirectory, Parameters.AnalysisDirectory, Parameters.Reference, Parameters.ReferenceGeneModelGtfOrGff);
                }
            }

            // Gene Fusion Discovery
            List <Protein> fusionProteins = new List <Protein>();

            if (Parameters.DoFusionAnalysis)
            {
                Fusion.Parameters.SpritzDirectory   = Parameters.SpritzDirectory;
                Fusion.Parameters.AnalysisDirectory = Parameters.AnalysisDirectory;
                Fusion.Parameters.Reference         = Parameters.Reference;
                Fusion.Parameters.Threads           = Parameters.Threads;
                Fusion.Parameters.Fastqs            = Parameters.Fastqs;
                Fusion.DiscoverGeneFusions();
                fusionProteins = Fusion.FusionProteins;
            }

            // Variant Calling
            if (Parameters.Fastqs != null && !Parameters.SkipVariantAnalysis)
            {
                VariantCalling.CallVariants(
                    Parameters.SpritzDirectory,
                    Parameters.AnalysisDirectory,
                    Parameters.ExperimentType,
                    reference,
                    Parameters.Threads,
                    sortedBed12Path,
                    Parameters.EnsemblKnownSitesPath,
                    Alignment.DedupedBamFiles,
                    Downloads.ReorderedFastaPath,
                    Downloads.EnsemblGenome,
                    Parameters.QuickSnpEffWithoutStats,
                    Parameters.IndelFinder,
                    Parameters.VariantCallingWorkers);
            }

            // Transfer features from UniProt
            List <string> xmlsToUse = null;

            if (VariantCalling.CombinedAnnotatedProteinXmlPaths.Count > 0)
            {
                xmlsToUse = VariantCalling.CombinedAnnotatedProteinXmlPaths;
            }
            // keep, since it might be useful for making a final database: .Concat(new[] { VariantCalling.CombinedAnnotatedProteinXmlPath }).ToList()
            else
            {
                xmlsToUse = new List <string> {
                    Parameters.DoTranscriptIsoformAnalysis?mergedGeneModelProteinXml : referenceGeneModelProteinXml
                }
            };
            VariantAnnotatedProteinXmlDatabases = new TransferModificationsFlow().TransferModifications(Parameters.SpritzDirectory, Parameters.UniProtXmlPath, xmlsToUse, fusionProteins);
        }