Esempio n. 1
0
        /// <summary>
        /// Filters GTF or GFF entries that lack strand information
        /// Can filter also by zero abundance stringtie estimates
        /// Add CDS at the end
        /// </summary>
        /// <param name="gtfPath"></param>
        /// <param name="gtfOutPath"></param>
        public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false)
        {
            var    chromFeatures   = GeneModel.SimplerParse(gtfPath);
            string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf");

            using (var file = File.Create(filteredGtfPath))
            {
                var formatter = new GffFormatter();
                foreach (var chromISeq in chromFeatures)
                {
                    List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >();
                    bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj);
                    if (isMetadata)
                    {
                        bool okayTranscript = false;
                        var  features       = featuresObj as List <MetadataListItem <List <string> > >;
                        foreach (var feature in features)
                        {
                            if (!feature.SubItems.TryGetValue("strand", out List <string> strandish))
                            {
                                continue;
                            }
                            var attributes = GeneModel.SplitAttributes(feature.FreeText);
                            if (feature.Key == "transcript")
                            {
                                bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                                attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0;
                                bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                               attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0;
                                okayTranscript = okayFpkm && okayTpm;
                            }
                            if (okayTranscript)
                            {
                                filteredFeatures.Add(feature);
                            }
                        }
                    }
                    chromISeq.Metadata["features"] = filteredFeatures;
                }
                formatter.Format(file, chromFeatures);
            }
            Genome    ensemblGenome      = new Genome(referenceGenomePath);
            GeneModel newGeneModel       = new GeneModel(ensemblGenome, filteredGtfPath);
            GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath);

            newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel);
            string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf");

            newGeneModel.PrintToGTF(filteredGtfWithCdsPath);
        }
 /// <summary>
 /// Print out to a
 /// </summary>
 /// <param name="outGffFilePath"></param>
 public void PrintToGTF(string outGffFilePath)
 {
     using (FileStream stream = new FileStream(outGffFilePath, FileMode.Create))
     {
         GffFormatter gff = new GffFormatter();
         foreach (Chromosome chrom in Genome.Chromosomes)
         {
             if (GenomeForest.Forest.TryGetValue(chrom.FriendlyName, out var tree))
             {
                 IEnumerable <Gene> genes = tree.Intervals.OfType <Gene>().OrderBy(g => g.OneBasedStart);
                 chrom.Sequence.Metadata["features"] = genes.SelectMany(g => g.GetFeatures()).ToList();
                 chrom.Sequence.ID = chrom.FriendlyName; // shortens to "1" from "1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 REF"
                 gff.Format(stream, chrom.Sequence);
             }
         }
     }
 }
Esempio n. 3
0
        /// <summary>
        /// Filters GTF or GFF entries that lack strand information
        /// </summary>
        /// <param name="gtfPath"></param>
        /// <param name="gtfOutPath"></param>
        public void FilterGtfEntriesWithoutStrand(string gtfPath, string gtfOutPath, bool filterEntriesWithZeroAbundanceStringtieEstimates)
        {
            var chromFeatures = GeneModel.SimplerParse(gtfPath);

            //if (!File.Exists(gtfOutPath))
            //{
            using (var file = File.Create(gtfOutPath))
            {
                var formatter = new GffFormatter();
                foreach (var chromISeq in chromFeatures)
                {
                    List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >();
                    bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj);
                    if (isMetadata)
                    {
                        bool okayTranscript = false;
                        var  features       = featuresObj as List <MetadataListItem <List <string> > >;
                        foreach (var feature in features)
                        {
                            if (!feature.SubItems.TryGetValue("strand", out List <string> strandish))
                            {
                                continue;
                            }
                            var attributes = GeneModel.SplitAttributes(feature.FreeText);
                            if (feature.Key == "transcript")
                            {
                                bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                                attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0;
                                bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates ||
                                               attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0;
                                okayTranscript = okayFpkm && okayTpm;
                            }
                            if (okayTranscript)
                            {
                                filteredFeatures.Add(feature);
                            }
                        }
                    }
                    chromISeq.Metadata["features"] = filteredFeatures;
                }
                formatter.Format(file, chromFeatures);
            }
            //}
        }
Esempio n. 4
0
        /// <summary>
        /// Validates the Format() method in Gff Formatter based on the parameters.
        /// </summary>
        /// <param name="nodeName">Xml Node name to be read.</param>
        /// <param name="isFilePath">Is file path passed as parameter?</param>
        /// <param name="isSequenceList">Is sequence list passed as parameter?</param>
        void ValidateFormatGeneralTestCases(string nodeName,
                                            bool isFilePath, bool isSequenceList)
        {
            // Gets the expected sequence from the Xml
            string filePath = utilityObj.xmlUtil.GetTextValue(nodeName,
                                                              Constants.FilePathNode);

            Assert.IsTrue(File.Exists(filePath));
            IList <ISequence> seqs      = null;
            GffParser         parserObj = new GffParser();

            seqs = parserObj.Parse(filePath).ToList();
            Sequence originalSequence = (Sequence)seqs[0];

            // Use the formatter to write the original sequences to a temp file
            ApplicationLog.WriteLine(string.Format("Gff Formatter BVT: Creating the Temp file '{0}'.",
                                                   Constants.GffTempFileName));

            GffFormatter formatter = new GffFormatter {
                ShouldWriteSequenceData = true
            };

            if (isFilePath)
            {
                if (isSequenceList)
                {
                    formatter.Format(seqs, Constants.GffTempFileName);
                }
                else
                {
                    formatter.Format(originalSequence, Constants.GffTempFileName);
                }
            }
            else
            {
                if (isSequenceList)
                {
                    formatter.Format(seqs);
                }
                else
                {
                    formatter.Format(originalSequence);
                }
            }

            // Read the new file, then compare the sequences
            IList <ISequence> seqsNew   = null;
            GffParser         newParser = new GffParser();

            {
                seqsNew = newParser.Parse(Constants.GffTempFileName).ToList();
            }
            Assert.IsNotNull(seqsNew);
            ApplicationLog.WriteLine(string.Format("Gff Formatter BVT: New Sequence is '{0}'.",
                                                   seqsNew[0].ToString()));

            bool val = ValidateFeatures(seqsNew[0], nodeName);

            Assert.IsTrue(val);
            ApplicationLog.WriteLine(
                "GFF Formatter BVT : All the features validated successfully.");

            // Now compare the sequences.
            int countNew      = seqsNew.Count();
            int expectedCount = 1;

            Assert.AreEqual(expectedCount, countNew);
            ApplicationLog.WriteLine("The Number of sequences are matching.");

            Assert.AreEqual(originalSequence.ID, seqsNew[0].ID);

            string    orgSeq       = new string(originalSequence.Select(x => (char)x).ToArray());;
            ISequence newSeq       = seqsNew.FirstOrDefault();
            string    newSeqString = new string(newSeq.Select(x => (char)x).ToArray());

            Assert.AreEqual(orgSeq, newSeqString);
            ApplicationLog.WriteLine(string.Format((IFormatProvider)null,
                                                   "Gff Formatter BVT: The Gff sequences '{0}' are matching with Format() method.",
                                                   seqsNew[0].ToString()));

            // Passed all the tests, delete the tmp file. If we failed an Assert,
            // the tmp file will still be there in case we need it for debugging.
            if (File.Exists(Constants.GffTempFileName))
            {
                File.Delete(Constants.GffTempFileName);
            }
            ApplicationLog.WriteLine("Deleted the temp file created.");
        }
Esempio n. 5
0
        /// <summary>
        /// General method to invalidate Argument Null exceptions generated from different methods.
        /// </summary>
        /// <param name="nodeName">xml node name.</param>
        /// <param name="method">Gff Parse method parameters</param>
        void InvalidateGffWriteMethod(ArgumentNullExceptions method)
        {
            ISequence        sequence     = null;
            List <ISequence> collection   = new List <ISequence>();
            string           sequenceData = null;
            GffFormatter     gffFormatter = null;

            try
            {
                switch (method)
                {
                case ArgumentNullExceptions.writeWithEmptyFile:
                    sequenceData = utilityObj.xmlUtil.GetTextValue(
                        Constants.SimpleGffDnaNodeName, Constants.ExpectedSequenceNode);

                    gffFormatter = new GffFormatter();
                    {
                        gffFormatter.Format(new Sequence(DnaAlphabet.Instance, sequenceData));
                    }
                    break;

                case ArgumentNullExceptions.writeWithEmptySequence:

                    gffFormatter = new GffFormatter();
                    {
                        gffFormatter.Format(sequence);
                    }
                    break;

                case ArgumentNullExceptions.FormatString:

                    gffFormatter = new GffFormatter();
                    {
                        gffFormatter.FormatString(sequence);
                    }
                    break;

                case ArgumentNullExceptions.writeCollectionWithEmptyFile:
                    sequenceData = utilityObj.xmlUtil.GetTextValue(
                        Constants.SimpleGffDnaNodeName, Constants.ExpectedSequenceNode);
                    collection.Add(new Sequence(DnaAlphabet.Instance, sequenceData));

                    gffFormatter = new GffFormatter();
                    {
                        gffFormatter.Format(collection);
                    }
                    break;

                case ArgumentNullExceptions.writeCollectionWithEmptySequence:

                    gffFormatter = new GffFormatter();
                    {
                        gffFormatter.Format(collection);
                    }
                    break;

                default:
                    break;
                }

                Assert.Fail();
            }

            catch (ArgumentNullException)
            {
                ApplicationLog.WriteLine("GFF P2 : Exception is validated successfully.");
            }
            catch (Exception)
            {
                ApplicationLog.WriteLine("GFF P2 : Exception is validated successfully.");
            }
        }