/// <summary> /// Filters GTF or GFF entries that lack strand information /// Can filter also by zero abundance stringtie estimates /// Add CDS at the end /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false) { var chromFeatures = GeneModel.SimplerParse(gtfPath); string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf"); using (var file = File.Create(filteredGtfPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } Genome ensemblGenome = new Genome(referenceGenomePath); GeneModel newGeneModel = new GeneModel(ensemblGenome, filteredGtfPath); GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(filteredGtfWithCdsPath); }
/// <summary> /// Print out to a /// </summary> /// <param name="outGffFilePath"></param> public void PrintToGTF(string outGffFilePath) { using (FileStream stream = new FileStream(outGffFilePath, FileMode.Create)) { GffFormatter gff = new GffFormatter(); foreach (Chromosome chrom in Genome.Chromosomes) { if (GenomeForest.Forest.TryGetValue(chrom.FriendlyName, out var tree)) { IEnumerable <Gene> genes = tree.Intervals.OfType <Gene>().OrderBy(g => g.OneBasedStart); chrom.Sequence.Metadata["features"] = genes.SelectMany(g => g.GetFeatures()).ToList(); chrom.Sequence.ID = chrom.FriendlyName; // shortens to "1" from "1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 REF" gff.Format(stream, chrom.Sequence); } } } }
/// <summary> /// Filters GTF or GFF entries that lack strand information /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public void FilterGtfEntriesWithoutStrand(string gtfPath, string gtfOutPath, bool filterEntriesWithZeroAbundanceStringtieEstimates) { var chromFeatures = GeneModel.SimplerParse(gtfPath); //if (!File.Exists(gtfOutPath)) //{ using (var file = File.Create(gtfOutPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } //} }
/// <summary> /// Validates the Format() method in Gff Formatter based on the parameters. /// </summary> /// <param name="nodeName">Xml Node name to be read.</param> /// <param name="isFilePath">Is file path passed as parameter?</param> /// <param name="isSequenceList">Is sequence list passed as parameter?</param> void ValidateFormatGeneralTestCases(string nodeName, bool isFilePath, bool isSequenceList) { // Gets the expected sequence from the Xml string filePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode); Assert.IsTrue(File.Exists(filePath)); IList <ISequence> seqs = null; GffParser parserObj = new GffParser(); seqs = parserObj.Parse(filePath).ToList(); Sequence originalSequence = (Sequence)seqs[0]; // Use the formatter to write the original sequences to a temp file ApplicationLog.WriteLine(string.Format("Gff Formatter BVT: Creating the Temp file '{0}'.", Constants.GffTempFileName)); GffFormatter formatter = new GffFormatter { ShouldWriteSequenceData = true }; if (isFilePath) { if (isSequenceList) { formatter.Format(seqs, Constants.GffTempFileName); } else { formatter.Format(originalSequence, Constants.GffTempFileName); } } else { if (isSequenceList) { formatter.Format(seqs); } else { formatter.Format(originalSequence); } } // Read the new file, then compare the sequences IList <ISequence> seqsNew = null; GffParser newParser = new GffParser(); { seqsNew = newParser.Parse(Constants.GffTempFileName).ToList(); } Assert.IsNotNull(seqsNew); ApplicationLog.WriteLine(string.Format("Gff Formatter BVT: New Sequence is '{0}'.", seqsNew[0].ToString())); bool val = ValidateFeatures(seqsNew[0], nodeName); Assert.IsTrue(val); ApplicationLog.WriteLine( "GFF Formatter BVT : All the features validated successfully."); // Now compare the sequences. int countNew = seqsNew.Count(); int expectedCount = 1; Assert.AreEqual(expectedCount, countNew); ApplicationLog.WriteLine("The Number of sequences are matching."); Assert.AreEqual(originalSequence.ID, seqsNew[0].ID); string orgSeq = new string(originalSequence.Select(x => (char)x).ToArray());; ISequence newSeq = seqsNew.FirstOrDefault(); string newSeqString = new string(newSeq.Select(x => (char)x).ToArray()); Assert.AreEqual(orgSeq, newSeqString); ApplicationLog.WriteLine(string.Format((IFormatProvider)null, "Gff Formatter BVT: The Gff sequences '{0}' are matching with Format() method.", seqsNew[0].ToString())); // Passed all the tests, delete the tmp file. If we failed an Assert, // the tmp file will still be there in case we need it for debugging. if (File.Exists(Constants.GffTempFileName)) { File.Delete(Constants.GffTempFileName); } ApplicationLog.WriteLine("Deleted the temp file created."); }
/// <summary> /// General method to invalidate Argument Null exceptions generated from different methods. /// </summary> /// <param name="nodeName">xml node name.</param> /// <param name="method">Gff Parse method parameters</param> void InvalidateGffWriteMethod(ArgumentNullExceptions method) { ISequence sequence = null; List <ISequence> collection = new List <ISequence>(); string sequenceData = null; GffFormatter gffFormatter = null; try { switch (method) { case ArgumentNullExceptions.writeWithEmptyFile: sequenceData = utilityObj.xmlUtil.GetTextValue( Constants.SimpleGffDnaNodeName, Constants.ExpectedSequenceNode); gffFormatter = new GffFormatter(); { gffFormatter.Format(new Sequence(DnaAlphabet.Instance, sequenceData)); } break; case ArgumentNullExceptions.writeWithEmptySequence: gffFormatter = new GffFormatter(); { gffFormatter.Format(sequence); } break; case ArgumentNullExceptions.FormatString: gffFormatter = new GffFormatter(); { gffFormatter.FormatString(sequence); } break; case ArgumentNullExceptions.writeCollectionWithEmptyFile: sequenceData = utilityObj.xmlUtil.GetTextValue( Constants.SimpleGffDnaNodeName, Constants.ExpectedSequenceNode); collection.Add(new Sequence(DnaAlphabet.Instance, sequenceData)); gffFormatter = new GffFormatter(); { gffFormatter.Format(collection); } break; case ArgumentNullExceptions.writeCollectionWithEmptySequence: gffFormatter = new GffFormatter(); { gffFormatter.Format(collection); } break; default: break; } Assert.Fail(); } catch (ArgumentNullException) { ApplicationLog.WriteLine("GFF P2 : Exception is validated successfully."); } catch (Exception) { ApplicationLog.WriteLine("GFF P2 : Exception is validated successfully."); } }