/// <summary> /// Filters GTF or GFF entries that lack strand information /// Can filter also by zero abundance stringtie estimates /// Add CDS at the end /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public static void FilterGtfEntriesWithoutStrand(string gtfPath, string referenceGenomePath, string referenceGeneModelPath, bool filterEntriesWithZeroAbundanceStringtieEstimates = false) { var chromFeatures = GeneModel.SimplerParse(gtfPath); string filteredGtfPath = Path.Combine(Path.GetDirectoryName(gtfPath), Path.GetFileNameWithoutExtension(gtfPath) + ".filtered.gtf"); using (var file = File.Create(filteredGtfPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } Genome ensemblGenome = new Genome(referenceGenomePath); GeneModel newGeneModel = new GeneModel(ensemblGenome, filteredGtfPath); GeneModel referenceGeneModel = new GeneModel(ensemblGenome, referenceGeneModelPath); newGeneModel.CreateCDSFromAnnotatedStartCodons(referenceGeneModel); string filteredGtfWithCdsPath = Path.Combine(Path.GetDirectoryName(filteredGtfPath), Path.GetFileNameWithoutExtension(filteredGtfPath) + ".withcds.gtf"); newGeneModel.PrintToGTF(filteredGtfWithCdsPath); }
/// <summary> /// Filters GTF or GFF entries that lack strand information /// </summary> /// <param name="gtfPath"></param> /// <param name="gtfOutPath"></param> public void FilterGtfEntriesWithoutStrand(string gtfPath, string gtfOutPath, bool filterEntriesWithZeroAbundanceStringtieEstimates) { var chromFeatures = GeneModel.SimplerParse(gtfPath); //if (!File.Exists(gtfOutPath)) //{ using (var file = File.Create(gtfOutPath)) { var formatter = new GffFormatter(); foreach (var chromISeq in chromFeatures) { List <MetadataListItem <List <string> > > filteredFeatures = new List <MetadataListItem <List <string> > >(); bool isMetadata = chromISeq.Metadata.TryGetValue("features", out object featuresObj); if (isMetadata) { bool okayTranscript = false; var features = featuresObj as List <MetadataListItem <List <string> > >; foreach (var feature in features) { if (!feature.SubItems.TryGetValue("strand", out List <string> strandish)) { continue; } var attributes = GeneModel.SplitAttributes(feature.FreeText); if (feature.Key == "transcript") { bool okayFpkm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("FPKM", out string fpkm) && double.TryParse(fpkm, out double fpkmValue) && fpkmValue > 0; bool okayTpm = !filterEntriesWithZeroAbundanceStringtieEstimates || attributes.TryGetValue("TPM", out string tpm) && double.TryParse(tpm, out double tpmValue) && tpmValue > 0; okayTranscript = okayFpkm && okayTpm; } if (okayTranscript) { filteredFeatures.Add(feature); } } } chromISeq.Metadata["features"] = filteredFeatures; } formatter.Format(file, chromFeatures); } //} }