/// <summary> /// Reads gene model features into data structures contained within this library /// </summary> /// <param name="geneModelFile"></param> public void ReadGeneFeatures(string geneModelFile) { foreach (ISequence chromFeatures in SimplerParse(geneModelFile)) { Chromosome chrom = Genome.Chromosomes.FirstOrDefault(x => x.FriendlyName == chromFeatures.ID); if (chrom == null) { continue; } chromFeatures.Metadata.TryGetValue("features", out object f); List <MetadataListItem <List <string> > > features = f as List <MetadataListItem <List <string> > >; for (int i = 0; i < features.Count; i++) { MetadataListItem <List <string> > feature = features[i]; long.TryParse(feature.SubItems["start"][0], out long start); long.TryParse(feature.SubItems["end"][0], out long end); var attributes = SplitAttributes(feature.FreeText); if (feature.FreeText.Contains('=')) { ProcessGff3Feature(feature, start, end, chrom, attributes); } else { ProcessGtfFeature(feature, start, end, chrom, attributes); } } } if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } CreateIntergenicRegions(); // possibly check transcript sanity here with Parallel.ForEach(Genes.SelectMany(g => g.Transcripts).ToList(), t => t.SanityCheck()); GenomeForest.Build(); }
/// <summary> /// Processes a feature from a GTF gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } // Trim prefixes from the IDs string genePrefix = "gene:"; string transcriptPrefix = "transcript:"; if (hasGeneId && geneId.StartsWith(genePrefix)) { string newGeneId = geneId.Substring(genePrefix.Length); feature.FreeText.Replace(geneId, newGeneId); geneId = newGeneId; } if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix)) { string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length); feature.FreeText.Replace(transcriptId, newTranscriptId); transcriptId = newTranscriptId; } if (hasProteinId && proteinId.StartsWith(transcriptPrefix)) { proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes } // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon" || feature.Key == "CDS") { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon") { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (feature.Key == "CDS") { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (hasProteinId) { currentTranscript.ProteinID = proteinId; } if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); } } else { // nothing to do } } }
/// <summary> /// Processes a feature from a GFF3 gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format) bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format) bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); // false if empty ("." in GFF format) string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID)) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (hasExonId) { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (hasProteinId) { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); currentTranscript.ProteinID = proteinId; } } else // nothing to do { } }