private static MetadataListItem <List <string> > CDSFeatureMetadata(CDS cds, Exon exon) { string cdsAttributes = exon.GetGtfAttributes() + " protein_id \"" + (cds.Parent as Transcript).ProteinID + "\";"; var feature = new MetadataListItem <List <string> >(cds.FeatureType, cdsAttributes); feature.SubItems["source"] = new List <string> { cds.Source.ToString() }; feature.SubItems["start"] = new List <string> { cds.OneBasedStart.ToString() }; feature.SubItems["end"] = new List <string> { cds.OneBasedEnd.ToString() }; if (cds.Strand != ".") { feature.SubItems["strand"] = new List <string> { cds.Strand.ToString() }; } // might take in features without strand later on return(feature); }
/// <summary> /// Processes a feature from a GTF gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } // Trim prefixes from the IDs string genePrefix = "gene:"; string transcriptPrefix = "transcript:"; if (hasGeneId && geneId.StartsWith(genePrefix)) { string newGeneId = geneId.Substring(genePrefix.Length); feature.FreeText.Replace(geneId, newGeneId); geneId = newGeneId; } if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix)) { string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length); feature.FreeText.Replace(transcriptId, newTranscriptId); transcriptId = newTranscriptId; } if (hasProteinId && proteinId.StartsWith(transcriptPrefix)) { proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes } // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon" || feature.Key == "CDS") { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon") { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (feature.Key == "CDS") { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (hasProteinId) { currentTranscript.ProteinID = proteinId; } if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); } } else { // nothing to do } } }
/// <summary> /// Processes a feature from a GFF3 gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format) bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format) bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); // false if empty ("." in GFF format) string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID)) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (hasExonId) { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (hasProteinId) { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); currentTranscript.ProteinID = proteinId; } } else // nothing to do { } }
/// <summary> /// Creates coding domains based on another annotated transcript /// </summary> /// <param name="withCDS"></param> /// <returns>true if this transcript was annotated; false if the transcript with CDS did not lead to an annotation</returns> public bool CreateCDSFromAnnotatedStartCodons(Transcript withCDS) { // Nothing to do if null input if (withCDS == null) { return(false); } // Figure out the start position CDS firstCds = withCDS.CdsSortedStrand.First(); long cdsStartInChrom = IsStrandPlus() ? firstCds.OneBasedStart : firstCds.OneBasedEnd; long cdsStartInMrna = BaseNumber2MRnaPos(cdsStartInChrom); if (cdsStartInMrna < 0) { return(false); } // the coding start wasn't within any of the exons of this transcript // Figure out the stop codon from translation ISequence spliced = SplicedRNA(); ISequence translateThis = spliced.GetSubSequence(cdsStartInMrna, spliced.Count - cdsStartInMrna); ISequence proteinSequence = Translation.OneFrameTranslation(translateThis, Gene.Chromosome.Mitochondrial); int stopIdx = proteinSequence.Select(x => x).ToList().IndexOf(Alphabets.Protein.Ter); if (stopIdx < 0) { return(false); } // no stop codon in sight long endInMrna = cdsStartInMrna + (stopIdx + 1) * GeneModel.CODON_SIZE - 1; // include the stop codon in CDS long lengthInMrna = endInMrna - cdsStartInMrna + 1; // Figure out the stop index on the chromosome long utr5ishstart = IsStrandPlus() ? Exons.Min(x => x.OneBasedStart) : cdsStartInChrom + 1; long utr5ishend = IsStrandPlus() ? cdsStartInChrom - 1 : Exons.Max(x => x.OneBasedEnd); Interval utr5ish = new Interval(null, "", Source, Strand, utr5ishstart, utr5ishend); var intervals = SortedStrand(Exons.SelectMany(x => x.Minus(utr5ish)).ToList()); long lengthSoFar = 0; foreach (Interval y in intervals) { long lengthSum = lengthSoFar + y.Length(); if (lengthSum <= lengthInMrna) // add this whole interval { var toAdd = new CDS(this, ChromosomeID, Source, Strand, y.OneBasedStart, y.OneBasedEnd, 0); CodingDomainSequences.Add(toAdd); lengthSoFar += toAdd.Length(); } else if (lengthSoFar < lengthInMrna) // chop off part of this interval { long chopLength = lengthSum - lengthInMrna; long start = IsStrandPlus() ? y.OneBasedStart : y.OneBasedStart + chopLength; long end = IsStrandPlus() ? y.OneBasedEnd - chopLength : y.OneBasedEnd; var toAdd = new CDS(this, ChromosomeID, Source, Strand, start, end, 0); CodingDomainSequences.Add(toAdd); lengthSoFar += toAdd.Length(); } } SetRegions(this); return(true); }
public CDS(CDS cds) : base(cds) { }