예제 #1
0
        private static MetadataListItem <List <string> > CDSFeatureMetadata(CDS cds, Exon exon)
        {
            string cdsAttributes = exon.GetGtfAttributes() + " protein_id \"" + (cds.Parent as Transcript).ProteinID + "\";";
            var    feature       = new MetadataListItem <List <string> >(cds.FeatureType, cdsAttributes);

            feature.SubItems["source"] = new List <string> {
                cds.Source.ToString()
            };
            feature.SubItems["start"] = new List <string> {
                cds.OneBasedStart.ToString()
            };
            feature.SubItems["end"] = new List <string> {
                cds.OneBasedEnd.ToString()
            };
            if (cds.Strand != ".")
            {
                feature.SubItems["strand"] = new List <string> {
                    cds.Strand.ToString()
                };
            }                                                                                                   // might take in features without strand later on
            return(feature);
        }
예제 #2
0
        /// <summary>
        /// Processes a feature from a GTF gene model file.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="oneBasedStart"></param>
        /// <param name="oneBasedEnd"></param>
        /// <param name="chrom"></param>
        /// <param name="attributes"></param>
        public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes)
        {
            bool hasGeneId       = attributes.TryGetValue("gene_id", out string geneId);
            bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId);
            bool hasProteinId    = attributes.TryGetValue("protein_id", out string proteinId);
            bool hasExonId       = attributes.TryGetValue("exon_id", out string exonId);
            bool hasSource       = feature.SubItems.TryGetValue("source", out List <string> sourceish);
            bool hasStrand       = feature.SubItems.TryGetValue("strand", out List <string> strandish);
            bool hasFrame        = feature.SubItems.TryGetValue("frame", out List <string> framey);

            string source = hasSource ? sourceish[0] : "";

            if (!hasStrand)
            {
                return;
            }                           // strand is a required to do anything in this program
            string strand = strandish[0];
            int    frame  = 0;

            if (hasFrame)
            {
                int.TryParse(framey[0], out frame);
            }

            // Trim prefixes from the IDs
            string genePrefix       = "gene:";
            string transcriptPrefix = "transcript:";

            if (hasGeneId && geneId.StartsWith(genePrefix))
            {
                string newGeneId = geneId.Substring(genePrefix.Length);
                feature.FreeText.Replace(geneId, newGeneId);
                geneId = newGeneId;
            }
            if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix))
            {
                string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length);
                feature.FreeText.Replace(transcriptId, newTranscriptId);
                transcriptId = newTranscriptId;
            }
            if (hasProteinId && proteinId.StartsWith(transcriptPrefix))
            {
                proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes
            }

            // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons
            if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID))
            {
                if (currentGene == null || hasGeneId && geneId != currentGene.ID)
                {
                    currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                    Genes.Add(currentGene);
                    GenomeForest.Add(currentGene);
                }

                currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                currentGene.Transcripts.Add(currentTranscript);
                GenomeForest.Add(currentTranscript);
            }

            if (feature.Key == "exon" || feature.Key == "CDS")
            {
                if (currentGene == null || hasGeneId && geneId != currentGene.ID)
                {
                    currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                    Genes.Add(currentGene);
                    GenomeForest.Add(currentGene);
                }

                if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)
                {
                    if (currentTranscript != null)
                    {
                        Transcript.SetRegions(currentTranscript);
                        currentTranscript.FrameCorrection();
                    }
                    currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                    currentGene.Transcripts.Add(currentTranscript);
                    GenomeForest.Add(currentTranscript);
                }

                if (feature.Key == "exon")
                {
                    ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1);
                    Exon      exon     = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(),
                                                  source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature);
                    if (exon.Length() > 0)
                    {
                        currentTranscript.Exons.Add(exon);
                    }
                }
                else if (feature.Key == "CDS")
                {
                    CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame);
                    if (hasProteinId)
                    {
                        currentTranscript.ProteinID = proteinId;
                    }
                    if (cds.Length() > 0)
                    {
                        currentTranscript.CodingDomainSequences.Add(cds);
                    }
                }
                else
                { // nothing to do
                }
            }
        }
예제 #3
0
        /// <summary>
        /// Processes a feature from a GFF3 gene model file.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="oneBasedStart"></param>
        /// <param name="oneBasedEnd"></param>
        /// <param name="chrom"></param>
        /// <param name="attributes"></param>
        public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes)
        {
            bool hasGeneId       = attributes.TryGetValue("gene_id", out string geneId);
            bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId);
            bool hasExonId       = attributes.TryGetValue("exon_id", out string exonId);
            bool hasProteinId    = attributes.TryGetValue("protein_id", out string proteinId);
            bool hasSource       = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format)
            bool hasStrand       = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format)
            bool hasFrame        = feature.SubItems.TryGetValue("frame", out List <string> framey);     // false if empty ("." in GFF format)

            string source = hasSource ? sourceish[0] : "";

            if (!hasStrand)
            {
                return;
            }                           // strand is a required to do anything in this program
            string strand = strandish[0];
            int    frame  = 0;

            if (hasFrame)
            {
                int.TryParse(framey[0], out frame);
            }

            if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID))
            {
                currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                Genes.Add(currentGene);
                GenomeForest.Add(currentGene);
            }

            if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID))
            {
                if (currentTranscript != null)
                {
                    Transcript.SetRegions(currentTranscript);
                    currentTranscript.FrameCorrection();
                }
                currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                currentGene.Transcripts.Add(currentTranscript);
                GenomeForest.Add(currentTranscript);
            }

            if (hasExonId)
            {
                ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1);
                Exon      exon     = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(),
                                              source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature);
                if (exon.Length() > 0)
                {
                    currentTranscript.Exons.Add(exon);
                }
            }
            else if (hasProteinId)
            {
                CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame);
                if (cds.Length() > 0)
                {
                    currentTranscript.CodingDomainSequences.Add(cds);
                    currentTranscript.ProteinID = proteinId;
                }
            }
            else // nothing to do
            {
            }
        }
예제 #4
0
        /// <summary>
        /// Creates coding domains based on another annotated transcript
        /// </summary>
        /// <param name="withCDS"></param>
        /// <returns>true if this transcript was annotated; false if the transcript with CDS did not lead to an annotation</returns>
        public bool CreateCDSFromAnnotatedStartCodons(Transcript withCDS)
        {
            // Nothing to do if null input
            if (withCDS == null)
            {
                return(false);
            }

            // Figure out the start position
            CDS  firstCds        = withCDS.CdsSortedStrand.First();
            long cdsStartInChrom = IsStrandPlus() ? firstCds.OneBasedStart : firstCds.OneBasedEnd;
            long cdsStartInMrna  = BaseNumber2MRnaPos(cdsStartInChrom);

            if (cdsStartInMrna < 0)
            {
                return(false);
            }                                         // the coding start wasn't within any of the exons of this transcript

            // Figure out the stop codon from translation
            ISequence spliced         = SplicedRNA();
            ISequence translateThis   = spliced.GetSubSequence(cdsStartInMrna, spliced.Count - cdsStartInMrna);
            ISequence proteinSequence = Translation.OneFrameTranslation(translateThis, Gene.Chromosome.Mitochondrial);
            int       stopIdx         = proteinSequence.Select(x => x).ToList().IndexOf(Alphabets.Protein.Ter);

            if (stopIdx < 0)
            {
                return(false);
            }                                                                              // no stop codon in sight
            long endInMrna    = cdsStartInMrna + (stopIdx + 1) * GeneModel.CODON_SIZE - 1; // include the stop codon in CDS
            long lengthInMrna = endInMrna - cdsStartInMrna + 1;

            // Figure out the stop index on the chromosome
            long     utr5ishstart = IsStrandPlus() ? Exons.Min(x => x.OneBasedStart) : cdsStartInChrom + 1;
            long     utr5ishend   = IsStrandPlus() ? cdsStartInChrom - 1 : Exons.Max(x => x.OneBasedEnd);
            Interval utr5ish      = new Interval(null, "", Source, Strand, utr5ishstart, utr5ishend);
            var      intervals    = SortedStrand(Exons.SelectMany(x => x.Minus(utr5ish)).ToList());
            long     lengthSoFar  = 0;

            foreach (Interval y in intervals)
            {
                long lengthSum = lengthSoFar + y.Length();
                if (lengthSum <= lengthInMrna) // add this whole interval
                {
                    var toAdd = new CDS(this, ChromosomeID, Source, Strand, y.OneBasedStart, y.OneBasedEnd, 0);
                    CodingDomainSequences.Add(toAdd);
                    lengthSoFar += toAdd.Length();
                }
                else if (lengthSoFar < lengthInMrna) // chop off part of this interval
                {
                    long chopLength = lengthSum - lengthInMrna;
                    long start      = IsStrandPlus() ?
                                      y.OneBasedStart :
                                      y.OneBasedStart + chopLength;
                    long end = IsStrandPlus() ?
                               y.OneBasedEnd - chopLength :
                               y.OneBasedEnd;
                    var toAdd = new CDS(this, ChromosomeID, Source, Strand, start, end, 0);
                    CodingDomainSequences.Add(toAdd);
                    lengthSoFar += toAdd.Length();
                }
            }

            SetRegions(this);
            return(true);
        }
예제 #5
0
 public CDS(CDS cds)
     : base(cds)
 {
 }