コード例 #1
0
 /// <summary>
 /// Reads gene model features into data structures contained within this library
 /// </summary>
 /// <param name="geneModelFile"></param>
 public void ReadGeneFeatures(string geneModelFile)
 {
     foreach (ISequence chromFeatures in SimplerParse(geneModelFile))
     {
         Chromosome chrom = Genome.Chromosomes.FirstOrDefault(x => x.FriendlyName == chromFeatures.ID);
         if (chrom == null)
         {
             continue;
         }
         chromFeatures.Metadata.TryGetValue("features", out object f);
         List <MetadataListItem <List <string> > > features = f as List <MetadataListItem <List <string> > >;
         for (int i = 0; i < features.Count; i++)
         {
             MetadataListItem <List <string> > feature = features[i];
             long.TryParse(feature.SubItems["start"][0], out long start);
             long.TryParse(feature.SubItems["end"][0], out long end);
             var attributes = SplitAttributes(feature.FreeText);
             if (feature.FreeText.Contains('='))
             {
                 ProcessGff3Feature(feature, start, end, chrom, attributes);
             }
             else
             {
                 ProcessGtfFeature(feature, start, end, chrom, attributes);
             }
         }
     }
     if (currentTranscript != null)
     {
         Transcript.SetRegions(currentTranscript);
         currentTranscript.FrameCorrection();
     }
     CreateIntergenicRegions();
     // possibly check transcript sanity here with Parallel.ForEach(Genes.SelectMany(g => g.Transcripts).ToList(), t => t.SanityCheck());
     GenomeForest.Build();
 }
コード例 #2
0
        /// <summary>
        /// Processes a feature from a GTF gene model file.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="oneBasedStart"></param>
        /// <param name="oneBasedEnd"></param>
        /// <param name="chrom"></param>
        /// <param name="attributes"></param>
        public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes)
        {
            bool hasGeneId       = attributes.TryGetValue("gene_id", out string geneId);
            bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId);
            bool hasProteinId    = attributes.TryGetValue("protein_id", out string proteinId);
            bool hasExonId       = attributes.TryGetValue("exon_id", out string exonId);
            bool hasSource       = feature.SubItems.TryGetValue("source", out List <string> sourceish);
            bool hasStrand       = feature.SubItems.TryGetValue("strand", out List <string> strandish);
            bool hasFrame        = feature.SubItems.TryGetValue("frame", out List <string> framey);

            string source = hasSource ? sourceish[0] : "";

            if (!hasStrand)
            {
                return;
            }                           // strand is a required to do anything in this program
            string strand = strandish[0];
            int    frame  = 0;

            if (hasFrame)
            {
                int.TryParse(framey[0], out frame);
            }

            // Trim prefixes from the IDs
            string genePrefix       = "gene:";
            string transcriptPrefix = "transcript:";

            if (hasGeneId && geneId.StartsWith(genePrefix))
            {
                string newGeneId = geneId.Substring(genePrefix.Length);
                feature.FreeText.Replace(geneId, newGeneId);
                geneId = newGeneId;
            }
            if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix))
            {
                string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length);
                feature.FreeText.Replace(transcriptId, newTranscriptId);
                transcriptId = newTranscriptId;
            }
            if (hasProteinId && proteinId.StartsWith(transcriptPrefix))
            {
                proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes
            }

            // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons
            if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID))
            {
                if (currentGene == null || hasGeneId && geneId != currentGene.ID)
                {
                    currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                    Genes.Add(currentGene);
                    GenomeForest.Add(currentGene);
                }

                currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                currentGene.Transcripts.Add(currentTranscript);
                GenomeForest.Add(currentTranscript);
            }

            if (feature.Key == "exon" || feature.Key == "CDS")
            {
                if (currentGene == null || hasGeneId && geneId != currentGene.ID)
                {
                    currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                    Genes.Add(currentGene);
                    GenomeForest.Add(currentGene);
                }

                if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)
                {
                    if (currentTranscript != null)
                    {
                        Transcript.SetRegions(currentTranscript);
                        currentTranscript.FrameCorrection();
                    }
                    currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                    currentGene.Transcripts.Add(currentTranscript);
                    GenomeForest.Add(currentTranscript);
                }

                if (feature.Key == "exon")
                {
                    ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1);
                    Exon      exon     = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(),
                                                  source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature);
                    if (exon.Length() > 0)
                    {
                        currentTranscript.Exons.Add(exon);
                    }
                }
                else if (feature.Key == "CDS")
                {
                    CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame);
                    if (hasProteinId)
                    {
                        currentTranscript.ProteinID = proteinId;
                    }
                    if (cds.Length() > 0)
                    {
                        currentTranscript.CodingDomainSequences.Add(cds);
                    }
                }
                else
                { // nothing to do
                }
            }
        }
コード例 #3
0
        /// <summary>
        /// Processes a feature from a GFF3 gene model file.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="oneBasedStart"></param>
        /// <param name="oneBasedEnd"></param>
        /// <param name="chrom"></param>
        /// <param name="attributes"></param>
        public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes)
        {
            bool hasGeneId       = attributes.TryGetValue("gene_id", out string geneId);
            bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId);
            bool hasExonId       = attributes.TryGetValue("exon_id", out string exonId);
            bool hasProteinId    = attributes.TryGetValue("protein_id", out string proteinId);
            bool hasSource       = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format)
            bool hasStrand       = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format)
            bool hasFrame        = feature.SubItems.TryGetValue("frame", out List <string> framey);     // false if empty ("." in GFF format)

            string source = hasSource ? sourceish[0] : "";

            if (!hasStrand)
            {
                return;
            }                           // strand is a required to do anything in this program
            string strand = strandish[0];
            int    frame  = 0;

            if (hasFrame)
            {
                int.TryParse(framey[0], out frame);
            }

            if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID))
            {
                currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                Genes.Add(currentGene);
                GenomeForest.Add(currentGene);
            }

            if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID))
            {
                if (currentTranscript != null)
                {
                    Transcript.SetRegions(currentTranscript);
                    currentTranscript.FrameCorrection();
                }
                currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                currentGene.Transcripts.Add(currentTranscript);
                GenomeForest.Add(currentTranscript);
            }

            if (hasExonId)
            {
                ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1);
                Exon      exon     = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(),
                                              source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature);
                if (exon.Length() > 0)
                {
                    currentTranscript.Exons.Add(exon);
                }
            }
            else if (hasProteinId)
            {
                CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame);
                if (cds.Length() > 0)
                {
                    currentTranscript.CodingDomainSequences.Add(cds);
                    currentTranscript.ProteinID = proteinId;
                }
            }
            else // nothing to do
            {
            }
        }