/// <summary> /// Creates UTRs for transcripts and intergenic regions after reading gene model /// </summary> public void CreateIntergenicRegions() { foreach (IntervalTree it in GenomeForest.Forest.Values) { Gene previousPositiveStrandGene = null; Gene previousNegativeStrandGene = null; // Create intergenic regions on each strand foreach (Gene gene in it.Intervals.OfType <Gene>().OrderBy(g => g.OneBasedStart)) { Intergenic intergenic = null; Gene previous = gene.IsStrandPlus() ? previousPositiveStrandGene : previousNegativeStrandGene; if (previous != null) { // if there's a previous gene, create the intergenic region intergenic = new Intergenic(gene.Chromosome, gene.ChromosomeID, gene.Source, gene.Strand, previous.OneBasedEnd + 1, gene.OneBasedStart - 1, null); } // store previous genes on each strand if (gene.IsStrandPlus()) { previousPositiveStrandGene = gene; } if (gene.IsStrandMinus()) { previousNegativeStrandGene = gene; } // add the intergenic region to the genome forest if it was created if (intergenic != null && intergenic.Length() > 0) { GenomeForest.Add(intergenic); } } } }
/// <summary> /// Processes a feature from a GFF3 gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format) bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format) bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); // false if empty ("." in GFF format) string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID)) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (hasExonId) { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (hasProteinId) { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); currentTranscript.ProteinID = proteinId; } } else // nothing to do { } }
/// <summary> /// Processes a feature from a GTF gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } // Trim prefixes from the IDs string genePrefix = "gene:"; string transcriptPrefix = "transcript:"; if (hasGeneId && geneId.StartsWith(genePrefix)) { string newGeneId = geneId.Substring(genePrefix.Length); feature.FreeText.Replace(geneId, newGeneId); geneId = newGeneId; } if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix)) { string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length); feature.FreeText.Replace(transcriptId, newTranscriptId); transcriptId = newTranscriptId; } if (hasProteinId && proteinId.StartsWith(transcriptPrefix)) { proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes } // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon" || feature.Key == "CDS") { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon") { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (feature.Key == "CDS") { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (hasProteinId) { currentTranscript.ProteinID = proteinId; } if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); } } else { // nothing to do } } }
public string ToString(bool useSeqOntology, bool useHgvs) { // Get data to show string geneId = "", geneName = "", transcriptId = "", exonId = "", customId = ""; string bioType = null; int exonRank = -1; if (Marker != null) { // Gene Id, name and biotype Gene gene = GetGene(); Transcript tr = GetTranscript(); // CDS size info if (gene != null) { geneId = gene.ID; //geneName = gene.getGeneName(); //bioType = (getBiotype() == null ? "" : getBiotype().ToString()); } // Update trId if (tr != null) { transcriptId = tr.ID; } // Exon rank information Exon exon = GetExon(); //if (exon != null) //{ // exonId = exon.ID; // exonRank = exon.getRank(); //} // Regulation //if (isRegulation()) bioType = ((Regulation)marker).getRegulationType(); } // Add seqChage's ID //if (!variant.getId() == "") customId += variant.getId(); // Add custom markers //if (marker != null && marker is Custom) customId += (customId == "" ? "" : ";") + marker.getId(); // CDS length long cdsSize = GetCdsLength(); string error = String.Join(",", Error); string warning = String.Join(",", Warning); string errWarn = error + (error == "" ? "" : "|") + warning; //string aaChange = ""; //if (useHgvs) aaChange = getHgvs(); //else aaChange = ((aaRef.Length + aaAlt.Length) > 0 ? aaRef + "/" + aaAlt : ""); string aaChange = ((ReferenceAA.Length + AlternateAA.Length) > 0 ? ReferenceAA + "/" + AlternateAA : ""); return(errWarn // + "\t" + geneId // + "\t" + geneName // + "\t" + bioType // + "\t" + transcriptId // + "\t" + exonId // + "\t" + (exonRank >= 0 ? exonRank.ToString() : "") // + "\t" + Effect(false, false, false, useSeqOntology, false) // + "\t" + aaChange // + "\t" + ((CodonsRef.Length + CodonsAlt.Length) > 0 ? CodonsRef + "/" + CodonsAlt : "") // + "\t" + (CodonNum >= 0 ? (CodonNum + 1).ToString() : "") // //+ "\t" + (codonDegeneracy >= 0 ? codonDegeneracy + "" : "") // + "\t" + (cdsSize >= 0 ? cdsSize.ToString() : "") // + "\t" + (CodonsAroundOld.Length > 0 ? CodonsAroundOld + " / " + CodonsAroundNew : "") // + "\t" + (AroundOldAAs.Length > 0 ? AroundOldAAs + " / " + AroundNewAAs : "") // + "\t" + customId // ); }