/// <summary> /// constructor /// </summary> public HgvsCodingNomenclature(TranscriptAnnotation ta, Transcript transcript, VariantFeature variant, ICompressedSequence compressedSequence, bool isGenomicDuplicate) { _ta = ta; _transcript = transcript; _variant = variant; _compressedSequence = compressedSequence; _isGenomicDuplicate = isGenomicDuplicate; _sb = new StringBuilder(); // get reference sequence strand var transcriptOnReverseStrand = transcript.Gene.OnReverseStrand; // this may be different to the input one for insertions/deletions var altAllele = ta.AlternateAllele; string variationFeatureSequence = altAllele.AlternateAllele; // get the reverse complement of the vfs if needed if (transcriptOnReverseStrand) { variationFeatureSequence = SequenceUtilities.GetReverseComplement(variationFeatureSequence); } // calculate the reference start and end GetReferenceCoordinates(transcript, altAllele, out _hgvsStart, out _hgvsEnd); // decide event type from HGVS nomenclature _hgvsNotation = new HgvsNotation(ta.TranscriptReferenceAllele, variationFeatureSequence, FormatUtilities.CombineIdAndVersion(transcript.Id, transcript.Version), _hgvsStart, _hgvsEnd, _transcript.Translation != null); }
public string GetCdnaSequence() { if (_sequence != null) { return(_sequence); } var sb = StringBuilderCache.Acquire(); foreach (var region in _regions) { if (region.Type != TranscriptRegionType.Exon) { continue; } sb.Append(_compressedSequence.Substring(region.Start - 1, region.End - region.Start + 1)); } if (_onReverseStrand) { string reverseComplement = SequenceUtilities.GetReverseComplement(sb.ToString()); sb.Clear(); sb.Append(reverseComplement); } ApplyRnaEdits(sb); _sequence = StringBuilderCache.GetStringAndRelease(sb); return(_sequence); }
/// <summary> /// Retrieves all Exon sequences and concats them together. /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq] /// </summary> private static string GetSplicedSequence(ISequence refSequence, ITranscriptRegion[] regions, bool onReverseStrand) { var sb = StringBuilderCache.Acquire(); foreach (var region in regions) { if (region.Type != TranscriptRegionType.Exon) { continue; } var exonLength = region.End - region.Start + 1; // sanity check: handle the situation where no reference has been provided if (refSequence == null) { sb.Append(new string('N', exonLength)); continue; } sb.Append(refSequence.Substring(region.Start - 1, exonLength)); } var results = StringBuilderCache.GetStringAndRelease(sb); return(onReverseStrand ? SequenceUtilities.GetReverseComplement(results) : results); }
private static string GetRotatingBases(ISimpleVariant simpleVariant, bool onReverseStrand) { string rotatingBases = simpleVariant.Type == VariantType.insertion ? simpleVariant.AltAllele : simpleVariant.RefAllele; rotatingBases = onReverseStrand ? SequenceUtilities.GetReverseComplement(rotatingBases) : rotatingBases; return(rotatingBases); }
public static string GetHgvscAnnotation(ITranscript transcript, ISimpleVariant variant, ISequence refSequence, int regionStart, int regionEnd) { // sanity check: don't try to handle odd characters, make sure this is not a reference allele, // and make sure that we have protein coordinates if (variant.Type == VariantType.reference || SequenceUtilities.HasNonCanonicalBase(variant.AltAllele)) { return(null); } var onReverseStrand = transcript.Gene.OnReverseStrand; var refAllele = onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.RefAllele) : variant.RefAllele; var altAllele = onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.AltAllele) : variant.AltAllele; // decide event type from HGVS nomenclature var genomicChange = GetGenomicChange(transcript, onReverseStrand, refSequence, variant); var variantStart = variant.Start; var variantEnd = variant.End; if (genomicChange == GenomicChange.Duplication) { (variantStart, variantEnd, refAllele, regionStart, regionEnd) = transcript.TranscriptRegions.ShiftDuplication(variantStart, altAllele, onReverseStrand); } var startPositionOffset = HgvsUtilities.GetCdnaPositionOffset(transcript, variantStart, regionStart); var endPositionOffset = variantStart == variantEnd ? startPositionOffset : HgvsUtilities.GetCdnaPositionOffset(transcript, variantEnd, regionEnd); if (onReverseStrand) { var tmp = startPositionOffset; startPositionOffset = endPositionOffset; endPositionOffset = tmp; } // sanity check: make sure we have coordinates if (startPositionOffset == null || endPositionOffset == null) { return(null); } var transcriptLen = transcript.End - transcript.Start + 1; //_hgvs notation past the transcript if (startPositionOffset.Position > transcriptLen || endPositionOffset.Position > transcriptLen) { return(null); } var hgvsNotation = new HgvscNotation(refAllele, altAllele, transcript.Id.WithVersion, genomicChange, startPositionOffset, endPositionOffset, transcript.Translation != null); // generic formatting return(hgvsNotation.ToString()); }
public static GenomicChange GetGenomicChange(IInterval interval, bool onReverseStrand, ISequence refSequence, ISimpleVariant variant) { // length of the reference allele. Negative lengths make no sense int refLength = variant.End - variant.Start + 1; if (refLength < 0) { refLength = 0; } // length of alternative allele int altLength = variant.AltAllele.Length; // sanity check: make sure that the alleles are different if (variant.RefAllele == variant.AltAllele) { return(GenomicChange.Unknown); } // deletion if (altLength == 0) { return(GenomicChange.Deletion); } if (refLength == altLength) { // substitution if (refLength == 1) { return(GenomicChange.Substitution); } // inversion string rcRefAllele = SequenceUtilities.GetReverseComplement(variant.RefAllele); return(variant.AltAllele == rcRefAllele ? GenomicChange.Inversion : GenomicChange.DelIns); } // deletion/insertion if (refLength != 0) { return(GenomicChange.DelIns); } // If this is an insertion, we should check if the preceding reference nucleotides // match the insertion. In that case it should be annotated as a multiplication. bool isGenomicDuplicate = HgvsUtilities.IsDuplicateWithinInterval(refSequence, variant, interval, onReverseStrand); return(isGenomicDuplicate ? GenomicChange.Duplication : GenomicChange.Insertion); }
public string GetCodingSequence() { var sb = StringBuilderCache.Acquire(Length); // account for the exon phase (forward orientation) if (_startExonPhase > 0 && !_geneOnReverseStrand) { sb.Append('N', _startExonPhase); } foreach (var region in _regions) { if (region.Type != TranscriptRegionType.Exon) { continue; } // handle exons that are entirely in the UTR if (region.End < _codingRegion.Start || region.Start > _codingRegion.End) { continue; } int tempBegin = region.Start; int tempEnd = region.End; // trim the first and last exons if (_codingRegion.Start >= tempBegin && _codingRegion.Start <= tempEnd) { tempBegin = _codingRegion.Start; } if (_codingRegion.End >= tempBegin && _codingRegion.End <= tempEnd) { tempEnd = _codingRegion.End; } sb.Append(_compressedSequence.Substring(tempBegin - 1, tempEnd - tempBegin + 1)); } // account for the exon phase (reverse orientation) if (_startExonPhase > 0 && _geneOnReverseStrand) { sb.Append('N', _startExonPhase); } var s = StringBuilderCache.GetStringAndRelease(sb); return(_geneOnReverseStrand ? SequenceUtilities.GetReverseComplement(s) : s); }
private static void MapCdnaCoordinates(Transcript transcript, TranscriptAnnotation ta, VariantAlternateAllele altAllele) { if (transcript.Gene.OnReverseStrand) { ta.TranscriptReferenceAllele = SequenceUtilities.GetReverseComplement(altAllele.ReferenceAllele); ta.TranscriptAlternateAllele = SequenceUtilities.GetReverseComplement(altAllele.AlternateAllele); } else { ta.TranscriptReferenceAllele = altAllele.ReferenceAllele; ta.TranscriptAlternateAllele = altAllele.AlternateAllele; } CdnaMapper.MapCoordinates(altAllele.Start, altAllele.End, ta, transcript); }
private static string GetDownstreamSeq(IInterval simpleVariant, IInterval rotateRegion, ISequence refSequence, bool onReverseStrand, string rotatingBases) { int basesToEnd = onReverseStrand ? simpleVariant.Start - rotateRegion.Start : rotateRegion.End - simpleVariant.End; int downStreamLength = Math.Min(basesToEnd, Math.Max(rotatingBases.Length, MaxDownstreamLength)); // for large rotatingBases, we need to factor in its length but still make sure that we do not go past the end of transcript string downStreamSeq = onReverseStrand ? SequenceUtilities.GetReverseComplement( refSequence.Substring(simpleVariant.Start - 1 - downStreamLength, downStreamLength)) : refSequence.Substring(simpleVariant.End, downStreamLength); return(downStreamSeq); }
public string GetCodingSequence() { if (_sequence != null) { return(_sequence); } var sb = StringBuilderCache.Acquire(Length); // account for the exon phase (forward orientation) if (_startExonPhase > 0 && !_geneOnReverseStrand) { sb.Append('N', _startExonPhase); } foreach (var region in _regions) { // handle exons that are entirely in the UTR if (region.Type != TranscriptRegionType.Exon || region.End < _codingRegion.Start || region.Start > _codingRegion.End) { continue; } AddCodingRegion(region, sb); } // account for the exon phase (reverse orientation) if (_startExonPhase > 0 && _geneOnReverseStrand) { sb.Append('N', _startExonPhase); } if (_geneOnReverseStrand) { var revComp = SequenceUtilities.GetReverseComplement(sb.ToString()); sb.Clear(); sb.Append(revComp); } //RNA edits for transcripts on reverse strand come with reversed bases. So, no positional or base adjustment necessary // ref: unit test with NM_031947.3, chr5:140682196-140683630 ApplyRnaEdits(sb); _sequence = StringBuilderCache.GetStringAndRelease(sb); return(_sequence); }
/// <summary> /// extracts the coding sequence corresponding to the listed exons /// </summary> public string Sequence() { _sb.Clear(); // account for the exon phase (forward orientation) if (_startExonPhase > 0 && !_geneOnReverseStrand) { _sb.Append('N', _startExonPhase); } foreach (var map in _cdnaMaps) { // handle exons that are entirely in the UTR if (map.GenomicEnd < _start || map.GenomicStart > _end) { continue; } int tempBegin = map.GenomicStart; int tempEnd = map.GenomicEnd; // trim the first and last exons if (_start >= tempBegin && _start <= tempEnd) { tempBegin = _start; } if (_end >= tempBegin && _end <= tempEnd) { tempEnd = _end; } _sb.Append(_sequence.Substring(tempBegin - 1, tempEnd - tempBegin + 1)); } // account for the exon phase (reverse orientation) if (_startExonPhase > 0 && _geneOnReverseStrand) { _sb.Append('N', _startExonPhase); } return(_geneOnReverseStrand ? SequenceUtilities.GetReverseComplement(_sb.ToString()) : _sb.ToString()); }
/// <summary> /// Retrieves all Exon sequences and concats them together. /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq] /// </summary> private static string GetSplicedSequence(ICompressedSequence compressedSequence, CdnaCoordinateMap[] cdnaMaps, bool onReverseStrand) { var sb = new StringBuilder(); foreach (var exon in cdnaMaps) { var exonLength = exon.GenomicEnd - exon.GenomicStart + 1; // sanity check: handle the situation where no reference has been provided if (compressedSequence == null) { sb.Append(new string('N', exonLength)); continue; } sb.Append(compressedSequence.Substring(exon.GenomicStart - 1, exonLength)); } return(onReverseStrand ? SequenceUtilities.GetReverseComplement(sb.ToString()) : sb.ToString()); }
private void CheckNonCanonicalSpliceSurr(IAnnotatedTranscript ta, Transcript transcript, HashSet <LofteeFilter.Filter> filters, ICompressedSequence sequence) { if (ta.Exons == null) { return; } int affectedExonIndex = Convert.ToInt32(ta.Exons.Split('/').First().Split('-').First()); var totalExons = transcript.CdnaMaps.Length; string surrDonor = null; string surrAcceptor = null; if (totalExons <= 1) { return; } var onReverseStrand = transcript.Gene.OnReverseStrand; if (affectedExonIndex > 1) { var intron = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex] : transcript.Introns[affectedExonIndex - 2]; int acceptorStart = onReverseStrand ? intron.Start : intron.End - 1; var acceptorSeq = sequence.Substring(acceptorStart - 1, 2); surrAcceptor = onReverseStrand ? SequenceUtilities.GetReverseComplement(acceptorSeq) : acceptorSeq; } if (affectedExonIndex < totalExons) { var intron = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex - 1] : transcript.Introns[affectedExonIndex - 1]; int donorStart = onReverseStrand ? intron.End - 1 : intron.Start; var donorSeq = sequence.Substring(donorStart - 1, 2); surrDonor = onReverseStrand ? SequenceUtilities.GetReverseComplement(donorSeq) : donorSeq; } if (surrAcceptor != null && surrAcceptor != "AG" || surrDonor != null && surrDonor != "GT") { filters.Add(LofteeFilter.Filter.non_can_splice_surr); } }
private void CheckNagnagSite(Transcript transcript, IAnnotatedAlternateAllele allele, HashSet <LofteeFilter.Flag> flags, ICompressedSequence sequence) { if (allele.ReferenceBegin == null || allele.ReferenceEnd == null || allele.ReferenceBegin.Value != allele.ReferenceEnd.Value) { return; } int pos = allele.ReferenceBegin.Value; string upStreamSeq = sequence.Substring(pos - 6, 6); string downStreamSeq = sequence.Substring(pos, 5); var combineSeq = transcript.Gene.OnReverseStrand ? SequenceUtilities.GetReverseComplement(upStreamSeq + downStreamSeq) : upStreamSeq + downStreamSeq; if (Regex.Match(combineSeq, "[A|T|C|G]AG[A|T|C|G]AG").Success) { flags.Add(LofteeFilter.Flag.nagnag_site); } }
public void GetReverseComplement(string bases, string expectedResult) { var observedResult = SequenceUtilities.GetReverseComplement(bases); Assert.Equal(expectedResult, observedResult); }
/// <summary> /// get the genomic change that resulted from this variation [Sequence.pm:482 hgvs_variant_notation] /// </summary> private void GetGenomicChange(Transcript transcript, HgvsNotation hn, bool isGenomicDuplicate) { hn.Type = GenomicChange.Unknown; // make sure our positions are defined if (hn.Start.Position == null || hn.End.Position == null) { return; } int displayStart = (int)hn.Start.Position; int displayEnd = (int)hn.End.Position; // length of the reference allele. Negative lengths make no sense int refLength = displayEnd - displayStart + 1; if (refLength < 0) { refLength = 0; } // length of alternative allele var altLength = hn.AlternateBases.Length; // sanity check: make sure that the alleles are different if (hn.ReferenceBases == hn.AlternateBases) { return; } // deletion if (altLength == 0) { hn.Type = GenomicChange.Deletion; return; } if (refLength == altLength) { // substitution if (refLength == 1) { hn.Type = GenomicChange.Substitution; return; } // inversion var rcRefAllele = SequenceUtilities.GetReverseComplement(hn.ReferenceBases); hn.Type = hn.AlternateBases == rcRefAllele ? GenomicChange.Inversion : GenomicChange.InDel; return; } // If this is an insertion, we should check if the preceeding reference nucleotides // match the insertion. In that case it should be annotated as a multiplication. if (refLength == 0) { int prevPosition = displayEnd - altLength; if (!isGenomicDuplicate && _compressedSequence != null && prevPosition >= 0) { // Get the same number of nucleotides preceding the insertion as the length of // the insertion var precedingBases = SequenceUtilities.GetSubSubstring(transcript.Start, transcript.End, transcript.Gene.OnReverseStrand, prevPosition, prevPosition + altLength - 1, _compressedSequence); if (precedingBases == hn.AlternateBases) { isGenomicDuplicate = true; } } if (isGenomicDuplicate) { hn.Type = GenomicChange.Duplication; // for duplication, the hgvs positions are deceremented by alt allele length var incrementLength = altLength; hn.Start.Position = displayStart - incrementLength; hn.End.Position = hn.Start.Position + incrementLength - 1; hn.AlleleMultiple = 2; hn.ReferenceBases = hn.AlternateBases; return; } // otherwise just an insertion hn.Type = GenomicChange.Insertion; hn.Start.Position = displayEnd; hn.End.Position = displayStart; return; } // Otherwise, the reference and allele are of different lengths. By default, this is // a delins but we need to check if the alt allele is a multiplication of the reference. // Check if the length of the alt allele is a multiple of the reference allele if (altLength % refLength == 0) { hn.AlleleMultiple = altLength / refLength; string multRefAllele = string.Concat(Enumerable.Repeat(hn.ReferenceBases, hn.AlleleMultiple)); if (hn.AlternateBases == multRefAllele) { hn.Type = hn.AlleleMultiple == 2 ? GenomicChange.Duplication : GenomicChange.Multiple; return; } } // deletion/insertion hn.Type = GenomicChange.InDel; }
public static ISimpleVariant Right(ISimpleVariant simpleVariant, IInterval rotateRegion, ISequence refSequence, bool onReverseStrand) { if (refSequence == null) { return(simpleVariant); } if (simpleVariant.Type != VariantType.deletion && simpleVariant.Type != VariantType.insertion) { return(simpleVariant); } if (VariantStartOverlapsRegion(simpleVariant, rotateRegion, onReverseStrand)) { return(simpleVariant); } // if variant is before the transcript start, do not perform 3 prime shift string rotatingBases = GetRotatingBases(simpleVariant, onReverseStrand); string downStreamSeq = GetDownstreamSeq(simpleVariant, rotateRegion, refSequence, onReverseStrand, rotatingBases); string combinedSequence = rotatingBases + downStreamSeq; int shiftStart, shiftEnd; var hasShifted = false; // probably a VEP bug, just use it for consistency int numBases = rotatingBases.Length; for (shiftStart = 0, shiftEnd = numBases; shiftEnd < combinedSequence.Length; shiftStart++, shiftEnd++) { if (combinedSequence[shiftStart] != combinedSequence[shiftEnd]) { break; } hasShifted = true; } if (!hasShifted) { return(simpleVariant); } // create a new alternative allele string rotatedSequence = combinedSequence.Substring(shiftStart, numBases); int rotatedStart = simpleVariant.Start + shiftStart; int rotatedEnd = simpleVariant.End + shiftStart; if (onReverseStrand) { rotatedSequence = SequenceUtilities.GetReverseComplement(rotatedSequence); rotatedStart = simpleVariant.Start - shiftStart; rotatedEnd = simpleVariant.End - shiftStart; } string rotatedRefAllele = simpleVariant.RefAllele; string rotatedAltAllele = simpleVariant.AltAllele; if (simpleVariant.Type == VariantType.insertion) { rotatedAltAllele = rotatedSequence; } else { rotatedRefAllele = rotatedSequence; } return(new SimpleVariant(simpleVariant.Chromosome, rotatedStart, rotatedEnd, rotatedRefAllele, rotatedAltAllele, simpleVariant.Type)); }
public static string GetHgvscAnnotation(ITranscript transcript, ISimpleVariant variant, ISequence refSequence, int regionStart, int regionEnd, string transcriptRef, string transcriptAlt) { // sanity check: don't try to handle odd characters, make sure this is not a reference allele, // and make sure that we have protein coordinates if (variant.Type == VariantType.reference || SequenceUtilities.HasNonCanonicalBase(variant.AltAllele)) { return(null); } // do not report HGVSc notation when variant lands inside gap region if (regionStart > -1 && regionEnd > -1) { var startRegion = transcript.TranscriptRegions[regionStart]; var endRegion = transcript.TranscriptRegions[regionEnd]; if (startRegion.Id == endRegion.Id && startRegion.Type == TranscriptRegionType.Gap && endRegion.Type == TranscriptRegionType.Gap) { return(null); } } bool onReverseStrand = transcript.Gene.OnReverseStrand; string refAllele = string.IsNullOrEmpty(transcriptRef)? onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.RefAllele) : variant.RefAllele : transcriptRef; string altAllele = string.IsNullOrEmpty(transcriptAlt) ? onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.AltAllele) : variant.AltAllele : transcriptAlt; // decide event type from HGVS nomenclature var genomicChange = GetGenomicChange(transcript, onReverseStrand, refSequence, variant); int variantStart = variant.Start; int variantEnd = variant.End; if (genomicChange == GenomicChange.Duplication) { (variantStart, variantEnd, refAllele, regionStart, regionEnd) = transcript.TranscriptRegions.ShiftDuplication(variantStart, altAllele, onReverseStrand); } var startPositionOffset = HgvsUtilities.GetCdnaPositionOffset(transcript, variantStart, regionStart, true); var endPositionOffset = variantStart == variantEnd ? startPositionOffset : HgvsUtilities.GetCdnaPositionOffset(transcript, variantEnd, regionEnd, false); if (onReverseStrand) { var tmp = startPositionOffset; startPositionOffset = endPositionOffset; endPositionOffset = tmp; } if (startPositionOffset == null && variant.Type == VariantType.insertion) { startPositionOffset = new PositionOffset(endPositionOffset.Position + 1, endPositionOffset.Offset, $"{endPositionOffset.Position + 1}", endPositionOffset.HasStopCodonNotation); } // sanity check: make sure we have coordinates if (startPositionOffset == null || endPositionOffset == null) { return(null); } var hgvsNotation = new HgvscNotation(refAllele, altAllele, transcript.Id.WithVersion, genomicChange, startPositionOffset, endPositionOffset, transcript.Translation != null); // generic formatting return(hgvsNotation.ToString()); }