Ejemplo n.º 1
        /// <summary>
        /// constructor
        /// </summary>
        public HgvsCodingNomenclature(TranscriptAnnotation ta, Transcript transcript, VariantFeature variant,
                                      ICompressedSequence compressedSequence, bool isGenomicDuplicate)
            _ta                 = ta;
            _transcript         = transcript;
            _variant            = variant;
            _compressedSequence = compressedSequence;
            _isGenomicDuplicate = isGenomicDuplicate;

            _sb = new StringBuilder();

            // get reference sequence strand
            var transcriptOnReverseStrand = transcript.Gene.OnReverseStrand;

            // this may be different to the input one for insertions/deletions
            var    altAllele = ta.AlternateAllele;
            string variationFeatureSequence = altAllele.AlternateAllele;

            // get the reverse complement of the vfs if needed
            if (transcriptOnReverseStrand)
                variationFeatureSequence = SequenceUtilities.GetReverseComplement(variationFeatureSequence);

            // calculate the reference start and end
            GetReferenceCoordinates(transcript, altAllele, out _hgvsStart, out _hgvsEnd);

            // decide event type from HGVS nomenclature
            _hgvsNotation = new HgvsNotation(ta.TranscriptReferenceAllele, variationFeatureSequence,
                                             FormatUtilities.CombineIdAndVersion(transcript.Id, transcript.Version), _hgvsStart, _hgvsEnd,
                                             _transcript.Translation != null);
Ejemplo n.º 2
        public string GetCdnaSequence()
            if (_sequence != null)

            var sb = StringBuilderCache.Acquire();

            foreach (var region in _regions)
                if (region.Type != TranscriptRegionType.Exon)
                sb.Append(_compressedSequence.Substring(region.Start - 1, region.End - region.Start + 1));

            if (_onReverseStrand)
                string reverseComplement = SequenceUtilities.GetReverseComplement(sb.ToString());


            _sequence = StringBuilderCache.GetStringAndRelease(sb);
Ejemplo n.º 3
        /// <summary>
        /// Retrieves all Exon sequences and concats them together.
        /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq]
        /// </summary>
        private static string GetSplicedSequence(ISequence refSequence, ITranscriptRegion[] regions, bool onReverseStrand)
            var sb = StringBuilderCache.Acquire();

            foreach (var region in regions)
                if (region.Type != TranscriptRegionType.Exon)
                var exonLength = region.End - region.Start + 1;

                // sanity check: handle the situation where no reference has been provided
                if (refSequence == null)
                    sb.Append(new string('N', exonLength));

                sb.Append(refSequence.Substring(region.Start - 1, exonLength));

            var results = StringBuilderCache.GetStringAndRelease(sb);

            return(onReverseStrand ? SequenceUtilities.GetReverseComplement(results) : results);
Ejemplo n.º 4
        private static string GetRotatingBases(ISimpleVariant simpleVariant, bool onReverseStrand)
            string rotatingBases = simpleVariant.Type == VariantType.insertion ? simpleVariant.AltAllele : simpleVariant.RefAllele;

            rotatingBases = onReverseStrand ? SequenceUtilities.GetReverseComplement(rotatingBases) : rotatingBases;
Ejemplo n.º 5
        public static string GetHgvscAnnotation(ITranscript transcript, ISimpleVariant variant, ISequence refSequence,
                                                int regionStart, int regionEnd)
            // sanity check: don't try to handle odd characters, make sure this is not a reference allele,
            //               and make sure that we have protein coordinates
            if (variant.Type == VariantType.reference || SequenceUtilities.HasNonCanonicalBase(variant.AltAllele))

            var onReverseStrand = transcript.Gene.OnReverseStrand;

            var refAllele = onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.RefAllele) : variant.RefAllele;
            var altAllele = onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.AltAllele) : variant.AltAllele;

            // decide event type from HGVS nomenclature
            var genomicChange = GetGenomicChange(transcript, onReverseStrand, refSequence, variant);

            var variantStart = variant.Start;
            var variantEnd   = variant.End;

            if (genomicChange == GenomicChange.Duplication)
                (variantStart, variantEnd, refAllele, regionStart, regionEnd) = transcript.TranscriptRegions.ShiftDuplication(variantStart, altAllele, onReverseStrand);

            var startPositionOffset = HgvsUtilities.GetCdnaPositionOffset(transcript, variantStart, regionStart);
            var endPositionOffset   = variantStart == variantEnd
                ? startPositionOffset
                : HgvsUtilities.GetCdnaPositionOffset(transcript, variantEnd, regionEnd);

            if (onReverseStrand)
                var tmp = startPositionOffset;
                startPositionOffset = endPositionOffset;
                endPositionOffset   = tmp;

            // sanity check: make sure we have coordinates
            if (startPositionOffset == null || endPositionOffset == null)

            var transcriptLen = transcript.End - transcript.Start + 1;

            //_hgvs notation past the transcript
            if (startPositionOffset.Position > transcriptLen || endPositionOffset.Position > transcriptLen)

            var hgvsNotation = new HgvscNotation(refAllele, altAllele, transcript.Id.WithVersion, genomicChange,
                                                 startPositionOffset, endPositionOffset, transcript.Translation != null);

            // generic formatting
Ejemplo n.º 6
        public static GenomicChange GetGenomicChange(IInterval interval, bool onReverseStrand, ISequence refSequence, ISimpleVariant variant)
            // length of the reference allele. Negative lengths make no sense
            int refLength = variant.End - variant.Start + 1;

            if (refLength < 0)
                refLength = 0;

            // length of alternative allele
            int altLength = variant.AltAllele.Length;

            // sanity check: make sure that the alleles are different
            if (variant.RefAllele == variant.AltAllele)

            // deletion
            if (altLength == 0)

            if (refLength == altLength)
                // substitution
                if (refLength == 1)

                // inversion
                string rcRefAllele = SequenceUtilities.GetReverseComplement(variant.RefAllele);
                return(variant.AltAllele == rcRefAllele ? GenomicChange.Inversion : GenomicChange.DelIns);

            // deletion/insertion
            if (refLength != 0)

            // If this is an insertion, we should check if the preceding reference nucleotides
            // match the insertion. In that case it should be annotated as a multiplication.
            bool isGenomicDuplicate = HgvsUtilities.IsDuplicateWithinInterval(refSequence, variant, interval, onReverseStrand);

            return(isGenomicDuplicate ? GenomicChange.Duplication : GenomicChange.Insertion);
Ejemplo n.º 7
        public string GetCodingSequence()
            var sb = StringBuilderCache.Acquire(Length);

            // account for the exon phase (forward orientation)
            if (_startExonPhase > 0 && !_geneOnReverseStrand)
                sb.Append('N', _startExonPhase);

            foreach (var region in _regions)
                if (region.Type != TranscriptRegionType.Exon)

                // handle exons that are entirely in the UTR
                if (region.End < _codingRegion.Start || region.Start > _codingRegion.End)

                int tempBegin = region.Start;
                int tempEnd   = region.End;

                // trim the first and last exons
                if (_codingRegion.Start >= tempBegin && _codingRegion.Start <= tempEnd)
                    tempBegin = _codingRegion.Start;
                if (_codingRegion.End >= tempBegin && _codingRegion.End <= tempEnd)
                    tempEnd = _codingRegion.End;

                sb.Append(_compressedSequence.Substring(tempBegin - 1, tempEnd - tempBegin + 1));

            // account for the exon phase (reverse orientation)
            if (_startExonPhase > 0 && _geneOnReverseStrand)
                sb.Append('N', _startExonPhase);

            var s = StringBuilderCache.GetStringAndRelease(sb);

            return(_geneOnReverseStrand ? SequenceUtilities.GetReverseComplement(s) : s);
Ejemplo n.º 8
        private static void MapCdnaCoordinates(Transcript transcript, TranscriptAnnotation ta, VariantAlternateAllele altAllele)
            if (transcript.Gene.OnReverseStrand)
                ta.TranscriptReferenceAllele = SequenceUtilities.GetReverseComplement(altAllele.ReferenceAllele);
                ta.TranscriptAlternateAllele = SequenceUtilities.GetReverseComplement(altAllele.AlternateAllele);
                ta.TranscriptReferenceAllele = altAllele.ReferenceAllele;
                ta.TranscriptAlternateAllele = altAllele.AlternateAllele;

            CdnaMapper.MapCoordinates(altAllele.Start, altAllele.End, ta, transcript);
Ejemplo n.º 9
        private static string GetDownstreamSeq(IInterval simpleVariant, IInterval rotateRegion,
                                               ISequence refSequence, bool onReverseStrand, string rotatingBases)
            int basesToEnd       = onReverseStrand ? simpleVariant.Start - rotateRegion.Start : rotateRegion.End - simpleVariant.End;
            int downStreamLength =
                                  MaxDownstreamLength)); // for large rotatingBases, we need to factor in its length but still make sure that we do not go past the end of transcript

            string downStreamSeq = onReverseStrand
                ? SequenceUtilities.GetReverseComplement(
                refSequence.Substring(simpleVariant.Start - 1 - downStreamLength, downStreamLength))
                : refSequence.Substring(simpleVariant.End, downStreamLength);

Ejemplo n.º 10
        public string GetCodingSequence()
            if (_sequence != null)

            var sb = StringBuilderCache.Acquire(Length);

            // account for the exon phase (forward orientation)
            if (_startExonPhase > 0 && !_geneOnReverseStrand)
                sb.Append('N', _startExonPhase);

            foreach (var region in _regions)
                // handle exons that are entirely in the UTR
                if (region.Type != TranscriptRegionType.Exon || region.End < _codingRegion.Start || region.Start > _codingRegion.End)
                AddCodingRegion(region, sb);

            // account for the exon phase (reverse orientation)
            if (_startExonPhase > 0 && _geneOnReverseStrand)
                sb.Append('N', _startExonPhase);
            if (_geneOnReverseStrand)
                var revComp = SequenceUtilities.GetReverseComplement(sb.ToString());
            //RNA edits for transcripts on reverse strand come with reversed bases. So, no positional or base adjustment necessary
            // ref: unit test with NM_031947.3, chr5:140682196-140683630
            _sequence = StringBuilderCache.GetStringAndRelease(sb);

Ejemplo n.º 11
        /// <summary>
        /// extracts the coding sequence corresponding to the listed exons
        /// </summary>
        public string Sequence()

            // account for the exon phase (forward orientation)
            if (_startExonPhase > 0 && !_geneOnReverseStrand)
                _sb.Append('N', _startExonPhase);

            foreach (var map in _cdnaMaps)
                // handle exons that are entirely in the UTR
                if (map.GenomicEnd < _start || map.GenomicStart > _end)

                int tempBegin = map.GenomicStart;
                int tempEnd   = map.GenomicEnd;

                // trim the first and last exons
                if (_start >= tempBegin && _start <= tempEnd)
                    tempBegin = _start;
                if (_end >= tempBegin && _end <= tempEnd)
                    tempEnd = _end;

                _sb.Append(_sequence.Substring(tempBegin - 1, tempEnd - tempBegin + 1));

            // account for the exon phase (reverse orientation)
            if (_startExonPhase > 0 && _geneOnReverseStrand)
                _sb.Append('N', _startExonPhase);

            return(_geneOnReverseStrand ? SequenceUtilities.GetReverseComplement(_sb.ToString()) : _sb.ToString());
Ejemplo n.º 12
        /// <summary>
        /// Retrieves all Exon sequences and concats them together.
        /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq]
        /// </summary>
        private static string GetSplicedSequence(ICompressedSequence compressedSequence, CdnaCoordinateMap[] cdnaMaps, bool onReverseStrand)
            var sb = new StringBuilder();

            foreach (var exon in cdnaMaps)
                var exonLength = exon.GenomicEnd - exon.GenomicStart + 1;

                // sanity check: handle the situation where no reference has been provided
                if (compressedSequence == null)
                    sb.Append(new string('N', exonLength));

                sb.Append(compressedSequence.Substring(exon.GenomicStart - 1, exonLength));

            return(onReverseStrand ? SequenceUtilities.GetReverseComplement(sb.ToString()) : sb.ToString());
Ejemplo n.º 13
        private void CheckNonCanonicalSpliceSurr(IAnnotatedTranscript ta, Transcript transcript,
                                                 HashSet <LofteeFilter.Filter> filters, ICompressedSequence sequence)
            if (ta.Exons == null)
            int affectedExonIndex = Convert.ToInt32(ta.Exons.Split('/').First().Split('-').First());
            var totalExons        = transcript.CdnaMaps.Length;

            string surrDonor    = null;
            string surrAcceptor = null;

            if (totalExons <= 1)

            var onReverseStrand = transcript.Gene.OnReverseStrand;

            if (affectedExonIndex > 1)
                var intron        = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex] : transcript.Introns[affectedExonIndex - 2];
                int acceptorStart = onReverseStrand ? intron.Start : intron.End - 1;
                var acceptorSeq   = sequence.Substring(acceptorStart - 1, 2);
                surrAcceptor = onReverseStrand ? SequenceUtilities.GetReverseComplement(acceptorSeq) : acceptorSeq;

            if (affectedExonIndex < totalExons)
                var intron     = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex - 1] : transcript.Introns[affectedExonIndex - 1];
                int donorStart = onReverseStrand ? intron.End - 1 : intron.Start;
                var donorSeq   = sequence.Substring(donorStart - 1, 2);
                surrDonor = onReverseStrand ? SequenceUtilities.GetReverseComplement(donorSeq) : donorSeq;

            if (surrAcceptor != null && surrAcceptor != "AG" || surrDonor != null && surrDonor != "GT")
Ejemplo n.º 14
        private void CheckNagnagSite(Transcript transcript, IAnnotatedAlternateAllele allele,
                                     HashSet <LofteeFilter.Flag> flags, ICompressedSequence sequence)
            if (allele.ReferenceBegin == null || allele.ReferenceEnd == null ||
                allele.ReferenceBegin.Value != allele.ReferenceEnd.Value)

            int pos = allele.ReferenceBegin.Value;

            string upStreamSeq   = sequence.Substring(pos - 6, 6);
            string downStreamSeq = sequence.Substring(pos, 5);

            var combineSeq = transcript.Gene.OnReverseStrand
                ? SequenceUtilities.GetReverseComplement(upStreamSeq + downStreamSeq)
                : upStreamSeq + downStreamSeq;

            if (Regex.Match(combineSeq, "[A|T|C|G]AG[A|T|C|G]AG").Success)
Ejemplo n.º 15
        public void GetReverseComplement(string bases, string expectedResult)
            var observedResult = SequenceUtilities.GetReverseComplement(bases);

            Assert.Equal(expectedResult, observedResult);
Ejemplo n.º 16
        /// <summary>
        /// get the genomic change that resulted from this variation [Sequence.pm:482 hgvs_variant_notation]
        /// </summary>
        private void GetGenomicChange(Transcript transcript, HgvsNotation hn, bool isGenomicDuplicate)
            hn.Type = GenomicChange.Unknown;

            // make sure our positions are defined
            if (hn.Start.Position == null || hn.End.Position == null)

            int displayStart = (int)hn.Start.Position;
            int displayEnd   = (int)hn.End.Position;

            // length of the reference allele. Negative lengths make no sense
            int refLength = displayEnd - displayStart + 1;

            if (refLength < 0)
                refLength = 0;

            // length of alternative allele
            var altLength = hn.AlternateBases.Length;

            // sanity check: make sure that the alleles are different
            if (hn.ReferenceBases == hn.AlternateBases)

            // deletion
            if (altLength == 0)
                hn.Type = GenomicChange.Deletion;

            if (refLength == altLength)
                // substitution
                if (refLength == 1)
                    hn.Type = GenomicChange.Substitution;

                // inversion
                var rcRefAllele = SequenceUtilities.GetReverseComplement(hn.ReferenceBases);
                hn.Type = hn.AlternateBases == rcRefAllele ? GenomicChange.Inversion : GenomicChange.InDel;

            // If this is an insertion, we should check if the preceeding reference nucleotides
            // match the insertion. In that case it should be annotated as a multiplication.
            if (refLength == 0)
                int prevPosition = displayEnd - altLength;

                if (!isGenomicDuplicate && _compressedSequence != null && prevPosition >= 0)
                    // Get the same number of nucleotides preceding the insertion as the length of
                    // the insertion
                    var precedingBases = SequenceUtilities.GetSubSubstring(transcript.Start, transcript.End,
                                                                           transcript.Gene.OnReverseStrand, prevPosition, prevPosition + altLength - 1, _compressedSequence);
                    if (precedingBases == hn.AlternateBases)
                        isGenomicDuplicate = true;

                if (isGenomicDuplicate)
                    hn.Type = GenomicChange.Duplication;

                    // for duplication, the hgvs positions are deceremented by alt allele length
                    var incrementLength = altLength;
                    hn.Start.Position = displayStart - incrementLength;
                    hn.End.Position   = hn.Start.Position + incrementLength - 1;

                    hn.AlleleMultiple = 2;
                    hn.ReferenceBases = hn.AlternateBases;

                // otherwise just an insertion
                hn.Type           = GenomicChange.Insertion;
                hn.Start.Position = displayEnd;
                hn.End.Position   = displayStart;

            // Otherwise, the reference and allele are of different lengths. By default, this is
            // a delins but we need to check if the alt allele is a multiplication of the reference.
            // Check if the length of the alt allele is a multiple of the reference allele
            if (altLength % refLength == 0)
                hn.AlleleMultiple = altLength / refLength;
                string multRefAllele = string.Concat(Enumerable.Repeat(hn.ReferenceBases, hn.AlleleMultiple));

                if (hn.AlternateBases == multRefAllele)
                    hn.Type = hn.AlleleMultiple == 2 ? GenomicChange.Duplication : GenomicChange.Multiple;

            // deletion/insertion
            hn.Type = GenomicChange.InDel;
Ejemplo n.º 17
        public static ISimpleVariant Right(ISimpleVariant simpleVariant, IInterval rotateRegion, ISequence refSequence, bool onReverseStrand)
            if (refSequence == null)

            if (simpleVariant.Type != VariantType.deletion && simpleVariant.Type != VariantType.insertion)

            if (VariantStartOverlapsRegion(simpleVariant, rotateRegion, onReverseStrand))
            // if variant is before the transcript start, do not perform 3 prime shift

            string rotatingBases = GetRotatingBases(simpleVariant, onReverseStrand);

            string downStreamSeq = GetDownstreamSeq(simpleVariant, rotateRegion, refSequence, onReverseStrand, rotatingBases);

            string combinedSequence = rotatingBases + downStreamSeq;

            int shiftStart, shiftEnd;
            var hasShifted = false;

            // probably a VEP bug, just use it for consistency
            int numBases = rotatingBases.Length;

            for (shiftStart = 0, shiftEnd = numBases; shiftEnd < combinedSequence.Length; shiftStart++, shiftEnd++)
                if (combinedSequence[shiftStart] != combinedSequence[shiftEnd])
                hasShifted = true;

            if (!hasShifted)

            // create a new alternative allele
            string rotatedSequence = combinedSequence.Substring(shiftStart, numBases);
            int    rotatedStart    = simpleVariant.Start + shiftStart;
            int    rotatedEnd      = simpleVariant.End + shiftStart;

            if (onReverseStrand)
                rotatedSequence = SequenceUtilities.GetReverseComplement(rotatedSequence);
                rotatedStart    = simpleVariant.Start - shiftStart;
                rotatedEnd      = simpleVariant.End - shiftStart;

            string rotatedRefAllele = simpleVariant.RefAllele;
            string rotatedAltAllele = simpleVariant.AltAllele;

            if (simpleVariant.Type == VariantType.insertion)
                rotatedAltAllele = rotatedSequence;
                rotatedRefAllele = rotatedSequence;

            return(new SimpleVariant(simpleVariant.Chromosome, rotatedStart, rotatedEnd, rotatedRefAllele,
                                     rotatedAltAllele, simpleVariant.Type));
Ejemplo n.º 18
        public static string GetHgvscAnnotation(ITranscript transcript, ISimpleVariant variant, ISequence refSequence,
                                                int regionStart, int regionEnd, string transcriptRef, string transcriptAlt)
            // sanity check: don't try to handle odd characters, make sure this is not a reference allele,
            //               and make sure that we have protein coordinates
            if (variant.Type == VariantType.reference || SequenceUtilities.HasNonCanonicalBase(variant.AltAllele))

            // do not report HGVSc notation when variant lands inside gap region
            if (regionStart > -1 && regionEnd > -1)
                var startRegion = transcript.TranscriptRegions[regionStart];
                var endRegion   = transcript.TranscriptRegions[regionEnd];
                if (startRegion.Id == endRegion.Id && startRegion.Type == TranscriptRegionType.Gap &&
                    endRegion.Type == TranscriptRegionType.Gap)

            bool onReverseStrand = transcript.Gene.OnReverseStrand;

            string refAllele = string.IsNullOrEmpty(transcriptRef)? onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.RefAllele) : variant.RefAllele
                : transcriptRef;
            string altAllele = string.IsNullOrEmpty(transcriptAlt)
                ? onReverseStrand ? SequenceUtilities.GetReverseComplement(variant.AltAllele) : variant.AltAllele
                : transcriptAlt;

            // decide event type from HGVS nomenclature
            var genomicChange = GetGenomicChange(transcript, onReverseStrand, refSequence, variant);

            int variantStart = variant.Start;
            int variantEnd   = variant.End;

            if (genomicChange == GenomicChange.Duplication)
                (variantStart, variantEnd, refAllele, regionStart, regionEnd) = transcript.TranscriptRegions.ShiftDuplication(variantStart, altAllele, onReverseStrand);

            var startPositionOffset = HgvsUtilities.GetCdnaPositionOffset(transcript, variantStart, regionStart, true);
            var endPositionOffset   = variantStart == variantEnd
                ? startPositionOffset
                : HgvsUtilities.GetCdnaPositionOffset(transcript, variantEnd, regionEnd, false);

            if (onReverseStrand)
                var tmp = startPositionOffset;
                startPositionOffset = endPositionOffset;
                endPositionOffset   = tmp;

            if (startPositionOffset == null && variant.Type == VariantType.insertion)
                startPositionOffset = new PositionOffset(endPositionOffset.Position + 1, endPositionOffset.Offset, $"{endPositionOffset.Position + 1}", endPositionOffset.HasStopCodonNotation);

            // sanity check: make sure we have coordinates
            if (startPositionOffset == null || endPositionOffset == null)

            var hgvsNotation = new HgvscNotation(refAllele, altAllele, transcript.Id.WithVersion, genomicChange,
                                                 startPositionOffset, endPositionOffset, transcript.Translation != null);

            // generic formatting