Ejemplo n.º 1
0
 /// <summary>
 /// Verifies the alignment has no leading or trailing gaps and throws an exception otherwise.
 /// </summary>
 /// <param name="refseq">Refseq.</param>
 /// <param name="query">Query.</param>
 internal static void VerifyNoGapsOnEnds(byte[] refseq, BPandQV[] query) {
     var gap = DnaAlphabet.Instance.Gap;
     if (refseq [0] == gap ||
         refseq [refseq.Length - 1] == gap ||
         query [0].BP == gap ||
         query [query.Length - 1].BP == gap) {
         var refseqs = new string(refseq.Select(x=>(char)x).ToArray());
         var qseqs = new string(query.Select(x=>(char)x.BP).ToArray());
         throw new FormatException ("Alignment query and/or reference started with a gap character. " +
             "Alignments must be hard-clipped to remove starting and trailing variants " +
             "before variants can be called.  Alignment was:\n" + refseqs + "\n" + qseqs);
     }
 }
Ejemplo n.º 2
0
        /// <summary>
        /// Reverse complements the BP and QV values accounting for homopolymers.
        ///
        /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that
        /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the
        /// first base in a homopolymer (though this is not standardized).  To account for this, when reverse complementing, we switch the
        /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
        /// </summary>
        /// <returns>The reverse complemented sequence.</returns>
        /// <param name="toFlip">The array with the QV values to flip.</param>
        /// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param>
        internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false)
        {
            BPandQV[] newData = new BPandQV[toFlip.Length];

            for (long index = 0; index < toFlip.Length; index++)
            {
                byte complementedSymbol;
                byte symbol = toFlip[toFlip.Length - index - 1].BP;

                if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol))
                {
                    throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString());
                }
                var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index - 1].QV);
                newData [index] = bpandq;
            }

            if (flipHpQvValues)
            {
                ReverseQVValuesForHomopolymers(newData);
            }
            return(newData);
        }
Ejemplo n.º 3
0
 /// <summary>
 /// Gets the bases for a length of the sequence as a string.
 /// </summary>
 /// <returns>The bases.</returns>
 /// <param name="array">Array.</param>
 /// <param name="position">Position.</param>
 /// <param name="length">Length.</param>
 private static string getBases(BPandQV[] array, int position, int length)
 {
     char[] chars = new char[length];
     for(int i=0; i<length; i++)
     {
         chars[i] = (char)array[i+position].BP;
     }
     return new string(chars);
 }
Ejemplo n.º 4
0
        /// <summary>
        /// Calls the variants.
        /// 
        /// Should only be used internally as assumptions are made that the alignments are left-aligned and fulfill certain criteria.
        /// </summary>
        /// <returns>The variants.</returns>
        /// <param name="refSeq">Reference seq.</param>
        /// <param name="querySeq">Query seq.</param>
        /// <param name="originallyReverseComplemented">If set to <c>true</c> the query sequence was originally reverse complemented. (this affects QV value scoring)</param>
        internal static List<Variant> CallVariants(byte[] refSeq, BPandQV[] querySeq, bool originallyReverseComplemented)
        {
            if (originallyReverseComplemented) {
                AlignmentUtils.ReverseQVValuesForHomopolymers (querySeq);
            }
            List<Variant> variants = new List<Variant>();

            // Now call variants.
            var gap = DnaAlphabet.Instance.Gap;
            int i = 0;
            int refPos = 0;
            while( i < refSeq.Length)
            {
                if (refSeq[i] == gap)
                {
                    int len = AlignmentUtils.GetGapLength(i, refSeq);
                    var nextBasePos = (i + len);
                    // Should alway be true as we don't end in gaps
                    Debug.Assert (nextBasePos < refSeq.Length);
                    var hplenAndChar = determineHomoPolymerLength (nextBasePos, refSeq);
                    var bases = getBases(querySeq, i, len);
                    var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Insertion,  
                                                      hplenAndChar.Item2, hplenAndChar.Item1, 
                                                      (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));                   
                    newVariant.QV = querySeq[i].QV;
                    variants.Add(newVariant);
                    i += len;
                }
                else if (querySeq[i].BP == gap)
                {
                    int len = AlignmentUtils.GetGapLength(i, querySeq);
                    var bases = getBases(refSeq, i, len);
                    var hplenAndChar = determineHomoPolymerLength (i, refSeq);
                    var newVariant = new IndelVariant(refPos - 1, len, bases, 
                                                      IndelType.Deletion, hplenAndChar.Item2, 
                                                      hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length));
                    /* An insertion mutation occurs BEFORE pos, so normally we get the next base
                     * or the last one if it's a reverse complemented alignment.  However, this is not true if 
                     * it is a homopolymer because what would have been the previous position is the next position
                     * after left aligning and reversing the position of the QV value.
                     * 
                     * Consider the following
                     * --*-       -*--
                     * A-TA   --> TA-T
                     * AGTA       TACT
                     * 
                     * However, 
                     * --*--         --*--
                     * A-TTA   ----> T-AAT
                     * ATTTA         TAAAT
                     * 
                     */
                    if ((i + len ) < querySeq.Length) {
                        
                        var qc_pos = originallyReverseComplemented ? i - 1 : i + len;
                        if (newVariant.InHomopolymer) {
                            qc_pos = i + len;
                        }
                        newVariant.QV = querySeq[qc_pos].QV;
                    }
                    variants.Add(newVariant);
                    i += len;
                    refPos += len;
                }
                else
                {
                    if (querySeq[i].BP != refSeq[i])
                    {
                        var newVariant = new SNPVariant(refPos, (char) querySeq[i].BP, (char)refSeq[i], (i ==0 || i == (refSeq.Length -1)));
                        newVariant.QV = querySeq [i].QV;
                        variants.Add(newVariant);
                    }
                    i++; refPos++;
                }
            }
            return variants;
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Given two byte arrays representing a pairwise alignment, shift them so
        /// that all deletions start as early as possible.  For example:
        /// TTTTAAAATTTT   -> Converts to -> TTTTAAAATTTT
        /// TTTTAA--TTTT                     TTTT--AATTTT
        ///
        /// This modifies the array in place.
        /// </summary>
        /// <param name="refseq">Reference Sequency</param>
        /// <param name="query">Query Sequence</param>
        /// <returns></returns>
        public static void LeftAlignIndels(byte[] refseq, BPandQV[] query)
        {
            // Validation
            if (refseq.Length != query.Length)
            {
                throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences");
            }

            ValidateNoOverlappingGaps(refseq, query);
            byte gap = DnaAlphabet.Instance.Gap;
            // Keep left aligning until we can't anymore, this is a
            // do while loop because some downstream left alignments open up
            // further ones upstream, even though this is rare.
            int change_count = 0;
            int loopsThrough = 0;

            do
            {
                loopsThrough++;
                change_count = 0;
                for (int i = 1; i < refseq.Length; i++)
                {
                    if (refseq[i] == gap)
                    {
                        int len        = GetGapLength(i, refseq);
                        int left_side  = i - 1;
                        int right_side = i - 1 + len;
                        while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP))
                        {
                            // Move the gap left.
                            if (right_side < refseq.Length)
                            {
                                refseq[right_side] = refseq[left_side];
                            }
                            refseq[left_side] = gap;
                            left_side--;
                            right_side--;
                            change_count++;
                        }
                        if (loopsThrough > MAX_LOOPS)
                        {
                            throw new Exception(MAX_LOOPS_ERROR);
                        }
                    }
                    else if (query[i].BP == gap)
                    {
                        int len        = GetGapLength(i, query);
                        int left_side  = i - 1;
                        int right_side = i - 1 + len;
                        while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side]))
                        {
                            // Move the gap left.
                            if (right_side < query.Length)
                            {
                                query[right_side] = query[left_side];
                            }
                            query[left_side] = new BPandQV(gap, 0);
                            left_side--;
                            right_side--;
                            change_count++;
                        }
                        if (loopsThrough > MAX_LOOPS)
                        {
                            throw new Exception(MAX_LOOPS_ERROR);
                        }
                    }
                }
            } while (change_count > 0);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Given two byte arrays representing a pairwise alignment, shift them so 
        /// that all deletions start as early as possible.  For example:
        /// TTTTAAAATTTT   -> Converts to -> TTTTAAAATTTT
        /// TTTTAA--TTTT                     TTTT--AATTTT
        /// 
        /// This modifies the array in place.
        /// </summary>
        /// <param name="refseq">Reference Sequency</param>
        /// <param name="query">Query Sequence</param>
        /// <returns></returns>
        public static void LeftAlignIndels(byte[] refseq, BPandQV[] query)
        {          
            // Validation
            if (refseq.Length != query.Length) {
                throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences");
            }

            ValidateNoOverlappingGaps (refseq, query);
            byte gap = DnaAlphabet.Instance.Gap;
            // Keep left aligning until we can't anymore, this is a 
            // do while loop because some downstream left alignments open up
            // further ones upstream, even though this is rare.
            int change_count = 0;
            int loopsThrough = 0;
            do
            {
                loopsThrough++;
                change_count = 0;
                for (int i = 1; i < refseq.Length; i++)
                {
                    if (refseq[i] == gap)
                    {
                        int len = GetGapLength(i, refseq);
                        int left_side = i - 1;
                        int right_side = i  - 1 + len;
                        while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP))
                        {
                            // Move the gap left.
                            if (right_side < refseq.Length) {
                                refseq[right_side] = refseq[left_side];
                            }
                            refseq[left_side] = gap;
                            left_side--;
                            right_side--;
                            change_count++;
                        }
                        if (loopsThrough > MAX_LOOPS) {
                            throw new Exception(MAX_LOOPS_ERROR);
                        }
                    }
                    else if (query[i].BP == gap)
                    {
                        int len = GetGapLength(i, query);
                        int left_side = i - 1;
                        int right_side = i - 1 + len;
                        while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side]))
                        {
                            // Move the gap left.
                            if (right_side < query.Length) {
                            query[right_side] = query[left_side];
                            }
                            query[left_side] = new BPandQV(gap, 0);
                            left_side--;
                            right_side--;
                            change_count++;
                        }
                        if (loopsThrough > MAX_LOOPS) {
                            throw new Exception(MAX_LOOPS_ERROR);
                        }
                    }
                }
            } while (change_count > 0);
        }
Ejemplo n.º 7
0
 /// <summary>
 /// Reverses the QV values for homopolymers.
 /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that 
 /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the 
 /// first base in a homopolymer (though this is not standardized).  To account for this, when reverse complementing, we switch the 
 /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
 /// </summary>
 /// <returns>The QV values for homopolymers.</returns>
 /// <param name="toFlip">To flip.</param>
 internal static void ReverseQVValuesForHomopolymers(BPandQV[] toFlip) {
     // Basic idea is to assume it is A, C, G, T alphabet and flip HP values
     // Also assumes all low QV is due to HP deletion/insertion error.
     if (toFlip.Length > 1) {
         byte lastbp = toFlip [0].BP;
         int firstPos = 0;
         int curLength = 1;
         for (int i = 1; i < toFlip.Length; i++) {
             byte newbp = toFlip [i].BP;
             if (newbp != lastbp) {
                 if (curLength > 1) {
                     var right = toFlip [i - 1];
                     var left = toFlip[firstPos];
                     Debug.Assert (right.BP == left.BP);
                     if (right.QV < left.QV) {
                         toFlip [i - 1] = left;
                         toFlip [firstPos] = right;
                     }
                 }
                 firstPos = i;
                 lastbp = newbp;
                 curLength = 1;
             } else if (newbp == lastbp) {
                 curLength++;
             }
         }
         // Finally flip the end
         if (curLength > 1) {
             var tmp = toFlip [toFlip.Length - 1];
             toFlip [toFlip.Length - 1] = toFlip [firstPos];
             toFlip [firstPos] = tmp;
         }
     }          
 }
Ejemplo n.º 8
0
        /// <summary>
        /// Reverse complements the BP and QV values accounting for homopolymers.
        /// 
        /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that 
        /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the 
        /// first base in a homopolymer (though this is not standardized).  To account for this, when reverse complementing, we switch the 
        /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base.
        /// </summary>
        /// <returns>The reverse complemented sequence.</returns>
        /// <param name="toFlip">The array with the QV values to flip.</param>
        /// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param>
        internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false)
        {
            BPandQV[] newData = new BPandQV[toFlip.Length];

            for (long index = 0; index < toFlip.Length; index++)
            {
                byte complementedSymbol;
                byte symbol = toFlip[toFlip.Length - index - 1].BP;

                if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol))
                {
                    throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString());
                }
                var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index -1].QV);
                newData [index] = bpandq;
            }

            if (flipHpQvValues) {
                ReverseQVValuesForHomopolymers (newData);
            }
            return newData;
        }
Ejemplo n.º 9
0
 /// <summary>
 /// Simple check that the alignment does not have a gap on top of a
 /// gap, which violates several assumptions.
 /// </summary>
 /// <param name="seq1"></param>
 /// <param name="seq2"></param>
 internal static void ValidateNoOverlappingGaps(byte[] seq1, BPandQV[] seq2)
 {
     var gap = DnaAlphabet.Instance.Gap;
     for(int i=0;i<seq1.Length;i++)
     {
         if (seq1[i] == gap && seq2[i].BP == gap)
             throw new Exception("You have an alignment with overlapping gaps.  Input problem!");
     }
 }
Ejemplo n.º 10
0
 /// <summary>
 /// Given the start position of a gap, returns how long it is.
 /// For example:
 /// 
 /// AAAA---TTTT returns 3.
 /// </summary>
 /// <param name="pos">0 indexed</param>
 /// <param name="array"></param>
 /// <returns></returns>
 public static int GetGapLength(int pos, BPandQV[] array)
 {
     var gap = DnaAlphabet.Instance.Gap;
     int len = 1;
     while (++pos < array.Length)
     {
         if (array[pos].BP == gap)
         {
             len += 1;
         }
         else
         {
             break;
         }
     }
     return len;
 }