/// <summary> /// Verifies the alignment has no leading or trailing gaps and throws an exception otherwise. /// </summary> /// <param name="refseq">Refseq.</param> /// <param name="query">Query.</param> internal static void VerifyNoGapsOnEnds(byte[] refseq, BPandQV[] query) { var gap = DnaAlphabet.Instance.Gap; if (refseq [0] == gap || refseq [refseq.Length - 1] == gap || query [0].BP == gap || query [query.Length - 1].BP == gap) { var refseqs = new string(refseq.Select(x=>(char)x).ToArray()); var qseqs = new string(query.Select(x=>(char)x.BP).ToArray()); throw new FormatException ("Alignment query and/or reference started with a gap character. " + "Alignments must be hard-clipped to remove starting and trailing variants " + "before variants can be called. Alignment was:\n" + refseqs + "\n" + qseqs); } }
/// <summary> /// Reverse complements the BP and QV values accounting for homopolymers. /// /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the /// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base. /// </summary> /// <returns>The reverse complemented sequence.</returns> /// <param name="toFlip">The array with the QV values to flip.</param> /// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param> internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false) { BPandQV[] newData = new BPandQV[toFlip.Length]; for (long index = 0; index < toFlip.Length; index++) { byte complementedSymbol; byte symbol = toFlip[toFlip.Length - index - 1].BP; if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol)) { throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString()); } var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index - 1].QV); newData [index] = bpandq; } if (flipHpQvValues) { ReverseQVValuesForHomopolymers(newData); } return(newData); }
/// <summary> /// Gets the bases for a length of the sequence as a string. /// </summary> /// <returns>The bases.</returns> /// <param name="array">Array.</param> /// <param name="position">Position.</param> /// <param name="length">Length.</param> private static string getBases(BPandQV[] array, int position, int length) { char[] chars = new char[length]; for(int i=0; i<length; i++) { chars[i] = (char)array[i+position].BP; } return new string(chars); }
/// <summary> /// Calls the variants. /// /// Should only be used internally as assumptions are made that the alignments are left-aligned and fulfill certain criteria. /// </summary> /// <returns>The variants.</returns> /// <param name="refSeq">Reference seq.</param> /// <param name="querySeq">Query seq.</param> /// <param name="originallyReverseComplemented">If set to <c>true</c> the query sequence was originally reverse complemented. (this affects QV value scoring)</param> internal static List<Variant> CallVariants(byte[] refSeq, BPandQV[] querySeq, bool originallyReverseComplemented) { if (originallyReverseComplemented) { AlignmentUtils.ReverseQVValuesForHomopolymers (querySeq); } List<Variant> variants = new List<Variant>(); // Now call variants. var gap = DnaAlphabet.Instance.Gap; int i = 0; int refPos = 0; while( i < refSeq.Length) { if (refSeq[i] == gap) { int len = AlignmentUtils.GetGapLength(i, refSeq); var nextBasePos = (i + len); // Should alway be true as we don't end in gaps Debug.Assert (nextBasePos < refSeq.Length); var hplenAndChar = determineHomoPolymerLength (nextBasePos, refSeq); var bases = getBases(querySeq, i, len); var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Insertion, hplenAndChar.Item2, hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length)); newVariant.QV = querySeq[i].QV; variants.Add(newVariant); i += len; } else if (querySeq[i].BP == gap) { int len = AlignmentUtils.GetGapLength(i, querySeq); var bases = getBases(refSeq, i, len); var hplenAndChar = determineHomoPolymerLength (i, refSeq); var newVariant = new IndelVariant(refPos - 1, len, bases, IndelType.Deletion, hplenAndChar.Item2, hplenAndChar.Item1, (i == 0 || (i + len + hplenAndChar.Item1) >= refSeq.Length)); /* An insertion mutation occurs BEFORE pos, so normally we get the next base * or the last one if it's a reverse complemented alignment. However, this is not true if * it is a homopolymer because what would have been the previous position is the next position * after left aligning and reversing the position of the QV value. * * Consider the following * --*- -*-- * A-TA --> TA-T * AGTA TACT * * However, * --*-- --*-- * A-TTA ----> T-AAT * ATTTA TAAAT * */ if ((i + len ) < querySeq.Length) { var qc_pos = originallyReverseComplemented ? i - 1 : i + len; if (newVariant.InHomopolymer) { qc_pos = i + len; } newVariant.QV = querySeq[qc_pos].QV; } variants.Add(newVariant); i += len; refPos += len; } else { if (querySeq[i].BP != refSeq[i]) { var newVariant = new SNPVariant(refPos, (char) querySeq[i].BP, (char)refSeq[i], (i ==0 || i == (refSeq.Length -1))); newVariant.QV = querySeq [i].QV; variants.Add(newVariant); } i++; refPos++; } } return variants; }
/// <summary> /// Given two byte arrays representing a pairwise alignment, shift them so /// that all deletions start as early as possible. For example: /// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT /// TTTTAA--TTTT TTTT--AATTTT /// /// This modifies the array in place. /// </summary> /// <param name="refseq">Reference Sequency</param> /// <param name="query">Query Sequence</param> /// <returns></returns> public static void LeftAlignIndels(byte[] refseq, BPandQV[] query) { // Validation if (refseq.Length != query.Length) { throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences"); } ValidateNoOverlappingGaps(refseq, query); byte gap = DnaAlphabet.Instance.Gap; // Keep left aligning until we can't anymore, this is a // do while loop because some downstream left alignments open up // further ones upstream, even though this is rare. int change_count = 0; int loopsThrough = 0; do { loopsThrough++; change_count = 0; for (int i = 1; i < refseq.Length; i++) { if (refseq[i] == gap) { int len = GetGapLength(i, refseq); int left_side = i - 1; int right_side = i - 1 + len; while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP)) { // Move the gap left. if (right_side < refseq.Length) { refseq[right_side] = refseq[left_side]; } refseq[left_side] = gap; left_side--; right_side--; change_count++; } if (loopsThrough > MAX_LOOPS) { throw new Exception(MAX_LOOPS_ERROR); } } else if (query[i].BP == gap) { int len = GetGapLength(i, query); int left_side = i - 1; int right_side = i - 1 + len; while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side])) { // Move the gap left. if (right_side < query.Length) { query[right_side] = query[left_side]; } query[left_side] = new BPandQV(gap, 0); left_side--; right_side--; change_count++; } if (loopsThrough > MAX_LOOPS) { throw new Exception(MAX_LOOPS_ERROR); } } } } while (change_count > 0); }
/// <summary> /// Given two byte arrays representing a pairwise alignment, shift them so /// that all deletions start as early as possible. For example: /// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT /// TTTTAA--TTTT TTTT--AATTTT /// /// This modifies the array in place. /// </summary> /// <param name="refseq">Reference Sequency</param> /// <param name="query">Query Sequence</param> /// <returns></returns> public static void LeftAlignIndels(byte[] refseq, BPandQV[] query) { // Validation if (refseq.Length != query.Length) { throw new ArgumentException("Alignment passed to LeftAlignIndels had unequal length sequences"); } ValidateNoOverlappingGaps (refseq, query); byte gap = DnaAlphabet.Instance.Gap; // Keep left aligning until we can't anymore, this is a // do while loop because some downstream left alignments open up // further ones upstream, even though this is rare. int change_count = 0; int loopsThrough = 0; do { loopsThrough++; change_count = 0; for (int i = 1; i < refseq.Length; i++) { if (refseq[i] == gap) { int len = GetGapLength(i, refseq); int left_side = i - 1; int right_side = i - 1 + len; while (left_side >= 0 && refseq[left_side] != gap && (refseq[left_side] == query[right_side].BP)) { // Move the gap left. if (right_side < refseq.Length) { refseq[right_side] = refseq[left_side]; } refseq[left_side] = gap; left_side--; right_side--; change_count++; } if (loopsThrough > MAX_LOOPS) { throw new Exception(MAX_LOOPS_ERROR); } } else if (query[i].BP == gap) { int len = GetGapLength(i, query); int left_side = i - 1; int right_side = i - 1 + len; while (left_side >= 0 && query[left_side].BP != gap && (query[left_side].BP == refseq[right_side])) { // Move the gap left. if (right_side < query.Length) { query[right_side] = query[left_side]; } query[left_side] = new BPandQV(gap, 0); left_side--; right_side--; change_count++; } if (loopsThrough > MAX_LOOPS) { throw new Exception(MAX_LOOPS_ERROR); } } } } while (change_count > 0); }
/// <summary> /// Reverses the QV values for homopolymers. /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the /// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base. /// </summary> /// <returns>The QV values for homopolymers.</returns> /// <param name="toFlip">To flip.</param> internal static void ReverseQVValuesForHomopolymers(BPandQV[] toFlip) { // Basic idea is to assume it is A, C, G, T alphabet and flip HP values // Also assumes all low QV is due to HP deletion/insertion error. if (toFlip.Length > 1) { byte lastbp = toFlip [0].BP; int firstPos = 0; int curLength = 1; for (int i = 1; i < toFlip.Length; i++) { byte newbp = toFlip [i].BP; if (newbp != lastbp) { if (curLength > 1) { var right = toFlip [i - 1]; var left = toFlip[firstPos]; Debug.Assert (right.BP == left.BP); if (right.QV < left.QV) { toFlip [i - 1] = left; toFlip [firstPos] = right; } } firstPos = i; lastbp = newbp; curLength = 1; } else if (newbp == lastbp) { curLength++; } } // Finally flip the end if (curLength > 1) { var tmp = toFlip [toFlip.Length - 1]; toFlip [toFlip.Length - 1] = toFlip [firstPos]; toFlip [firstPos] = tmp; } } }
/// <summary> /// Reverse complements the BP and QV values accounting for homopolymers. /// /// This is not a simple operation because in addition to reversing the QV scores, one must account for the fact that /// the quality value for homopolymer indel errors (that is a deletion or insertion) is typically only placed at the /// first base in a homopolymer (though this is not standardized). To account for this, when reverse complementing, we switch the /// QV value of the first and last base in homopolymers if the first base is lower quality than the last base. /// </summary> /// <returns>The reverse complemented sequence.</returns> /// <param name="toFlip">The array with the QV values to flip.</param> /// <param name="flipHpQvValues">If set to <c>true</c> flip hp qv values.</param> internal static BPandQV[] GetReverseComplementedSequence(BPandQV[] toFlip, bool flipHpQvValues = false) { BPandQV[] newData = new BPandQV[toFlip.Length]; for (long index = 0; index < toFlip.Length; index++) { byte complementedSymbol; byte symbol = toFlip[toFlip.Length - index - 1].BP; if (!DnaAlphabet.Instance.TryGetComplementSymbol(symbol, out complementedSymbol)) { throw new NotSupportedException("Bad character in BPandQV array: " + symbol.ToString()); } var bpandq = new BPandQV(complementedSymbol, toFlip[toFlip.Length - index -1].QV); newData [index] = bpandq; } if (flipHpQvValues) { ReverseQVValuesForHomopolymers (newData); } return newData; }
/// <summary> /// Simple check that the alignment does not have a gap on top of a /// gap, which violates several assumptions. /// </summary> /// <param name="seq1"></param> /// <param name="seq2"></param> internal static void ValidateNoOverlappingGaps(byte[] seq1, BPandQV[] seq2) { var gap = DnaAlphabet.Instance.Gap; for(int i=0;i<seq1.Length;i++) { if (seq1[i] == gap && seq2[i].BP == gap) throw new Exception("You have an alignment with overlapping gaps. Input problem!"); } }
/// <summary> /// Given the start position of a gap, returns how long it is. /// For example: /// /// AAAA---TTTT returns 3. /// </summary> /// <param name="pos">0 indexed</param> /// <param name="array"></param> /// <returns></returns> public static int GetGapLength(int pos, BPandQV[] array) { var gap = DnaAlphabet.Instance.Gap; int len = 1; while (++pos < array.Length) { if (array[pos].BP == gap) { len += 1; } else { break; } } return len; }