/// <summary> /// Unpacks the sequence so that it is aligned to the reference at the given start but ignoring insertions. /// Useful for Depth of Coverage; /// </summary> public void ProcessCountCoverageFromSequence(CompactSAMSequence orgSeq) { if (orgSeq == null || orgSeq.RName != StaticResources.MT_CHROMOSOME_NAME) { return; } string CIGAR = orgSeq.CIGAR; if (!CigarUtils.NoInformationCigar(CIGAR)) { int curRef = orgSeq.Pos - 1; var elements = CigarUtils.GetCigarElements(CIGAR); foreach (var v in elements) { var len = v.Length; switch (v.Operation) { case 'P': //padding (Silent deltions from padded reference) case 'N': //skipped region from reference throw new Exception("Not built to handle clipping yet"); case 'M': //match or mismatch case '=': //match case 'X': //mismatch for (int k = 0; k < len; k++) { if (curRef >= StaticResources.CRS_LENGTH) { Debug.WriteLine("Seq: " + orgSeq.ID + " is aligned past the MT DNA reference genome"); break; } depthCounts [curRef] = depthCounts [curRef] + 1.0; curRef++; } break; case 'I': //insertion to the reference break; case 'D': //Deletion from the reference curRef += len; break; case 'S': //soft clipped case 'H': //had clipped break; default: throw new FormatException("Unexpected SAM Cigar element found " + v.Operation.ToString()); } } } }
/// <summary> /// Method throws an exception if sequence violates any assumption made by this class anywhere. /// Avoids, separate checks within each method. /// </summary> /// <param name="seq"></param> private static bool validateSequence(CompactSAMSequence seq) { if (seq == null) { throw new ArgumentNullException("seq"); } if (String.IsNullOrEmpty(seq.RName) || seq.RefEndPos <= seq.Pos || String.IsNullOrEmpty(seq.CIGAR) || seq.CIGAR == "*") { return(false); //throw new ArgumentException("Tried to build a pileup with an invalid sequence. Sequence was:\n"+ // seq.ToString()); } return(true); }
protected CompactSAMSequence GetAlignedSequence() { byte[] array = new byte[4]; ReadUnCompressedData(array, 0, 4); int blockLen = Helper.GetInt32(array, 0); byte[] alignmentBlock = new byte[blockLen]; ReadUnCompressedData(alignmentBlock, 0, blockLen); int value; UInt32 UnsignedValue; // 0-4 bytes int refSeqIndex = Helper.GetInt32(alignmentBlock, 0); string RName; if (refSeqIndex == -1) { RName = "*"; } else { RName = refSeqNames[refSeqIndex]; } // 4-8 bytes int Pos = Helper.GetInt32(alignmentBlock, 4) + 1; // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len" UnsignedValue = Helper.GetUInt32(alignmentBlock, 8); int queryNameLen = (int)(UnsignedValue & 0x000000FF); // 12 - 16 bytes UnsignedValue = Helper.GetUInt32(alignmentBlock, 12); int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16; int cigarLen = (int)(UnsignedValue & 0x0000FFFF); //// 16-20 bytes int readLen = Helper.GetInt32(alignmentBlock, 16); // 32-(32+readLen) bytes string name = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1); StringBuilder strbuilder = new StringBuilder(); int startIndex = 32 + queryNameLen; for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4) { // Get the CIGAR operation length stored in first 28 bits. UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i); strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture)); // Get the CIGAR operation stored in last 4 bits. value = (int)cigarValue & 0x0000000F; // MIDNSHP=>0123456 switch (value) { case 0: strbuilder.Append("M"); break; case 1: strbuilder.Append("I"); break; case 2: strbuilder.Append("D"); break; case 3: strbuilder.Append("N"); break; case 4: strbuilder.Append("S"); break; case 5: strbuilder.Append("H"); break; case 6: strbuilder.Append("P"); break; case 7: strbuilder.Append("="); break; case 8: strbuilder.Append("X"); break; default: throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR); } } string cigar = strbuilder.ToString(); if (string.IsNullOrWhiteSpace(cigar)) { cigar = "*"; } startIndex += cigarLen * 4; //strbuilder = new StringBuilder(); byte[] seqData = new byte[readLen]; int seqDataIndex = 0; int index = startIndex; for (; index < (startIndex + (readLen + 1) / 2) - 1; index++) { // Get first 4 bit value value = (alignmentBlock[index] & 0xF0) >> 4; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); // Get last 4 bit value value = alignmentBlock[index] & 0x0F; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); } value = (alignmentBlock[index] & 0xF0) >> 4; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); if (readLen % 2 == 0) { value = alignmentBlock[index] & 0x0F; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); } startIndex = index + 1; // string strSequence = strbuilder.ToString(); //Insert qual value catch here? ADDING NEW QUALITY SCORE FINDER!!! byte[] qualValues = new byte[readLen]; string strQualValues = "*"; if (alignmentBlock[startIndex] != 0xFF) { for (int i = startIndex; i < (startIndex + readLen); i++) { qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33); } strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues); } //END NEW EDITION! //var syms = Encoding.UTF8.GetBytes(strSequence); var alpha = Alphabets.AutoDetectAlphabet(seqData, 0, seqData.Length, null); //Sequence toReturn = new Sequence(alpha, syms); //TODO: Possibly a bit unsafe here var toReturn = new CompactSAMSequence(alpha, FastQFormatType.GATK_Recalibrated, seqData, qualValues, false); toReturn.ID = name; toReturn.Pos = Pos; toReturn.CIGAR = cigar; toReturn.RName = RName; toReturn.SAMFlags = (SAMFlags)flagValue; return(toReturn); }
/// <summary> /// Turn a SAMAlignedSequence into a list of BaseAndQualityAndPosition objects, /// useful for adding to a pile-up. /// </summary> /// <param name="seq"></param> /// <returns></returns> static List <BaseAndQualityAndPosition> getBasesForSequence(CompactSAMSequence seq) { List <BaseAndQualityAndPosition> toReturn = new List <BaseAndQualityAndPosition>(seq.RefEndPos - seq.Pos + 10); // Decode the cigar string into operations. // TODO: This code is duplicated in many places string CIGAR = seq.CIGAR; List <KeyValuePair <char, int> > charsAndPositions = new List <KeyValuePair <char, int> >(); for (int i = 0; i < CIGAR.Length; i++) { char ch = CIGAR[i]; if (Char.IsDigit(ch)) { continue; } charsAndPositions.Add(new KeyValuePair <char, int>(ch, i)); } // Get sequence bases and error probabilities var seq_phred_scores = seq.GetPhredQualityScores(); var seq_bases = seq.ToArray(); // Use the cigar operations to emit bases. int curRef = seq.Pos; int curQuery = 0; for (int i = 0; i < charsAndPositions.Count; i++) { // Parse the current cigar operation char ch = charsAndPositions[i].Key; int cig_start = i == 0 ? 0 : charsAndPositions[i - 1].Value + 1; int cig_end = charsAndPositions[i].Value - cig_start; int cig_len = int.Parse(CIGAR.Substring(cig_start, cig_end)); // Emit or advance based on cigar operation. switch (ch) { case 'P': //padding (Silent deltions from padded reference) case 'N': //skipped region from reference throw new Exception("Pile up methods not built to handle reference clipping (Cigar P or N) yet."); case 'M': //match or mismatch case '=': //match case 'X': //mismatch for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef, 0, new BaseAndQuality(seq_bases[curQuery], (byte)seq_phred_scores[curQuery])); toReturn.Add(bqp); curQuery++; curRef++; } break; case 'I': //insertion to the reference for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef, k, new BaseAndQuality(seq_bases[curQuery], (byte)seq_phred_scores[curQuery])); toReturn.Add(bqp); curQuery++; } break; case 'D': //Deletion from the reference for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef, k, new BaseAndQuality((byte)'-', byte.MinValue)); toReturn.Add(bqp); curRef++; } break; case 'S': //soft clipped curQuery += cig_len; break; case 'H': //had clipped break; default: throw new FormatException("Unexpected SAM Cigar element found " + ch.ToString()); } } return(toReturn); }