/// <summary> /// jh /// </summary> /// <returns></returns> public SAMAlignedSequence Clone() { SAMAlignedSequence n = new SAMAlignedSequence(); n.CIGAR = this.CIGAR; n.Bin = this.Bin; n.Flag = this.Flag; n.ISize = this.ISize; n.MapQ = this.MapQ; n.MPos = this.MPos; n.MRNM = this.MRNM; n.Pos = this.Pos; n.QName = this.QName; n.QuerySequence = new Bio.Sequence(this.QuerySequence.Alphabet, this.QuerySequence.ConvertToString()); n.RName = this.RName; foreach (SAMOptionalField Item in this.OptionalFields) { SAMOptionalField nItem = new SAMOptionalField(); nItem.Tag = Item.Tag; nItem.Value = Item.Value; nItem.VType = Item.VType; n.OptionalFields.Add(nItem); } return(n); }
// parses sequence. private void ParseSequences(SequenceAlignmentMap seqAlignment, BioTextReader bioReader, bool isReadOnly) { while (bioReader.HasLines && !bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { string[] tokens = bioReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); alignedSeq.QName = tokens[0]; alignedSeq.Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]); alignedSeq.RName = tokens[2]; alignedSeq.Pos = int.Parse(tokens[3], CultureInfo.InvariantCulture); alignedSeq.MapQ = int.Parse(tokens[4], CultureInfo.InvariantCulture); alignedSeq.CIGAR = tokens[5]; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7], CultureInfo.InvariantCulture); alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture); string message = alignedSeq.IsValidHeader(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } ISequence refSeq = null; if (RefSequences != null && RefSequences.Count > 0) { refSeq = RefSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0); } ParseQualityNSequence(alignedSeq, Alphabet, Encoding, tokens[9], tokens[10], refSeq, isReadOnly); SAMOptionalField optField = null; for (int i = 11; i < tokens.Length; i++) { optField = new SAMOptionalField(); string optionalFieldRegExpn = OptionalFieldLinePattern; if (!Helper.IsValidRegexValue(optionalFieldRegExpn, tokens[i])) { message = string.Format(CultureInfo.CurrentCulture, Resource.InvalidOptionalField, tokens[i]); throw new FormatException(message); } string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; message = optField.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } alignedSeq.OptionalFields.Add(optField); } seqAlignment.QuerySequences.Add(alignedSeq); bioReader.GoToNextLine(); } }
public void AddOptionalField(SAMOptionalField sof) { if (UsingReadOnlyOptionFieldCollection) { changeFromReadOnlyToEditableCollection(); } else if (editableOptionalFieldCollection == null) { editableOptionalFieldCollection = new Dictionary <string, SAMOptionalField>(); } editableOptionalFieldCollection[sof.Tag] = sof; }
/// <summary> /// Convert this read only collection into a dictionary of option fields /// </summary> /// <returns></returns> public Dictionary <string, SAMOptionalField> ConvertToDictionary() { Dictionary <string, SAMOptionalField> toReturn = new Dictionary <string, SAMOptionalField>(tagToDataTypeAndLocation.Count); foreach (var kv in tagToDataTypeAndLocation) { char type = kv.Value.Key; object value = this[kv.Key].ToString(); string tag = kv.Key; SAMOptionalField sof = new SAMOptionalField(tag, value, type); toReturn[tag] = sof; } return(toReturn); }
/// <summary> /// Parse a single sequencer. /// </summary> /// <param name="bioText">sequence alignment text.</param> /// <param name="alphabet">Alphabet of the sequences.</param> /// <param name="referenceSequences">Reference sequences.</param> private static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet, IList <ISequence> referenceSequences) { const int optionalTokenStartingIndex = 11; string[] tokens = bioText.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); alignedSeq.QName = tokens[0]; alignedSeq.Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]); alignedSeq.RName = tokens[2]; alignedSeq.Pos = int.Parse(tokens[3], CultureInfo.InvariantCulture); alignedSeq.MapQ = int.Parse(tokens[4], CultureInfo.InvariantCulture); alignedSeq.CIGAR = tokens[5]; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7], CultureInfo.InvariantCulture); alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture); ISequence refSeq = null; if (referenceSequences != null && referenceSequences.Count > 0) { refSeq = referenceSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0); } ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10], refSeq); SAMOptionalField optField = null; string message; for (int i = optionalTokenStartingIndex; i < tokens.Length; i++) { optField = new SAMOptionalField(); if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i])) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.InvalidOptionalField, tokens[i]); throw new FormatException(message); } string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; alignedSeq.OptionalFields.Add(optField); } return(alignedSeq); }
/// <summary> /// Parse a single sequencer. /// </summary> /// <param name="bioText">sequence alignment text.</param> /// <param name="alphabet">Alphabet of the sequences.</param> public static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet) { const int optionalTokenStartingIndex = 11; string[] tokens = bioText.Split(TabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence { QName = tokens[0], Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]), RName = tokens[2], Pos = int.Parse(tokens[3]), MapQ = int.Parse(tokens[4]), CIGAR = tokens[5] }; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7]); alignedSeq.ISize = int.Parse(tokens[8]); ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10]); for (int i = optionalTokenStartingIndex; i < tokens.Length; i++) { SAMOptionalField optField = new SAMOptionalField(); if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i])) { throw new FormatException(string.Format(Properties.Resource.InvalidOptionalField, tokens[i])); } string[] opttokens = tokens[i].Split(ColonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; alignedSeq.OptionalFields.Add(optField); } return(alignedSeq); }
// Gets optional field in a byte array. private static byte[] GetOptioanField(SAMOptionalField field) { int valueSize = GetOptionalFieldValueSize(field); if (valueSize == 0) { string message = string.Format(CultureInfo.InvariantCulture, Properties.Resource.BAM_InvalidIntValueInOptField, field.Value, field.Tag); throw new FormatException(message); } int arrayLen = valueSize < 0 ? -valueSize : valueSize; arrayLen += 3; byte[] array = new byte[arrayLen]; array[0] = (byte)field.Tag[0]; array[1] = (byte)field.Tag[1]; array[2] = (byte)field.VType[0]; byte[] temparray = new byte[4]; switch (field.VType) { case "A": // Printable character array[3] = (byte)field.Value[0]; break; case "c": //signed 8-bit integer case "C": //unsigned 8-bit integer case "s": // signed 16 bit integer case "S"://unsinged 16 bit integer case "i": // signed 32 bit integer case "I": // unsigned 32 bit integer if (valueSize == 1) { array[2] = (byte)'C'; array[3] = byte.Parse(field.Value, CultureInfo.InvariantCulture); } else if (valueSize == -1) { sbyte sb = sbyte.Parse(field.Value, CultureInfo.InvariantCulture); array[2] = (byte)'c'; array[3] = (byte)sb; } else if (valueSize == 2) { UInt16 uint16value = UInt16.Parse(field.Value, CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(uint16value); array[2] = (byte)'S'; array[3] = temparray[0]; array[4] = temparray[1]; } else if (valueSize == -2) { Int16 int16value = Int16.Parse(field.Value, CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(int16value); array[2] = (byte)'s'; array[3] = temparray[0]; array[4] = temparray[1]; } else if (valueSize == 4) { uint uint32value = uint.Parse(field.Value, CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(uint32value); array[2] = (byte)'I'; array[3] = temparray[0]; array[4] = temparray[1]; array[5] = temparray[2]; array[6] = temparray[3]; } else { int int32value = int.Parse(field.Value, CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(int32value); array[2] = (byte)'i'; array[3] = temparray[0]; array[4] = temparray[1]; array[5] = temparray[2]; array[6] = temparray[3]; } break; case "f": // float float floatvalue = float.Parse(field.Value, CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(floatvalue); array[3] = temparray[0]; array[4] = temparray[1]; array[5] = temparray[2]; array[6] = temparray[3]; break; case "Z": // printable string temparray = Encoding.UTF8.GetBytes(field.Value); temparray.CopyTo(array, 3); array[3 + temparray.Length] = (byte)'\0'; break; case "H": // HexString temparray = Encoding.UTF8.GetBytes(field.Value); temparray.CopyTo(array, 3); array[3 + temparray.Length] = (byte)'\0'; break; case "B": // integer or numeric array. UpdateArrayType(array, field); break; default: throw new Exception(Properties.Resource.BAM_InvalidOptValType); } return array; }
// Gets optional field value size. private static int GetOptionalFieldValueSize(SAMOptionalField optionalField) { switch (optionalField.VType) { case "A": // Printable character case "c": //signed 8-bit integer return -1; case "C": //unsigned 8-bit integer return 1; case "s": // signed 16 bit integer case "S"://unsinged 16 bit integer case "i": // signed 32 bit integer case "I": // unsigned 32 bit integer return GetOptionalFieldIntValueSize(optionalField.Value); case "f": // float return 4; case "Z": // printable string case "H": // HexString return optionalField.Value.Length + 1; case "B"://integer or numeric array char type = optionalField.Value[0]; int arrayTypeSize = GetSizeOfArrayType(type); int numberofelements = optionalField.Value.Split(DelimComma, StringSplitOptions.RemoveEmptyEntries).Length - 1; int elementsSize = arrayTypeSize * numberofelements; int arraylen = elementsSize + 1 + 4; // 1 to store array type and 4 to store number of values in array. return arraylen; default: throw new Exception(Properties.Resource.BAM_InvalidOptValType); } }
private static void UpdateArrayType(byte[] array, SAMOptionalField field) { byte[] temparray = new byte[4]; char arraytype = field.Value[0]; int arrayTypeSize = GetSizeOfArrayType(arraytype); string[] elements = field.Value.Split(DelimComma, StringSplitOptions.RemoveEmptyEntries); array[3] = (byte)arraytype; int arrayIndex = 4; temparray = Helper.GetLittleEndianByteArray(elements.Length - 1); array[arrayIndex++] = temparray[0]; array[arrayIndex++] = temparray[1]; array[arrayIndex++] = temparray[2]; array[arrayIndex++] = temparray[3]; //elemetns[0] contains array type; for (int i = 1; i < elements.Length; i++) { switch (arraytype) { case 'A': // Printable character temparray[0] = (byte)elements[i][0]; break; case 'c': //signed 8-bit integer temparray[0] = (byte)sbyte.Parse(elements[i], CultureInfo.InvariantCulture); break; case 'C': //unsigned 8-bit integer temparray[0] = byte.Parse(elements[i], CultureInfo.InvariantCulture); break; case 's': // signed 16 bit integer Int16 int16value = Int16.Parse(elements[i], CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(int16value); break; case 'S'://unsinged 16 bit integer UInt16 uint16value = UInt16.Parse(elements[i], CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(uint16value); break; case 'i': // signed 32 bit integer int int32value = int.Parse(elements[i], CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(int32value); break; case 'I': // unsigned 32 bit integer uint uint32value = uint.Parse(elements[i], CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(uint32value); break; case 'f': // float float floatvalue = float.Parse(elements[i], CultureInfo.InvariantCulture); temparray = Helper.GetLittleEndianByteArray(floatvalue); break; default: throw new Exception(string.Format(Properties.Resource.BAM_InvalidOptValType, arraytype)); } for (int tempIndex = 0; tempIndex < arrayTypeSize; tempIndex++) { array[arrayIndex++] = temparray[tempIndex]; } } }
/// <summary> /// Returns an aligned sequence by parses the BAM file. /// </summary> private SAMAlignedSequence GetAlignedSequence(int start, int end) { byte[] array = new byte[4]; ReadUnCompressedData(array, 0, 4); int blockLen = Helper.GetInt32(array, 0); byte[] alignmentBlock = new byte[blockLen]; ReadUnCompressedData(alignmentBlock, 0, blockLen); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); int value; UInt32 UnsignedValue; // 0-4 bytes int refSeqIndex = Helper.GetInt32(alignmentBlock, 0); if (refSeqIndex == -1) alignedSeq.RName = "*"; else alignedSeq.RName = refSeqNames[refSeqIndex]; // 4-8 bytes alignedSeq.Pos = Helper.GetInt32(alignmentBlock, 4) + 1; // if there is no overlap no need to parse further. // BAMPos > closedEnd // => (alignedSeq.Pos - 1) > end -1 if (alignedSeq.Pos > end) { return null; } // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len" UnsignedValue = Helper.GetUInt32(alignmentBlock, 8); // 10 -12 bytes alignedSeq.Bin = (int)(UnsignedValue & 0xFFFF0000) >> 16; // 9th bytes alignedSeq.MapQ = (int)(UnsignedValue & 0x0000FF00) >> 8; // 8th bytes int queryNameLen = (int)(UnsignedValue & 0x000000FF); // 12 - 16 bytes UnsignedValue = Helper.GetUInt32(alignmentBlock, 12); // 14-16 bytes int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16; alignedSeq.Flag = (SAMFlags)flagValue; // 12-14 bytes int cigarLen = (int)(UnsignedValue & 0x0000FFFF); // 16-20 bytes int readLen = Helper.GetInt32(alignmentBlock, 16); // 20-24 bytes int mateRefSeqIndex = Helper.GetInt32(alignmentBlock, 20); if (mateRefSeqIndex != -1) { alignedSeq.MRNM = refSeqNames[mateRefSeqIndex]; } else { alignedSeq.MRNM = "*"; } // 24-28 bytes alignedSeq.MPos = Helper.GetInt32(alignmentBlock, 24) + 1; // 28-32 bytes alignedSeq.ISize = Helper.GetInt32(alignmentBlock, 28); // 32-(32+readLen) bytes alignedSeq.QName = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1); StringBuilder strbuilder = new StringBuilder(); int startIndex = 32 + queryNameLen; for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4) { // Get the CIGAR operation length stored in first 28 bits. UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i); strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture)); // Get the CIGAR operation stored in last 4 bits. value = (int)cigarValue & 0x0000000F; // MIDNSHP=>0123456 switch (value) { case 0: strbuilder.Append("M"); break; case 1: strbuilder.Append("I"); break; case 2: strbuilder.Append("D"); break; case 3: strbuilder.Append("N"); break; case 4: strbuilder.Append("S"); break; case 5: strbuilder.Append("H"); break; case 6: strbuilder.Append("P"); break; case 7: strbuilder.Append("="); break; case 8: strbuilder.Append("X"); break; default: throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR); } } string cigar = strbuilder.ToString(); if (string.IsNullOrWhiteSpace(cigar)) { alignedSeq.CIGAR = "*"; } else { alignedSeq.CIGAR = cigar; } // if there is no overlap no need to parse further. // ZeroBasedRefEnd < start // => (alignedSeq.RefEndPos -1) < start if (alignedSeq.RefEndPos - 1 < start && alignedSeq.RName!=Properties.Resource.SAM_NO_REFERENCE_DEFINED_INDICATOR) { return null; } startIndex += cigarLen * 4; strbuilder = new StringBuilder(); int index = startIndex; for (; index < (startIndex + (readLen + 1) / 2) - 1; index++) { // Get first 4 bit value value = (alignmentBlock[index] & 0xF0) >> 4; strbuilder.Append(GetSeqChar(value)); // Get last 4 bit value value = alignmentBlock[index] & 0x0F; strbuilder.Append(GetSeqChar(value)); } value = (alignmentBlock[index] & 0xF0) >> 4; strbuilder.Append(GetSeqChar(value)); if (readLen % 2 == 0) { value = alignmentBlock[index] & 0x0F; strbuilder.Append(GetSeqChar(value)); } startIndex = index + 1; string strSequence = strbuilder.ToString(); byte[] qualValues = new byte[readLen]; string strQualValues = "*"; if (alignmentBlock[startIndex] != 0xFF) { for (int i = startIndex; i < (startIndex + readLen); i++) { qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33); } strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues); } SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, strSequence, strQualValues); startIndex += readLen; if (alignmentBlock.Length > startIndex + 4 && alignmentBlock[startIndex] != 0x0 && alignmentBlock[startIndex + 1] != 0x0) { for (index = startIndex; index < alignmentBlock.Length; ) { SAMOptionalField optionalField = new SAMOptionalField(); optionalField.Tag = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, index, 2); index += 2; char vType = (char)alignmentBlock[index++]; string valueType = vType.ToString(); // SAM format supports [AifZH] for value type. // In BAM, an integer may be stored as a signed 8-bit integer (c), unsigned 8-bit integer (C), signed short (s), unsigned // short (S), signed 32-bit (i) or unsigned 32-bit integer (I), depending on the signed magnitude of the integer. However, // in SAM, all types of integers are presented as type ʻiʼ. string message = Helper.IsValidPatternValue("VType", valueType, BAMOptionalFieldRegex); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } optionalField.Value = GetOptionalValue(vType, alignmentBlock, ref index).ToString(); // Convert to SAM format. if ("cCsSI".IndexOf(vType) >= 0) { valueType = "i"; } optionalField.VType = valueType; alignedSeq.OptionalFields.Add(optionalField); } } return alignedSeq; }