public void AddRead(SAMAlignedSequence seq) { TotalReadCount++; ReadsByIndividuals.AddRead(seq); if (seq.RefEndPos > GenomeLocation.End) { GenomeLocation.End = seq.RefEndPos; } }
/// <summary> /// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file. /// </summary> /// <param name="s">S.</param> public PacBioCCSRead(SAMAlignedSequence s) { /* TODO: Converting from binary to string and back is beyond silly... * no performance hit worth worrying about at present, but in the future it might be worth * going directly from binary to the type rather than through string intermediates */ foreach (var v in s.OptionalFields) { if (v.Tag == "sn") { var snrs = v.Value.Split(',').Skip(1).Select(x => Convert.ToSingle(x)).ToArray(); SnrA = snrs [0]; SnrC = snrs [1]; SnrG = snrs [2]; SnrT = snrs [3]; } else if (v.Tag == "zm") { HoleNumber = (int)Convert.ToInt32(v.Value); } else if (v.Tag == "pq") { // This tag is now deprecated by the rq tag ReadQuality = Convert.ToSingle(v.Value); } else if (v.Tag == "rq" && ReadQuality < 0) { ReadQuality = Convert.ToSingle(v.Value); } else if (v.Tag == "za") { AvgZscore = (float)Convert.ToSingle(v.Value); } else if (v.Tag == "rs") { statusCounts = v.Value.Split(',').Skip(1).Select(x => Convert.ToInt32(x)).ToArray(); } else if (v.Tag == "np") { NumPasses = Convert.ToInt32(v.Value); } else if (v.Tag == "RG") { ReadGroup = v.Value; } else if (v.Tag == "zs") { ZScores = v.Value.Split(',').Skip(1).Select(x => Convert.ToSingle(x)).ToArray(); } } // TODO: We should use String.Intern here, but not available in PCL... // Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]); Movie = s.QuerySequence.ID.Split('/') [0]; Sequence = s.QuerySequence as QualitativeSequence; }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); } else { this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges(); } return(seqAlignmentMap); } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Properties.Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); } else { this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges(); } foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Properties.Resource.SAMAlignedSequenceHeaderNotFound); } SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; seqAlignmentMap.QuerySequences.Add(samAlignedSeq); } return(seqAlignmentMap); }
/// <summary> /// Writes aligned sequence to output stream. /// </summary> /// <param name="header">Alignment header.</param> /// <param name="alignedSequence">Aligned sequence to write.</param> private void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSequence) { if (UnCompressedBAM || BAMOutput) { // Incase of compressed bamoutput uncompressed file will be compressed before sending it to output stream. bamformatter.WriteAlignedSequence(header, alignedSequence, bamUncompressedOutStream); } else { SAMFormatter.WriteSAMAlignedSequence(alignedSequence, writer); } }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); foreach (SAMAlignedSequence alignedSequence in seqAlignmentMap.QuerySequences) { string message = alignedSequence.IsValidHeader(); if (!string.IsNullOrEmpty(message)) { throw new ArgumentException(message); } ValidateSQHeader(alignedSequence.RName); } return(seqAlignmentMap); } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Resource.SAMAlignedSequenceHeaderNotFound); } ValidateAlignedSequenceHeader(alignedHeader); ValidateSQHeader(alignedHeader.RName); SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; } return(seqAlignmentMap); }
/// <summary> /// Method throws an exception if sequence violates any assumption made by this class anywhere. /// Avoids, separate checks within each method. /// </summary> /// <param name="seq"></param> private void validateSequence(SAMAlignedSequence seq) { if (seq == null) { throw new ArgumentNullException("seq"); } if (String.IsNullOrEmpty(seq.RName) || seq.RefEndPos <= seq.Pos || String.IsNullOrEmpty(seq.CIGAR) || seq.CIGAR =="*" || !(seq.QuerySequence is QualitativeSequence) ) { throw new ArgumentException("Tried to build a pileup with an invalid sequence. Sequence was:\n"+ seq.ToString()); } }
/// <summary> /// Displays the Aligned sequence /// </summary> private void DisplaySeqAlignments(SAMAlignedSequence alignedSequence, FileStream stream = null) { // Get Aligned sequences _write.Write("\n"); string seq = "*"; if (alignedSequence.QuerySequence.Count > 0) { seq = alignedSequence.QuerySequence.ToString(); } string qualValues = "*"; QualitativeSequence qualSeq = alignedSequence.QuerySequence as QualitativeSequence; if (qualSeq != null) { byte[] bytes = qualSeq.Scores; qualValues = System.Text.ASCIIEncoding.ASCII.GetString(bytes); } string flag = string.Empty; if (FlagInHex) { flag = String.Format("0x" + "{0:x2}", (int)alignedSequence.Flag); } else if (FlagAsString) { flag = GetFlagDesc(alignedSequence.Flag); } else { flag = ((int)alignedSequence.Flag).ToString(); } _write.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}", alignedSequence.QName, flag, alignedSequence.RName, alignedSequence.Pos, alignedSequence.MapQ, alignedSequence.CIGAR, alignedSequence.MRNM.Equals(alignedSequence.RName) ? "=" : alignedSequence.MRNM, alignedSequence.MPos, alignedSequence.ISize, seq, qualValues); for (int j = 0; j < alignedSequence.OptionalFields.Count; j++) { _write.Write("\t{0}:{1}:{2}", alignedSequence.OptionalFields[j].Tag, alignedSequence.OptionalFields[j].VType, alignedSequence.OptionalFields[j].Value); } }
/// <summary> /// Gets Aligned seqeunces in the Specified BAM file. /// </summary> /// <param name="textReader">BAM file stream.</param> private IEnumerable <SAMAlignedSequence> GetAlignedSequence(Stream bamStream) { bool isFilterRequired = IsFilterApplied(); bool display = true; while (!bamparser.IsEOF()) { SAMAlignedSequence alignedSequence = bamparser.GetAlignedSequence(false); if (isFilterRequired) { display = Filter(alignedSequence); } if (display) { yield return(alignedSequence); } } }
// Gets the length of the optional fields in a SAMAlignedSequence object. private static int GetAuxiliaryDataLength(SAMAlignedSequence alignedSeq) { int size = 0; foreach (SAMOptionalField field in alignedSeq.OptionalFields) { size += 3; int valueSize = GetOptionalFieldValueSize(field); if (valueSize == 0) { string message = string.Format(CultureInfo.InvariantCulture, Resource.BAM_InvalidIntValueInOptFieldOfAlignedSeq, field.Value, field.Tag, alignedSeq.QName); throw new FormatException(message); } size += valueSize < 0 ? -valueSize : valueSize; } return(size); }
public void ValidateBAMParseAlignedSeqWithSeqPointer() { // Get values from XML node. string expectedSequence = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.ExpectedSeqWithPointersNode); string samFilePath = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.FilePathNode); string startingLineForPointer = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.LineNumberToPointNode); string startIndex = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.StartIndexNode); string endIndex = _utilityObj._xmlUtil.GetTextValue( Constants.BAMToSAMConversionNode, Constants.EndIndexNode); // Parse a BAM file using (BAMParser parserObj = new BAMParser()) { parserObj.EnforceDataVirtualization = true; SequenceAlignmentMap seqList = parserObj.Parse(samFilePath); Assert.IsNotNull(seqList); // Get a pointer object SequencePointer pointerObj = GetBAMSequencePointer(Int32.Parse(startingLineForPointer, (IFormatProvider)null), Int32.Parse(startIndex, (IFormatProvider)null), Int32.Parse(endIndex, (IFormatProvider)null)); // Parse a BAM file using Sequence Pointer. SAMAlignedSequence alignedSeq = ( SAMAlignedSequence)parserObj.ParseAlignedSequence(pointerObj); // Validate parsed SAM aligned sequence. Assert.AreEqual(expectedSequence, alignedSeq.QuerySequence.ToString()); Console.WriteLine(string.Format((IFormatProvider)null, "BAM Parser BVT : Sequence alignment aligned seq {0} validate successfully", alignedSeq.Sequences[0].ToString())); ApplicationLog.WriteLine(string.Format((IFormatProvider)null, "BAM Parser BVT : Sequence alignment aligned seq validate successfully")); } }
/// <summary> /// Writes SAMAlignedSequence to specified stream. /// </summary> /// <param name="header">Header from SAM object.</param> /// <param name="alignedSeq">SAMAlignedSequence object.</param> /// <param name="writer">Stream to write.</param> public void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSeq, Stream writer) { if (header == null) { throw new ArgumentNullException("header"); } if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (writer == null) { throw new ArgumentNullException("writer"); } _refSequences = SortSequenceRanges(header.GetReferenceSequenceRanges()); WriteAlignedSequence(alignedSeq, writer); }
/// <summary> /// Update the linear index array based on an aligned read and its current coordinates /// </summary> /// <param name="alignedSeq"></param> /// <param name="offset"></param> internal void UpdateLinearArrayIndex(SAMAlignedSequence alignedSeq, FileOffset offset) { int pos = alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : 0; int end = alignedSeq.RefEndPos > 0 ? alignedSeq.RefEndPos - 1 : 0; pos = pos >> 14; end = end >> 14; if (end > largestBinSeen) { largestBinSeen = end; } for (int i = pos; i <= end; i++) { var cur = offSetArray[i]; //TODO: Is second check necessary? Seems to always be true as we are doing things in order if (cur.BothDataElements == 0 || cur > offset) { offSetArray[i] = offset; } } }
public void ValidateSAMAlignedSequenceGetObjectData() { SerializationInfo info = null; StreamingContext context = new StreamingContext(StreamingContextStates.All); SAMAlignedSequence sdObj = new SAMAlignedSequence(); try { sdObj.GetObjectData(info, context); Assert.Fail(); } catch (ArgumentNullException) { info = new SerializationInfo(typeof(SAMAlignedSequence), new FormatterConverter()); sdObj.GetObjectData(info, context); } ApplicationLog.WriteLine("SAMAlignedSequence P1 : Successfully validated GetObjectData() method"); }
/// <summary> /// Sort the index of SequenceAlignmentMap by QName. /// Fill the index (sorted by QName) into a list, when the list size reaches /// the maximum limit, write the list to file and clear the list. /// </summary> private IList <string> SortByReadNames() { IList <string> files = new List <string>(); var sortedList = new System.Collections.Generic.SortedList <object, string>(); for (int index = 0; index < sequenceAlignMap.QuerySequences.Count; index++) { SAMAlignedSequence alignedSeq = sequenceAlignMap.QuerySequences[index]; string indices = string.Empty; if (!sortedList.TryGetValue(alignedSeq.QName, out indices)) { sortedList.Add(alignedSeq.QName, index.ToString(CultureInfo.InvariantCulture)); } else { indices = string.Format(CultureInfo.InvariantCulture, "{0},{1}", indices, index.ToString(CultureInfo.InvariantCulture)); sortedList[alignedSeq.QName] = indices; } if (sortedList.Count >= SortedListMaxCount) { if (files == null) { files = new List <string>(); } files.Add(WriteToFile(sortedList)); sortedList.Clear(); } } if (sortedList.Count > 0) { files.Add(WriteToFile(sortedList)); sortedList.Clear(); } return(files); }
// Search the BAM file for the next valid read aligned against the current contig. // Update read/base pairs statistics. private void Search_Reads(BAMParser parser, ref SAMAlignedSequence next_alignment, string contig_name, ref long number_of_aligned_reads, ref long number_of_aligned_base_pairs, ref long number_of_used_reads, ref long number_of_used_base_pairs, Queue <Padded_Read> read_queue, long current_position) { while (next_alignment != null && !next_alignment.IsDummyRead && next_alignment.RName == contig_name && (next_alignment.Pos - 1) == current_position) { // The next alignment overlaps with current position, so continue. number_of_aligned_reads++; number_of_aligned_base_pairs += next_alignment.QuerySequence.Count; // Maybe we should let the mininum alignment quality be a parameter. // We currently leave it for the user to pre-filter the BAM file. if (next_alignment.MapQ > 0) { number_of_used_reads++; number_of_used_base_pairs += next_alignment.QuerySequence.Count; read_queue.Enqueue(new Padded_Read(next_alignment)); } #region Parse BAM file until next alignment is found if (!parser.IsEOF()) { next_alignment = parser.GetAlignedSequence(true); while ((next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) && !parser.IsEOF()) { next_alignment = parser.GetAlignedSequence(true); } } else { next_alignment = null; } #endregion Parse BAM file until next alignment is found } }
public string SAMToString(SAMAlignedSequence sam) { if (sam == null) { return(null); } return(string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}", sam.QName, (int)sam.Flag, sam.RName, sam.Pos, sam.MapQ, sam.CIGAR, sam.MRNM, sam.MPos, sam.ISize, sam.GetQuerySequenceString(), sam.GetQualityScoresString(), (from of in sam.OptionalFields select string.Format("{0}:{1}:{2}", of.Tag, of.VType, of.Value)).Merge("\t"))); }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); return(seqAlignmentMap); } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Resource.SAMAlignedSequenceHeaderNotFound); } SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; seqAlignmentMap.QuerySequences.Add(samAlignedSeq); } return(seqAlignmentMap); }
public void ValidateSAMParseAlignedSeqWithSeqPointer() { // Get values from XML node. string expectedSequence = Utility._xmlUtil.GetTextValue( Constants.SAMFileWithAllFieldsNode, Constants.ExpectedSeqWithPointersNode); string samFilePath = Utility._xmlUtil.GetTextValue( Constants.SAMFileWithAllFieldsNode, Constants.FilePathNode); string lineNumberForPointer = Utility._xmlUtil.GetTextValue( Constants.SAMFileWithAllFieldsNode, Constants.LineNumberToPointNode); // Parse a SAM file SAMParser parserObj = new SAMParser(); parserObj.EnforceDataVirtualization = true; SequenceAlignmentMap seqList = parserObj.Parse(samFilePath); Assert.IsNotNull(seqList); // Get a pointer object SequencePointer pointerObj = GetSequencePointer(Int32.Parse(lineNumberForPointer)); pointerObj.IndexOffsets[0] = 156; pointerObj.IndexOffsets[1] = 304; // Parse a SAM file using Sequence Pointer. SAMAlignedSequence alignedSeq = (SAMAlignedSequence)parserObj.ParseAlignedSequence(pointerObj); // Validate parsed SAM aligned sequence. Assert.AreEqual(expectedSequence, alignedSeq.QuerySequence.ToString()); Console.WriteLine(string.Format(null, "SAM Parser BVT : Sequence alignment aligned seq {0} validate successfully", alignedSeq.Sequences[0].ToString())); ApplicationLog.WriteLine(string.Format(null, "SAM Parser BVT : Sequence alignment aligned seq validate successfully")); }
public void ValidateVirtualBAMAlignedSequenceListCopyTo() { VirtualAlignedSequenceList <SAMAlignedSequence> virtualASeqList = GetBAMAlignedSequence(Constants.BAMFileWithMultipleAlignedSeqsNode); SAMAlignedSequence[] samAlignedSeqList = new SAMAlignedSequence[virtualASeqList.Count]; // Copy virtual aligned sequence to sam aligned sequence lilst array. virtualASeqList.CopyTo(samAlignedSeqList, 0); // Validate copied aligned sequences. for (int i = 0; i < virtualASeqList.Count; i++) { Assert.AreEqual(samAlignedSeqList[i].QuerySequence.ToString(), virtualASeqList[i].QuerySequence.ToString()); } ApplicationLog.WriteLine(string.Format((IFormatProvider)null, "Virtual AlignedSequenceList Bvt : Validated the VAS CopyTo")); Console.WriteLine(string.Format((IFormatProvider)null, "Virtual AlignedSequenceList Bvt : Validated the VAS CopyTo")); }
/// <summary> /// Gets Aligned seqeunces in the Specified SAM file. /// </summary> /// <param name="textReader">SAM file stream.</param> private IEnumerable <SAMAlignedSequence> GetAlignedSequence(MBFTextReader textReader) { bool isFilterRequired = IsFilterApplied(); bool display = true; //Displays SAM as output. while (textReader.HasLines) { SAMAlignedSequence alignedSequence = SAMParser.ParseSequence(textReader, false); if (isFilterRequired) { display = Filter(alignedSequence); } if (display) { yield return(alignedSequence); } textReader.GoToNextLine(); } }
/// <summary> /// Gets Aligned sequences in the Specified BAM file. /// </summary> /// <param name="bamStream"></param> private IEnumerable <SAMAlignedSequence> GetAlignedSequence(Stream bamStream) { bool isFilterRequired = IsFilterApplied(); bool display = true; while (!bamparser.IsEOF()) { SAMAlignedSequence alignedSequence = bamparser.GetAlignedSequence(false); //TODO: The parser should probably never return a null sequence //this may be a band aid over a lurking problem, fix in future if (alignedSequence != null) { if (isFilterRequired) { display = Filter(alignedSequence); } if (display) { yield return(alignedSequence); } } } }
/// <summary> /// Genaral method to Invalidate Quality Sequences /// <param name="method">enum type to execute different overload</param> /// </summary> private static void ValidateQualitySeqLength(ParseOrFormatQualLength method) { SAMAlignedSequence align = new SAMAlignedSequence(); try { switch (method) { case ParseOrFormatQualLength.AlignedSeq: SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, String.Empty, null); break; case ParseOrFormatQualLength.Sequencedata: align.QName = "Quality Value"; SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, String.Empty, null); break; case ParseOrFormatQualLength.Qualitydata: align.QName = "Quality Value"; SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, Constants.QualitySequence, null); break; case ParseOrFormatQualLength.QualityLength: align.QName = "Quality Value"; SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, Constants.QualitySequence, new Sequence(Alphabets.DNA, Constants.QualityLength)); break; default: break; } Assert.Fail(); } catch (ArgumentException) { ApplicationLog.WriteLine( "SAM Parser P2 : Successfully validated the exception"); Console.WriteLine( "SAM Parser P2 : Successfully validated the exception"); } catch (FormatException) { ApplicationLog.WriteLine( "SAM Parser P2 : Successfully validated the exception"); Console.WriteLine( "SAM Parser P2 : Successfully validated the exception"); } }
/// <summary> /// Gets encoded sequence according to the BAM specification. /// </summary> /// <param name="alignedSeq"></param> /// <returns></returns> private static byte[] GetEncodedSequence(SAMAlignedSequence alignedSeq) { List <byte> byteList = new List <byte>(); ISequence seq = alignedSeq.QuerySequence; if (seq != null) { if (seq.Alphabet != Alphabets.DNA) { throw new ArgumentException(Resource.SAMFormatterSupportsDNAOnly); } for (int i = 0; i < seq.Count; i++) { char symbol = seq[i].Symbol; byte encodedvalue = 0; if (alignedSeq.DotSymbolIndexes.Count > 0) { if (alignedSeq.DotSymbolIndexes.Contains(i)) { symbol = 'N'; alignedSeq.DotSymbolIndexes.Remove(i); } } if (alignedSeq.EqualSymbolIndexes.Count > 0) { if (alignedSeq.EqualSymbolIndexes.Contains(i)) { symbol = '='; alignedSeq.EqualSymbolIndexes.Remove(i); } } // 4-bit encoded read: =ACGTN=>0,1,2,4,8,15; the earlier base is stored in the // high-order 4 bits of the byte. switch (symbol) { case '=': encodedvalue = 0; break; case 'A': encodedvalue = 1; break; case 'C': encodedvalue = 2; break; case 'G': encodedvalue = 4; break; case 'T': encodedvalue = 8; break; default: encodedvalue = 15; break; } if ((i + 1) % 2 > 0) { byteList.Add((byte)(encodedvalue << 4)); } else { byteList[byteList.Count - 1] = (byte)(byteList[byteList.Count - 1] | encodedvalue); } } } return(byteList.ToArray()); }
/// <summary> /// Writes SAMAlignedSequence to specified stream. /// </summary> /// <param name="alignedSeq">SAMAlignedSequence object.</param> /// <param name="writer">Stream to write.</param> private void WriteAlignedSequence(SAMAlignedSequence alignedSeq, Stream writer) { // Get the total block size required. int blocksize = GetBlockSize(alignedSeq); // Get Reference sequence index. int rid = GetRefSeqID(alignedSeq.RName); // bin<<16|mapQual<<8|read_name_len (including NULL) uint bin_mq_nl = (uint)alignedSeq.Bin << 16; bin_mq_nl = bin_mq_nl | (uint)alignedSeq.MapQ << 8; bin_mq_nl = bin_mq_nl | (uint)(alignedSeq.QName.Length + 1); // flag<<16|cigar_len uint flag_nc = (uint)alignedSeq.Flag << 16; flag_nc = flag_nc | (uint)GetCIGARLength(alignedSeq.CIGAR); int readLen = alignedSeq.QuerySequence.Count; int mateRefId = GetRefSeqID(alignedSeq.MRNM); byte[] readName = System.Text.ASCIIEncoding.ASCII.GetBytes(alignedSeq.QName); // Cigar: op_len<<4|op. Op: MIDNSHP=>0123456 IList <uint> encodedCIGAR = GetEncodedCIGAR(alignedSeq.CIGAR); //block size writer.Write(Helper.GetLittleEndianByteArray(blocksize), 0, 4); // Reference sequence index. writer.Write(Helper.GetLittleEndianByteArray(rid), 0, 4); // Pos writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : -1), 0, 4); // bin<<16|mapQual<<8|read_name_len (including NULL) writer.Write(Helper.GetLittleEndianByteArray(bin_mq_nl), 0, 4); // flag<<16|cigar_len writer.Write(Helper.GetLittleEndianByteArray(flag_nc), 0, 4); // Length of the read writer.Write(Helper.GetLittleEndianByteArray(readLen), 0, 4); // Mate reference sequence index writer.Write(Helper.GetLittleEndianByteArray(mateRefId), 0, 4); // mate_pos - Leftmost coordinate of the mate writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.MPos > 1 ? alignedSeq.MPos - 1 : 0), 0, 4); // Insert size of the read pair (if paired) writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.ISize >= 0 ? alignedSeq.ISize : 0), 0, 4); // Read name, null terminated writer.Write(readName, 0, readName.Length); writer.WriteByte((byte)'\0'); // Cigar: op_len<<4|op. Op: MIDNSHP=>0123456 for (int i = 0; i < encodedCIGAR.Count; i++) { writer.Write(Helper.GetLittleEndianByteArray(encodedCIGAR[i]), 0, 4); } // 4-bit encoded read: =ACGTN=>0,1,2,4,8,15; the earlier base is stored in the high-order 4 bits of the byte. byte[] encodedValues = GetEncodedSequence(alignedSeq); writer.Write(encodedValues, 0, encodedValues.Length); // Phred base quality (0xFF if absent) encodedValues = GetQualityValue(alignedSeq.QuerySequence); writer.Write(encodedValues, 0, encodedValues.Length); // Optional fields foreach (SAMOptionalField field in alignedSeq.OptionalFields) { byte[] optionalArray = GetOptioanField(field); writer.Write(optionalArray, 0, optionalArray.Length); } }
public IEnumerable <CompactSAMSequence> Parse() { if (string.IsNullOrWhiteSpace(_fileName)) { throw new ArgumentNullException("fileName"); } using (readStream = new FileStream(_fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream reader = readStream; if (reader == null || reader.Length == 0) { throw new FileFormatException(Properties.Resource.BAM_InvalidBAMFile); } if (!String.IsNullOrEmpty(ChromosomeToGet)) { foreach (var s in ParseRangeAsEnumerableSequences(_fileName, ChromosomeToGet)) { if (s != null) { yield return(s); } ////TODO: Super inefficient right now, am parsing the sequence multiple times, ////fix this. //var s2 = s.ToArray (); //var alpha = Alphabets.AutoDetectAlphabet(s2, 0, s2.Length, null); //var strippedOfInfo = new Sequence(alpha, s2); //yield return strippedOfInfo; } } else { readStream = reader; ValidateReader(); SAMAlignmentHeader header = GetHeader(); SequenceAlignmentMap sequenceAlignmentMap = null; if (sequenceAlignmentMap == null) { sequenceAlignmentMap = new SequenceAlignmentMap(header); } while (!IsEOF()) { #if WANT_OLD_VERSION SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); #else var alignedSeq = GetAlignedSequence(); #endif if (alignedSeq != null) { #if WANT_OLD_VERSION //make a new Sequence ISequence strippedOfInfo = null; try { var syms = alignedSeq.QuerySequence.ToArray(); var alpha = Alphabets.AutoDetectAlphabet(syms, 0, syms.Length, null); strippedOfInfo = new Sequence(alpha, alignedSeq.QuerySequence.ToArray()); strippedOfInfo = alignedSeq; } catch (ArgumentOutOfRangeException exception) { Debug.Write("Could not convert sequence: " + exception.Message); } if (strippedOfInfo != null) { yield return(strippedOfInfo); } #else yield return(alignedSeq); #endif } alignedSeq = null; } } } }
/// <summary> /// Returns an aligned sequence by parses the BAM file. /// </summary> private SAMAlignedSequence GetAlignedSequence(int start, int end) { byte[] array = new byte[4]; ReadUnCompressedData(array, 0, 4); int blockLen = Helper.GetInt32(array, 0); byte[] alignmentBlock = new byte[blockLen]; ReadUnCompressedData(alignmentBlock, 0, blockLen); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); int value; UInt32 UnsignedValue; // 0-4 bytes int refSeqIndex = Helper.GetInt32(alignmentBlock, 0); if (refSeqIndex == -1) { alignedSeq.RName = "*"; } else { alignedSeq.RName = refSeqNames[refSeqIndex]; } // 4-8 bytes alignedSeq.Pos = Helper.GetInt32(alignmentBlock, 4) + 1; // if there is no overlap no need to parse further. // BAMPos > closedEnd // => (alignedSeq.Pos - 1) > end -1 if (alignedSeq.Pos > end) { return(null); } // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len" UnsignedValue = Helper.GetUInt32(alignmentBlock, 8); // 10 -12 bytes alignedSeq.Bin = (int)(UnsignedValue & 0xFFFF0000) >> 16; // 9th bytes alignedSeq.MapQ = (int)(UnsignedValue & 0x0000FF00) >> 8; // 8th bytes int queryNameLen = (int)(UnsignedValue & 0x000000FF); // 12 - 16 bytes UnsignedValue = Helper.GetUInt32(alignmentBlock, 12); // 14-16 bytes int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16; alignedSeq.Flag = (SAMFlags)flagValue; // 12-14 bytes int cigarLen = (int)(UnsignedValue & 0x0000FFFF); // 16-20 bytes int readLen = Helper.GetInt32(alignmentBlock, 16); // 20-24 bytes int mateRefSeqIndex = Helper.GetInt32(alignmentBlock, 20); if (mateRefSeqIndex != -1) { alignedSeq.MRNM = refSeqNames[mateRefSeqIndex]; } else { alignedSeq.MRNM = "*"; } // 24-28 bytes alignedSeq.MPos = Helper.GetInt32(alignmentBlock, 24) + 1; // 28-32 bytes alignedSeq.ISize = Helper.GetInt32(alignmentBlock, 28); // 32-(32+readLen) bytes alignedSeq.QName = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1); StringBuilder strbuilder = new StringBuilder(); int startIndex = 32 + queryNameLen; for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4) { // Get the CIGAR operation length stored in first 28 bits. UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i); strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture)); // Get the CIGAR operation stored in last 4 bits. value = (int)cigarValue & 0x0000000F; // MIDNSHP=>0123456 switch (value) { case 0: strbuilder.Append("M"); break; case 1: strbuilder.Append("I"); break; case 2: strbuilder.Append("D"); break; case 3: strbuilder.Append("N"); break; case 4: strbuilder.Append("S"); break; case 5: strbuilder.Append("H"); break; case 6: strbuilder.Append("P"); break; case 7: strbuilder.Append("="); break; case 8: strbuilder.Append("X"); break; default: throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR); } } string cigar = strbuilder.ToString(); if (string.IsNullOrWhiteSpace(cigar)) { alignedSeq.CIGAR = "*"; } else { alignedSeq.CIGAR = cigar; } // if there is no overlap no need to parse further. // ZeroBasedRefEnd < start // => (alignedSeq.RefEndPos -1) < start if (alignedSeq.RefEndPos - 1 < start && alignedSeq.RName != Properties.Resource.SAM_NO_REFERENCE_DEFINED_INDICATOR) { return(null); } startIndex += cigarLen * 4; strbuilder = new StringBuilder(); int index = startIndex; for (; index < (startIndex + (readLen + 1) / 2) - 1; index++) { // Get first 4 bit value value = (alignmentBlock[index] & 0xF0) >> 4; strbuilder.Append(GetSeqChar(value)); // Get last 4 bit value value = alignmentBlock[index] & 0x0F; strbuilder.Append(GetSeqChar(value)); } value = (alignmentBlock[index] & 0xF0) >> 4; strbuilder.Append(GetSeqChar(value)); if (readLen % 2 == 0) { value = alignmentBlock[index] & 0x0F; strbuilder.Append(GetSeqChar(value)); } startIndex = index + 1; string strSequence = strbuilder.ToString(); byte[] qualValues = new byte[readLen]; string strQualValues = "*"; if (alignmentBlock[startIndex] != 0xFF) { for (int i = startIndex; i < (startIndex + readLen); i++) { qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33); } strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues); } SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, strSequence, strQualValues); startIndex += readLen; if (alignmentBlock.Length > startIndex + 4 && alignmentBlock[startIndex] != 0x0 && alignmentBlock[startIndex + 1] != 0x0) { for (index = startIndex; index < alignmentBlock.Length;) { SAMOptionalField optionalField = new SAMOptionalField(); optionalField.Tag = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, index, 2); index += 2; char vType = (char)alignmentBlock[index++]; string valueType = vType.ToString(); // SAM format supports [AifZH] for value type. // In BAM, an integer may be stored as a signed 8-bit integer (c), unsigned 8-bit integer (C), signed short (s), unsigned // short (S), signed 32-bit (i) or unsigned 32-bit integer (I), depending on the signed magnitude of the integer. However, // in SAM, all types of integers are presented as type ʻiʼ. string message = Helper.IsValidPatternValue("VType", valueType, BAMOptionalFieldRegex); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } optionalField.Value = GetOptionalValue(vType, alignmentBlock, ref index).ToString(); // Convert to SAM format. if ("cCsSI".IndexOf(vType) >= 0) { valueType = "i"; } optionalField.VType = valueType; alignedSeq.OptionalFields.Add(optionalField); } } return(alignedSeq); }
/// <summary> /// Filters Sequence based on user inputs. /// </summary> /// <param name="alignedSequence">Aligned Sequence.</param> /// <returns>Whether aligned sequence matches user defined options.</returns> private bool Filter(SAMAlignedSequence alignedSequence) { bool filter = true; if (filter && FlagRequired != 0) { filter = (((int)alignedSequence.Flag) & FlagRequired) == FlagRequired; } if (filter && FilteringFlag != 0) { filter = ((((int)alignedSequence.Flag) & FilteringFlag) == 0); } if (filter && QualityMinimumMapping != 0) { filter = alignedSequence.MapQ == QualityMinimumMapping; } if (filter && !string.IsNullOrEmpty(Library)) { filter = rgRecFields.First( a => a.Tags.First( b => b.Tag.Equals("ID")).Value.Equals(alignedSequence.OptionalFields.First( c => c.Tag.Equals("RG")).Value)).Tags.First( d => d.Tag.Equals("LB")).Value.Equals(Library); } if (filter && !string.IsNullOrEmpty(ReadGroup)) { filter = alignedSequence.OptionalFields.AsParallel().Where( O => O.Tag.ToUpper().Equals("RG")).ToList().Any(a => a.Value.Equals(ReadGroup)); } if (filter && !string.IsNullOrEmpty(Region)) { if (alignedSequence.RName.Equals(region.Chromosome)) { if (region.Start > -1) { if (alignedSequence.Pos >= region.Start) { if (region.End > -1) { if (alignedSequence.Pos <= region.End) { filter = true; } else { filter = false; } } else { filter = true; } } else { filter = false; } } else { filter = true; } } else { filter = false; } } return(filter); }
/// <summary> /// Sort and merge multiple SAM objects /// </summary> /// <param name="sortedIndexes">Sorted Indexes of SAM object.</param> /// <param name="fstemp">Temporary tream to write alignments.</param> /// <param name="formatter">Format aligned sequences in BAM format.</param> /// <param name="sequenceAlignmentMaps">List of SAM objects to be merged.</param> private void WriteMergeFile(IList <IList <BAMSortedIndex> > sortedIndexes, FileStream fstemp, BAMFormatter formatter, IList <SequenceAlignmentMap> sequenceAlignmentMaps) { List <SAMAlignedSequence> alignedSeqs = new List <SAMAlignedSequence>(); int[] sortedIndex = new int[sequenceAlignmentMaps.Count]; for (int i = 0; i < sortedIndexes.Count; i++) { BAMSortedIndex bamSortedIndex = sortedIndexes[i].ElementAt(sortedIndex[i]); if (bamSortedIndex != null) { if (bamSortedIndex.MoveNext()) { alignedSeqs.Add(sequenceAlignmentMaps[i].QuerySequences[bamSortedIndex.Current]); } else { alignedSeqs.Add(null); } } else { alignedSeqs.Add(null); } } int smallestIndex = -1; do { for (int index = 0; index < alignedSeqs.Count; index++) { if (alignedSeqs[index] != null) { if (smallestIndex == -1) { smallestIndex = index; } else { if (0 < string.Compare(alignedSeqs[smallestIndex].RName, alignedSeqs[index].RName, StringComparison.OrdinalIgnoreCase)) { smallestIndex = index; } else if (alignedSeqs[smallestIndex].RName.Equals(alignedSeqs[index].RName)) { if (alignedSeqs[smallestIndex].Pos > alignedSeqs[index].Pos) { smallestIndex = index; } } } } } if (smallestIndex > -1) { SAMAlignedSequence alignSeqTowrite = alignedSeqs[smallestIndex]; if (sortedIndexes[smallestIndex].ElementAt(sortedIndex[smallestIndex]).MoveNext()) { int nextIndex = sortedIndexes[smallestIndex].ElementAt(sortedIndex[smallestIndex]).Current; alignedSeqs[smallestIndex] = sequenceAlignmentMaps[smallestIndex].QuerySequences[nextIndex]; } else { sortedIndex[smallestIndex]++; if (sortedIndex[smallestIndex] < sortedIndexes[smallestIndex].Count && sortedIndexes[smallestIndex].ElementAt(sortedIndex[smallestIndex]).MoveNext()) { int nextIndex = sortedIndexes[smallestIndex].ElementAt(sortedIndex[smallestIndex]).Current; alignedSeqs[smallestIndex] = sequenceAlignmentMaps[smallestIndex].QuerySequences[nextIndex]; } else { alignedSeqs[smallestIndex] = null; smallestIndex = -1; } } formatter.WriteAlignedSequence(_header, alignSeqTowrite, fstemp); } } while (!alignedSeqs.All(a => a == null)); }
/// <summary> /// Gets encoded sequence according to the BAM specification. /// </summary> /// <param name="alignedSeq"></param> /// <returns></returns> private static byte[] GetEncodedSequence(SAMAlignedSequence alignedSeq) { List <byte> byteList = new List <byte>(); ISequence seq = alignedSeq.QuerySequence; if (seq != null) { if (!(seq.Alphabet is DnaAlphabet)) { throw new ArgumentException(Properties.Resource.BAMFormatterSupportsDNAOnly); } byte[] symbolMap = seq.Alphabet.GetSymbolValueMap(); for (int i = 0; i < seq.Count; i++) { char symbol = (char)symbolMap[seq[i]]; byte encodedvalue = 0; // 4-bit encoded read: =ACMGRSVTWYHKDBN -> 0-15; the earlier base is stored in the // high-order 4 bits of the byte. //Note: // All the other symbols which are not supported by BAM specification (other than "=ACMGRSVTWYHKDBN") are converted to 'N' // for example a '.' symbol which is supported by SAM specification will be converted to symbol 'N' switch (symbol) { case '=': encodedvalue = 0; break; case 'A': encodedvalue = 1; break; case 'C': encodedvalue = 2; break; case 'M': encodedvalue = 3; break; case 'G': encodedvalue = 4; break; case 'R': encodedvalue = 5; break; case 'S': encodedvalue = 6; break; case 'V': encodedvalue = 7; break; case 'T': encodedvalue = 8; break; case 'W': encodedvalue = 9; break; case 'Y': encodedvalue = 10; break; case 'H': encodedvalue = 11; break; case 'K': encodedvalue = 12; break; case 'D': encodedvalue = 13; break; case 'B': encodedvalue = 14; break; default: encodedvalue = 15; break; } if ((i + 1) % 2 > 0) { byteList.Add((byte)(encodedvalue << 4)); } else { byteList[byteList.Count - 1] = (byte)(byteList[byteList.Count - 1] | encodedvalue); } } } return(byteList.ToArray()); }
List<BaseAndQualityAndPosition> getBasesForSequence(SAMAlignedSequence seq) { List<BaseAndQualityAndPosition> toReturn = new List<BaseAndQualityAndPosition>(seq.RefEndPos - seq.Pos + 10); // Decode the cigar string into operations. // TODO: This code is duplicated in many places string CIGAR = seq.CIGAR; List<KeyValuePair<char, int>> charsAndPositions = new List<KeyValuePair<char, int>>(); for (int i = 0; i < CIGAR.Length; i++) { char ch = CIGAR[i]; if (Char.IsDigit(ch)) { continue; } charsAndPositions.Add(new KeyValuePair<char, int>(ch, i)); } // Get sequence bases and error probabilities var qseq = seq.QuerySequence as QualitativeSequence; var seq_log10ErrorProb = qseq.GetPhredQualityScores().Select(Utils.GetLog10ErrorProbability).ToArray(); var seq_bases = qseq.ToArray(); // Use the cigar operations to emit bases. int curRef = seq.Pos; int curQuery = 0; for (int i = 0; i < charsAndPositions.Count; i++) { // Parse the current cigar operation char ch = charsAndPositions[i].Key; int cig_start = i==0 ? 0 : charsAndPositions[i - 1].Value + 1; int cig_end = charsAndPositions[i].Value - cig_start; int cig_len = int.Parse(CIGAR.Substring(cig_start, cig_end)); // Emit or advance based on cigar operation. switch (ch) { case 'P': //padding (Silent deltions from padded reference) case 'N': //skipped region from reference throw new Exception("Pile up methods not built to handle reference clipping (Cigar P or N) yet."); case 'M': //match or mismatch case '=': //match case 'X': //mismatch for (int k = 0; k < cig_len; k++) { var bqp= new BaseAndQualityAndPosition(curRef,0, new BaseAndQuality(seq_bases[curQuery], seq_log10ErrorProb[curQuery])); toReturn.Add(bqp); curQuery++; curRef++; } break; case 'I'://insertion to the reference for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef,k, new BaseAndQuality(seq_bases[curQuery], seq_log10ErrorProb[curQuery])); toReturn.Add(bqp); curQuery++; } break; case 'D'://Deletion from the reference for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef,k, new BaseAndQuality((byte)'-', Double.NaN)); toReturn.Add(bqp); curRef++; } break; case 'S': //soft clipped curQuery += cig_len; break; case 'H'://had clipped break; default: throw new FormatException("Unexpected SAM Cigar element found " + ch.ToString()); } } return toReturn; }