/// <summary> /// Writes specified BAMIndex data. /// </summary> /// <param name="bamIndex">BAMIndex instance to write.</param> public void Write(BAMIndex bamIndex) { if (bamIndex == null) { throw new ArgumentNullException("bamIndex"); } if (sourceStream == null) { throw new InvalidOperationException(Properties.Resource.BAM_CantUseBAMIndexStreamDisposed); } byte[] arrays = new byte[20]; byte[] magic = new byte[] { 66, 65, 73, 1 }; Write(magic, 0, 4); arrays = Helper.GetLittleEndianByteArray(bamIndex.RefIndexes.Count); Write(arrays, 0, 4); for (Int32 refindex = 0; refindex < bamIndex.RefIndexes.Count; refindex++) { BAMReferenceIndexes bamindices = bamIndex.RefIndexes[refindex]; arrays = Helper.GetLittleEndianByteArray(bamindices.Bins.Count); Write(arrays, 0, 4); for (Int32 binIndex = 0; binIndex < bamindices.Bins.Count; binIndex++) { Bin bin = bamindices.Bins[binIndex]; arrays = Helper.GetLittleEndianByteArray(bin.BinNumber); Write(arrays, 0, 4); arrays = Helper.GetLittleEndianByteArray(bin.Chunks.Count); Write(arrays, 0, 4); for (Int32 chunkIndex = 0; chunkIndex < bin.Chunks.Count; chunkIndex++) { Chunk chunk = bin.Chunks[chunkIndex]; arrays = GetBAMOffsetArray(chunk.ChunkStart); Write(arrays, 0, 8); arrays = GetBAMOffsetArray(chunk.ChunkEnd); Write(arrays, 0, 8); } } arrays = Helper.GetLittleEndianByteArray(bamindices.LinearOffsets.Count); Write(arrays, 0, 4); for (Int32 offsetIndex = 0; offsetIndex < bamindices.LinearOffsets.Count; offsetIndex++) { FileOffset value = bamindices.LinearOffsets[offsetIndex]; arrays = GetBAMOffsetArray(value); Write(arrays, 0, 8); } sourceStream.Flush(); } sourceStream.Flush(); }
/// <summary> /// Creates BAMIndex object from the specified BAM file and writes to specified BAMIndex file. /// </summary> /// <param name="compressedBAMStream"></param> /// <param name="indexStorage"></param> private static void CreateBAMIndexFile(Stream compressedBAMStream, BAMIndexStorage indexStorage) { var parser = new BAMParser(); BAMIndex bamIndex = parser.GetIndexFromBAMStorage(compressedBAMStream); indexStorage.Write(bamIndex); }
public IEnumerable <CompactSAMSequence> ParseRangeAsEnumerableSequences(string fileName, string refSeqName, int start = 0, int end = Int32.MaxValue) { if (refSeqName == null) { throw new ArgumentNullException("refSeqName"); } using (FileStream bamStream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { string bamIndexFileName = getBAMIndexFileName(fileName); using (BAMIndexFile bamIndexFile = new BAMIndexFile(bamIndexFileName, FileMode.Open, FileAccess.Read)) { readStream = bamStream; if (readStream == null || readStream.Length == 0) { throw new FileFormatException(Properties.Resource.BAM_InvalidBAMFile); } ValidateReader(); SAMAlignmentHeader header = GetHeader(); // verify whether there is any reads related to chromosome. int refSeqIndex = refSeqNames.IndexOf(refSeqName); if (refSeqIndex < 0) { string message = string.Format(CultureInfo.InvariantCulture, Properties.Resource.BAM_RefSeqNotFound, refSeqName); throw new ArgumentException(message, "refSeqName"); } BAMIndex bamIndexInfo = bamIndexFile.Read(); BAMReferenceIndexes refIndex = bamIndexInfo.RefIndexes[refSeqIndex]; IList <Chunk> chunks = GetChunks(refIndex, start, end); foreach (var s in EnumerateAlignedSequences(chunks)) { if (s != null && (s.RName == "*" || (s.Pos >= (start - 1) && s.RefEndPos < end))) { yield return(s); } } readStream = null; } } }
/// <summary> /// Writes specified BAMIndex data. /// </summary> /// <param name="bamIndex">BAMIndex instance to write.</param> public void Write(BAMIndex bamIndex) { if (bamIndex == null) { throw new ArgumentNullException("bamIndex"); } if (sourceStream == null) { throw new InvalidOperationException(Properties.Resource.BAM_CantUseBAMIndexStreamDisposed); } byte[] arrays = new byte[20]; byte[] magic = new byte[] { 66, 65, 73, 1 }; Write(magic, 0, 4); arrays = Helper.GetLittleEndianByteArray(bamIndex.RefIndexes.Count); Write(arrays, 0, 4); for (Int32 refindex = 0; refindex < bamIndex.RefIndexes.Count; refindex++) { BAMReferenceIndexes bamindices = bamIndex.RefIndexes[refindex]; int binCount = bamindices.Bins.Count; bool addingMetaData = bamindices.HasMetaData && BitConverter.IsLittleEndian; if (addingMetaData) { binCount++; } arrays = Helper.GetLittleEndianByteArray(binCount); Write(arrays, 0, 4); //Write each bin for (Int32 binIndex = 0; binIndex < bamindices.Bins.Count; binIndex++) { Bin bin = bamindices.Bins[binIndex]; arrays = Helper.GetLittleEndianByteArray(bin.BinNumber); Write(arrays, 0, 4); int chunkCount = bin.Chunks.Count; arrays = Helper.GetLittleEndianByteArray(chunkCount); Write(arrays, 0, 4); for (Int32 chunkIndex = 0; chunkIndex < bin.Chunks.Count; chunkIndex++) { Chunk chunk = bin.Chunks[chunkIndex]; arrays = GetBAMOffsetArray(chunk.ChunkStart); Write(arrays, 0, 8); arrays = GetBAMOffsetArray(chunk.ChunkEnd); Write(arrays, 0, 8); } } //Add Meta Data - this varies by implementation, .NET Bio will do start and //end of reads found in file and then mapped/unmapped //TODO: Assumes little endian, only adds if so if (addingMetaData) { //Dummy bin to indicate meta-data arrays = Helper.GetLittleEndianByteArray(BAMIndexFile.MAX_BINS); Write(arrays, 0, 4); //2 chunks worth of meta data //first the file offsets arrays = Helper.GetLittleEndianByteArray((int)2); Write(arrays, 0, 4); arrays = GetBAMOffsetArray(bamindices.FirstOffSetSeen); Write(arrays, 0, 8); arrays = GetBAMOffsetArray(bamindices.LastOffSetSeen); Write(arrays, 0, 8); arrays = BitConverter.GetBytes(bamindices.MappedReadsCount); Write(arrays, 0, 8); arrays = BitConverter.GetBytes(bamindices.UnMappedReadsCount); Write(arrays, 0, 8); } arrays = Helper.GetLittleEndianByteArray(bamindices.LinearIndex.Count); Write(arrays, 0, 4); for (Int32 offsetIndex = 0; offsetIndex < bamindices.LinearIndex.Count; offsetIndex++) { FileOffset value = bamindices.LinearIndex[offsetIndex]; arrays = GetBAMOffsetArray(value); Write(arrays, 0, 8); } sourceStream.Flush(); } sourceStream.Flush(); }
/// <summary> /// Returns BAMIndex instance by parsing BAM index source. /// </summary> public BAMIndex Read() { if (sourceStream == null) { throw new InvalidOperationException(Properties.Resource.BAM_CantUseBAMIndexStreamDisposed); } BAMIndex bamIndex = new BAMIndex(); byte[] arrays = new byte[20]; Read(arrays, 0, 4); if (arrays[0] != 66 || arrays[1] != 65 || arrays[2] != 73 || arrays[3] != 1) { throw new FormatException(Properties.Resource.BAM_InvalidIndexFile); } Read(arrays, 0, 4); int n_ref = Helper.GetInt32(arrays, 0); for (Int32 refindex = 0; refindex < n_ref; refindex++) { BAMReferenceIndexes bamindices = new BAMReferenceIndexes(); bamIndex.RefIndexes.Add(bamindices); Read(arrays, 0, 4); int n_bin = Helper.GetInt32(arrays, 0); for (Int32 binIndex = 0; binIndex < n_bin; binIndex++) { Bin bin = new Bin(); Read(arrays, 0, 4); bin.BinNumber = Helper.GetUInt32(arrays, 0); Read(arrays, 0, 4); int n_chunk = Helper.GetInt32(arrays, 0); if (bin.BinNumber == MAX_BINS)//some groups use this to place meta-data, such as the picard toolkit and now SAMTools { //Meta data was later added in to the SAMTools specification for (Int32 chunkIndex = 0; chunkIndex < n_chunk; chunkIndex++) { bamindices.HasMetaData = true; Read(arrays, 0, 8); bamindices.MappedReadsCount = Helper.GetUInt64(arrays, 0); Read(arrays, 0, 8); bamindices.UnMappedReadsCount = Helper.GetUInt64(arrays, 0); } } else if (bin.BinNumber > MAX_BINS) { throw new FileFormatException("BAM Index is incorrectly formatted. Bin number specified is higher than the maximum allowed."); } else { bamindices.Bins.Add(bin); for (Int32 chunkIndex = 0; chunkIndex < n_chunk; chunkIndex++) { Chunk chunk = new Chunk(); bin.Chunks.Add(chunk); Read(arrays, 0, 8); chunk.ChunkStart = GetBAMOffset(arrays, 0); Read(arrays, 0, 8); chunk.ChunkEnd = GetBAMOffset(arrays, 0); } } } //Get number of linear bins Read(arrays, 0, 4); int n_intv = Helper.GetInt32(arrays, 0); for (Int32 offsetIndex = 0; offsetIndex < n_intv; offsetIndex++) { FileOffset value; Read(arrays, 0, 8); value = GetBAMOffset(arrays, 0); bamindices.LinearIndex.Add(value); } } return(bamIndex); }
/// <summary> /// Writes specified BAMIndex data. /// </summary> /// <param name="bamIndex">BAMIndex instance to write.</param> public void Write(BAMIndex bamIndex) { if (bamIndex == null) { throw new ArgumentNullException("bamIndex"); } if (Source == null) { throw new InvalidOperationException(Properties.Resource.BAM_CantUseBAMIndexStreamDisposed); } byte[] magic = { 66, 65, 73, 1 }; Write(magic, 0, 4); byte[] arrays = Helper.GetLittleEndianByteArray(bamIndex.RefIndexes.Count); Write(arrays, 0, 4); foreach (BAMReferenceIndexes index in bamIndex.RefIndexes) { int binCount = index.Bins.Count; bool addingMetaData = index.HasMetaData && BitConverter.IsLittleEndian; if (addingMetaData) { binCount++; } arrays = Helper.GetLittleEndianByteArray(binCount); this.Write(arrays, 0, 4); //Write each bin foreach (Bin bin in index.Bins) { arrays = Helper.GetLittleEndianByteArray(bin.BinNumber); this.Write(arrays, 0, 4); int chunkCount = bin.Chunks.Count; arrays = Helper.GetLittleEndianByteArray(chunkCount); this.Write(arrays, 0, 4); foreach (Chunk chunk in bin.Chunks) { arrays = GetBAMOffsetArray(chunk.ChunkStart); this.Write(arrays, 0, 8); arrays = GetBAMOffsetArray(chunk.ChunkEnd); this.Write(arrays, 0, 8); } } //Add Meta Data - this varies by implementation, .NET Bio will do start and //end of reads found in file and then mapped/unmapped //TODO: Assumes little endian, only adds if so if (addingMetaData) { //Dummy bin to indicate meta-data arrays = Helper.GetLittleEndianByteArray(BAMIndexStorage.MaxBins); this.Write(arrays, 0, 4); //2 chunks worth of meta data //first the file offsets arrays = Helper.GetLittleEndianByteArray((int)2); this.Write(arrays, 0, 4); arrays = GetBAMOffsetArray(index.FirstOffSetSeen); this.Write(arrays, 0, 8); arrays = GetBAMOffsetArray(index.LastOffSetSeen); this.Write(arrays, 0, 8); arrays = BitConverter.GetBytes(index.MappedReadsCount); this.Write(arrays, 0, 8); arrays = BitConverter.GetBytes(index.UnMappedReadsCount); this.Write(arrays, 0, 8); } arrays = Helper.GetLittleEndianByteArray(index.LinearIndex.Count); this.Write(arrays, 0, 4); foreach (FileOffset value in index.LinearIndex) { arrays = GetBAMOffsetArray(value); this.Write(arrays, 0, 8); } Source.Flush(); } }
/// <summary> /// Returns BAMIndex instance by parsing BAM index source. /// </summary> public BAMIndex Read() { if (sourceStream == null) { throw new InvalidOperationException(Properties.Resource.BAM_CantUseBAMIndexStreamDisposed); } BAMIndex bamIndex = new BAMIndex(); byte[] arrays = new byte[20]; Read(arrays, 0, 4); if (arrays[0] != 66 || arrays[1] != 65 || arrays[2] != 73 || arrays[3] != 1) { throw new FormatException(Properties.Resource.BAM_InvalidIndexFile); } Read(arrays, 0, 4); int n_ref = Helper.GetInt32(arrays, 0); for (Int32 refindex = 0; refindex < n_ref; refindex++) { BAMReferenceIndexes bamindices = new BAMReferenceIndexes(); bamIndex.RefIndexes.Add(bamindices); Read(arrays, 0, 4); int n_bin = Helper.GetInt32(arrays, 0); for (Int32 binIndex = 0; binIndex < n_bin; binIndex++) { Bin bin = new Bin(); bamindices.Bins.Add(bin); Read(arrays, 0, 4); bin.BinNumber = Helper.GetUInt32(arrays, 0); Read(arrays, 0, 4); int n_chunk = Helper.GetInt32(arrays, 0); for (Int32 chunkIndex = 0; chunkIndex < n_chunk; chunkIndex++) { Chunk chunk = new Chunk(); bin.Chunks.Add(chunk); Read(arrays, 0, 8); chunk.ChunkStart = GetBAMOffset(arrays, 0); Read(arrays, 0, 8); chunk.ChunkEnd = GetBAMOffset(arrays, 0); } } Read(arrays, 0, 4); int n_intv = Helper.GetInt32(arrays, 0); for (Int32 offsetIndex = 0; offsetIndex < n_intv; offsetIndex++) { FileOffset value; Read(arrays, 0, 8); value = GetBAMOffset(arrays, 0); bamindices.LinearOffsets.Add(value); } } return(bamIndex); }
/// <summary> /// Returns BAMIndex instance by parsing BAM index source. /// </summary> public BAMIndex Read() { if (Source == null) { throw new InvalidOperationException(Properties.Resource.BAM_CantUseBAMIndexStreamDisposed); } BAMIndex bamIndex = new BAMIndex(); byte[] arrays = new byte[20]; Read(arrays, 0, 4); if (arrays[0] != 66 || arrays[1] != 65 || arrays[2] != 73 || arrays[3] != 1) { throw new FormatException(Properties.Resource.BAM_InvalidIndexFile); } Read(arrays, 0, 4); int n_ref = Helper.GetInt32(arrays, 0); for (Int32 refindex = 0; refindex < n_ref; refindex++) { BAMReferenceIndexes bamindices = new BAMReferenceIndexes(); bamIndex.RefIndexes.Add(bamindices); Read(arrays, 0, 4); int n_bin = Helper.GetInt32(arrays, 0); for (Int32 binIndex = 0; binIndex < n_bin; binIndex++) { Bin bin = new Bin(); Read(arrays, 0, 4); bin.BinNumber = Helper.GetUInt32(arrays, 0); Read(arrays, 0, 4); int n_chunk = Helper.GetInt32(arrays, 0); if (bin.BinNumber == MaxBins)//some groups use this to place meta-data, such as the picard toolkit and now SAMTools { //Meta data was later added in to the SAMTools specification for (Int32 chunkIndex = 0; chunkIndex < n_chunk; chunkIndex++) { bamindices.HasMetaData = true; Read(arrays, 0, 8); bamindices.MappedReadsCount = Helper.GetUInt64(arrays, 0); Read(arrays, 0, 8); bamindices.UnMappedReadsCount = Helper.GetUInt64(arrays, 0); } } else if (bin.BinNumber > MaxBins) { throw new Exception("BAM Index is incorrectly formatted. Bin number specified is higher than the maximum allowed."); } else { bamindices.Bins.Add(bin); for (Int32 chunkIndex = 0; chunkIndex < n_chunk; chunkIndex++) { Chunk chunk = new Chunk(); bin.Chunks.Add(chunk); Read(arrays, 0, 8); chunk.ChunkStart = GetBAMOffset(arrays, 0); Read(arrays, 0, 8); chunk.ChunkEnd = GetBAMOffset(arrays, 0); } } } //Get number of linear bins Read(arrays, 0, 4); int n_intv = Helper.GetInt32(arrays, 0); for (Int32 offsetIndex = 0; offsetIndex < n_intv; offsetIndex++) { FileOffset value; Read(arrays, 0, 8); value = GetBAMOffset(arrays, 0); bamindices.LinearIndex.Add(value); } } return bamIndex; }
// Refactored to remove this block from GetAlignmentMap() private SAMAlignedSequence BamIndexing(SAMAlignedSequence alignedSeq, BAMReferenceIndexes refIndices, BAMIndex index, ulong lastcOffset, ushort lastuOffset, ref Chunk lastChunk) { int lastBin = int.MaxValue; Bin bin; Chunk chunk; int lastRefSeqIndex = 0; int curRefSeqIndex; #region BAM indexing if (createBamIndex) { curRefSeqIndex = refSeqNames.IndexOf(alignedSeq.RName); if (lastRefSeqIndex != curRefSeqIndex) { refIndices = index.RefIndexes[curRefSeqIndex]; lastBin = int.MaxValue; lastRefSeqIndex = curRefSeqIndex; } if (lastBin != alignedSeq.Bin) { bin = refIndices.Bins.FirstOrDefault(B => B.BinNumber == alignedSeq.Bin); if (bin == null) { bin = new Bin(); bin.BinNumber = (uint)alignedSeq.Bin; refIndices.Bins.Add(bin); } if (lastChunk != null) { lastChunk.ChunkEnd.CompressedBlockOffset = lastcOffset; lastChunk.ChunkEnd.UncompressedBlockOffset = lastuOffset; } chunk = new Chunk(); chunk.ChunkStart = new FileOffset(); chunk.ChunkEnd = new FileOffset(); chunk.ChunkStart.CompressedBlockOffset = lastcOffset; chunk.ChunkStart.UncompressedBlockOffset = lastuOffset; bin.Chunks.Add(chunk); lastChunk = chunk; lastBin = alignedSeq.Bin; } // store linear index other than 16k bins, that is bin number less than 4681. if (alignedSeq.Bin < 4681) { int pos = alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : 0; int end = alignedSeq.RefEndPos > 0 ? alignedSeq.RefEndPos - 1 : 0; pos = pos >> 14; end = end >> 14; if (refIndices.LinearOffsets.Count == 0) { refIndices.LinearOffsets.Add(new FileOffset()); } if (refIndices.LinearOffsets.Count <= end) { for (int i = refIndices.LinearOffsets.Count; i <= end; i++) { refIndices.LinearOffsets.Add(new FileOffset()); } } for (int i = pos + 1; i <= end; i++) { FileOffset offset = refIndices.LinearOffsets[i]; if (offset.CompressedBlockOffset == 0 && offset.UncompressedBlockOffset == 0) { offset.CompressedBlockOffset = lastcOffset; offset.UncompressedBlockOffset = lastuOffset; } } } } #endregion return alignedSeq; }
private IEnumerable<SAMAlignedSequence> GetAlignmentWithoutIndexYield(SAMAlignmentHeader header) { Chunk lastChunk = null; ulong lastcOffset = 0; ushort lastuOffset = 0; BAMReferenceIndexes refIndices = null; if (createBamIndex) { bamIndex = new BAMIndex(); for (int i = 0; i < refSeqNames.Count; i++) { bamIndex.RefIndexes.Add(new BAMReferenceIndexes()); } refIndices = bamIndex.RefIndexes[0]; } while (!IsEOF()) { if (createBamIndex) { lastcOffset = (ulong)currentCompressedBlockStartPos; lastuOffset = (ushort)deCompressedStream.Position; } SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); alignedSeq = BamIndexing(alignedSeq, refIndices, bamIndex, lastcOffset, lastuOffset, ref lastChunk); yield return alignedSeq; alignedSeq = null; } #region BAM Indexing if (createBamIndex) { lastChunk.ChunkEnd.CompressedBlockOffset = (ulong)readStream.Position; if (deCompressedStream != null) { lastChunk.ChunkEnd.UncompressedBlockOffset = (ushort)deCompressedStream.Position; } else { lastChunk.ChunkEnd.UncompressedBlockOffset = 0; } } #endregion }
private void GetAlignmentWithoutIndex(SAMAlignmentHeader header, ref SequenceAlignmentMap seqMap) { Chunk lastChunk = null; ulong lastcOffset = 0; ushort lastuOffset = 0; BAMReferenceIndexes refIndices = null; if (createBamIndex) { bamIndex = new BAMIndex(); for (int i = 0; i < refSeqNames.Count; i++) { bamIndex.RefIndexes.Add(new BAMReferenceIndexes()); } refIndices = bamIndex.RefIndexes[0]; } if (!createBamIndex && seqMap == null) { seqMap = new SequenceAlignmentMap(header); } while (!IsEOF()) { if (createBamIndex) { lastcOffset = (ulong)currentCompressedBlockStartPos; lastuOffset = (ushort)deCompressedStream.Position; } SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); alignedSeq = BamIndexing(alignedSeq, refIndices, bamIndex, lastcOffset, lastuOffset, ref lastChunk); if (!createBamIndex && alignedSeq != null) { seqMap.QuerySequences.Add(alignedSeq); } alignedSeq = null; } #region BAM Indexing if (createBamIndex) { lastChunk.ChunkEnd.CompressedBlockOffset = (ulong)readStream.Position; if (deCompressedStream != null) { lastChunk.ChunkEnd.UncompressedBlockOffset = (ushort)deCompressedStream.Position; } else { lastChunk.ChunkEnd.UncompressedBlockOffset = 0; } } #endregion }