/// <summary> /// Create an index from a sorted BAM file /// </summary> /// <param name="bamFilePath">Path to BAM file</param> public BamIndex(string bamFilePath) { _numUnalignedWithoutCoordinates = 0; // allocate space for the reference index using (var reader = new BamReader(bamFilePath)) { List<GenomeMetadata.SequenceMetadata> references = reader.GetReferences(); Initialize(references.Count, reader.Tell()); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { if (!UpdateReferenceIndex(ref alignment, reader.Tell())) break; } // perform some post-processing on the index PostProcessing(reader.Tell()); if (_hasUnalignedReads) { while (reader.GetNextAlignment(ref alignment, true)) ++_numUnalignedWithoutCoordinates; } } }
/// <summary> /// Step 2: Get the ref and variant allele frequencies for the variants of interest, in the tumor bam file. /// </summary> protected void ProcessBamFile(string bamPath) { Console.WriteLine("{0} Looping over bam records from {1}", DateTime.Now, bamPath); int overallCount = 0; int nextVariantIndex = 0; using (BamReader reader = new BamReader(bamPath)) { BamAlignment read = new BamAlignment(); int refID = reader.GetReferenceIndex(this.Chromosome); if (refID < 0) { throw new ArgumentException(string.Format("Error: Chromosome name '{0}' does not match bam file at '{1}'", this.Chromosome, bamPath)); } Console.WriteLine("Jump to refid {0} {1}", refID, this.Chromosome); reader.Jump(refID, 0); while (true) { bool result = reader.GetNextAlignment(ref read, false); if (!result) break; if (!read.HasPosition() || read.RefID > refID) break; // We're past our chromosome of interest. if (read.RefID < refID) continue; // We're not yet on our chromosome of interest. overallCount++; if (overallCount % 1000000 == 0) { Console.WriteLine("Record {0} at {1}...", overallCount, read.Position); } // Skip over unaligned or other non-count-worthy reads: if (!read.IsPrimaryAlignment()) continue; if (!read.IsMapped()) continue; if (read.IsDuplicate()) continue; if (read.MapQuality <= MinimumMapQ) continue; // Scan forward through the variants list, to keep up with our reads: while (nextVariantIndex < this.Variants.Count && this.Variants[nextVariantIndex].ReferencePosition < read.Position) { nextVariantIndex++; } if (nextVariantIndex >= this.Variants.Count) break; // If the read doesn't look like it has a reasonable chance of touching the next variant, continue: if (read.Position + 1000 < this.Variants[nextVariantIndex].ReferencePosition) continue; // This read potentially overlaps next variant (and further variants). Count bases! ProcessReadBases(read, nextVariantIndex); } } Console.WriteLine("Looped over {0} bam records in all", overallCount); }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) continue; if (alignment.IsFailedQC()) continue; if (alignment.IsDuplicate()) continue; if (alignment.IsReverseStrand()) continue; if (!alignment.IsMainAlignment()) continue; // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue; if (isPairedEnd && !alignment.IsProperPair()) continue; int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
/// <summary> /// Bins fragments. /// </summary> private void binFragments() { // Sanity check: The BAM index file must exist, in order for us to seek to our target chromosome! if (!Bam.Index.Exists) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", Bam.Index.FullName)); } long pairedAlignmentCount = 0; // keep track of paired alignments usableFragmentCount = 0; using (BamReader reader = new BamReader(Bam.BamFile.FullName)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(Chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", Chromosome, Bam.BamFile.FullName)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } Dictionary<string, int> readNameToBinIndex = new Dictionary<string, int>(); HashSet<string> samePositionReadNames = new HashSet<string>(); int binIndexStart = 0; int prevPosition = -1; BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; if (alignment.Position < prevPosition) // Make sure the BAM is properly sorted { throw new ApplicationException( string.Format("The alignment on {0} are not properly sorted in {1}: {2}", Chromosome, Bam.BamFile.FullName, alignment.Name)); } prevPosition = alignment.Position; if (alignment.IsPaired()) { pairedAlignmentCount++; } BinOneAlignment(alignment, FragmentBinnerConstants.MappingQualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, Bins, ref binIndexStart); } } if (pairedAlignmentCount == 0) { throw new ApplicationException(string.Format("No paired alignments found for {0} in {1}", Chromosome, Bam.BamFile.FullName)); } }