protected override bool ShouldSkipRead(BamAlignment alignment) { if (!alignment.IsMapped()) { _statusCounter.AddDebugStatusCount("Skipped not mapped"); return(true); } if (alignment.IsSupplementaryAlignment()) { _statusCounter.AddDebugStatusCount("Skipped supplementary"); return(true); } if (_filterForProperPairs && !alignment.IsProperPair()) { _statusCounter.AddDebugStatusCount("Skipped improper pair"); return(true); } if (alignment.MapQuality < _minMapQuality) { _statusCounter.AddDebugStatusCount("Skipped low map quality"); return(true); } return(false); }
protected override bool ShouldBlacklistReadIndexer(BamAlignment alignment) { if (_filterPairLowMapQ) { if (alignment.MapQuality > 0 && alignment.MapQuality < _minMapQuality) { return(true); } } if (_filterPairUnmapped) { if (!alignment.IsMapped()) { _statusCounter.AddDebugStatusCount("Skipped not mapped"); return(true); } if (!alignment.IsMateMapped()) { _statusCounter.AddDebugStatusCount("Skipped mate not mapped"); return(true); } } // Only check if read is duplicate once (otherwise de novo dup finder will falsely mark dup because it has seen this read before!) // Blacklist rather than just skipping because if one mate is duplicate, we presume the other one is too. // Note: This breaks down is if we have a fusion read and the first mate we see is not a duplicate and the second mate is. In our case, // (if we are not trying to mate fusions) we will flush the first mate to bam without knowing that the second mate is a dup. // This is a highly unlikely degenerate case. var isDuplicate = ReadIsDuplicate(alignment); if (isDuplicate) { _statusCounter.AddStatusCount("Blacklisted Duplicates"); } return(isDuplicate); }
public void FromBam() { var alignment = new BamAlignment { Bases = "ATCTTA", Position = 100, MatePosition = 500, Name = "test", CigarData = new CigarAlignment("5M1S"), MapQuality = 10, Qualities = new[] { (byte)10, (byte)20, (byte)30 } }; alignment.SetIsDuplicate(true); alignment.SetIsProperPair(true); alignment.SetIsSecondaryAlignment(true); alignment.SetIsUnmapped(true); var read = new Read("chr1", alignment); Assert.Equal(read.Chromosome, "chr1"); Assert.Equal(read.Sequence, alignment.Bases); Assert.Equal(read.Position, alignment.Position + 1); Assert.Equal(read.MatePosition, alignment.MatePosition + 1); Assert.Equal(read.Name, alignment.Name); Assert.Equal(read.CigarData, alignment.CigarData); Assert.Equal(read.IsMapped, alignment.IsMapped()); Assert.Equal(read.IsProperPair, alignment.IsProperPair()); Assert.Equal(read.IsPrimaryAlignment, alignment.IsPrimaryAlignment()); Assert.Equal(read.IsPcrDuplicate, alignment.IsDuplicate()); foreach (var direction in read.SequencedBaseDirectionMap) { Assert.Equal(direction, DirectionType.Forward); } for (var i = 0; i < read.Qualities.Length; i++) { Assert.Equal(read.Qualities[i], alignment.Qualities[i]); } }
private bool MayOverlapMate(BamAlignment alignment) { if (!alignment.IsMateMapped()) { return(false); } if (!alignment.IsMapped()) { return(false); } if (alignment.RefID != alignment.MateRefID) { return(false); } if (Math.Abs(alignment.Position - alignment.MatePosition) > _maxPairGap) { return(false); } return(true); }
private PairStatus SingleReadStatus(BamAlignment alignment) { if ((alignment.RefID != alignment.MateRefID && alignment.IsPaired())) { return(PairStatus.SplitChromosomes); // Stitched reads will have split ref ids too but not the same thing } if (((!alignment.IsMateMapped() && alignment.RefID == -1) || (!alignment.IsMapped() && alignment.MateRefID == -1))) { return(PairStatus.MateUnmapped); } if (alignment.IsDuplicate()) { return(PairStatus.Duplicate); } if (_considerInsertSize) { if (alignment.IsPaired() && !OverlapsMate(alignment)) { return(PairStatus.LongFragment); } } return(PairStatus.Unknown); }
protected override bool ShouldBlacklistReadIndexer(BamAlignment alignment) { if (_filterPairLowMapQ) { if (alignment.MapQuality > 0 && alignment.MapQuality < _minMapQuality) { return(true); } } if (_filterPairUnmapped) { // Need to check mapped flag in addition to refid because some pairs have one mate mapped and one mate mapped right next to it but with mapq 0 and with mapping(chr: pos) information. This allows us to distinguish those from truly unmapped("don't know what the heck to do with this") reads if (!alignment.IsMapped() && alignment.RefID == -1) { _statusCounter.AddDebugStatusCount("Skipped not mapped"); return(true); } if (!alignment.IsMateMapped() && alignment.MateRefID == -1) { _statusCounter.AddDebugStatusCount("Skipped mate not mapped"); return(true); } } // Only check if read is duplicate once (otherwise de novo dup finder will falsely mark dup because it has seen this read before!) // Blacklist rather than just skipping because if one mate is duplicate, we presume the other one is too. // Note: This breaks down is if we have a fusion read and the first mate we see is not a duplicate and the second mate is. In our case, // (if we are not trying to mate fusions) we will flush the first mate to bam without knowing that the second mate is a dup. // This is a highly unlikely degenerate case. var isDuplicate = ReadIsDuplicate(alignment); if (isDuplicate) { _statusCounter.AddStatusCount("Blacklisted Duplicates"); } return(isDuplicate); }
/// <summary> /// Step 2: Get the ref and variant allele frequencies for the variants of interest, in the tumor bam file. /// </summary> protected void ProcessBamFile(string bamPath) { Console.WriteLine("{0} Looping over bam records from {1}", DateTime.Now, bamPath); int overallCount = 0; int nextVariantIndex = 0; using (BamReader reader = new BamReader(bamPath)) { BamAlignment read = new BamAlignment(); int refID = reader.GetReferenceIndex(this.Chromosome); if (refID < 0) { throw new ArgumentException(string.Format("Error: Chromosome name '{0}' does not match bam file at '{1}'", this.Chromosome, bamPath)); } Console.WriteLine("Jump to refid {0} {1}", refID, this.Chromosome); reader.Jump(refID, 0); while (true) { bool result = reader.GetNextAlignment(ref read, false); if (!result) break; if (!read.HasPosition() || read.RefID > refID) break; // We're past our chromosome of interest. if (read.RefID < refID) continue; // We're not yet on our chromosome of interest. overallCount++; if (overallCount % 1000000 == 0) { Console.WriteLine("Record {0} at {1}...", overallCount, read.Position); } // Skip over unaligned or other non-count-worthy reads: if (!read.IsPrimaryAlignment()) continue; if (!read.IsMapped()) continue; if (read.IsDuplicate()) continue; if (read.MapQuality <= MinimumMapQ) continue; // Scan forward through the variants list, to keep up with our reads: while (nextVariantIndex < this.Variants.Count && this.Variants[nextVariantIndex].ReferencePosition < read.Position) { nextVariantIndex++; } if (nextVariantIndex >= this.Variants.Count) break; // If the read doesn't look like it has a reasonable chance of touching the next variant, continue: if (read.Position + 1000 < this.Variants[nextVariantIndex].ReferencePosition) continue; // This read potentially overlaps next variant (and further variants). Count bases! ProcessReadBases(read, nextVariantIndex); } } Console.WriteLine("Looped over {0} bam records in all", overallCount); }
/// <summary> /// Step 2: Get the ref and variant allele frequencies for the variants of interest, in the tumor bam file. /// </summary> protected void ProcessBamFile(string bamPath) { Console.WriteLine("{0} Looping over bam records from {1}", DateTime.Now, bamPath); int overallCount = 0; int nextVariantIndex = 0; using (BamReader reader = new BamReader(bamPath)) { BamAlignment read = new BamAlignment(); int refID = reader.GetReferenceIndex(this.Chromosome); if (refID < 0) { throw new ArgumentException(string.Format("Error: Chromosome name '{0}' does not match bam file at '{1}'", this.Chromosome, bamPath)); } Console.WriteLine("Jump to refid {0} {1}", refID, this.Chromosome); reader.Jump(refID, 0); while (true) { bool result = reader.GetNextAlignment(ref read, false); if (!result) { break; } if (!read.HasPosition() || read.RefID > refID) { break; // We're past our chromosome of interest. } if (read.RefID < refID) { continue; // We're not yet on our chromosome of interest. } overallCount++; if (overallCount % 1000000 == 0) { Console.WriteLine("Record {0} at {1}...", overallCount, read.Position); } // Skip over unaligned or other non-count-worthy reads: if (!read.IsPrimaryAlignment()) { continue; } if (!read.IsMapped()) { continue; } if (read.IsDuplicate()) { continue; } if (read.MapQuality <= MinimumMapQ) { continue; } // Scan forward through the variants list, to keep up with our reads: while (nextVariantIndex < this.Variants.Count && this.Variants[nextVariantIndex].ReferencePosition < read.Position) { nextVariantIndex++; } if (nextVariantIndex >= this.Variants.Count) { break; } // If the read doesn't look like it has a reasonable chance of touching the next variant, continue: if (read.Position + 1000 < this.Variants[nextVariantIndex].ReferencePosition) { continue; } // This read potentially overlaps next variant (and further variants). Count bases! ProcessReadBases(read, nextVariantIndex); } } Console.WriteLine("Looped over {0} bam records in all", overallCount); }
/// <summary> /// Bins the fragment identified by alignment. Increases bin count if the first read of a pair passes all the filters. /// Decreases bin count if the second read of a pair does not pass all the filters. /// </summary> /// <param name="alignment"></param> /// <param name="qualityThreshold">minimum mapping quality</param> /// <param name="readNameToBinIndex">Dictionary of read name to bin index</param> /// <param name="usableFragmentCount">number of usable fragments</param> /// <param name="bins">predefined bins</param> /// <param name="binIndexStart">bin index from which to start searching for the best bin</param> public static void BinOneAlignment(BamAlignment alignment, uint qualityThreshold, Dictionary <string, int> readNameToBinIndex, HashSet <string> samePositionReadNames, ref long usableFragmentCount, List <SampleGenomicBin> bins, ref int binIndexStart) { if (!alignment.IsMapped()) { return; } if (!alignment.IsMateMapped()) { return; } if (!alignment.IsPrimaryAlignment()) { return; } if (!(alignment.IsPaired() && alignment.IsProperPair())) { return; } bool duplicateFailedQCLowQuality = IsDuplicateFailedQCLowQuality(alignment, qualityThreshold); // Check whether we have binned the fragment using the mate if (readNameToBinIndex.ContainsKey(alignment.Name)) { // Undo binning when one of the reads is a duplicate, fails QC or has low mapping quality if (duplicateFailedQCLowQuality) { usableFragmentCount--; bins[readNameToBinIndex[alignment.Name]].Count--; } readNameToBinIndex.Remove(alignment.Name); // clean up return; } if (duplicateFailedQCLowQuality) { return; } if (alignment.RefID != alignment.MateRefID) { return; } // does this ever happen? if (IsRightMostInPair(alignment)) { return; } // look at only one read of the pair // handle the case where alignment.Position == alignment.MatePosition if (alignment.Position == alignment.MatePosition) { if (samePositionReadNames.Contains(alignment.Name)) { samePositionReadNames.Remove(alignment.Name); return; } samePositionReadNames.Add(alignment.Name); } if (alignment.FragmentLength == 0) { return; } // Janus-SRS-190: 0 when the information is unavailable // Try to bin the fragment int fragmentStart = alignment.Position; // 0-based, inclusive int fragmentStop = alignment.Position + alignment.FragmentLength; // 0-based, exclusive while (binIndexStart < bins.Count && bins[binIndexStart].Stop <= fragmentStart) // Bins[binIndexStart] on the left of the fragment { binIndexStart++; } if (binIndexStart >= bins.Count) { return; } // all the remaining fragments are on the right of the last bin // now Bins[binIndexStart].Stop > fragmentStart int bestBinIndex = FindBestBin(bins, binIndexStart, fragmentStart, fragmentStop); if (bestBinIndex >= 0) // Bin the fragment { usableFragmentCount++; bins[bestBinIndex].Count++; readNameToBinIndex[alignment.Name] = bestBinIndex; } }
/// <summary> /// Updates the index with respect to the current alignment /// </summary> /// <returns>false if multiple reads without coordinates are encountered</returns> public bool UpdateReferenceIndex(ref BamAlignment alignment, ulong offset) { // record the number of unaligned reads if (alignment.RefID < 0) ++_numUnalignedWithoutCoordinates; // update the reference IDs and check that the alignment is sorted if (alignment.RefID != _lastRefID) { _lastRefID = alignment.RefID; _lastBin = uint.MaxValue; } else if (alignment.Position < _lastPosition) { throw new ApplicationException( string.Format( "ERROR: The BAM file is not sorted. An alignment ({0}) occurred before the preceding alignment ({1}).", alignment.Position, _lastPosition)); } if (alignment.RefID >= 0) AddOffset(ref _index[alignment.RefID].OffsetList, ref alignment, _lastOffset); if (alignment.Bin != _lastBin) { if (_saveBin != uint.MaxValue) AddBamRegion(ref _index[_saveRefID].RegionsDictionary, _saveBin, _saveOffset, _lastOffset); if ((_lastBin == uint.MaxValue) && (_saveRefID != int.MinValue)) { _endOffset = _lastOffset; AddBamRegion(ref _index[_saveRefID].RegionsDictionary, BamMaxBin, _beginOffset, _endOffset); AddBamRegion(ref _index[_saveRefID].RegionsDictionary, BamMaxBin, _numAligned, _numUnaligned); _numAligned = _numUnaligned = 0; _beginOffset = _endOffset; } _saveOffset = _lastOffset; _saveBin = _lastBin = alignment.Bin; _saveRefID = alignment.RefID; if (_saveRefID < 0) { _hasUnalignedReads = true; return false; } } if (offset <= _lastOffset) { throw new ApplicationException( "ERROR: While updating the BAM index, the offset did not increase after processing the last alignment."); } if (alignment.IsMapped()) ++_numAligned; else ++_numUnaligned; _lastOffset = offset; _lastPosition = alignment.Position; return true; }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) { continue; } if (alignment.IsFailedQC()) { continue; } if (alignment.IsDuplicate()) { continue; } if (alignment.IsReverseStrand()) { continue; } if (!alignment.IsMainAlignment()) { continue; } // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) { continue; } if (isPairedEnd && !alignment.IsProperPair()) { continue; } int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) { break; } if (refID == -1) { continue; } keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) { fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
/// <summary> /// Updates the index with respect to the current alignment /// </summary> /// <returns>false if multiple reads without coordinates are encountered</returns> public bool UpdateReferenceIndex(ref BamAlignment alignment, ulong offset) { // record the number of unaligned reads if (alignment.RefID < 0) { ++_numUnalignedWithoutCoordinates; } // update the reference IDs and check that the alignment is sorted if (alignment.RefID != _lastRefID) { _lastRefID = alignment.RefID; _lastBin = int.MaxValue; } else if (alignment.Position < _lastPosition) { throw new InvalidDataException( string.Format( "ERROR: The BAM file is not sorted. An alignment ({0}:{1}) occurred before the preceding alignment ({2}:{3}).", alignment.RefID, alignment.Position, _lastRefID, _lastPosition)); } if (alignment.RefID >= 0) { AddOffset(ref _index[alignment.RefID].OffsetList, ref alignment, _lastOffset); } if (alignment.Bin != _lastBin) { if (_saveBin != uint.MaxValue) { AddBamRegion(ref _index[_saveRefID].RegionsDictionary, _saveBin, _saveOffset, _lastOffset); } if ((_lastBin == uint.MaxValue) && (_saveRefID != int.MinValue)) { _endOffset = _lastOffset; AddBamRegion(ref _index[_saveRefID].RegionsDictionary, BamMaxBin, _beginOffset, _endOffset); AddBamRegion(ref _index[_saveRefID].RegionsDictionary, BamMaxBin, _numAligned, _numUnaligned); _numAligned = _numUnaligned = 0; _beginOffset = _endOffset; } _saveOffset = _lastOffset; _saveBin = _lastBin = alignment.Bin; _saveRefID = alignment.RefID; if (_saveRefID < 0) { _hasUnalignedReads = true; return(false); } } if (offset <= _lastOffset) { throw new InvalidDataException( "ERROR: While updating the BAM index, the offset did not increase after processing the last alignment."); } if (alignment.IsMapped()) { ++_numAligned; } else { ++_numUnaligned; } _lastOffset = offset; _lastPosition = alignment.Position; return(true); }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) continue; if (alignment.IsFailedQC()) continue; if (alignment.IsDuplicate()) continue; if (alignment.IsReverseStrand()) continue; if (!alignment.IsMainAlignment()) continue; // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue; if (isPairedEnd && !alignment.IsProperPair()) continue; int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
/// <summary> /// Bins the fragment identified by alignment. Increases bin count if the first read of a pair passes all the filters. /// Decreases bin count if the second read of a pair does not pass all the filters. /// </summary> /// <param name="alignment"></param> /// <param name="qualityThreshold">minimum mapping quality</param> /// <param name="readNameToBinIndex">Dictionary of read name to bin index</param> /// <param name="usableFragmentCount">number of usable fragments</param> /// <param name="bins">predefined bins</param> /// <param name="binIndexStart">bin index from which to start searching for the best bin</param> public static void BinOneAlignment(BamAlignment alignment, uint qualityThreshold, Dictionary<string, int> readNameToBinIndex, HashSet<string> samePositionReadNames, ref long usableFragmentCount, List<GenomicBin> bins, ref int binIndexStart) { if (!alignment.IsMapped()) { return; } if (!alignment.IsMateMapped()) { return; } if (!alignment.IsPrimaryAlignment()) { return; } if (!(alignment.IsPaired() && alignment.IsProperPair())) { return; } bool duplicateFailedQCLowQuality = IsDuplicateFailedQCLowQuality(alignment, qualityThreshold); // Check whether we have binned the fragment using the mate if (readNameToBinIndex.ContainsKey(alignment.Name)) { // Undo binning when one of the reads is a duplicate, fails QC or has low mapping quality if (duplicateFailedQCLowQuality) { usableFragmentCount--; bins[readNameToBinIndex[alignment.Name]].Count--; } readNameToBinIndex.Remove(alignment.Name); // clean up return; } if (duplicateFailedQCLowQuality) { return; } if (alignment.RefID != alignment.MateRefID) { return; } // does this ever happen? if (IsRightMostInPair(alignment)) { return; } // look at only one read of the pair // handle the case where alignment.Position == alignment.MatePosition if (alignment.Position == alignment.MatePosition) { if (samePositionReadNames.Contains(alignment.Name)) { samePositionReadNames.Remove(alignment.Name); return; } samePositionReadNames.Add(alignment.Name); } if (alignment.FragmentLength == 0) { return; } // Janus-SRS-190: 0 when the information is unavailable // Try to bin the fragment int fragmentStart = alignment.Position; // 0-based, inclusive int fragmentStop = alignment.Position + alignment.FragmentLength; // 0-based, exclusive while (binIndexStart < bins.Count && bins[binIndexStart].Stop <= fragmentStart) // Bins[binIndexStart] on the left of the fragment { binIndexStart++; } if (binIndexStart >= bins.Count) { return; } // all the remaining fragments are on the right of the last bin // now Bins[binIndexStart].Stop > fragmentStart int bestBinIndex = FindBestBin(bins, binIndexStart, fragmentStart, fragmentStop); if (bestBinIndex >= 0) // Bin the fragment { usableFragmentCount++; bins[bestBinIndex].Count++; readNameToBinIndex[alignment.Name] = bestBinIndex; } }