/// <summary> /// Create an index from a sorted BAM file /// </summary> /// <param name="bamFilePath">Path to BAM file</param> public BamIndex(string bamFilePath) { _numUnalignedWithoutCoordinates = 0; // allocate space for the reference index using (var reader = new BamReader(bamFilePath)) { List<GenomeMetadata.SequenceMetadata> references = reader.GetReferences(); Initialize(references.Count, reader.Tell()); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { if (!UpdateReferenceIndex(ref alignment, reader.Tell())) break; } // perform some post-processing on the index PostProcessing(reader.Tell()); if (_hasUnalignedReads) { while (reader.GetNextAlignment(ref alignment, true)) ++_numUnalignedWithoutCoordinates; } } }
private void SilenceReads(BamAlignment read1, BamAlignment read2, int readsToSilence, bool realignedR1, bool realignedR2) { if (!realignedR1 && (readsToSilence == 1 || readsToSilence == 3)) { for (var i = 0; i < read1.Qualities.Length; i++) { read1.Qualities[i] = 0; } } if (!realignedR2 && (readsToSilence == 2 || readsToSilence == 3)) { for (var i = 0; i < read2.Qualities.Length; i++) { read2.Qualities[i] = 0; } } }
public void AddCombinedStatusStringTags() { var counter = new ReadStatusCounter(); var handler = new DebugSummaryStatusHandler(counter); var pair = TestHelpers.GetPair("10M", "10M"); pair.Read1.ReplaceOrAddStringTag("HI", "read1_hi"); pair.Read2.ReplaceOrAddStringTag("HI", "read2_hi"); var outAlignment = new BamAlignment(pair.Read1); outAlignment.ReplaceOrAddStringTag("HI", "nothing"); // Should not update handler.AddCombinedStatusStringTags("HI", pair.Read1, pair.Read2, outAlignment); Assert.Equal("nothing", outAlignment.GetStringTag("HI")); }
public void IsReadCollapsed() { var alignment = new BamAlignment { Bases = "ACTC", Position = 5, MapQuality = 343, MatePosition = 12312, Qualities = new[] { (byte)20, (byte)21, (byte)30, (byte)40 }, CigarData = new CigarAlignment("1S3M") }; var read = new Read("chr1", alignment); Assert.False(read.IsCollapsedRead()); alignment.TagData = ReadTestHelper.GetReadCountsTagData(1, 10); // set XV and XW tags Assert.True(read.IsCollapsedRead()); }
/// <summary> /// Serialize alignment to a byte array, for later flushing to output file. /// </summary> public byte[] SerializeAlignment(ref BamAlignment al) { // initialize uint nameLen = (uint)al.Name.Length + 1; uint numBases = (uint)al.Bases.Length; uint numCigarOperations = (uint)al.CigarData.Count; uint packedCigarLen = numCigarOperations * 4; uint numEncodedBases = (uint)((numBases / 2.0) + 0.5); uint tagDataLen = (uint)al.TagData.Length; uint dataBlockSize = nameLen + packedCigarLen + numEncodedBases + numBases + tagDataLen; uint alignBlockSize = BamConstants.CoreAlignmentDataLen + dataBlockSize; uint blockSize = alignBlockSize + 4; byte[] buffer = new byte[blockSize]; int offset = 0; // store the block size BinaryIO.AddUIntBytes(ref buffer, ref offset, alignBlockSize); // store the BAM core data BinaryIO.AddIntBytes(ref buffer, ref offset, al.RefID); BinaryIO.AddIntBytes(ref buffer, ref offset, al.Position); BinaryIO.AddUIntBytes(ref buffer, ref offset, (al.Bin << 16) | (al.MapQuality << 8) | nameLen); BinaryIO.AddUIntBytes(ref buffer, ref offset, (al.AlignmentFlag << 16) | numCigarOperations); BinaryIO.AddUIntBytes(ref buffer, ref offset, numBases); BinaryIO.AddIntBytes(ref buffer, ref offset, al.MateRefID); BinaryIO.AddIntBytes(ref buffer, ref offset, al.MatePosition); BinaryIO.AddIntBytes(ref buffer, ref offset, al.FragmentLength); // store the alignment name BinaryIO.AddNullTerminatedString(ref buffer, ref offset, al.Name); // store the packed CIGAR string and packed bases PackCigar(ref offset, ref buffer, al.CigarData); PackBases(ref offset, ref buffer, numEncodedBases, al.Bases); // store the base qualities Buffer.BlockCopy(al.Qualities, 0, buffer, offset, al.Qualities.Length); offset += al.Qualities.Length; // store the tag data Buffer.BlockCopy(al.TagData, 0, buffer, offset, al.TagData.Length); offset += al.TagData.Length; return(buffer); }
public static BamAlignment CreateAlignment(string name, bool isProperPair = true, int position = 0, string cigarData = "3M", bool isUnMapped = false, bool mateIsUnMapped = false, uint mapQ = 30) { var alignment = new BamAlignment { Name = name, Qualities = new byte[0], CigarData = new CigarAlignment(cigarData), Position = position }; alignment.SetIsProperPair(isProperPair); alignment.SetIsUnmapped(isUnMapped); alignment.SetIsMateUnmapped(mateIsUnMapped); alignment.MapQuality = mapQ; return(alignment); }
public bool GetNextAlignment(Read read) { if (_bamReader == null) { throw new Exception("Already disposed."); } while (true) { Region currentInterval = null; if (_rawAlignment != null) { var currentChrIntervals = GetIntervalsForChr(_rawAlignment.RefID); if (currentChrIntervals != null) // null signals not to apply interval jumping { if (!JumpIfNeeded(currentChrIntervals, out currentInterval)) { Dispose(); return(false); } } } else { _rawAlignment = new BamAlignment(); // first time pass } if (!_bamReader.GetNextAlignment(ref _rawAlignment, false) || ((_bamIndexFilter > -1) && (_rawAlignment.RefID != _bamIndexFilter))) { Dispose(); return(false); } if (currentInterval == null || _rawAlignment.Position < currentInterval.EndPosition) { var reference = _references.FirstOrDefault(r => r.Index == _rawAlignment.RefID); read.Reset(reference?.Name, _rawAlignment); return(true); } // read off the end of the interval - keep looping to jump to the next one or scan to the end } }
public void FindVariantMNVResults() { var read = new BamAlignment(); read.Bases = "AA" + "ACGTACGT" + "GGGG"; //vcf coords 12-345678910-11,12,13,14 read.CigarData = new CigarAlignment("2S8M4S"); read.Position = 3 - 1; read.Qualities = new byte[read.Bases.Length]; var vs1 = new VariantSite(); vs1.VcfReferencePosition = 4; vs1.VcfReferenceAllele = "TA"; vs1.VcfAlternateAllele = "CG"; //read should match ALT for this test var vs2 = new VariantSite(); vs2.VcfReferencePosition = 10; vs2.VcfReferenceAllele = "TTT"; vs2.VcfAlternateAllele = "T"; var vsFromVcf = new List <VariantSite>() { vs1, vs2 }; //given a variant site, is it in the read? ExecuteTest(read, 0, vsFromVcf, (foundVariants) => { Assert.Equal(foundVariants[SubsequenceType.MatchOrMismatchSequence].Count, 1); Assert.Equal(foundVariants[SubsequenceType.InsertionSquence].Count, 0); Assert.Equal(foundVariants[SubsequenceType.DeletionSequence].Count, 0); }, (matchedVariants) => { Assert.Equal(matchedVariants[0].VcfReferencePosition, 4); Assert.Equal(matchedVariants[0].VcfReferenceAllele, "TA"); Assert.Equal(matchedVariants[0].VcfAlternateAllele, "CG"); Assert.Equal(matchedVariants[1].VcfReferencePosition, 10); //a deletion not supported by the reads Assert.Equal(matchedVariants[1].VcfReferenceAllele, "T"); //to we just return T>T, a reference call at this loci. Assert.Equal(matchedVariants[1].VcfAlternateAllele, "T"); }); }
/// <summary> /// Checks if any of the conditions is true: /// 1. The read is a duplicate, /// 2. The read failed QC, /// 3. The read is of low mapping quality. /// </summary> /// <param name="alignment"></param> /// <returns></returns> public static bool IsDuplicateFailedQCLowQuality(BamAlignment alignment, uint qualityThreshold) { if (alignment.IsDuplicate()) { return(true); } if (alignment.IsFailedQC()) { return(true); } if (alignment.MapQuality == FragmentBinnerConstants.MappingQualityNotAvailable || alignment.MapQuality < qualityThreshold) { return(true); } return(false); }
private void WriteAlignment(BamAlignment al, int bufferNumber) { var buffer = _alignmentBuffer[bufferNumber]; buffer.Add(al); if (buffer.Count >= MAX_BUFFER_SIZE) { lock (_alignmentBuffer) { foreach (var alignment in buffer) { WriteAlignment(alignment); } } buffer.Clear(); } }
public void GetEndPosition_Tests() { var alignment1 = new BamAlignment() { Position = 500, CigarData = new CigarAlignment("5M7I19M3D") }; Assert.Equal(527, alignment1.GetEndPosition()); var alignment2 = new BamAlignment() { Position = 500, CigarData = new CigarAlignment("3I") }; Assert.Equal(500, alignment2.GetEndPosition()); }
private Read CreateRead(string chr, string sequence, int position, string name, bool isMapped = true, bool isPrimaryAlignment = true, bool isProperPair = true, bool isDuplicate = false, int mapQuality = 10, bool addCigarData = true) { var alignment = new BamAlignment() { Bases = sequence, Position = position, Name = name, MapQuality = (uint)mapQuality }; alignment.SetIsUnmapped(!isMapped); alignment.SetIsSecondaryAlignment(!isPrimaryAlignment); alignment.SetIsDuplicate(isDuplicate); alignment.SetIsProperPair(isProperPair); if (addCigarData) { alignment.CigarData = new CigarAlignment(sequence.Length + "M"); } return(new Read(chr, alignment)); }
private static void RunProcessorTest(string inBam, string outBam, string expBam, string outFolder, bool threadbyChr, StitcherOptions stitcherOptions) { if (File.Exists(outBam)) { File.Delete(outBam); } Logger.OpenLog(TestPaths.LocalScratchDirectory, "StitcherTestLog.txt", true); var processor = threadbyChr ? (IStitcherProcessor) new GenomeProcessor(inBam) : new BamProcessor(); processor.Process(inBam, outFolder, stitcherOptions); Logger.CloseLog(); Assert.True(File.Exists(outBam)); var observedAlignment = new BamAlignment(); var expectedAlignment = new BamAlignment(); using (var outReader = new BamReader(outBam)) using (var expReader = new BamReader(expBam)) { while (true) { var nextObservation = outReader.GetNextAlignment(ref observedAlignment, true); var nextExpected = expReader.GetNextAlignment(ref expectedAlignment, true); if ((nextExpected == false) || (expectedAlignment == null)) { break; } Assert.Equal(expectedAlignment.Bases, observedAlignment.Bases); Assert.Equal(expectedAlignment.Position, observedAlignment.Position); Assert.Equal(expectedAlignment.Qualities, observedAlignment.Qualities); } outReader.Close(); expReader.Close(); } }
private static BamAlignment BuildRead(AbstractAlignment alignment, byte qualityForAll, Tuple <int, int> MNVdata) { int MNVPosition = MNVdata.Item1; int MNVLength = MNVdata.Item2; try { var ca = new CigarAlignment(alignment.Cigar); int readLength = (int)ca.GetReadSpan(); string readSequence = new string('A', readLength); //originalAlignment.Sequence; if (MNVLength > 0) { readSequence = new string('A', MNVPosition - 1); readSequence += new string('G', MNVLength); readSequence += new string('A', readLength - readSequence.Length); } var varTagUtils = new TagUtils(); varTagUtils.AddStringTag("XD", alignment.Directions); var varRead = new BamAlignment() { RefID = 1, Position = alignment.Position - 1, CigarData = ca, Bases = readSequence, TagData = varTagUtils.ToBytes(), Qualities = Enumerable.Repeat(qualityForAll, readLength).ToArray(), MapQuality = 50 }; return(varRead); } catch { return(null); } }
public void GetTag_Tests() { // create a tag TagUtils tagUtils = new TagUtils(); tagUtils.AddIntTag("NM", 5); tagUtils.AddStringTag("XU", "ABCD"); tagUtils.AddCharTag("XP", '?'); byte[] tagData = tagUtils.ToBytes(); var alignment = new BamAlignment() { TagData = tagData }; // string tag scenarios Assert.Equal("ABCD", alignment.GetStringTag("XU")); Assert.Equal("?", alignment.GetStringTag("XP")); Assert.Throws <ApplicationException>(() => alignment.GetStringTag("NM")); Assert.Equal(null, alignment.GetStringTag("AB")); }
private bool ReadsDoNotOverlap(BamAlignment read1, BamAlignment read2) { var overlaps = read1.OverlapsAlignment(read2); if (overlaps) { return(false); } if (_treatHalfAnchoredAsUnanchored) { return(true); } // Check for S/M overlap, if half-anchoring is allowed var read1ContainsUnanchoredRead2 = AnchoredRegionContainsUnanchoredEnds(read1, read2); var read2ContainsUnanchoredRead1 = AnchoredRegionContainsUnanchoredEnds(read2, read1); return(!(read1ContainsUnanchoredRead2 || read2ContainsUnanchoredRead1)); }
public int GetNm(BamAlignment alignment) { var positionMap = new PositionMap(alignment.Bases.Length); Read.UpdatePositionMap(alignment.Position + 1, alignment.CigarData, positionMap); var snippet = _genomeSnippetSource.GetGenomeSnippet(alignment.Position); var numMismatches = Helper.GetNumMismatches(alignment.Bases, positionMap, snippet.Sequence, snippet.StartPosition); if (numMismatches == null) { throw new Exception("Num mismatches is null"); } var numIndelBases = alignment.CigarData.NumIndelBases(); return(numMismatches.Value + numIndelBases); }
public void ProcessInsertionReadTest() { //chr12:121431782-121432182:COSM46441:TGC:TACCTA:chr12:121432185-121432585-1478/2_fwd 121432113 72M3S CGGGCCCCCCCCAGGGCCAGGCCCGGGACCTGCGCTGCCCGCTCACAGCTCCCCTGGCCTGCCTCCACCTACCTA //chr12:121431782-121432182:COSM46441:TGC:TACCTA:chr12:121432185-121432585-662/2_fwd 121432113 72M3S CGGGCCCCCCCCAGGGCCAGGCCCGGGACCTGCGCTGCCCGCTCACAGCTCCCCTGGCCTGCCTCCACCTACCTA //chr12:121431782-121432182:COSM46441:TGC:TACCTA:chr12:121432185-121432585-1308/2_rev 121432114 71M3I1M GGGCCCCCCCCAGGGCCAGGCCCGGGACCTGCGCTGCCCGCTCACAGCTCCCCTGGCCTGCCTCCACCTAC-CTA-C //chr12:121431782-121432182:COSM46441:TGC:TACCTA:chr12:121432185-121432585-64/2_rev 121432114 71M3I1M GGGCCCCCCCCAGGGCCAGGCCCGGGACCTGCGCTGCCCGCTCACAGCTCCCCTGGCCTGCCTCCACCTAC-TTA-C //chr12:121431782-121432182:COSM46441:TGC:TACCTA:chr12:121432185-121432585-1322/2_rev 121432114 75M GGGCCCCCCCCAGGGCCAGGCCCGGGACCTGCGCTGCCCGCTCACAGCTCCCCTGGCCTGCCTCCACCTGC-CCTC var read = new BamAlignment(); read.Bases = "GGGCCCCCCCCAGGGCCAGGCCCGGGACCTGCGCTGCCCGCTCACAGCTCCCCTGGCCTGCCTCCACCTACCTAC"; //vcf coords 12-345678910-11,12,13,14 read.CigarData = new CigarAlignment("71M3I1M"); read.Position = 121432114; read.Qualities = new byte[read.Bases.Length]; var vs1 = new VariantSite(); vs1.VcfReferencePosition = 121432185; vs1.VcfReferenceAllele = "C"; vs1.VcfAlternateAllele = "CCTA"; //read should match ALT for this test var vsFromVcf = new List <VariantSite>() { vs1 }; //given a variant site, is it in the read? ExecuteTest(read, 0, vsFromVcf, (foundVariants) => { Assert.Equal(foundVariants[SubsequenceType.MatchOrMismatchSequence].Count, 2); Assert.Equal(foundVariants[SubsequenceType.InsertionSquence].Count, 1); Assert.Equal(foundVariants[SubsequenceType.DeletionSequence].Count, 0); }, (matchedVariants) => { Assert.Equal(matchedVariants[0].VcfReferencePosition, 121432185); Assert.Equal(matchedVariants[0].VcfReferenceAllele, "C"); Assert.Equal(matchedVariants[0].VcfAlternateAllele, "CCTA"); }); }
public void ProcessOneDeletionReadTest() { //reads with deletions, S102 // 16187-121416587:COSM21479:GCCAGCTGCAGACGGAGCTC:GT:chr12:121416607-121417007-1014/2_rev_121416520 121416520 75M AGGCGGCTAGCGTGGTGGACCCGGGCCGCGTGGCCCTGTGGCAGCCGAGCCATGGTTTCTAAACTGAGCCAGCTG //16187-121416587:COSM21479:GCCAGCTGCAGACGGAGCTC:GT:chr12:121416607-121417007-1484/2_fwd_121416520 121416520 68M18D7M AGGCGGCTAGCGTGGTGGACCCGGGCCGCGTGGCCCTGTGGCAGCCGAGCCATGGTTTCTAAACTGAGTCTGGCG //16187-121416587:COSM21479:GCCAGCTGCAGACGGAGCTC:GT:chr12:121416607-121417007-1320/2_rev_121416520 121416520 68M18D7M AGGCGGCTAGCGTGGTGGACCCGGGCCGCGTGGCCCTGTGGCAGCCGAGCCATGGTTTCTAAACTGAGTCTGGCG //16187-121416587:COSM21479:GCCAGCTGCAGACGGAGCTC:GT:chr12:121416607-121417007-1076/2_rev_121416520 121416520 68M18D7M AGGCGGCTAGCGTGGTGGACCCGGGCCGCGTGGCCCTGTGGCAGCCGAGCCATGGTTTCTAAACTGAGTCTGGCG //416187-121416587:COSM21479:GCCAGCTGCAGACGGAGCTC:GT:chr12:121416607-121417007-850/2_rev_121416520 121416520 75M AGGCGGCTAGCGTGGTGGACCCGGGCCGCGTGGCCCTGTGGCAGCCGAGCCATGGTTTCTAAACTGAGCCAGCTG var read = new BamAlignment(); read.Bases = "AGGCGGCTAGCGTGGTGGACCCGGGCCGCGTGGCCCTGTGGCAGCCGAGCCATGGTTTCTAAACTGAGTCTGGCG"; read.CigarData = new CigarAlignment("68M18D7M"); read.Position = 121416520; read.Qualities = new byte[read.Bases.Length]; var vs1 = new VariantSite(); vs1.VcfReferencePosition = 121416588; vs1.VcfReferenceAllele = "GCCAGCTGCAGACGGAGCT"; vs1.VcfAlternateAllele = "G"; //read should match ALT for this test var vsFromVcf = new List <VariantSite>() { vs1 }; ExecuteTest(read, 0, vsFromVcf, (foundVariants) => { Assert.Equal(foundVariants[SubsequenceType.MatchOrMismatchSequence].Count, 2); Assert.Equal(foundVariants[SubsequenceType.InsertionSquence].Count, 0); Assert.Equal(foundVariants[SubsequenceType.DeletionSequence].Count, 1); }, (matchedVariants) => { Assert.Equal(matchedVariants[0].VcfReferencePosition, 121416588); Assert.Equal(matchedVariants[0].VcfReferenceAllele, "GCCAGCTGCAGACGGAGCT"); Assert.Equal(matchedVariants[0].VcfAlternateAllele, "G"); }); }
private void HandleFailedRealignment(BamAlignment origBamAlignment, ref bool forcedSoftclip, List <PreIndel> existingIndels, RealignmentResult realignResult, bool hasExistingUnsanctionedIndels, List <PreIndel> existingMatches) { _statusCounter.AddStatusCount("INDEL STATUS\tRejected\t" + realignResult.Indels); _statusCounter.AppendStatusStringTag("RX", "Did not accept: " + realignResult.Indels, origBamAlignment); // TODO could this be happening because of a low-ranked indel? Maybe we should be allowing to realign against all indels... // TODO STILL should this actually be happening also to reads that had no indels to realign around (i.e. started with weak indel, and couldn't go anywhere), not just the ones that were changed? if (_softclipUnknownIndels && hasExistingUnsanctionedIndels) { var unsanctioned = existingIndels.Where(x => !existingMatches.Contains(x)); foreach (var preIndel in unsanctioned.OrderBy(x => x.ReferencePosition)) { var reverseClip = false; var clipLength = preIndel.RightAnchor; if (preIndel.LeftAnchor < preIndel.RightAnchor) { reverseClip = true; clipLength = preIndel.LeftAnchor; } // TODO arbitrary number here... // If it's pretty well-anchored, don't remove the indel if (clipLength > 20) { continue; } forcedSoftclip = true; _statusCounter.AddStatusCount("Softclipped out bad indel"); _statusCounter.AppendStatusStringTag("RX", $"Softclipped out bad indel({origBamAlignment.CigarData},{string.Join(",", existingIndels)}...{realignResult?.Indels}", origBamAlignment); _statusCounter.AddStatusCount("INDEL STATUS\tRemoved\t" + string.Join("|", existingIndels)); OverlappingIndelHelpers.SoftclipAfterIndel(origBamAlignment, reverseClip, preIndel.ReferencePosition); } } }
public void ReadCollapsedCounts() { var alignment = new BamAlignment { Bases = "ACTC", Position = 5, MapQuality = 343, MatePosition = 12312, Qualities = new[] { (byte)20, (byte)21, (byte)30, (byte)40 }, CigarData = new CigarAlignment("1S3M") }; alignment.TagData = DomainTestHelper.GetReadCountsTagData(5, 10); var read = new Read("chr1", alignment); Assert.True(read.IsDuplex); alignment.TagData = DomainTestHelper.GetReadCountsTagData(0, 5); // first tag is 0 read = new Read("chr1", alignment); Assert.False(read.IsDuplex); alignment.TagData = DomainTestHelper.GetReadCountsTagData(null, 5); // first tag is missing read = new Read("chr1", alignment); Assert.False(read.IsDuplex); alignment.TagData = DomainTestHelper.GetReadCountsTagData(5, 0); // second tag is 0 read = new Read("chr1", alignment); Assert.False(read.IsDuplex); alignment.TagData = DomainTestHelper.GetReadCountsTagData(5, null); // second tag is missing read = new Read("chr1", alignment); Assert.False(read.IsDuplex); alignment.TagData = DomainTestHelper.GetReadCountsTagData(0, 0); // both tags 0 read = new Read("chr1", alignment); Assert.False(read.IsDuplex); alignment.TagData = DomainTestHelper.GetReadCountsTagData(null, null); // both tags missing read = new Read("chr1", alignment); Assert.False(read.IsDuplex); }
public void FromBam() { var alignment = new BamAlignment { Bases = "ATCTTA", Position = 100, MatePosition = 500, Name = "test", CigarData = new CigarAlignment("5M1S"), MapQuality = 10, Qualities = new[] { (byte)10, (byte)20, (byte)30 } }; alignment.SetIsDuplicate(true); alignment.SetIsProperPair(true); alignment.SetIsSecondaryAlignment(true); alignment.SetIsUnmapped(true); var read = new Read("chr1", alignment); Assert.Equal(read.Chromosome, "chr1"); Assert.Equal(read.Sequence, alignment.Bases); Assert.Equal(read.Position, alignment.Position + 1); Assert.Equal(read.MatePosition, alignment.MatePosition + 1); Assert.Equal(read.Name, alignment.Name); Assert.Equal(read.CigarData, alignment.CigarData); Assert.Equal(read.IsMapped, alignment.IsMapped()); Assert.Equal(read.IsProperPair, alignment.IsProperPair()); Assert.Equal(read.IsPrimaryAlignment, alignment.IsPrimaryAlignment()); Assert.Equal(read.IsPcrDuplicate, alignment.IsDuplicate()); foreach (var direction in read.SequencedBaseDirectionMap) { Assert.Equal(direction, DirectionType.Forward); } for (var i = 0; i < read.Qualities.Length; i++) { Assert.Equal(read.Qualities[i], alignment.Qualities[i]); } }
private bool MayOverlapMate(BamAlignment alignment) { if (!alignment.IsMateMapped()) { return(false); } if (!alignment.IsMapped()) { return(false); } if (alignment.RefID != alignment.MateRefID) { return(false); } if (Math.Abs(alignment.Position - alignment.MatePosition) > _maxPairGap) { return(false); } return(true); }
protected override bool ShouldSkipRead(BamAlignment alignment) { if (!_filterPairLowMapQ && alignment.MapQuality > 0 && alignment.MapQuality < _minMapQuality) { _statusCounter.AddDebugStatusCount("Skipped read below mapQ"); return(true); } if (alignment.IsSupplementaryAlignment()) { _statusCounter.AddDebugStatusCount("Skipped supplementary"); return(true); } if (_filterForProperPairs && !alignment.IsProperPair()) { _statusCounter.AddDebugStatusCount("Skipped improper pair"); return(true); } return(false); }
public static List <IndelSite> GetIndelPositions(BamAlignment read, out int totalIndelBases) { totalIndelBases = 0; int startIndexInRead = 0; int startIndexInReference = read.Position; var positions = new List <IndelSite>(); var numOperations = read.CigarData.Count; for (var cigarOpIndex = 0; cigarOpIndex < numOperations; cigarOpIndex++) { var operation = read.CigarData[cigarOpIndex]; switch (operation.Type) { case 'I': positions.Add(new IndelSite(startIndexInReference - 1, startIndexInReference, operation.Type, (int)operation.Length, cigarOpIndex == 0 || cigarOpIndex == numOperations - 1)); totalIndelBases += (int)operation.Length; break; case 'D': positions.Add(new IndelSite(startIndexInReference - 1, startIndexInReference + (int)operation.Length, operation.Type, (int)operation.Length * -1, cigarOpIndex == 0 || cigarOpIndex == numOperations - 1)); totalIndelBases += (int)operation.Length; break; } if (operation.IsReadSpan()) { startIndexInRead += (int)operation.Length; } if (operation.IsReferenceSpan()) { startIndexInReference += (int)operation.Length; } } return(positions); }
public void Reset() { var alignment = new BamAlignment { Bases = "ACTC", Position = 5, MapQuality = 343, MatePosition = 12312, Qualities = new[] { (byte)20, (byte)21, (byte)30, (byte)40 }, CigarData = new CigarAlignment("1S3M") }; alignment.SetIsUnmapped(false); alignment.SetIsSecondaryAlignment(false); alignment.SetIsDuplicate(true); alignment.SetIsProperPair(true); var read = new Read("chr1", alignment); read.StitchedCigar = new CigarAlignment("7M"); read.SequencedBaseDirectionMap = new[] { DirectionType.Forward, DirectionType.Reverse, DirectionType.Stitched, DirectionType.Reverse }; alignment.SetIsDuplicate(false); alignment.MatePosition = 555; read.Reset("chr2", alignment); Assert.Equal(556, read.MatePosition); Assert.False(read.IsPcrDuplicate); Assert.Equal("chr2", read.Chromosome); var stitchedCigar = "1S3M1S"; alignment.TagData = DomainTestHelper.GetXCTagData(stitchedCigar); read.Reset("chr3", alignment); Assert.Equal(556, read.MatePosition); Assert.False(read.IsPcrDuplicate); Assert.Equal("chr3", read.Chromosome); Assert.Equal(stitchedCigar, read.StitchedCigar.ToString()); }
/// <summary> /// Seek to the unaligned (and mate-unaligned) reads at the tail of the input file, and write them all out to the output file. /// </summary> private void WriteUnalignedReads(BamWriter writer) { Logger.WriteToLog("Writing unaligned reads"); using (var reader = new BamReader(_inputFile)) { reader.JumpToUnaligned(); var read = new BamAlignment(); while (true) { var result = reader.GetNextAlignment(ref read, false); if (!result) { break; } if (read.RefID != -1) { continue; // skip over last reads } writer.WriteAlignment(read); } } }
public static BamAlignment CreateBamAlignment(string sequence, int position, int matePosition, byte qualityForAll, bool isReverseMapped, uint mapQ = 30, byte[] qualities = null, CigarAlignment cigar = null, string name = null, bool isFirstMate = true) { var bamAlignment = new BamAlignment { Bases = sequence, Position = position - 1, CigarData = cigar ?? new CigarAlignment(sequence.Length + "M"), Qualities = qualities ?? Enumerable.Repeat(qualityForAll, sequence.Length).ToArray(), MatePosition = matePosition - 1, TagData = new byte[0], RefID = 1, MateRefID = 1, Name = name ?? "Alignment" }; bamAlignment.SetIsFirstMate(isFirstMate); bamAlignment.MapQuality = mapQ; bamAlignment.SetIsReverseStrand(isReverseMapped); bamAlignment.SetIsMateReverseStrand(!isReverseMapped); return(bamAlignment); }
public void UpdateIntTagData_Tests() { TagUtils tagUtils = new TagUtils(); byte[] tagData = tagUtils.ToBytes(); var alignment = new BamAlignment() { TagData = tagData }; // when there was not an NM tag to begin with // do not add if not found alignment.UpdateIntTagData("NM", 4); Assert.Equal(null, alignment.GetIntTag("NM")); // add if not found alignment.UpdateIntTagData("NM", 4, true); Assert.Equal(4, alignment.GetIntTag("NM")); // when there was an NM tag to begin with alignment.UpdateIntTagData("NM", 3); Assert.Equal(3, alignment.GetIntTag("NM")); }
public static string[] CheckReadLoading(BamAlignment read, PiscesApplicationOptions options, ChrReference chrInfo, bool isVariant, StitchingScenario scenario) { string expectedVarLoading = scenario.RefLoading; string expectedCandidateDireciton = "0"; if (isVariant) { expectedVarLoading = scenario.VarLoading; expectedCandidateDireciton = scenario.CandidateDirection; } var loadingResults = LoadReads(new List <BamAlignment>() { read }, options, chrInfo, isVariant, expectedVarLoading, expectedCandidateDireciton); if (loadingResults == null) { return(new string[] { "total fail to parse variant reads" }); } //coverage check var variantReadLoadResult = CheckLoading(scenario, 1, loadingResults.Item1, isVariant); var variantReadCandidateDirection = CheckCandidateDirection(isVariant, loadingResults.Item2, expectedCandidateDireciton); if (variantReadLoadResult == null) { return(new string[] { "total fail to check loading" }); } if (variantReadCandidateDirection == null) { return(new string[] { "total fail to check direction" }); } return(new string[] { variantReadLoadResult, variantReadCandidateDirection }); }
/// <summary> /// Returns a very basic read based on the abstract alignment. We don't yet /// </summary> /// <returns></returns> public Read ToRead() { var cigar = new CigarAlignment(Cigar); const byte qualityForAll = 30; var readLength = (int)cigar.GetReadSpan(); var alignment = new BamAlignment { CigarData = cigar, Position = Position - 1, RefID = 1, Bases = Directions.EndsWith("F") ? new string('A', readLength) : new string('T', readLength), Qualities = Enumerable.Repeat(qualityForAll, readLength).ToArray() }; alignment.MapQuality = 30; var read = new Read("chr1", alignment); var di = new DirectionInfo(Directions); read.SequencedBaseDirectionMap = di.ToDirectionMap(); return(read); }
/// <summary> /// jumps to the specified position in the BAM file /// </summary> /// <returns>true if we were successfully able to jump to the requested position</returns> public bool Jump(int refID, int position) { // sanity checks if (!_hasIndex) return false; if (refID > _referenceIndex.Count) return false; if (position > _referenceIndex[refID].Length) return false; // calculate the candidate index regions BamIterator bamIterator; bool foundOffset = _index.GetOffsets(refID, position, out bamIterator); if (!foundOffset || (bamIterator.Offsets == null) || (bamIterator.Offsets.Length == 0)) return false; int currentOffsetIndex = -1; int lastOffsetIndex = bamIterator.Offsets.Length - 1; BamAlignment alignment = new BamAlignment(); while (true) { // jump to the next chunk if ((bamIterator.CurrentOffset == 0) || (bamIterator.CurrentOffset >= bamIterator.Offsets[currentOffsetIndex].End)) { // no more chunks if (currentOffsetIndex == lastOffsetIndex) return false; // sanity check if ((currentOffsetIndex >= 0) && (bamIterator.CurrentOffset != bamIterator.Offsets[currentOffsetIndex].End)) { throw new ApplicationException( string.Format( "Found a potential bug in the BAM index routines. CurrentOffset ({0}) != Offsets[currentOffsetIndex].End ({1}", bamIterator.CurrentOffset, bamIterator.Offsets[currentOffsetIndex].End)); } // not adjacent chunks; then seek if ((currentOffsetIndex < 0) || (bamIterator.Offsets[currentOffsetIndex].End != bamIterator.Offsets[currentOffsetIndex + 1].Begin)) { Seek(bamIterator.Offsets[currentOffsetIndex + 1].Begin); bamIterator.CurrentOffset = Tell(); } currentOffsetIndex++; } // look for the desired alignment if (GetNextAlignment(ref alignment, false)) { // no need to proceed if ((alignment.RefID != bamIterator.RefID) || (alignment.Position >= bamIterator.End)) { return false; } if (IsOverlap(bamIterator.Begin, bamIterator.End, alignment)) { // this is the read we're looking for break; } bamIterator.CurrentOffset = Tell(); } else { // end of file or error return false; } } // reset the file position (since we already read blew past the good alignment) Seek(bamIterator.CurrentOffset); return true; }
/// <summary> /// Jump to unaligned reads (with no associated chromosome) at end of bam file. /// </summary> public bool JumpToUnaligned() { // sanity check: make sure we have unaligned reads if (_index.NumUnalignedWithoutCoordinates == 0) return false; // get the last indexed BAM offset ulong currentOffset = _index.GetLargestBamOffset(); // reposition our BAM reader if (currentOffset != 0) { Seek(currentOffset); } else { Rewind(); currentOffset = Tell(); } // skip all of the alignments that are aligned BamAlignment alignment = new BamAlignment(); while (true) { // look for the desired alignment if (GetNextAlignment(ref alignment, false)) { if (alignment.RefID == -1) break; currentOffset = Tell(); } else { // end of file or error return false; } } // reset the file position (since we already read blew past the good alignment) Seek(currentOffset); return true; }
/// <summary> /// returns true if the alignment overlaps with the specified interval /// </summary> private bool IsOverlap(int begin, int end, BamAlignment alignment) { int alignmentBegin = alignment.Position; int alignmentEnd = alignment.GetEndPosition(); return (alignmentEnd >= begin) && (alignmentBegin < end); }
// writes an alignment public void WriteAlignment(BamAlignment al) { if (!IsOpen) { throw new ApplicationException(string.Format("ERROR: Tried to write an alignment but the file has not been opened yet.")); } // initialize uint nameLen = (uint)al.Name.Length + 1; uint numBases = (uint)al.Bases.Length; uint numCigarOperations = (uint)al.CigarData.Count; uint packedCigarLen = numCigarOperations * 4; uint numEncodedBases = (uint)((numBases / 2.0) + 0.5); uint tagDataLen = (uint)al.TagData.Length; uint dataBlockSize = nameLen + packedCigarLen + numEncodedBases + numBases + tagDataLen; uint alignBlockSize = BamConstants.CoreAlignmentDataLen + dataBlockSize; uint blockSize = alignBlockSize + 4; int offset = 0; // test if we should flush the block if ((BlockOffset + blockSize) > MaxBlockSize) FlushBlock(); // redimension the buffer if needed if (blockSize > _outputBuffer.Length) _outputBuffer = new byte[blockSize + 1024]; // store the block size BinaryIO.AddUIntBytes(ref _outputBuffer, ref offset, alignBlockSize); // store the BAM core data BinaryIO.AddIntBytes(ref _outputBuffer, ref offset, al.RefID); BinaryIO.AddIntBytes(ref _outputBuffer, ref offset, al.Position); BinaryIO.AddUIntBytes(ref _outputBuffer, ref offset, (al.Bin << 16) | (al.MapQuality << 8) | nameLen); BinaryIO.AddUIntBytes(ref _outputBuffer, ref offset, (al.AlignmentFlag << 16) | numCigarOperations); BinaryIO.AddUIntBytes(ref _outputBuffer, ref offset, numBases); BinaryIO.AddIntBytes(ref _outputBuffer, ref offset, al.MateRefID); BinaryIO.AddIntBytes(ref _outputBuffer, ref offset, al.MatePosition); BinaryIO.AddIntBytes(ref _outputBuffer, ref offset, al.FragmentLength); // store the alignment name BinaryIO.AddNullTerminatedString(ref _outputBuffer, ref offset, al.Name); // store the packed CIGAR string and packed bases PackCigar(ref offset, ref _outputBuffer, al.CigarData); PackBases(ref offset, numEncodedBases, al.Bases); // store the base qualities Buffer.BlockCopy(al.Qualities, 0, _outputBuffer, offset, al.Qualities.Length); offset += al.Qualities.Length; // store the tag data Buffer.BlockCopy(al.TagData, 0, _outputBuffer, offset, al.TagData.Length); offset += al.TagData.Length; // write the alignment Write(_outputBuffer, blockSize); }
/// <summary> /// Use the CIGAR string to map bases to chromosome positions, and check whether we see the ref base or the /// variant allele for our variants of interest. /// </summary> private void ProcessReadBases(BamAlignment read, int nextVariantIndex) { int position = read.Position; int baseIndex = 0; int cigarCount = read.CigarData.Count; for (int opIndex = 0; opIndex < cigarCount; opIndex++) { CigarOp cigar = read.CigarData[opIndex]; switch (cigar.Type) { case 'M': // Loop over matches/mismatches: for (int index = 0; index < cigar.Length; index++,position++,baseIndex++) { for (int varIndex = nextVariantIndex; varIndex < this.Variants.Count; varIndex++) { VcfVariant variant = this.Variants[varIndex]; // Subtract 1: Vcf positions are 1-based, bam file positions are 0-based: if (variant.ReferencePosition - 1 > position) break; if (variant.ReferencePosition - 1 < position) { nextVariantIndex++; continue; } if (read.Qualities[baseIndex] < MinimumBaseQScore) continue; // Skip low-quality base calls. char Base = read.Bases[baseIndex]; if (Base == variant.ReferenceAllele[0]) this.ReferenceCounts[varIndex]++; if (Base == variant.VariantAlleles[0][0]) this.VariantCounts[varIndex]++; } } break; case 'S': baseIndex += (int)cigar.Length; break; case 'I': baseIndex += (int)cigar.Length; break; case 'D': position += (int)cigar.Length; break; default: // We don't know how to cope with this CIGAR operation; bail out! return; } } }
/// <summary> /// Adds an offset to a specific reference sequence in the index /// </summary> private static void AddOffset(ref List<ulong> offsets, ref BamAlignment al, ulong offset) { int beg = al.Position >> BamLidxShift; int end = (al.GetEndPosition() - 1) >> BamLidxShift; // initialize additional entries if needed while (offsets.Count < (end + 1)) offsets.Add(0); if (beg == end) { if (offsets[beg] == 0) offsets[beg] = offset; } else { for (int i = beg; i <= end; i++) { if (offsets[i] == 0) offsets[i] = offset; } } }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) continue; if (alignment.IsFailedQC()) continue; if (alignment.IsDuplicate()) continue; if (alignment.IsReverseStrand()) continue; if (!alignment.IsMainAlignment()) continue; // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue; if (isPairedEnd && !alignment.IsProperPair()) continue; int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
// retrieves next available alignment public bool GetNextAlignment(ref BamAlignment alignment, bool skipAdditionalParsing) { // check that our file is open if (!IsOpen) return false; // retrieve the alignment data length if (Read(ref _byteBuffer, 4) != 4) return false; uint alignmentDataLen = BitConverter.ToUInt32(_byteBuffer, 0); if (alignmentDataLen == 0) return false; // retrieve the alignment data if (Read(ref _byteBuffer, alignmentDataLen) != alignmentDataLen) return false; // retrieve the core alignment data uint compositeData1 = BitConverter.ToUInt32(_byteBuffer, 8); uint flagAndNumCigarOps = BitConverter.ToUInt32(_byteBuffer, 12); uint numBases = BitConverter.ToUInt32(_byteBuffer, 16); if (numBases > _sequenceBuffer.Length) { // For very long reads, re-allocate this buffer to twice the data length _sequenceBuffer = new char[numBases * 2]; } uint readNameLen = compositeData1 & 0xff; uint numCigarOps = flagAndNumCigarOps & 0xffff; alignment.RefID = BitConverter.ToInt32(_byteBuffer, 0); alignment.Position = BitConverter.ToInt32(_byteBuffer, 4); alignment.Bin = (compositeData1 >> 16); alignment.MapQuality = ((compositeData1 >> 8) & 0xff); alignment.AlignmentFlag = flagAndNumCigarOps >> 16; alignment.MateRefID = BitConverter.ToInt32(_byteBuffer, 20); alignment.MatePosition = BitConverter.ToInt32(_byteBuffer, 24); alignment.FragmentLength = BitConverter.ToInt32(_byteBuffer, 28); // retrieve the read name int offset = (int)BamConstants.CoreAlignmentDataLen; alignment.Name = Encoding.ASCII.GetString(_byteBuffer, offset, (int)(readNameLen - 1)); offset += (int)readNameLen; // retrieve the CIGAR operations alignment.CigarData.Clear(); for (uint i = 0; i < numCigarOps; ++i, offset += 4) { uint cigarData = BitConverter.ToUInt32(_byteBuffer, offset); alignment.CigarData.Add(new CigarOp(BamConstants.CigarTypes[cigarData & BamConstants.CigarMask], cigarData >> BamConstants.CigarShift)); } // here we provide a mechanism for skipping the processing of // bases, base qualities, and tags if (!skipAdditionalParsing) { // retrieve the bases byte shift = 4; for (int i = 0; i < numBases; ++i, shift ^= 4) { _sequenceBuffer[i] = _baseLookupTable[(_byteBuffer[offset] >> shift) & 15]; if (shift == 0) offset++; } if (shift == 0) offset++; alignment.Bases = new string(_sequenceBuffer, 0, (int)numBases); // retrieve the qualities if ((alignment.Qualities == null) || (alignment.Qualities.Length != numBases)) { alignment.Qualities = new byte[numBases]; } Buffer.BlockCopy(_byteBuffer, offset, alignment.Qualities, 0, (int)numBases); offset += (int)numBases; // retrieve the tags int numTagBytes = (int)alignmentDataLen - offset; alignment.TagData = new byte[numTagBytes]; Array.Copy(_byteBuffer, offset, alignment.TagData, 0, numTagBytes); } return true; }
/// <summary> /// Bins the fragment identified by alignment. Increases bin count if the first read of a pair passes all the filters. /// Decreases bin count if the second read of a pair does not pass all the filters. /// </summary> /// <param name="alignment"></param> /// <param name="qualityThreshold">minimum mapping quality</param> /// <param name="readNameToBinIndex">Dictionary of read name to bin index</param> /// <param name="usableFragmentCount">number of usable fragments</param> /// <param name="bins">predefined bins</param> /// <param name="binIndexStart">bin index from which to start searching for the best bin</param> public static void BinOneAlignment(BamAlignment alignment, uint qualityThreshold, Dictionary<string, int> readNameToBinIndex, HashSet<string> samePositionReadNames, ref long usableFragmentCount, List<GenomicBin> bins, ref int binIndexStart) { if (!alignment.IsMapped()) { return; } if (!alignment.IsMateMapped()) { return; } if (!alignment.IsPrimaryAlignment()) { return; } if (!(alignment.IsPaired() && alignment.IsProperPair())) { return; } bool duplicateFailedQCLowQuality = IsDuplicateFailedQCLowQuality(alignment, qualityThreshold); // Check whether we have binned the fragment using the mate if (readNameToBinIndex.ContainsKey(alignment.Name)) { // Undo binning when one of the reads is a duplicate, fails QC or has low mapping quality if (duplicateFailedQCLowQuality) { usableFragmentCount--; bins[readNameToBinIndex[alignment.Name]].Count--; } readNameToBinIndex.Remove(alignment.Name); // clean up return; } if (duplicateFailedQCLowQuality) { return; } if (alignment.RefID != alignment.MateRefID) { return; } // does this ever happen? if (IsRightMostInPair(alignment)) { return; } // look at only one read of the pair // handle the case where alignment.Position == alignment.MatePosition if (alignment.Position == alignment.MatePosition) { if (samePositionReadNames.Contains(alignment.Name)) { samePositionReadNames.Remove(alignment.Name); return; } samePositionReadNames.Add(alignment.Name); } if (alignment.FragmentLength == 0) { return; } // Janus-SRS-190: 0 when the information is unavailable // Try to bin the fragment int fragmentStart = alignment.Position; // 0-based, inclusive int fragmentStop = alignment.Position + alignment.FragmentLength; // 0-based, exclusive while (binIndexStart < bins.Count && bins[binIndexStart].Stop <= fragmentStart) // Bins[binIndexStart] on the left of the fragment { binIndexStart++; } if (binIndexStart >= bins.Count) { return; } // all the remaining fragments are on the right of the last bin // now Bins[binIndexStart].Stop > fragmentStart int bestBinIndex = FindBestBin(bins, binIndexStart, fragmentStart, fragmentStop); if (bestBinIndex >= 0) // Bin the fragment { usableFragmentCount++; bins[bestBinIndex].Count++; readNameToBinIndex[alignment.Name] = bestBinIndex; } }
/// <summary> /// Checks if any of the conditions is true: /// 1. The read is a duplicate, /// 2. The read failed QC, /// 3. The read is of low mapping quality. /// </summary> /// <param name="alignment"></param> /// <returns></returns> public static bool IsDuplicateFailedQCLowQuality(BamAlignment alignment, uint qualityThreshold) { if (alignment.IsDuplicate()) { return true; } if (alignment.IsFailedQC()) { return true; } if (alignment.MapQuality == FragmentBinnerConstants.MappingQualityNotAvailable || alignment.MapQuality < qualityThreshold) { return true; } return false; }
public void TestBinOneAlignment(int pos1, int pos2) { uint qualityThreshold = 3; Dictionary<string, int> readNameToBinIndex = new Dictionary<string, int>(); HashSet<string> samePositionReadNames = new HashSet<string>(); long usableFragmentCount = 0; List<GenomicBin> bins = new List<GenomicBin>() { new GenomicBin("chr1", 100, 200, 50, 0) }; int binIndexStart = 0; BamAlignment alignment1 = new BamAlignment(); BamAlignment alignment2 = new BamAlignment(); alignment1.Name = alignment2.Name = "ReadName"; alignment1.AlignmentFlag = 0x1 | 0x2; alignment2.AlignmentFlag = 0x1 | 0x2; alignment1.Position = pos1; alignment1.MatePosition = pos2; alignment1.FragmentLength = 100; alignment2.Position = pos2; alignment2.MatePosition = pos1; alignment2.FragmentLength = -100; alignment1.MapQuality = 10; alignment2.MapQuality = 10; // Both reads pass filters FragmentBinner.BinTask.BinOneAlignment(alignment1, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); FragmentBinner.BinTask.BinOneAlignment(alignment2, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); Assert.AreEqual(bins[0].Count, 1); // First read passes filters bins[0].Count = 0; // reset bin count alignment2.MapQuality = 2; // below quality threshold of 3 FragmentBinner.BinTask.BinOneAlignment(alignment1, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); FragmentBinner.BinTask.BinOneAlignment(alignment2, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); Assert.AreEqual(bins[0].Count, 0); // Second read passes filters bins[0].Count = 0; // reset bin count alignment1.MapQuality = 2; // below quality threshold of 3 alignment2.MapQuality = 10; FragmentBinner.BinTask.BinOneAlignment(alignment1, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); FragmentBinner.BinTask.BinOneAlignment(alignment2, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); Assert.AreEqual(bins[0].Count, 0); // Both fail filters bins[0].Count = 0; // reset bin count alignment1.MapQuality = 2; // below quality threshold of 3 alignment2.MapQuality = 2; // below quality threshold of 3 FragmentBinner.BinTask.BinOneAlignment(alignment1, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); FragmentBinner.BinTask.BinOneAlignment(alignment2, qualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, bins, ref binIndexStart); Assert.AreEqual(bins[0].Count, 0); }
/// <summary> /// Updates the index with respect to the current alignment /// </summary> /// <returns>false if multiple reads without coordinates are encountered</returns> public bool UpdateReferenceIndex(ref BamAlignment alignment, ulong offset) { // record the number of unaligned reads if (alignment.RefID < 0) ++_numUnalignedWithoutCoordinates; // update the reference IDs and check that the alignment is sorted if (alignment.RefID != _lastRefID) { _lastRefID = alignment.RefID; _lastBin = uint.MaxValue; } else if (alignment.Position < _lastPosition) { throw new ApplicationException( string.Format( "ERROR: The BAM file is not sorted. An alignment ({0}) occurred before the preceding alignment ({1}).", alignment.Position, _lastPosition)); } if (alignment.RefID >= 0) AddOffset(ref _index[alignment.RefID].OffsetList, ref alignment, _lastOffset); if (alignment.Bin != _lastBin) { if (_saveBin != uint.MaxValue) AddBamRegion(ref _index[_saveRefID].RegionsDictionary, _saveBin, _saveOffset, _lastOffset); if ((_lastBin == uint.MaxValue) && (_saveRefID != int.MinValue)) { _endOffset = _lastOffset; AddBamRegion(ref _index[_saveRefID].RegionsDictionary, BamMaxBin, _beginOffset, _endOffset); AddBamRegion(ref _index[_saveRefID].RegionsDictionary, BamMaxBin, _numAligned, _numUnaligned); _numAligned = _numUnaligned = 0; _beginOffset = _endOffset; } _saveOffset = _lastOffset; _saveBin = _lastBin = alignment.Bin; _saveRefID = alignment.RefID; if (_saveRefID < 0) { _hasUnalignedReads = true; return false; } } if (offset <= _lastOffset) { throw new ApplicationException( "ERROR: While updating the BAM index, the offset did not increase after processing the last alignment."); } if (alignment.IsMapped()) ++_numAligned; else ++_numUnaligned; _lastOffset = offset; _lastPosition = alignment.Position; return true; }
/// <summary> /// Step 2: Get the ref and variant allele frequencies for the variants of interest, in the tumor bam file. /// </summary> protected void ProcessBamFile(string bamPath) { Console.WriteLine("{0} Looping over bam records from {1}", DateTime.Now, bamPath); int overallCount = 0; int nextVariantIndex = 0; using (BamReader reader = new BamReader(bamPath)) { BamAlignment read = new BamAlignment(); int refID = reader.GetReferenceIndex(this.Chromosome); if (refID < 0) { throw new ArgumentException(string.Format("Error: Chromosome name '{0}' does not match bam file at '{1}'", this.Chromosome, bamPath)); } Console.WriteLine("Jump to refid {0} {1}", refID, this.Chromosome); reader.Jump(refID, 0); while (true) { bool result = reader.GetNextAlignment(ref read, false); if (!result) break; if (!read.HasPosition() || read.RefID > refID) break; // We're past our chromosome of interest. if (read.RefID < refID) continue; // We're not yet on our chromosome of interest. overallCount++; if (overallCount % 1000000 == 0) { Console.WriteLine("Record {0} at {1}...", overallCount, read.Position); } // Skip over unaligned or other non-count-worthy reads: if (!read.IsPrimaryAlignment()) continue; if (!read.IsMapped()) continue; if (read.IsDuplicate()) continue; if (read.MapQuality <= MinimumMapQ) continue; // Scan forward through the variants list, to keep up with our reads: while (nextVariantIndex < this.Variants.Count && this.Variants[nextVariantIndex].ReferencePosition < read.Position) { nextVariantIndex++; } if (nextVariantIndex >= this.Variants.Count) break; // If the read doesn't look like it has a reasonable chance of touching the next variant, continue: if (read.Position + 1000 < this.Variants[nextVariantIndex].ReferencePosition) continue; // This read potentially overlaps next variant (and further variants). Count bases! ProcessReadBases(read, nextVariantIndex); } } Console.WriteLine("Looped over {0} bam records in all", overallCount); }
/// <summary> /// Bins fragments. /// </summary> private void binFragments() { // Sanity check: The BAM index file must exist, in order for us to seek to our target chromosome! if (!Bam.Index.Exists) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", Bam.Index.FullName)); } long pairedAlignmentCount = 0; // keep track of paired alignments usableFragmentCount = 0; using (BamReader reader = new BamReader(Bam.BamFile.FullName)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(Chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", Chromosome, Bam.BamFile.FullName)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } Dictionary<string, int> readNameToBinIndex = new Dictionary<string, int>(); HashSet<string> samePositionReadNames = new HashSet<string>(); int binIndexStart = 0; int prevPosition = -1; BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; if (alignment.Position < prevPosition) // Make sure the BAM is properly sorted { throw new ApplicationException( string.Format("The alignment on {0} are not properly sorted in {1}: {2}", Chromosome, Bam.BamFile.FullName, alignment.Name)); } prevPosition = alignment.Position; if (alignment.IsPaired()) { pairedAlignmentCount++; } BinOneAlignment(alignment, FragmentBinnerConstants.MappingQualityThreshold, readNameToBinIndex, samePositionReadNames, ref usableFragmentCount, Bins, ref binIndexStart); } } if (pairedAlignmentCount == 0) { throw new ApplicationException(string.Format("No paired alignments found for {0} in {1}", Chromosome, Bam.BamFile.FullName)); } }
/// <summary> /// Is the read the right-most one (by genomic position) in a pair? /// </summary> /// <param name="alignment"></param> /// <returns></returns> public static bool IsRightMostInPair(BamAlignment alignment) { return alignment.Position > alignment.MatePosition; }
/// <summary> /// Serialize alignment to a byte array, for later flushing to output file. /// </summary> static public byte[] SerializeAlignment(ref BamAlignment al) { // initialize uint nameLen = (uint)al.Name.Length + 1; uint numBases = (uint)al.Bases.Length; uint numCigarOperations = (uint)al.CigarData.Count; uint packedCigarLen = numCigarOperations * 4; uint numEncodedBases = (uint)((numBases / 2.0) + 0.5); uint tagDataLen = (uint)al.TagData.Length; uint dataBlockSize = nameLen + packedCigarLen + numEncodedBases + numBases + tagDataLen; uint alignBlockSize = BamConstants.CoreAlignmentDataLen + dataBlockSize; uint blockSize = alignBlockSize + 4; byte[] buffer = new byte[blockSize]; int offset = 0; // store the block size BinaryIO.AddUIntBytes(ref buffer, ref offset, alignBlockSize); // store the BAM core data BinaryIO.AddIntBytes(ref buffer, ref offset, al.RefID); BinaryIO.AddIntBytes(ref buffer, ref offset, al.Position); BinaryIO.AddUIntBytes(ref buffer, ref offset, (al.Bin << 16) | (al.MapQuality << 8) | nameLen); BinaryIO.AddUIntBytes(ref buffer, ref offset, (al.AlignmentFlag << 16) | numCigarOperations); BinaryIO.AddUIntBytes(ref buffer, ref offset, numBases); BinaryIO.AddIntBytes(ref buffer, ref offset, al.MateRefID); BinaryIO.AddIntBytes(ref buffer, ref offset, al.MatePosition); BinaryIO.AddIntBytes(ref buffer, ref offset, al.FragmentLength); // store the alignment name BinaryIO.AddNullTerminatedString(ref buffer, ref offset, al.Name); // store the packed CIGAR string and packed bases PackCigar(ref offset, ref buffer, al.CigarData); PackBases(ref offset, ref buffer, numEncodedBases, al.Bases); // store the base qualities Buffer.BlockCopy(al.Qualities, 0, buffer, offset, al.Qualities.Length); offset += al.Qualities.Length; // store the tag data Buffer.BlockCopy(al.TagData, 0, buffer, offset, al.TagData.Length); offset += al.TagData.Length; return buffer; }