/// <summary> /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide. /// </summary> /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param> /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param> /// <param name="observedAlignments">Stores observed alignments from a sample.</param> /// <param name="fragmentLengths">Stores fragment length (Int16).</param> static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary <string, BitArray> possibleAlignments, IDictionary <string, HitArray> observedAlignments, IDictionary <string, Int16[]> fragmentLengths) { string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome); BitArray possible = new BitArray(referenceBases.Length); possibleAlignments[chromosome] = possible; observedAlignments[chromosome] = new HitArray(referenceBases.Length); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { fragmentLengths[chromosome] = new Int16[referenceBases.Length]; } else { fragmentLengths[chromosome] = new Int16[0]; } // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters. for (int i = 0; i < referenceBases.Length; i++) { if (char.IsUpper(referenceBases[i])) { possible[i] = true; } } }
/// <summary> /// Calculates how many possible alignments corresponds to the desired number of observed alignments per bin. /// </summary> /// <param name="countsPerBin">Desired number of observed alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments (unique mers).</param> /// <param name="observedAlignments">BitArrays storing the observed alignments.</param> /// <returns>Median alignment rate observed on the autosomes.</returns> static int CalculateNumberOfPossibleAlignmentsPerBin(int countsPerBin, Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, NexteraManifest manifest = null) { List <double> rates = new List <double>(); Dictionary <string, List <NexteraManifest.ManifestRegion> > manifestRegionsByChrom = null; if (manifest != null) { manifestRegionsByChrom = manifest.GetManifestRegionsByChromosome(); } List <ThreadStart> tasks = new List <ThreadStart>(); foreach (string chr in possibleAlignments.Keys) { // We don't want to include the sex chromosomes because they may not be copy number 2 if (!GenomeMetadata.SequenceMetadata.IsAutosome(chr)) { continue; } HitArray observed = observedAlignments[chr]; BitArray possible = possibleAlignments[chr]; List <NexteraManifest.ManifestRegion> regions = null; if (manifestRegionsByChrom != null) { if (!manifestRegionsByChrom.ContainsKey(chr)) { continue; } regions = manifestRegionsByChrom[chr]; } tasks.Add(new ThreadStart(() => { int numberObserved = observed.CountSetBits(regions); int numberPossible = CountSetBits(possible, regions); double rate = numberObserved / (double)numberPossible; lock (rates) { rates.Add(rate); } })); } Console.WriteLine("Launch CalculateNumberOfPossibleAlignmentsPerBin jobs..."); Console.Out.WriteLine(); //Parallel.ForEach(tasks, t => { t.Invoke(); }); //todo allow controling degree of parallelism Isas.Shared.Utilities.DoWorkParallelThreads(tasks); Console.WriteLine("CalculateNumberOfPossibleAlignmentsPerBin jobs complete."); Console.Out.WriteLine(); double medianRate = CanvasCommon.Utilities.Median(rates); return((int)(countsPerBin / medianRate)); }
public void GetData(out Dictionary <string, BitArray> possibleAlignments, out Dictionary <string, HitArray> observedAlignments, out Dictionary <string, Int16[]> fragmentLengths) { possibleAlignments = Convert(this.PossibleAlignments, this.BitsInLastBytePossibleAlignments); observedAlignments = new Dictionary <string, HitArray>(); foreach (string key in this.ObservedAlignments.Keys) { observedAlignments[key] = new HitArray(this.ObservedAlignments[key]); } fragmentLengths = new Dictionary <string, Int16[]>(); foreach (string key in this.FragmentLengths.Keys) { fragmentLengths[key] = this.FragmentLengths[key]; } }
/// <summary> /// Remove any observed alignment if it wasn't 'possible'. /// </summary> /// <param name="observedAlignments">BitArrays of observed alignment positions.</param> /// <param name="possibleAlignments">BitArrays of possible alignment positions.</param> static void ScreenObservedTags(IDictionary <string, HitArray> observedAlignments, IDictionary <string, BitArray> possibleAlignments) { foreach (string chr in possibleAlignments.Keys) { if (!observedAlignments.ContainsKey(chr)) { continue; } HitArray observed = observedAlignments[chr]; BitArray possible = possibleAlignments[chr]; for (int i = 0; i < possible.Length; i++) { if (!possible[i]) { observed.Data[i] = 0; } } } }
/// <summary> /// Populate the list of GenomicBin objects for this chromosome. /// </summary> static void BinCountsForChromosome(BinTaskArguments arguments) { List <GenomicBin> bins = arguments.Bins; bool usePredefinedBins = bins.Any(); int predefinedBinIndex = 0; GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value; BinState currentBin = new BinState(); string chr = arguments.Chromosome; BitArray possibleAlignments = arguments.PossibleAlignments; HitArray observedAlignments = arguments.ObservedAlignments; CanvasCoverageMode coverageMode = arguments.CoverageMode; int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0; // Skip past leading Ns while (fastaEntry.Bases[pos].Equals('n')) { pos++; } List <float> binPositions = new List <float>(); List <int> binObservations = new List <int>(); for (; pos < fastaEntry.Bases.Length; pos++) { // Sets the start of the bin if (currentBin.StartPosition == -1) { currentBin.StartPosition = pos; } if (!fastaEntry.Bases[pos].Equals("n")) { currentBin.NucleotideCount++; } //if (Utilities.IsGC(fastaEntry.Bases[pos])) // currentBin.GCCount++; switch (fastaEntry.Bases[pos]) { case 'C': case 'c': case 'G': case 'g': currentBin.GCCount++; break; } if (possibleAlignments[pos]) { currentBin.PossibleCount++; currentBin.ObservedCount += observedAlignments.Data[pos]; binObservations.Add(observedAlignments.Data[pos]); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]); } } // We've seen the desired number of possible alignment positions. if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1)) { if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range { currentBin.ObservedCount = 0; foreach (int Value in binObservations) { currentBin.ObservedCount += Math.Min(10, Value); } } if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted { currentBin.ObservedCount = 0; float tmpObservedCount = 0; for (int i = 0; i < binObservations.Count; i++) { tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]); } currentBin.ObservedCount = (int)Math.Round(tmpObservedCount); } int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount); if (usePredefinedBins) { bins[predefinedBinIndex].GC = gc; bins[predefinedBinIndex].Count = currentBin.ObservedCount; predefinedBinIndex++; if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin } else { // Note the pos + 1 to make the first three conform to bed specification GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount); bins.Add(bin); } // Reset all relevant variables currentBin.Reset(); binObservations.Clear(); binPositions.Clear(); } } }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) { continue; } if (alignment.IsFailedQC()) { continue; } if (alignment.IsDuplicate()) { continue; } if (alignment.IsReverseStrand()) { continue; } if (!alignment.IsMainAlignment()) { continue; } // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) { continue; } if (isPairedEnd && !alignment.IsProperPair()) { continue; } int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) { break; } if (refID == -1) { continue; } keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) { fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
public void GetData(out Dictionary<string, BitArray> possibleAlignments, out Dictionary<string, HitArray> observedAlignments, out Dictionary<string, Int16[]> fragmentLengths) { possibleAlignments = Convert(this.PossibleAlignments, this.BitsInLastBytePossibleAlignments); observedAlignments = new Dictionary<string, HitArray>(); foreach (string key in this.ObservedAlignments.Keys) { observedAlignments[key] = new HitArray(this.ObservedAlignments[key]); } fragmentLengths = new Dictionary<string, Int16[]>(); foreach (string key in this.FragmentLengths.Keys) { fragmentLengths[key] = this.FragmentLengths[key]; } }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) continue; if (alignment.IsFailedQC()) continue; if (alignment.IsDuplicate()) continue; if (alignment.IsReverseStrand()) continue; if (!alignment.IsMainAlignment()) continue; // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue; if (isPairedEnd && !alignment.IsProperPair()) continue; int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
/// <summary> /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide. /// </summary> /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param> /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param> /// <param name="observedAlignments">Stores observed alignments from a sample.</param> /// <param name="fragmentLengths">Stores fragment length (Int16).</param> static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary<string, BitArray> possibleAlignments, IDictionary<string, HitArray> observedAlignments, IDictionary<string, Int16[]> fragmentLengths) { string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome); BitArray possible = new BitArray(referenceBases.Length); possibleAlignments[chromosome] = possible; observedAlignments[chromosome] = new HitArray(referenceBases.Length); if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[chromosome] = new Int16[referenceBases.Length]; else fragmentLengths[chromosome] = new Int16[0]; // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters. for (int i = 0; i < referenceBases.Length; i++) { if (char.IsUpper(referenceBases[i])) possible[i] = true; } }