/// <summary> /// Populate the list of GenomicBin objects for this chromosome. /// </summary> static void BinCountsForChromosome(BinTaskArguments arguments) { List <GenomicBin> bins = arguments.Bins; bool usePredefinedBins = bins.Any(); int predefinedBinIndex = 0; GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value; BinState currentBin = new BinState(); string chr = arguments.Chromosome; BitArray possibleAlignments = arguments.PossibleAlignments; HitArray observedAlignments = arguments.ObservedAlignments; CanvasCoverageMode coverageMode = arguments.CoverageMode; int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0; // Skip past leading Ns while (fastaEntry.Bases[pos].Equals('n')) { pos++; } List <float> binPositions = new List <float>(); List <int> binObservations = new List <int>(); for (; pos < fastaEntry.Bases.Length; pos++) { // Sets the start of the bin if (currentBin.StartPosition == -1) { currentBin.StartPosition = pos; } if (!fastaEntry.Bases[pos].Equals("n")) { currentBin.NucleotideCount++; } //if (Utilities.IsGC(fastaEntry.Bases[pos])) // currentBin.GCCount++; switch (fastaEntry.Bases[pos]) { case 'C': case 'c': case 'G': case 'g': currentBin.GCCount++; break; } if (possibleAlignments[pos]) { currentBin.PossibleCount++; currentBin.ObservedCount += observedAlignments.Data[pos]; binObservations.Add(observedAlignments.Data[pos]); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]); } } // We've seen the desired number of possible alignment positions. if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1)) { if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range { currentBin.ObservedCount = 0; foreach (int Value in binObservations) { currentBin.ObservedCount += Math.Min(10, Value); } } if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted { currentBin.ObservedCount = 0; float tmpObservedCount = 0; for (int i = 0; i < binObservations.Count; i++) { tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]); } currentBin.ObservedCount = (int)Math.Round(tmpObservedCount); } int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount); if (usePredefinedBins) { bins[predefinedBinIndex].GC = gc; bins[predefinedBinIndex].Count = currentBin.ObservedCount; predefinedBinIndex++; if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin } else { // Note the pos + 1 to make the first three conform to bed specification GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount); bins.Add(bin); } // Reset all relevant variables currentBin.Reset(); binObservations.Clear(); binPositions.Clear(); } } }
/// <summary> /// Populate the list of GenomicBin objects for this chromosome. /// </summary> static void BinCountsForChromosome(BinTaskArguments arguments) { List<GenomicBin> bins = arguments.Bins; bool usePredefinedBins = bins.Any(); int predefinedBinIndex = 0; GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value; BinState currentBin = new BinState(); string chr = arguments.Chromosome; BitArray possibleAlignments = arguments.PossibleAlignments; HitArray observedAlignments = arguments.ObservedAlignments; CanvasCoverageMode coverageMode = arguments.CoverageMode; int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0; // Skip past leading Ns while (fastaEntry.Bases[pos].Equals('n')) pos++; List<float> binPositions = new List<float>(); List<int> binObservations = new List<int>(); for (; pos < fastaEntry.Bases.Length; pos++) { // Sets the start of the bin if (currentBin.StartPosition == -1) currentBin.StartPosition = pos; if (!fastaEntry.Bases[pos].Equals("n")) currentBin.NucleotideCount++; //if (IsGC(fastaEntry.Bases[pos])) // currentBin.GCCount++; switch (fastaEntry.Bases[pos]) { case 'C': case 'c': case 'G': case 'g': currentBin.GCCount++; break; } if (possibleAlignments[pos]) { currentBin.PossibleCount++; currentBin.ObservedCount += observedAlignments.Data[pos]; binObservations.Add(observedAlignments.Data[pos]); if (coverageMode == CanvasCoverageMode.GCContentWeighted) binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]); } // We've seen the desired number of possible alignment positions. if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1)) { if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range { currentBin.ObservedCount = 0; foreach (int Value in binObservations) { currentBin.ObservedCount += Math.Min(10, Value); } } if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted { currentBin.ObservedCount = 0; float tmpObservedCount = 0; for (int i = 0; i < binObservations.Count; i++) { tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]); } currentBin.ObservedCount = (int)Math.Round(tmpObservedCount); } int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount); if (usePredefinedBins) { bins[predefinedBinIndex].GC = gc; bins[predefinedBinIndex].Count = currentBin.ObservedCount; predefinedBinIndex++; if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin } else { // Note the pos + 1 to make the first three conform to bed specification GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount); bins.Add(bin); } // Reset all relevant variables currentBin.Reset(); binObservations.Clear(); binPositions.Clear(); } } }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List <GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths, Dictionary <string, List <GenomicBin> > predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary <string, GenericRead> fastaEntries = new Dictionary <string, GenericRead>(); List <string> chromosomes = new List <string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { meanFragmentSize = MeanFragmentSize(fragmentLengths); } using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary <string, byte[]> readGCContent = new Dictionary <string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List <ThreadStart> normalizationTasks = new List <ThreadStart>(); foreach (KeyValuePair <string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; uint gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) { currentFragment = meanFragmentSize; } else { currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); } for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); Isas.Shared.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); } Dictionary <string, List <GenomicBin> > perChromosomeBins = new Dictionary <string, List <GenomicBin> >(); List <ThreadStart> binningTasks = new List <ThreadStart>(); foreach (KeyValuePair <string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) { continue; } if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) { continue; } BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List <GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { args.ReadGCContent = readGCContent[chr]; } else { args.ReadGCContent = null; } args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List <GenomicBin> finalBins = new List <GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) { continue; } finalBins.AddRange(perChromosomeBins[chr]); } return(finalBins); }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, Dictionary<string, List<GenomicBin>> predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>(); List<string> chromosomes = new List<string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) meanFragmentSize = MeanFragmentSize(fragmentLengths); using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List<ThreadStart> normalizationTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; int gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) currentFragment = meanFragmentSize; else currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } if (gcCounter < 0) gcCounter = 0; gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>(); List<ThreadStart> binningTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) continue; if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue; BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) args.ReadGCContent = readGCContent[chr]; else args.ReadGCContent = null; args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List<GenomicBin> finalBins = new List<GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) continue; finalBins.AddRange(perChromosomeBins[chr]); } return finalBins; }