private void initialize() { IEnumerable <SampleGenomicBin> onTargetBins = manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest); List <double> x = new List <double>(); List <double> y = new List <double>(); withoutChrY = new List <int>(); int i = 0; // index into x and y foreach (var bin in onTargetBins) { double count = countTransformer(bin.Count); // Variance stablization if (!double.IsInfinity(count)) { x.Add(bin.GenomicBin.GC); y.Add(count); string chrom = bin.GenomicBin.Chromosome.ToLower(); bool isChrY = chrom == "chry" || chrom == "y"; if (!isChrY) { withoutChrY.Add(i); } i++; } } gcs = x.ToArray(); counts = y.ToArray(); }
/// <summary> /// Assumes the bins are sorted by genomic coordinates /// </summary> /// <param name="bins">Bins whose counts are to be normalized</param> /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param> /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param> public static void GetCountsByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, out List <float>[] countsByGC, out List <float> counts) { countsByGC = new List <float> [numberOfGCbins]; counts = new List <float>(bins.Count); // Initialize the lists for (int i = 0; i < countsByGC.Length; i++) { countsByGC[i] = new List <float>(); } foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome)) { continue; } // Put the observed count in the GC-appropriate list. countsByGC[bin.GenomicBin.GC].Add(bin.Count); // Add to the global list of counts. counts.Add(bin.Count); } }
/// <summary> /// Perform a simple GC normalization. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> /// <param name="manifest"></param> static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List <float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List <float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); double globalMedian = Utilities.Median(counts); double?[] medians = new double?[countsByGC.Length]; // Compute the median count for each GC bin for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++) { if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC) { medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]); } else { List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex); medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts); } } // Divide each count by the median count of bins with the same GC content for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++) { double?median = medians[bins[gcBinIndex].GenomicBin.GC]; if (median != null && median > 0) { bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median); } } // DebugPrintCountsByGC(bins, "CountsByGC-After.txt"); }
/// <summary> /// Remove bins with extreme GC content. /// </summary> /// <param name="bins">Genomic bins in from which we filter out GC content outliers.</param> /// <param name="threshold">Minimum number of bins with the same GC content required to keep a bin.</param> /// /// The rationale of this function is that a GC normalization is performed by computing the median count /// for each possible GC value. If that count is small, then the corresponding normalization constant /// is unstable and we shouldn't use these data. static List <SampleGenomicBin> RemoveBinsWithExtremeGC(List <SampleGenomicBin> bins, int threshold, NexteraManifest manifest = null) { // Will hold outlier-removed bins. List <SampleGenomicBin> stripped = new List <SampleGenomicBin>(); // used to count the number of bins with each possible GC content (0-100) int[] counts = new int[EnrichmentUtilities.numberOfGCbins]; double totalCount = 0; foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { // We only count autosomal bins because these are the ones we computed normalization factor upon. if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome)) { continue; } counts[bin.GenomicBin.GC]++; totalCount++; } int averageCountPerGC = Math.Max(minNumberOfBinsPerGCForWeightedMedian, (int)(totalCount / counts.Length)); threshold = Math.Min(threshold, averageCountPerGC); foreach (SampleGenomicBin bin in bins) { // Remove outlier (not a lot of bins with the same GC content) if (counts[bin.GenomicBin.GC] < threshold) { continue; } stripped.Add(bin); } return(stripped); }
/// <summary> /// Perform variance stabilization by GC bins. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> static bool NormalizeVarianceByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List <float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List <float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); // Estimate quartiles of all bins genomewide var globalQuartiles = Utilities.Quartiles(counts); // Will hold interquartile range (IQR) separately for each GC bin List <float> localIQR = new List <float>(countsByGC.Length); // Will hold quartiles separately for each GC bin List <Tuple <float, float, float> > localQuartiles = new List <Tuple <float, float, float> >(countsByGC.Length); // calculate interquartile range (IQR) for GC bins and populate localQuartiles list for (int i = 0; i < countsByGC.Length; i++) { if (countsByGC[i].Count == 0) { localIQR.Add(-1f); localQuartiles.Add(new Tuple <float, float, float>(-1f, -1f, -1f)); } else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC) { localQuartiles.Add(Utilities.Quartiles(countsByGC[i])); localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1); } else { List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, i); double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List <float>() { 0.25f, 0.5f, 0.75f }); localQuartiles.Add(new Tuple <float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2])); localIQR.Add((float)(quartiles[2] - quartiles[0])); } } // Identify if particular GC bins have IQR twice as large as IQR genomewide float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1; // Holder for GC bins with large IQR (compared to genomewide IQR) int significantIQRcounter = 0; for (int i = 10; i < 90; i++) { if (globalIQR < localIQR[i] * 2f) { significantIQRcounter++; } } if (significantIQRcounter <= 0) { return(false); } // Divide each count by the median count of bins with the same GC content foreach (SampleGenomicBin bin in bins) { var scaledLocalIqr = localIQR[bin.GenomicBin.GC] * 0.8f; if (globalIQR >= scaledLocalIqr) { continue; } // ratio of GC bins and global IQRs float iqrRatio = scaledLocalIqr / globalIQR; var medianGCCount = localQuartiles[bin.GenomicBin.GC].Item2; bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio; } // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt"); return(true); }