public static GetCountsByGC ( List |
||
bins | List |
Bins whose counts are to be normalized |
manifest | NexteraManifest | |
countsByGC | List |
An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. |
counts | List |
Will hold all of the autosomal counts present in 'bins' |
return | void |
/// <summary> /// Perform a simple GC normalization. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> /// <param name="manifest"></param> static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List <float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List <float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); double globalMedian = Utilities.Median(counts); double?[] medians = new double?[countsByGC.Length]; // Compute the median count for each GC bin for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++) { if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC) { medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]); } else { List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex); medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts); } } // Divide each count by the median count of bins with the same GC content for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++) { double?median = medians[bins[gcBinIndex].GenomicBin.GC]; if (median != null && median > 0) { bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median); } } // DebugPrintCountsByGC(bins, "CountsByGC-After.txt"); }
/// <summary> /// Perform variance stabilization by GC bins. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> static bool NormalizeVarianceByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List <float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List <float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); // Estimate quartiles of all bins genomewide var globalQuartiles = Utilities.Quartiles(counts); // Will hold interquartile range (IQR) separately for each GC bin List <float> localIQR = new List <float>(countsByGC.Length); // Will hold quartiles separately for each GC bin List <Tuple <float, float, float> > localQuartiles = new List <Tuple <float, float, float> >(countsByGC.Length); // calculate interquartile range (IQR) for GC bins and populate localQuartiles list for (int i = 0; i < countsByGC.Length; i++) { if (countsByGC[i].Count == 0) { localIQR.Add(-1f); localQuartiles.Add(new Tuple <float, float, float>(-1f, -1f, -1f)); } else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC) { localQuartiles.Add(Utilities.Quartiles(countsByGC[i])); localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1); } else { List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, i); double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List <float>() { 0.25f, 0.5f, 0.75f }); localQuartiles.Add(new Tuple <float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2])); localIQR.Add((float)(quartiles[2] - quartiles[0])); } } // Identify if particular GC bins have IQR twice as large as IQR genomewide float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1; // Holder for GC bins with large IQR (compared to genomewide IQR) int significantIQRcounter = 0; for (int i = 10; i < 90; i++) { if (globalIQR < localIQR[i] * 2f) { significantIQRcounter++; } } if (significantIQRcounter <= 0) { return(false); } // Divide each count by the median count of bins with the same GC content foreach (SampleGenomicBin bin in bins) { var scaledLocalIqr = localIQR[bin.GenomicBin.GC] * 0.8f; if (globalIQR >= scaledLocalIqr) { continue; } // ratio of GC bins and global IQRs float iqrRatio = scaledLocalIqr / globalIQR; var medianGCCount = localQuartiles[bin.GenomicBin.GC].Item2; bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio; } // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt"); return(true); }