GetCountsByGC() public static method

Assumes the bins are sorted by genomic coordinates
public static GetCountsByGC ( List bins, NexteraManifest manifest, List &countsByGC, List &counts ) : void
bins List Bins whose counts are to be normalized
manifest NexteraManifest
countsByGC List An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
counts List Will hold all of the autosomal counts present in 'bins'
return void
Esempio n. 1
0
        /// <summary>
        /// Perform a simple GC normalization.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        /// <param name="manifest"></param>
        static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List <float>[] countsByGC;

            // Will hold all of the autosomal counts present in 'bins'
            List <float> counts;

            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            double globalMedian = Utilities.Median(counts);

            double?[] medians = new double?[countsByGC.Length];

            // Compute the median count for each GC bin
            for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++)
            {
                if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC)
                {
                    medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]);
                }
                else
                {
                    List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex);
                    medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts);
                }
            }

            // Divide each count by the median count of bins with the same GC content
            for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++)
            {
                double?median = medians[bins[gcBinIndex].GenomicBin.GC];
                if (median != null && median > 0)
                {
                    bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median);
                }
            }
            // DebugPrintCountsByGC(bins, "CountsByGC-After.txt");
        }
Esempio n. 2
0
        /// <summary>
        /// Perform variance stabilization by GC bins.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        static bool NormalizeVarianceByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List <float>[] countsByGC;
            // Will hold all of the autosomal counts present in 'bins'
            List <float> counts;

            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            // Estimate quartiles of all bins genomewide
            var globalQuartiles = Utilities.Quartiles(counts);
            // Will hold interquartile range (IQR) separately for each GC bin
            List <float> localIQR = new List <float>(countsByGC.Length);
            // Will hold quartiles separately for each GC bin
            List <Tuple <float, float, float> > localQuartiles = new List <Tuple <float, float, float> >(countsByGC.Length);

            // calculate interquartile range (IQR) for GC bins and populate localQuartiles list
            for (int i = 0; i < countsByGC.Length; i++)
            {
                if (countsByGC[i].Count == 0)
                {
                    localIQR.Add(-1f);
                    localQuartiles.Add(new Tuple <float, float, float>(-1f, -1f, -1f));
                }
                else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC)
                {
                    localQuartiles.Add(Utilities.Quartiles(countsByGC[i]));
                    localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1);
                }
                else
                {
                    List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, i);
                    double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List <float>()
                    {
                        0.25f, 0.5f, 0.75f
                    });
                    localQuartiles.Add(new Tuple <float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2]));
                    localIQR.Add((float)(quartiles[2] - quartiles[0]));
                }
            }

            // Identify if particular GC bins have IQR twice as large as IQR genomewide
            float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1;
            // Holder for GC bins with large IQR (compared to genomewide IQR)
            int significantIQRcounter = 0;

            for (int i = 10; i < 90; i++)
            {
                if (globalIQR < localIQR[i] * 2f)
                {
                    significantIQRcounter++;
                }
            }

            if (significantIQRcounter <= 0)
            {
                return(false);
            }

            // Divide each count by the median count of bins with the same GC content
            foreach (SampleGenomicBin bin in bins)
            {
                var scaledLocalIqr = localIQR[bin.GenomicBin.GC] * 0.8f;
                if (globalIQR >= scaledLocalIqr)
                {
                    continue;
                }

                // ratio of GC bins and global IQRs
                float iqrRatio      = scaledLocalIqr / globalIQR;
                var   medianGCCount = localQuartiles[bin.GenomicBin.GC].Item2;
                bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio;
            }

            // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt");
            return(true);
        }