Ejemplo n.º 1
0
        private void initialize()
        {
            IEnumerable <SampleGenomicBin> onTargetBins = manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest);

            List <double> x = new List <double>();
            List <double> y = new List <double>();

            withoutChrY = new List <int>();
            int i = 0; // index into x and y

            foreach (var bin in onTargetBins)
            {
                double count = countTransformer(bin.Count); // Variance stablization
                if (!double.IsInfinity(count))
                {
                    x.Add(bin.GenomicBin.GC);
                    y.Add(count);
                    string chrom  = bin.GenomicBin.Chromosome.ToLower();
                    bool   isChrY = chrom == "chry" || chrom == "y";
                    if (!isChrY)
                    {
                        withoutChrY.Add(i);
                    }
                    i++;
                }
            }

            gcs    = x.ToArray();
            counts = y.ToArray();
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Assumes the bins are sorted by genomic coordinates
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized</param>
        /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param>
        /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param>
        public static void GetCountsByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, out List <float>[] countsByGC, out List <float> counts)
        {
            countsByGC = new List <float> [numberOfGCbins];
            counts     = new List <float>(bins.Count);

            // Initialize the lists
            for (int i = 0; i < countsByGC.Length; i++)
            {
                countsByGC[i] = new List <float>();
            }

            foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest))
            {
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome))
                {
                    continue;
                }

                // Put the observed count in the GC-appropriate list.
                countsByGC[bin.GenomicBin.GC].Add(bin.Count);

                // Add to the global list of counts.
                counts.Add(bin.Count);
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Perform a simple GC normalization.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        /// <param name="manifest"></param>
        static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List <float>[] countsByGC;

            // Will hold all of the autosomal counts present in 'bins'
            List <float> counts;

            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            double globalMedian = Utilities.Median(counts);

            double?[] medians = new double?[countsByGC.Length];

            // Compute the median count for each GC bin
            for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++)
            {
                if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC)
                {
                    medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]);
                }
                else
                {
                    List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex);
                    medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts);
                }
            }

            // Divide each count by the median count of bins with the same GC content
            for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++)
            {
                double?median = medians[bins[gcBinIndex].GenomicBin.GC];
                if (median != null && median > 0)
                {
                    bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median);
                }
            }
            // DebugPrintCountsByGC(bins, "CountsByGC-After.txt");
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Remove bins with extreme GC content.
        /// </summary>
        /// <param name="bins">Genomic bins in from which we filter out GC content outliers.</param>
        /// <param name="threshold">Minimum number of bins with the same GC content required to keep a bin.</param>
        ///
        /// The rationale of this function is that a GC normalization is performed by computing the median count
        /// for each possible GC value. If that count is small, then the corresponding normalization constant
        /// is unstable and we shouldn't use these data.
        static List <SampleGenomicBin> RemoveBinsWithExtremeGC(List <SampleGenomicBin> bins, int threshold, NexteraManifest manifest = null)
        {
            // Will hold outlier-removed bins.
            List <SampleGenomicBin> stripped = new List <SampleGenomicBin>();

            // used to count the number of bins with each possible GC content (0-100)
            int[]  counts     = new int[EnrichmentUtilities.numberOfGCbins];
            double totalCount = 0;

            foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest))
            {
                // We only count autosomal bins because these are the ones we computed normalization factor upon.
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome))
                {
                    continue;
                }

                counts[bin.GenomicBin.GC]++;
                totalCount++;
            }

            int averageCountPerGC = Math.Max(minNumberOfBinsPerGCForWeightedMedian, (int)(totalCount / counts.Length));

            threshold = Math.Min(threshold, averageCountPerGC);
            foreach (SampleGenomicBin bin in bins)
            {
                // Remove outlier (not a lot of bins with the same GC content)
                if (counts[bin.GenomicBin.GC] < threshold)
                {
                    continue;
                }
                stripped.Add(bin);
            }

            return(stripped);
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Perform variance stabilization by GC bins.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        static bool NormalizeVarianceByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List <float>[] countsByGC;
            // Will hold all of the autosomal counts present in 'bins'
            List <float> counts;

            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            // Estimate quartiles of all bins genomewide
            var globalQuartiles = Utilities.Quartiles(counts);
            // Will hold interquartile range (IQR) separately for each GC bin
            List <float> localIQR = new List <float>(countsByGC.Length);
            // Will hold quartiles separately for each GC bin
            List <Tuple <float, float, float> > localQuartiles = new List <Tuple <float, float, float> >(countsByGC.Length);

            // calculate interquartile range (IQR) for GC bins and populate localQuartiles list
            for (int i = 0; i < countsByGC.Length; i++)
            {
                if (countsByGC[i].Count == 0)
                {
                    localIQR.Add(-1f);
                    localQuartiles.Add(new Tuple <float, float, float>(-1f, -1f, -1f));
                }
                else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC)
                {
                    localQuartiles.Add(Utilities.Quartiles(countsByGC[i]));
                    localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1);
                }
                else
                {
                    List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, i);
                    double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List <float>()
                    {
                        0.25f, 0.5f, 0.75f
                    });
                    localQuartiles.Add(new Tuple <float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2]));
                    localIQR.Add((float)(quartiles[2] - quartiles[0]));
                }
            }

            // Identify if particular GC bins have IQR twice as large as IQR genomewide
            float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1;
            // Holder for GC bins with large IQR (compared to genomewide IQR)
            int significantIQRcounter = 0;

            for (int i = 10; i < 90; i++)
            {
                if (globalIQR < localIQR[i] * 2f)
                {
                    significantIQRcounter++;
                }
            }

            if (significantIQRcounter <= 0)
            {
                return(false);
            }

            // Divide each count by the median count of bins with the same GC content
            foreach (SampleGenomicBin bin in bins)
            {
                var scaledLocalIqr = localIQR[bin.GenomicBin.GC] * 0.8f;
                if (globalIQR >= scaledLocalIqr)
                {
                    continue;
                }

                // ratio of GC bins and global IQRs
                float iqrRatio      = scaledLocalIqr / globalIQR;
                var   medianGCCount = localQuartiles[bin.GenomicBin.GC].Item2;
                bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio;
            }

            // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt");
            return(true);
        }