public static GetOnTargetBins ( IEnumerable |
||
bins | IEnumerable |
|
manifest | NexteraManifest | |
Résultat | IEnumerable |
/// <summary> /// Assumes the bins are sorted by genomic coordinates /// </summary> /// <param name="bins">Bins whose counts are to be normalized</param> /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param> /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param> public static void GetCountsByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, out List <float>[] countsByGC, out List <float> counts) { countsByGC = new List <float> [numberOfGCbins]; counts = new List <float>(bins.Count); // Initialize the lists for (int i = 0; i < countsByGC.Length; i++) { countsByGC[i] = new List <float>(); } foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome)) { continue; } // Put the observed count in the GC-appropriate list. countsByGC[bin.GenomicBin.GC].Add(bin.Count); // Add to the global list of counts. counts.Add(bin.Count); } }
private void initialize() { IEnumerable <SampleGenomicBin> onTargetBins = manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest); List <double> x = new List <double>(); List <double> y = new List <double>(); withoutChrY = new List <int>(); int i = 0; // index into x and y foreach (var bin in onTargetBins) { double count = countTransformer(bin.Count); // Variance stablization if (!double.IsInfinity(count)) { x.Add(bin.GenomicBin.GC); y.Add(count); string chrom = bin.GenomicBin.Chromosome.ToLower(); bool isChrY = chrom == "chry" || chrom == "y"; if (!isChrY) { withoutChrY.Add(i); } i++; } } gcs = x.ToArray(); counts = y.ToArray(); }
/// <summary> /// Remove bins with extreme GC content. /// </summary> /// <param name="bins">Genomic bins in from which we filter out GC content outliers.</param> /// <param name="threshold">Minimum number of bins with the same GC content required to keep a bin.</param> /// /// The rationale of this function is that a GC normalization is performed by computing the median count /// for each possible GC value. If that count is small, then the corresponding normalization constant /// is unstable and we shouldn't use these data. static List <SampleGenomicBin> RemoveBinsWithExtremeGC(List <SampleGenomicBin> bins, int threshold, NexteraManifest manifest = null) { // Will hold outlier-removed bins. List <SampleGenomicBin> stripped = new List <SampleGenomicBin>(); // used to count the number of bins with each possible GC content (0-100) int[] counts = new int[EnrichmentUtilities.numberOfGCbins]; double totalCount = 0; foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { // We only count autosomal bins because these are the ones we computed normalization factor upon. if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome)) { continue; } counts[bin.GenomicBin.GC]++; totalCount++; } int averageCountPerGC = Math.Max(minNumberOfBinsPerGCForWeightedMedian, (int)(totalCount / counts.Length)); threshold = Math.Min(threshold, averageCountPerGC); foreach (SampleGenomicBin bin in bins) { // Remove outlier (not a lot of bins with the same GC content) if (counts[bin.GenomicBin.GC] < threshold) { continue; } stripped.Add(bin); } return(stripped); }