private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List <double>[] binCountsBySample = new List <double> [sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List <double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
public void Run(IFileLocation outputFile) { int bestNormalSampleIndex = 0; int normalSampleCount = _controlBinnedFiles.Count(); if (normalSampleCount > 1) // find the best normal { List <double[]> binCountsByNormalSample = new List <double[]>(); for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { var controlBinnedFile = _controlBinnedFiles.ElementAt(normalSampleIndex); var binCounts = new BinCounts(controlBinnedFile.FullName, manifest: _manifest); List <double> counts = binCounts.OnTargetCounts; double median = binCounts.OnTargetMedianBinCount; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double weight = median > 0 ? 1.0 / median : 0; binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray()); } double[] tumorBinCounts; { var binCounts = new BinCounts(_sampleBinnedFile.FullName, manifest: _manifest); List <double> counts = binCounts.OnTargetCounts; double tumorMedian = binCounts.OnTargetMedianBinCount; double tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0; tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray(); } // Find the best normal sample bestNormalSampleIndex = -1; double minMeanSquaredLogRatios = double.PositiveInfinity; for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { // Get the sum of squared log ratios var result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]); double meanSquaredLogRatios = result.Item1; int ignoredBinCount = result.Item2; // TODO: Skip a (bad) normal sample if too many bins were ignored. // Donavan's script skips a normal sample if more than 100 log ratios is NA. // The cut-off is likely panel-dependent. if (meanSquaredLogRatios < minMeanSquaredLogRatios) { minMeanSquaredLogRatios = meanSquaredLogRatios; bestNormalSampleIndex = normalSampleIndex; } } } // copy file var srcBinnedFile = _controlBinnedFiles.ElementAt(bestNormalSampleIndex); if (outputFile.Exists) { outputFile.Delete(); } srcBinnedFile.CopyTo(outputFile); }
/// <summary> /// Pick the best normal control that has the smallest mean squared log-ratios (LR2s). /// </summary> /// <param name="tumorBinnedPath"></param> /// <param name="normalBinnedPaths"></param> /// <param name="bestBinnedPath"></param> /// <param name="manifest"></param> private static void GetBestLR2BinCount(string tumorBinnedPath, IEnumerable<string> normalBinnedPaths, string bestBinnedPath, NexteraManifest manifest = null) { int bestNormalSampleIndex = 0; int normalSampleCount = normalBinnedPaths.Count(); if (normalSampleCount > 1) // find the best normal { List<double[]> binCountsByNormalSample = new List<double[]>(); for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { string normalBinnedPath = normalBinnedPaths.ElementAt(normalSampleIndex); var binCounts = new BinCounts(normalBinnedPath, manifest: manifest); List<double> counts = binCounts.OnTargetCounts; double median = binCounts.OnTargetMedianBinCount; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double weight = median > 0 ? 1.0 / median : 0; binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray()); } double[] tumorBinCounts; { var binCounts = new BinCounts(tumorBinnedPath, manifest: manifest); List<double> counts = binCounts.OnTargetCounts; double tumorMedian = binCounts.OnTargetMedianBinCount; double tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0; tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray(); } // Find the best normal sample bestNormalSampleIndex = -1; double minMeanSquaredLogRatios = double.PositiveInfinity; for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { // Get the sum of squared log ratios var result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]); double meanSquaredLogRatios = result.Item1; int ignoredBinCount = result.Item2; // TODO: Skip a (bad) normal sample if too many bins were ignored. // Donavan's script skips a normal sample if more than 100 log ratios is NA. // The cut-off is likely panel-dependent. if (meanSquaredLogRatios < minMeanSquaredLogRatios) { minMeanSquaredLogRatios = meanSquaredLogRatios; bestNormalSampleIndex = normalSampleIndex; } } } // copy file string srcBinnedPath = normalBinnedPaths.ElementAt(bestNormalSampleIndex); if (File.Exists(srcBinnedPath)) { if (File.Exists(bestBinnedPath)) { File.Delete(bestBinnedPath); } File.Copy(srcBinnedPath, bestBinnedPath); } }
private static void GetWeightedAverageBinCount(IEnumerable<string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List<double>[] binCountsBySample = new List<double>[sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List<double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
public void Run(IFileLocation outputFile) { List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName); VerifyBinOrder(sampleBins); // set bin count to 1 if less than 1 foreach (var bin in sampleBins) { bin.Count = Math.Max(1, bin.Count); } // center the sample var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray(); // project onto the axes var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes); // undo centering and set bin count to 1 if less than 1 var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count)); // write temporary reference count file var tempReferenceFile = new FileLocation(Path.GetTempFileName()); var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count)); CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins); // calcualte median ratio var ratios = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest); double medianRatio = ratios.OnTargetMedianBinCount; // delete temporary reference count file tempReferenceFile.Delete(); // multiply reference counts by the median ratio var referenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio))); // write reference count file CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins); }
/// <summary> /// Pick the best normal control that has the smallest mean squared log-ratios (LR2s). /// </summary> /// <param name="tumorBinnedPath"></param> /// <param name="normalBinnedPaths"></param> /// <param name="bestBinnedPath"></param> /// <param name="manifest"></param> private static void GetBestLR2BinCount(string tumorBinnedPath, IEnumerable <string> normalBinnedPaths, string bestBinnedPath, NexteraManifest manifest = null) { int bestNormalSampleIndex = 0; int normalSampleCount = normalBinnedPaths.Count(); if (normalSampleCount > 1) // find the best normal { List <double[]> binCountsByNormalSample = new List <double[]>(); for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { string normalBinnedPath = normalBinnedPaths.ElementAt(normalSampleIndex); var binCounts = new BinCounts(normalBinnedPath, manifest: manifest); List <double> counts = binCounts.OnTargetCounts; double median = binCounts.OnTargetMedianBinCount; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double weight = median > 0 ? 1.0 / median : 0; binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray()); } double[] tumorBinCounts; { var binCounts = new BinCounts(tumorBinnedPath, manifest: manifest); List <double> counts = binCounts.OnTargetCounts; double tumorMedian = binCounts.OnTargetMedianBinCount; double tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0; tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray(); } // Find the best normal sample bestNormalSampleIndex = -1; double minMeanSquaredLogRatios = double.PositiveInfinity; for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { // Get the sum of squared log ratios var result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]); double meanSquaredLogRatios = result.Item1; int ignoredBinCount = result.Item2; // TODO: Skip a (bad) normal sample if too many bins were ignored. // Donavan's script skips a normal sample if more than 100 log ratios is NA. // The cut-off is likely panel-dependent. if (meanSquaredLogRatios < minMeanSquaredLogRatios) { minMeanSquaredLogRatios = meanSquaredLogRatios; bestNormalSampleIndex = normalSampleIndex; } } } // copy file string srcBinnedPath = normalBinnedPaths.ElementAt(bestNormalSampleIndex); if (File.Exists(srcBinnedPath)) { if (File.Exists(bestBinnedPath)) { File.Delete(bestBinnedPath); } File.Copy(srcBinnedPath, bestBinnedPath); } }