/// <summary> /// Estimate genome distance between two purity models (weighted absolute difference between copy number profiles) /// /// </summary> protected double CalculateModelDistance(CoveragePurityModel model1, CoveragePurityModel model2, List<SegmentInfo> usableSegments, long genomeLength) { double genomeDistance = 0; // every model should have the same number of segments if (model1.CNs.Count != model2.CNs.Count) { Console.WriteLine("Models do not have the same number of usable CN segments"); return 1; } for (int i = 0; i < model1.CNs.Count; i++) { genomeDistance += Math.Abs(model1.CNs[i] - model2.CNs[i]) * (usableSegments[i].Segment.End - usableSegments[i].Segment.Begin) / (double)genomeLength; } return genomeDistance; }
/// <summary> /// Fit a Gaussian mixture model. /// Fix the means to the model MAF and Coverage and run the EM algorithm until convergence. /// Compute the empirical MAF and Coverage. /// Fix the means to the empirical MAF and Coverage and run the EM algorithm again until convergence. /// Always estimate the full covariance matrix? /// </summary> /// <param name="model"></param> /// <param name="segments"></param> /// <param name="debugPath"></param> /// <returns></returns> protected double FitGaussians(CoveragePurityModel model, List<SegmentInfo> segments, string debugPath = null, double knearestNeighbourCutoff = Int32.MaxValue) { List<ModelPoint> modelPoints = InitializeModelPoints(model); GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, segments, this.MeanCoverage, this.CoverageWeightingFactor, knearestNeighbourCutoff); double likelihood = gmm.Fit(); if (debugPath != null) { // write Gaussian mixture model to debugPath using (StreamWriter writer = new StreamWriter(debugPath)) { writer.WriteLine("CN\tMajor Chr #\tMAF\tCoverage\tOmega\tMu0\tMu1\tSigma00\tSigma01\tSigma10\tSigma11"); foreach (ModelPoint modelPoint in modelPoints) { writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}", modelPoint.Ploidy.CopyNumber, modelPoint.Ploidy.MajorChromosomeCount, modelPoint.Ploidy.MixedMinorAlleleFrequency, modelPoint.Ploidy.MixedCoverage, modelPoint.Ploidy.Omega, modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Ploidy.Sigma[0][0], modelPoint.Ploidy.Sigma[0][1], modelPoint.Ploidy.Sigma[1][0], modelPoint.Ploidy.Sigma[1][1]); } writer.WriteLine(""); writer.WriteLine("MAF\tCoverage\tPosterior Probabilities"); StringBuilder sb = new StringBuilder(); foreach (SegmentInfo segment in segments) { sb.Clear(); sb.AppendFormat("{0}\t{1}", segment.MAF, segment.Coverage); foreach (ModelPoint modelPoint in modelPoints) { sb.AppendFormat("\t{0}", segment.PosteriorProbs[modelPoint]); } writer.WriteLine(sb.ToString()); } } } return likelihood; }
/// <summary> /// Given genome-wide copy number (CN) profile of the model estimate the total number of rearrangements that /// need to be applied to a diploid genome to transform it into the tumor genome under given purity model. /// The following logic is used: /// 1) Assign one rearrangement score to a single CN state transition, i.e. transition 2 -> 3 will get a score of one /// while transition 2 -> 4 will get a score of 2. /// 2) Cumulative PercentCN of 80% and more for copy number bins > 2 indicate possible genome doubling. /// Assign score of 1 for genome doubling event. Use copy number 4 baseline instead of 2 and count events as in step 1. /// </summary> protected double DiploidModelDistance(CoveragePurityModel model, List<SegmentInfo> usableSegments, long genomeLength) { double totalCNevents = 0; int modelBaseline = 2; double amplificationPercentCN = 0; for (int copyNumber = 3; copyNumber < MaximumCopyNumber; copyNumber++) amplificationPercentCN += model.PercentCN[copyNumber]; if (amplificationPercentCN > 0.8) { modelBaseline = 4; totalCNevents += 1; } for (int i = 0; i < model.CNs.Count; i++) { totalCNevents += Math.Abs(model.CNs[i] - modelBaseline) * (usableSegments[i].Segment.End - usableSegments[i].Segment.Begin) / (double)genomeLength; } model.DiploidDistance = (double)1.0 / Math.Max(0.001, totalCNevents); return totalCNevents; }
// Initialize model points given expected ploidy and purity values protected List<ModelPoint> InitializeModelPoints(CoveragePurityModel model) { List<ModelPoint> modelPoints = new List<ModelPoint>(); double[] mu = GetProjectedMeanCoverage(model.DiploidCoverage); double diploidMAF = this.AllPloidies[3].MinorAlleleFrequency; /// %%% Magic number! ///////////////////////////////////////////// // Update the parameters in each SegmentPloidy object, and construct corresponding SegmentInfo objects foreach (SegmentPloidy ploidy in this.AllPloidies) { ModelPoint point = new ModelPoint(); double pureCoverage = mu[ploidy.CopyNumber]; point.Coverage = (model.Purity * pureCoverage) + (1 - model.Purity) * model.DiploidCoverage; double pureMAF = ploidy.MinorAlleleFrequency; if (ploidy.MajorChromosomeCount * 2 == ploidy.CopyNumber) { point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 2 * diploidMAF); point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2; if (double.IsNaN(point.MAF)) point.MAF = 0; } else { point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 1); point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2; } point.Ploidy = ploidy; modelPoints.Add(point); point.CN = ploidy.CopyNumber; ploidy.MixedMinorAlleleFrequency = point.MAF; ploidy.MixedCoverage = point.Coverage; } return modelPoints; }
/// <summary> /// Initialize model points given diploid purity modelInitialize model points given somatic purity model /// </summary> protected List<ModelPoint> InitializeModelPoints(List<SegmentInfo> segments, double coverage, int percentPurity, int numClusters) { List<ModelPoint> modelPoints = new List<ModelPoint>(); CoveragePurityModel model = new CoveragePurityModel(); model.DiploidCoverage = coverage; model.Purity = percentPurity / 100f; double[] mu = GetProjectedMeanCoverage(model.DiploidCoverage); double diploidMAF = this.AllPloidies[3].MinorAlleleFrequency; /// %%% Magic number! ///////////////////////////////////////////// // Update the parameters in each SegmentPloidy object, and construct corresponding SegmentInfo objects foreach (SegmentPloidy ploidy in this.AllPloidies) { ModelPoint point = new ModelPoint(); double pureCoverage = mu[ploidy.CopyNumber]; point.Coverage = (model.Purity * pureCoverage) + (1 - model.Purity) * model.DiploidCoverage; double pureMAF = ploidy.MinorAlleleFrequency; if (ploidy.MajorChromosomeCount * 2 == ploidy.CopyNumber) { point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 2 * diploidMAF); point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2; if (double.IsNaN(point.MAF)) point.MAF = 0; } else { point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 1); point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2; } point.Ploidy = ploidy; modelPoints.Add(point); point.CN = ploidy.CopyNumber; ploidy.MixedMinorAlleleFrequency = point.MAF; ploidy.MixedCoverage = point.Coverage; } // estimate distance between each model point and segments List<double> modelPointsScore = new List<double>(); foreach (ModelPoint modelPoint in modelPoints) { List<double> distanceList = new List<double>(); foreach (SegmentInfo info in segments) { if (info.MAF >= 0) distanceList.Add(GetModelDistance(info.Coverage, modelPoint.Coverage, info.MAF, modelPoint.MAF)); } distanceList.Sort(); double v15th_percentile = distanceList[Convert.ToInt32(distanceList.Count * 0.15)]; // use model points with good fit to observed values modelPointsScore.Add(v15th_percentile); } // sort list and return indices var sortedScores = modelPointsScore.Select((x, i) => new KeyValuePair<double, int>(x, i)).OrderBy(x => x.Key).ToList(); List<double> scoresValue = sortedScores.Select(x => x.Key).ToList(); List<int> scoresIndex = sortedScores.Select(x => x.Value).ToList(); List<ModelPoint> selectedModelPoints = new List<ModelPoint>(); for (int i = 0; i < numClusters; i++) { modelPoints[scoresIndex[i]].Cluster = i + 1; selectedModelPoints.Add(modelPoints[scoresIndex[i]]); } return selectedModelPoints; }
/// <summary> /// Assign copy number calls to segments. And, produce extra headers for the CNV vcf file, giving the /// overall estimated purity and ploidy. /// </summary> protected List<string> CallCNVUsingSNVFrequency(double? localSDmertic, string referenceFolder) { List<string> Headers = new List<string>(); if (this.CNOracle != null) { this.DerivePurityEstimateFromVF(); } // Get genome length. GenomeMetadata genomeMetaData = null; genomeMetaData = new GenomeMetadata(); genomeMetaData.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml")); // Derive a model of diploid coverage, and overall tumor purity: this.Model = ModelOverallCoverageAndPurity(genomeMetaData.Length); // Make preliminary ploidy calls for all segments. For those segments which fit their ploidy reasonably well, // accumulate information about the MAF by site and coverage by bin. this.HeterogeneousSegmentsSignature.Sort(); if (AllPloidies.First().Sigma == null) { AssignPloidyCalls(); } else { AssignPloidyCallsGaussianMixture(); } // If the somatic SNV/indel file was provided, then we use it to derive another estimate of purity. // And, if we didn't make many CNV calls, then we report this estimate, instead of the estimate derived from // our overall model. if (!string.IsNullOrEmpty(SomaticVCFPath)) { try { double SNVPurityEstimate = EstimatePurityFromSomaticSNVs(); this.SelectPurityEstimate(SNVPurityEstimate, genomeMetaData.Length); } catch (Exception e) { Console.Error.WriteLine("* Error deriving purity estimate from somatic SNVs. Details:\n{0}", e.ToString()); } } // Add some extra information to the vcf file header: Headers.Add(string.Format("##EstimatedTumorPurity={0:F2}", this.Model.Purity)); double totalPloidy = 0; double totalWeight = 0; foreach (CanvasSegment segment in this.Segments) { totalWeight += segment.End - segment.Begin; totalPloidy += segment.CopyNumber * (segment.End - segment.Begin); } Headers.Add(string.Format("##OverallPloidy={0:F2}", totalPloidy / Math.Max(1, totalWeight))); Headers.Add(string.Format("##PurityModelFit={0:F4}", this.Model.Deviation)); Headers.Add(string.Format("##InterModelDistance={0:F4}", this.Model.InterModelDistance)); Headers.Add(string.Format("##EstimatedChromosomeCount={0:F2}", this.EstimateChromosomeCount())); Headers.Add(string.Format("##LocalSDmetric={0:F2}", localSDmertic)); Headers.Add(string.Format("##Heterogeneity={0:F2}", this.Model.HeterogeneityIndex)); return Headers; }
/// <summary> /// Identify the tuple (DiploidCoverage, OverallPurity) which best models our overall /// distribution of (MAF, Coverage) data across all segments. Consider various tuples (first with a coarse-grained /// and then a fine-grained search), and for each one, measure the distortion - the average distance (weighted /// by segment length) between actual and modeled (MAF, Coverage) coordinate. /// </summary> protected CoveragePurityModel ModelOverallCoverageAndPurity(long genomeLength) { List<SegmentInfo> usableSegments; // Identify usable segments using our MinimumVariantFrequenciesForInformativeSegment cutoff, // then (if we don't find enough) we can try again with progressively more permissive cutoffs. while (true) { usableSegments = GetUsableSegmentsForModeling(this.Segments); int validMAFCount = usableSegments.Count(x => x.MAF >= 0); if (validMAFCount > Math.Min(20, this.Segments.Count)) break; // We have enough usable segments with nonnull MAF if (MinimumVariantFrequenciesForInformativeSegment <= 5) break; // Give up on modeling MinimumVariantFrequenciesForInformativeSegment -= 15; } Console.WriteLine("Modeling overall coverage/purity across {0} segments", usableSegments.Count); if (usableSegments.Count < 10) throw new UncallableDataException("Cannot model coverage/purity with less than 10 segments."); // When computing distances between model and actual points, we want to provide roughly equal weight // to coverage (which covers a large range) and MAF, which falls in the range (0, 0.5). // If we already knew the diploid coverage, then we'd know just how to scale things (catch-22). // Let's assume that the median coverage is a sane thing to use for scaling: List<float> tempCoverageList = new List<float>(); List<double> knearestNeighbourList = new List<double>(); // Segments clustering using Gaussian Expectation Maximisation // Step0: Prepare model parameters foreach (SegmentInfo info in usableSegments) tempCoverageList.Add(Convert.ToSingle(info.Coverage)); Tuple<float, float, float> coverageQuartiles = CanvasCommon.Utilities.Quartiles(tempCoverageList); int minCoverageLevel = Convert.ToInt32(coverageQuartiles.Item1); int maxCoverageLevel = Convert.ToInt32(coverageQuartiles.Item3); int medianCoverageLevel = Convert.ToInt32(coverageQuartiles.Item2); this.CoverageWeightingFactor = this.CoverageWeighting / medianCoverageLevel; int bestNumClusters = 0; // Need large number of segments for cluster analysis if (usableSegments.Count > 100) { List<float> tempMAFList = new List<float>(); foreach (SegmentInfo info in usableSegments) tempMAFList.Add(Convert.ToSingle(info.MAF)); Tuple<float, float, float> MAFQuartiles = CanvasCommon.Utilities.Quartiles(tempMAFList); double minMAF = Math.Max(Convert.ToDouble(MAFQuartiles.Item1) - 0.05, 0.01); double maxMAF = Math.Min(Convert.ToDouble(MAFQuartiles.Item3) + 0.05, 0.46); // Step1: Find outliers double knearestNeighbourCutoff = KnearestNeighbourCutoff(usableSegments); // Step2: Find the best CoverageWeightingFactor double bestCoverageWeightingFactor = BestCoverageWeightingFactor(usableSegments, maxCoverageLevel, medianCoverageLevel, knearestNeighbourCutoff); // Step3: Find the optimal number of clusters List<ModelPoint> modelPoints = BestNumClusters(usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff); bestNumClusters = modelPoints.Count; // Step4: Find segment clusters using the final model GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff); double likelihood = gmm.runExpectationMaximization(); // Step5: Write results string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusteringModel.txt"); if (!string.IsNullOrEmpty(debugPathClusterModel)) { using (StreamWriter debugWriter = new StreamWriter(debugPathClusterModel)) { debugWriter.WriteLine("#MAF\tCoverage\tClusterID"); foreach (ModelPoint modelPoint in modelPoints) { debugWriter.WriteLine("{0}\t{1}\t{2}", modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Cluster); } debugWriter.WriteLine(); debugWriter.WriteLine("#MAF\tCoverage\tBestDistance\tClusterID"); foreach (SegmentInfo info in usableSegments) { debugWriter.Write("{0}\t{1}\t{2}", info.MAF, info.Coverage, info.Cluster); debugWriter.WriteLine(); } } } } // Note: Don't consider purity below 20 (at this point), becuase that creates a model that is very noise-sensitive. // We tried using a "tumor" sample that is actually just the real normal: We could overfit this data as very low // purity 5% and make lots of (bogus) calls which fit the noise in coverage and MAF. double bestDeviation = double.MaxValue; List<CoveragePurityModel> allModels = new List<CoveragePurityModel>(); // set best somatic model to pre-specified ploidy and purity values if (this.userPloidy != null && this.userPurity != null) { CoveragePurityModel bestModel = new CoveragePurityModel(); bestModel.DiploidCoverage = medianCoverageLevel * Convert.ToDouble(this.userPloidy) / 2.0; bestModel.Purity = Convert.ToDouble(this.userPurity); this.ModelDeviation(bestModel, usableSegments, bestNumClusters); this.DiploidModelDistance(bestModel, usableSegments, genomeLength); return bestModel; } // find best somatic model else { // Coarse search: Consider various (coverage, purity) tuples. int minCoverage = (int)Math.Max(10, medianCoverageLevel / 2.5); int maxCoverage = (int)Math.Max(10, medianCoverageLevel * 2.5); int coverageStep = Math.Max(1, (maxCoverage - minCoverage) / 80); Console.WriteLine(">>>DiploidCoverage: Consider {0}...{1} step {2}", minCoverage, maxCoverage, coverageStep); for (int coverage = minCoverage; coverage < maxCoverage; coverage += coverageStep) { // iterate over purity range for (int percentPurity = 20; percentPurity <= 100; percentPurity += 5) { CoveragePurityModel model = new CoveragePurityModel(); model.DiploidCoverage = coverage; model.Purity = percentPurity / 100f; this.ModelDeviation(model, usableSegments, bestNumClusters); this.DiploidModelDistance(model, usableSegments, genomeLength); if (model.Deviation < bestDeviation && model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy) { bestDeviation = model.Deviation; } // exluce models with unrealistic genome ploidies if (model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy) allModels.Add(model); } } // New logic for model selection: // - First, compute the best model deviation. This establishes a baseline for how large the deviation is allowed to get in // an acceptable model. Allow somewhat higher deviation for targeted data, since we see extra noise there. // - Review models. Discard any with unacceptable deviation. Note the best attainable % copy number 2 and % normal. // - For each model, scale PercentNormal to a range of 0..100 where 100 = the best number seen for any acceptable model. Similarly // for PercentCN2. And similarly for DeviationScore: BestDeviation=1, WorstAllowedDeviation=0 // - Choose a model (with acceptable deviation) which maximizes a score of the form: // PercentNormal + a * PercentCN2 + b * DeviationScore double worstAllowedDeviation = bestDeviation * this.DeviationFactor; double bestCN2 = 0; double bestCN2Normal = 0; double bestDiploidDistance = 0; // derive max values for scaling int counter = 0; List<double> deviations = new List<double>(); foreach (CoveragePurityModel model in allModels) { if (model.Deviation < worstAllowedDeviation) counter++; deviations.Add(model.Deviation); } deviations.Sort(); if (counter < this.DeviationIndexCutoff) { worstAllowedDeviation = deviations[Math.Min(this.DeviationIndexCutoff, deviations.Count - 1)]; } double bestAccuracyDeviation = double.MaxValue; double bestPrecisionDeviation = double.MaxValue; // derive max values for scaling foreach (CoveragePurityModel model in allModels) { bestAccuracyDeviation = Math.Min(bestAccuracyDeviation, model.AccuracyDeviation); bestPrecisionDeviation = Math.Min(bestPrecisionDeviation, model.PrecisionDeviation); if (model.Deviation > worstAllowedDeviation) continue; if (model.PercentCN[2] > bestCN2) bestCN2 = model.PercentCN[2]; if (model.PercentNormal > bestCN2Normal) bestCN2Normal = model.PercentNormal; if (model.DiploidDistance > bestDiploidDistance) bestDiploidDistance = model.DiploidDistance; } // coarse search to find best ploidy and purity model List<CoveragePurityModel> bestModels = new List<CoveragePurityModel>(); CoveragePurityModel bestModel = null; double bestScore = 0; // holds scores for all models List<double> scores = new List<double>(); // save all purity and ploidy models to a file string debugPath = Path.Combine(this.TempFolder, "PurityModel.txt"); using (StreamWriter debugWriter = new StreamWriter(debugPath)) { debugWriter.Write("#Purity\tDiploidCoverage\t"); debugWriter.Write("Deviation\tAccuracyDeviation\tPrecisionDeviation\tWorstAllowedDeviation\tAccDev/best\tPrecDev/best\t"); debugWriter.Write("DeviationScore\tScore\tPloidy\t"); debugWriter.Write("Normal\tNormal/best\tCN2\tCN2/Best\t"); debugWriter.Write("DiploidDistance\tDiploidDistance/Best"); debugWriter.WriteLine(); foreach (CoveragePurityModel model in allModels) { // Filter models with unacceptable deviation: if (model.Deviation > worstAllowedDeviation) continue; // Transform purity into Weighting Factor to penalize abnormal ploidies at low purity: // (1.5 - 0.5) = minmax range of the new weighting scale; (1.0 - 0.2) = minmax range of the purity values // This transformation leads a maximal lowPurityWeightingFactor value of 1.5 for the lowest purity model and a minimal value of 0.75 for the highest purity model double lowPurityWeightingFactor = 1.5 / ((1.5 - 0.5) / (1.0 - 0.2) * (model.Purity - 0.2) + 1.0); double score = this.PercentNormal2WeightingFactor * model.PercentNormal / Math.Max(0.01, bestCN2Normal); score += lowPurityWeightingFactor * this.CN2WeightingFactor * model.PercentCN[2] / Math.Max(0.01, bestCN2); score += this.DeviationScoreWeightingFactor * (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation); score += this.DiploidDistanceScoreWeightingFactor * model.DiploidDistance / Math.Max(0.01, bestDiploidDistance); scores.Add(score); bestModels.Add(model); // write to file debugWriter.Write("{0}\t{1}\t", (int)Math.Round(100 * model.Purity), model.DiploidCoverage); debugWriter.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t", model.Deviation, model.AccuracyDeviation, model.PrecisionDeviation, worstAllowedDeviation, model.AccuracyDeviation / bestAccuracyDeviation, model.PrecisionDeviation / bestPrecisionDeviation); debugWriter.Write("{0}\t{1}\t{2}\t", (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation), score, model.Ploidy); debugWriter.Write("{0}\t{1}\t{2}\t{3}\t", model.PercentNormal, model.PercentNormal / Math.Max(0.01, bestCN2Normal), model.PercentCN[2], model.PercentCN[2] / Math.Max(0.01, bestCN2)); debugWriter.Write("{0}\t{1}\t", model.DiploidDistance, model.DiploidDistance / Math.Max(0.01, bestDiploidDistance)); debugWriter.WriteLine(); if (score > bestScore) { bestModel = model; bestScore = score; } } } // sort list and return indices var sortedScores = scores.Select((x, i) => new KeyValuePair<double, int>(x, i)).OrderBy(x => x.Key).ToList(); List<double> scoresValue = sortedScores.Select(x => x.Key).ToList(); List<int> scoresIndex = sortedScores.Select(x => x.Value).ToList(); // interModelDistance shows genome edit distance between the best model and other top models (defined by MaximumRelatedModels). // The premise is that if the top models provide widely different genome baseline (leading to high interModelDistance), // the overall modeling approach might be more unstable. double interModelDistance = 0; // start at one since model #0 is the highest scoring model to compare to for (int i = 1; i < MaximumRelatedModels; i++) { interModelDistance += CalculateModelDistance(bestModels[scoresIndex[0]], bestModels[scoresIndex[i]], usableSegments, genomeLength); } interModelDistance /= (double)MaximumRelatedModels; Console.WriteLine(">>> Initial model: Deviation {0:F5}, coverage {1}, purity {2:F1}%, CN2 {3:F2}", bestModel.Deviation, bestModel.DiploidCoverage, 100 * bestModel.Purity, bestModel.PercentCN[2]); // Refine search: Smaller step sizes in the neighborhood of the initial model. minCoverage = (int)Math.Round(bestModel.DiploidCoverage) - 5; maxCoverage = (int)Math.Round(bestModel.DiploidCoverage) + 5; int minPurity = Math.Max(20, (int)Math.Round(bestModel.Purity * 100) - 10); int maxPurity = Math.Min(100, (int)Math.Round(bestModel.Purity * 100) + 10); // %%% magic numbers bestDeviation = double.MaxValue; bestModel = null; for (int coverage = minCoverage; coverage <= maxCoverage; coverage++) { for (int percentPurity = minPurity; percentPurity <= maxPurity; percentPurity++) { CoveragePurityModel model = new CoveragePurityModel(); model.DiploidCoverage = coverage; model.Purity = percentPurity / 100f; this.ModelDeviation(model, usableSegments, bestNumClusters); if (bestModel == null || model.Deviation < bestModel.Deviation) { bestModel = model; } } } // string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusterModel.txt"); string debugPathCNVModeling = Path.Combine(this.TempFolder, "CNVModeling.txt"); ModelDeviation(bestModel, usableSegments, bestNumClusters, null, true, debugPathCNVModeling); Console.WriteLine(); Console.WriteLine(">>> Refined model: Deviation {0:F5}, coverage {1}, purity {2:F1}%", bestModel.Deviation, bestModel.DiploidCoverage, bestModel.Purity * 100); Console.WriteLine(); { foreach (SegmentPloidy ploidy in AllPloidies) { ploidy.Omega = 0; ploidy.Mu = null; ploidy.Sigma = null; } } if (!bestModel.InterModelDistance.HasValue) { bestModel.InterModelDistance = interModelDistance; } return bestModel; } }
/// <summary> /// Helper function for ModelOverallCoverageAndPurity. Measure the deviation (mismatch) between our /// model of expected coverage + minor allele frequency, and the actual data. /// Note that this method updates the parameters in this.AllPloidies to match this model. /// TotalDeviation = PrecisionWeight * PrecisionDeviation + (1 - PrecisionWeight) * AccuracyDeviation /// PrecisionDeviation is the weighted average of the distance between segments and their assigned ploidy /// AccuracyDeviation is the weighted average of the distance from the segment centroid /// and the corresponding ploidy. /// </summary> protected double ModelDeviation(CoveragePurityModel model, List<SegmentInfo> segments, int numClusters, string debugPathClusterInfo = null, bool bestModel = false, string debugPath = null) { List<ModelPoint> modelPoints = InitializeModelPoints(model); double precisionDeviation = 0; this.RefineDiploidMAF(segments, modelPoints); ///////////////////////////////////////////// // Cluster our segments: Array.Clear(model.PercentCN, 0, model.PercentCN.Length); model.CNs.Clear(); double totalWeight = 0; double totalBasesNormal = 0; foreach (SegmentInfo info in segments) { double bestDistance = double.MaxValue; int bestCN = 0; ModelPoint bestModelPoint = null; foreach (ModelPoint modelPoint in modelPoints) { double distance = GetModelDistance(info.Coverage, modelPoint.Coverage, info.MAF, modelPoint.MAF); if (distance < bestDistance) { bestDistance = distance; bestCN = modelPoint.CN; info.Ploidy = modelPoint.Ploidy; bestModelPoint = modelPoint; } } bestDistance = Math.Sqrt(bestDistance); info.Distance = bestDistance; precisionDeviation += bestDistance * info.Weight; totalWeight += info.Weight; model.PercentCN[bestCN] += info.Weight; if (bestCN == 2 && info.Ploidy.MajorChromosomeCount == 1) totalBasesNormal += info.Weight; bestModelPoint.Weight += info.Weight; bestModelPoint.EmpiricalCoverage += info.Weight * info.Coverage; if (info.MAF >= 0) { bestModelPoint.EmpiricalMAF += info.Weight * info.MAF; bestModelPoint.MAFWeight += info.Weight; } // add CN variant of the segment to the model if (bestCN == 2 && info.Ploidy.MajorChromosomeCount == 2) // aproximate LOH; we presume that LOH counts as one event, hence similar in effect to HET deletions model.CNs.Add(1); else model.CNs.Add(bestCN); } precisionDeviation /= totalWeight; // Compute AccuracyDeviation: double accuracyDeviation = 0; foreach (ModelPoint modelPoint in modelPoints) { if (modelPoint.Weight == 0) continue; modelPoint.EmpiricalCoverage /= modelPoint.Weight; if (modelPoint.MAFWeight > 0) modelPoint.EmpiricalMAF /= modelPoint.MAFWeight; double distance = this.GetModelDistance(modelPoint.Coverage, modelPoint.EmpiricalCoverage, modelPoint.MAF, modelPoint.EmpiricalMAF); distance = Math.Sqrt(distance); accuracyDeviation += distance * modelPoint.Weight; if (!string.IsNullOrEmpty(debugPath)) { Console.WriteLine("{0}\t{1}\t{2:F2}\t{3:F0}\t{4:F2}\t{5:F0}\t{6:F3},{7:F0}", modelPoint.CN, modelPoint.Ploidy.MajorChromosomeCount, modelPoint.MAF, modelPoint.Coverage, modelPoint.EmpiricalMAF, modelPoint.EmpiricalCoverage, distance, modelPoint.Weight); } } accuracyDeviation /= totalWeight; // standard somatic model deviation double tempDeviation = precisionDeviation * 0.5f + 0.5f * accuracyDeviation; // compute cluster deviation int heterogeneousClusters = 0; double heterogeneityIndex = 0; double clusterDeviation = ClusterDeviation(segments, numClusters, tempDeviation, out heterogeneousClusters, out heterogeneityIndex, bestModel, debugPathClusterInfo); // compute total deviation double totalDeviation; if (heterogeneousClusters > 0) totalDeviation = PrecisionWeightingFactor * precisionDeviation + PrecisionWeightingFactor * accuracyDeviation + PrecisionWeightingFactor * clusterDeviation; else totalDeviation = tempDeviation; // estimate abundance of each CN state for (int index = 0; index < model.PercentCN.Length; index++) { model.PercentCN[index] /= totalWeight; } // get model ploidy for (int index = 0; index < model.PercentCN.Length; index++) { model.Ploidy += index * model.PercentCN[index]; } model.PercentNormal = totalBasesNormal / totalWeight; if (!string.IsNullOrEmpty(debugPath)) { try { using (StreamWriter debugWriter = new StreamWriter(debugPath)) { debugWriter.WriteLine("#MAF\tCoverage\t"); foreach (ModelPoint modelPoint in modelPoints) { string gt = modelPoint.Ploidy.MajorChromosomeCount.ToString() + "/" + modelPoint.CN.ToString(); debugWriter.WriteLine("{0}\t{1}\t{2}\t", modelPoint.MAF, modelPoint.Coverage, gt); } debugWriter.WriteLine(); debugWriter.WriteLine("#MAF\tCoverage\tBestDistance\tChromosome\tBegin\tEnd\tLength\tTruthSetCN"); foreach (SegmentInfo info in segments) { // Find the best fit for this segment: double bestDistance = double.MaxValue; foreach (ModelPoint modelPoint in modelPoints) { double distance = GetModelDistance(info.Coverage, modelPoint.Coverage, info.MAF, modelPoint.MAF); if (distance < bestDistance) bestDistance = distance; } bestDistance = Math.Sqrt(bestDistance); debugWriter.Write("{0}\t{1}\t", info.MAF, info.Coverage); debugWriter.Write("{0}\t{1}\t{2}\t{3}\t", bestDistance, info.Segment.Chr, info.Segment.Begin, info.Segment.End); debugWriter.Write("{0}\t", info.Segment.End - info.Segment.Begin); int CN = this.GetKnownCNForSegment(info.Segment); debugWriter.Write("{0}\t", CN); debugWriter.WriteLine(); } } } catch (IOException ex) { // Whine, but continue - not outputing this file is not fatal. Console.Error.WriteLine(ex.ToString()); } } // make sure that CN profile length is equal to the usable segments length if (model.CNs.Count != segments.Count) { throw new IndexOutOfRangeException(String.Concat("Canvas Somatic Caller error: index sizes do not match, ", model.CNs.Count, " != ", segments.Count)); } model.PrecisionDeviation = precisionDeviation; model.AccuracyDeviation = accuracyDeviation; model.Deviation = totalDeviation; model.HeterogeneityIndex = heterogeneityIndex; return totalDeviation; }