/// <summary> /// Fit a Gaussian mixture model. /// Fix the means to the model MAF and Coverage and run the EM algorithm until convergence. /// Compute the empirical MAF and Coverage. /// Fix the means to the empirical MAF and Coverage and run the EM algorithm again until convergence. /// Always estimate the full covariance matrix? /// </summary> /// <param name="model"></param> /// <param name="segments"></param> /// <param name="debugPath"></param> /// <returns></returns> private double FitGaussians(CoverageModel model, List<SegmentInfo> segments, string debugPath = null) { List<ModelPoint> modelPoints = InitializeModelPoints(model); GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, segments, this.MeanCoverage, this.CoverageWeightingFactor, 0); double likelihood = gmm.Fit(); if (debugPath != null) { // write Gaussian mixture model to debugPath using (StreamWriter writer = new StreamWriter(debugPath)) { writer.WriteLine("CN\tMajor Chr #\tMAF\tCoverage\tOmega\tMu0\tMu1\tSigma00\tSigma01\tSigma10\tSigma11"); foreach (ModelPoint modelPoint in modelPoints) { writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}", modelPoint.Ploidy.CopyNumber, modelPoint.Ploidy.MajorChromosomeCount, modelPoint.Ploidy.MixedMinorAlleleFrequency, modelPoint.Ploidy.MixedCoverage, modelPoint.Ploidy.Omega, modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Ploidy.Sigma[0][0], modelPoint.Ploidy.Sigma[0][1], modelPoint.Ploidy.Sigma[1][0], modelPoint.Ploidy.Sigma[1][1]); } writer.WriteLine(""); writer.WriteLine("MAF\tCoverage\tPosterior Probabilities"); StringBuilder sb = new StringBuilder(); foreach (SegmentInfo segment in segments) { sb.Clear(); sb.AppendFormat("{0}\t{1}", segment.MAF, segment.Coverage); foreach (ModelPoint modelPoint in modelPoints) { sb.AppendFormat("\t{0}", segment.PosteriorProbs[modelPoint]); } writer.WriteLine(sb.ToString()); } } } return likelihood; }
/// <summary> /// Estimate the optimal number of clusters in an Expectation Maximizationfrom model using silhouette coefficient /// </summary> public double BestCoverageWeightingFactor(List<SegmentInfo> usableSegments, int maxCoverageLevel, int medianCoverageLevel, double knearestNeighbourCutoff) { double bestLikelihood = double.MinValue; double bestCoverageWeightingFactor = 0; // magic-scaling for now - keep small to penalize coverage, tested on 50+ groundtruth corpus double maxCoverageWeightingFactor = this.CoverageWeighting / medianCoverageLevel; double minCoverageWeightingFactor = 0.1 / maxCoverageLevel; double stepCoverageWeightingFactor = Math.Max(0.00001, (maxCoverageWeightingFactor - minCoverageWeightingFactor) / 10); for (double coverageWeighting = minCoverageWeightingFactor; coverageWeighting < maxCoverageWeightingFactor; coverageWeighting += stepCoverageWeightingFactor) { List<ModelPoint> tempModelPoints = InitializeModelPoints(usableSegments, medianCoverageLevel/2.0, 90, 6); GaussianMixtureModel tempgmm = new GaussianMixtureModel(tempModelPoints, usableSegments, medianCoverageLevel, coverageWeighting, knearestNeighbourCutoff); double currentLikelihood = tempgmm.runExpectationMaximization(); if (currentLikelihood > bestLikelihood) { bestLikelihood = currentLikelihood; bestCoverageWeightingFactor = coverageWeighting; } } return bestCoverageWeightingFactor; }
/// <summary> /// Identify the tuple (DiploidCoverage, OverallPurity) which best models our overall /// distribution of (MAF, Coverage) data across all segments. Consider various tuples (first with a coarse-grained /// and then a fine-grained search), and for each one, measure the distortion - the average distance (weighted /// by segment length) between actual and modeled (MAF, Coverage) coordinate. /// </summary> protected CoveragePurityModel ModelOverallCoverageAndPurity(long genomeLength) { List<SegmentInfo> usableSegments; // Identify usable segments using our MinimumVariantFrequenciesForInformativeSegment cutoff, // then (if we don't find enough) we can try again with progressively more permissive cutoffs. while (true) { usableSegments = GetUsableSegmentsForModeling(this.Segments); int validMAFCount = usableSegments.Count(x => x.MAF >= 0); if (validMAFCount > Math.Min(20, this.Segments.Count)) break; // We have enough usable segments with nonnull MAF if (MinimumVariantFrequenciesForInformativeSegment <= 5) break; // Give up on modeling MinimumVariantFrequenciesForInformativeSegment -= 15; } Console.WriteLine("Modeling overall coverage/purity across {0} segments", usableSegments.Count); if (usableSegments.Count < 10) throw new UncallableDataException("Cannot model coverage/purity with less than 10 segments."); // When computing distances between model and actual points, we want to provide roughly equal weight // to coverage (which covers a large range) and MAF, which falls in the range (0, 0.5). // If we already knew the diploid coverage, then we'd know just how to scale things (catch-22). // Let's assume that the median coverage is a sane thing to use for scaling: List<float> tempCoverageList = new List<float>(); List<double> knearestNeighbourList = new List<double>(); // Segments clustering using Gaussian Expectation Maximisation // Step0: Prepare model parameters foreach (SegmentInfo info in usableSegments) tempCoverageList.Add(Convert.ToSingle(info.Coverage)); Tuple<float, float, float> coverageQuartiles = CanvasCommon.Utilities.Quartiles(tempCoverageList); int minCoverageLevel = Convert.ToInt32(coverageQuartiles.Item1); int maxCoverageLevel = Convert.ToInt32(coverageQuartiles.Item3); int medianCoverageLevel = Convert.ToInt32(coverageQuartiles.Item2); this.CoverageWeightingFactor = this.CoverageWeighting / medianCoverageLevel; int bestNumClusters = 0; // Need large number of segments for cluster analysis if (usableSegments.Count > 100) { List<float> tempMAFList = new List<float>(); foreach (SegmentInfo info in usableSegments) tempMAFList.Add(Convert.ToSingle(info.MAF)); Tuple<float, float, float> MAFQuartiles = CanvasCommon.Utilities.Quartiles(tempMAFList); double minMAF = Math.Max(Convert.ToDouble(MAFQuartiles.Item1) - 0.05, 0.01); double maxMAF = Math.Min(Convert.ToDouble(MAFQuartiles.Item3) + 0.05, 0.46); // Step1: Find outliers double knearestNeighbourCutoff = KnearestNeighbourCutoff(usableSegments); // Step2: Find the best CoverageWeightingFactor double bestCoverageWeightingFactor = BestCoverageWeightingFactor(usableSegments, maxCoverageLevel, medianCoverageLevel, knearestNeighbourCutoff); // Step3: Find the optimal number of clusters List<ModelPoint> modelPoints = BestNumClusters(usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff); bestNumClusters = modelPoints.Count; // Step4: Find segment clusters using the final model GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff); double likelihood = gmm.runExpectationMaximization(); // Step5: Write results string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusteringModel.txt"); if (!string.IsNullOrEmpty(debugPathClusterModel)) { using (StreamWriter debugWriter = new StreamWriter(debugPathClusterModel)) { debugWriter.WriteLine("#MAF\tCoverage\tClusterID"); foreach (ModelPoint modelPoint in modelPoints) { debugWriter.WriteLine("{0}\t{1}\t{2}", modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Cluster); } debugWriter.WriteLine(); debugWriter.WriteLine("#MAF\tCoverage\tBestDistance\tClusterID"); foreach (SegmentInfo info in usableSegments) { debugWriter.Write("{0}\t{1}\t{2}", info.MAF, info.Coverage, info.Cluster); debugWriter.WriteLine(); } } } } // Note: Don't consider purity below 20 (at this point), becuase that creates a model that is very noise-sensitive. // We tried using a "tumor" sample that is actually just the real normal: We could overfit this data as very low // purity 5% and make lots of (bogus) calls which fit the noise in coverage and MAF. double bestDeviation = double.MaxValue; List<CoveragePurityModel> allModels = new List<CoveragePurityModel>(); // set best somatic model to pre-specified ploidy and purity values if (this.userPloidy != null && this.userPurity != null) { CoveragePurityModel bestModel = new CoveragePurityModel(); bestModel.DiploidCoverage = medianCoverageLevel * Convert.ToDouble(this.userPloidy) / 2.0; bestModel.Purity = Convert.ToDouble(this.userPurity); this.ModelDeviation(bestModel, usableSegments, bestNumClusters); this.DiploidModelDistance(bestModel, usableSegments, genomeLength); return bestModel; } // find best somatic model else { // Coarse search: Consider various (coverage, purity) tuples. int minCoverage = (int)Math.Max(10, medianCoverageLevel / 2.5); int maxCoverage = (int)Math.Max(10, medianCoverageLevel * 2.5); int coverageStep = Math.Max(1, (maxCoverage - minCoverage) / 80); Console.WriteLine(">>>DiploidCoverage: Consider {0}...{1} step {2}", minCoverage, maxCoverage, coverageStep); for (int coverage = minCoverage; coverage < maxCoverage; coverage += coverageStep) { // iterate over purity range for (int percentPurity = 20; percentPurity <= 100; percentPurity += 5) { CoveragePurityModel model = new CoveragePurityModel(); model.DiploidCoverage = coverage; model.Purity = percentPurity / 100f; this.ModelDeviation(model, usableSegments, bestNumClusters); this.DiploidModelDistance(model, usableSegments, genomeLength); if (model.Deviation < bestDeviation && model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy) { bestDeviation = model.Deviation; } // exluce models with unrealistic genome ploidies if (model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy) allModels.Add(model); } } // New logic for model selection: // - First, compute the best model deviation. This establishes a baseline for how large the deviation is allowed to get in // an acceptable model. Allow somewhat higher deviation for targeted data, since we see extra noise there. // - Review models. Discard any with unacceptable deviation. Note the best attainable % copy number 2 and % normal. // - For each model, scale PercentNormal to a range of 0..100 where 100 = the best number seen for any acceptable model. Similarly // for PercentCN2. And similarly for DeviationScore: BestDeviation=1, WorstAllowedDeviation=0 // - Choose a model (with acceptable deviation) which maximizes a score of the form: // PercentNormal + a * PercentCN2 + b * DeviationScore double worstAllowedDeviation = bestDeviation * this.DeviationFactor; double bestCN2 = 0; double bestCN2Normal = 0; double bestDiploidDistance = 0; // derive max values for scaling int counter = 0; List<double> deviations = new List<double>(); foreach (CoveragePurityModel model in allModels) { if (model.Deviation < worstAllowedDeviation) counter++; deviations.Add(model.Deviation); } deviations.Sort(); if (counter < this.DeviationIndexCutoff) { worstAllowedDeviation = deviations[Math.Min(this.DeviationIndexCutoff, deviations.Count - 1)]; } double bestAccuracyDeviation = double.MaxValue; double bestPrecisionDeviation = double.MaxValue; // derive max values for scaling foreach (CoveragePurityModel model in allModels) { bestAccuracyDeviation = Math.Min(bestAccuracyDeviation, model.AccuracyDeviation); bestPrecisionDeviation = Math.Min(bestPrecisionDeviation, model.PrecisionDeviation); if (model.Deviation > worstAllowedDeviation) continue; if (model.PercentCN[2] > bestCN2) bestCN2 = model.PercentCN[2]; if (model.PercentNormal > bestCN2Normal) bestCN2Normal = model.PercentNormal; if (model.DiploidDistance > bestDiploidDistance) bestDiploidDistance = model.DiploidDistance; } // coarse search to find best ploidy and purity model List<CoveragePurityModel> bestModels = new List<CoveragePurityModel>(); CoveragePurityModel bestModel = null; double bestScore = 0; // holds scores for all models List<double> scores = new List<double>(); // save all purity and ploidy models to a file string debugPath = Path.Combine(this.TempFolder, "PurityModel.txt"); using (StreamWriter debugWriter = new StreamWriter(debugPath)) { debugWriter.Write("#Purity\tDiploidCoverage\t"); debugWriter.Write("Deviation\tAccuracyDeviation\tPrecisionDeviation\tWorstAllowedDeviation\tAccDev/best\tPrecDev/best\t"); debugWriter.Write("DeviationScore\tScore\tPloidy\t"); debugWriter.Write("Normal\tNormal/best\tCN2\tCN2/Best\t"); debugWriter.Write("DiploidDistance\tDiploidDistance/Best"); debugWriter.WriteLine(); foreach (CoveragePurityModel model in allModels) { // Filter models with unacceptable deviation: if (model.Deviation > worstAllowedDeviation) continue; // Transform purity into Weighting Factor to penalize abnormal ploidies at low purity: // (1.5 - 0.5) = minmax range of the new weighting scale; (1.0 - 0.2) = minmax range of the purity values // This transformation leads a maximal lowPurityWeightingFactor value of 1.5 for the lowest purity model and a minimal value of 0.75 for the highest purity model double lowPurityWeightingFactor = 1.5 / ((1.5 - 0.5) / (1.0 - 0.2) * (model.Purity - 0.2) + 1.0); double score = this.PercentNormal2WeightingFactor * model.PercentNormal / Math.Max(0.01, bestCN2Normal); score += lowPurityWeightingFactor * this.CN2WeightingFactor * model.PercentCN[2] / Math.Max(0.01, bestCN2); score += this.DeviationScoreWeightingFactor * (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation); score += this.DiploidDistanceScoreWeightingFactor * model.DiploidDistance / Math.Max(0.01, bestDiploidDistance); scores.Add(score); bestModels.Add(model); // write to file debugWriter.Write("{0}\t{1}\t", (int)Math.Round(100 * model.Purity), model.DiploidCoverage); debugWriter.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t", model.Deviation, model.AccuracyDeviation, model.PrecisionDeviation, worstAllowedDeviation, model.AccuracyDeviation / bestAccuracyDeviation, model.PrecisionDeviation / bestPrecisionDeviation); debugWriter.Write("{0}\t{1}\t{2}\t", (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation), score, model.Ploidy); debugWriter.Write("{0}\t{1}\t{2}\t{3}\t", model.PercentNormal, model.PercentNormal / Math.Max(0.01, bestCN2Normal), model.PercentCN[2], model.PercentCN[2] / Math.Max(0.01, bestCN2)); debugWriter.Write("{0}\t{1}\t", model.DiploidDistance, model.DiploidDistance / Math.Max(0.01, bestDiploidDistance)); debugWriter.WriteLine(); if (score > bestScore) { bestModel = model; bestScore = score; } } } // sort list and return indices var sortedScores = scores.Select((x, i) => new KeyValuePair<double, int>(x, i)).OrderBy(x => x.Key).ToList(); List<double> scoresValue = sortedScores.Select(x => x.Key).ToList(); List<int> scoresIndex = sortedScores.Select(x => x.Value).ToList(); // interModelDistance shows genome edit distance between the best model and other top models (defined by MaximumRelatedModels). // The premise is that if the top models provide widely different genome baseline (leading to high interModelDistance), // the overall modeling approach might be more unstable. double interModelDistance = 0; // start at one since model #0 is the highest scoring model to compare to for (int i = 1; i < MaximumRelatedModels; i++) { interModelDistance += CalculateModelDistance(bestModels[scoresIndex[0]], bestModels[scoresIndex[i]], usableSegments, genomeLength); } interModelDistance /= (double)MaximumRelatedModels; Console.WriteLine(">>> Initial model: Deviation {0:F5}, coverage {1}, purity {2:F1}%, CN2 {3:F2}", bestModel.Deviation, bestModel.DiploidCoverage, 100 * bestModel.Purity, bestModel.PercentCN[2]); // Refine search: Smaller step sizes in the neighborhood of the initial model. minCoverage = (int)Math.Round(bestModel.DiploidCoverage) - 5; maxCoverage = (int)Math.Round(bestModel.DiploidCoverage) + 5; int minPurity = Math.Max(20, (int)Math.Round(bestModel.Purity * 100) - 10); int maxPurity = Math.Min(100, (int)Math.Round(bestModel.Purity * 100) + 10); // %%% magic numbers bestDeviation = double.MaxValue; bestModel = null; for (int coverage = minCoverage; coverage <= maxCoverage; coverage++) { for (int percentPurity = minPurity; percentPurity <= maxPurity; percentPurity++) { CoveragePurityModel model = new CoveragePurityModel(); model.DiploidCoverage = coverage; model.Purity = percentPurity / 100f; this.ModelDeviation(model, usableSegments, bestNumClusters); if (bestModel == null || model.Deviation < bestModel.Deviation) { bestModel = model; } } } // string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusterModel.txt"); string debugPathCNVModeling = Path.Combine(this.TempFolder, "CNVModeling.txt"); ModelDeviation(bestModel, usableSegments, bestNumClusters, null, true, debugPathCNVModeling); Console.WriteLine(); Console.WriteLine(">>> Refined model: Deviation {0:F5}, coverage {1}, purity {2:F1}%", bestModel.Deviation, bestModel.DiploidCoverage, bestModel.Purity * 100); Console.WriteLine(); { foreach (SegmentPloidy ploidy in AllPloidies) { ploidy.Omega = 0; ploidy.Mu = null; ploidy.Sigma = null; } } if (!bestModel.InterModelDistance.HasValue) { bestModel.InterModelDistance = interModelDistance; } return bestModel; } }
/// <summary> /// Estimate the optimal number of clusters in an Expectation Maximizationfrom model using silhouette coefficient /// </summary> public List<ModelPoint> BestNumClusters(List<SegmentInfo> usableSegments, double medianCoverageLevel, double bestCoverageWeightingFactor, double knearestNeighbourCutoff) { double bestSilhouette = double.MinValue; List<ModelPoint> bestModelPoints = new List<ModelPoint>(); int bestNumClusters = 0; int maxNumClusters = 8; // find distanceThreshold, use it in InitializeModelPoints List<double> tempModelDistanceList = new List<double>(); for (int i = 0; i < usableSegments.Count; i++) { for (int j = 0; j < usableSegments.Count; j++) { if (i != j && usableSegments[i].Cluster != -1 && usableSegments[j].Cluster != -1 && usableSegments[i].MAF >= 0 && usableSegments[j].MAF >= 0) { tempModelDistanceList.Add(GetModelDistance(usableSegments[i].Coverage, usableSegments[j].Coverage, usableSegments[i].MAF, usableSegments[j].MAF)); } } } tempModelDistanceList.Sort(); int distanceThresholdIndex = Math.Min(Convert.ToInt32(tempModelDistanceList.Count * 0.8), tempModelDistanceList.Count - 1); double distanceThreshold = tempModelDistanceList[distanceThresholdIndex]; // enable capturing clusters with less than 30% abundance // find optimal number of clusters using silhouette distance and return bestModelPoints for (int numClusters = 4; numClusters < maxNumClusters; numClusters++) { for (int i = 0; i < 10; i++) { List<ModelPoint> tempModelPoints = InitializeModelPoints(usableSegments, numClusters, distanceThreshold); GaussianMixtureModel tempgmm = new GaussianMixtureModel(tempModelPoints, usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff); double currentLikelihood = tempgmm.runExpectationMaximization(); // return BIC rather than raw likelihood double currentSilhouette = ComputeSilhouette(usableSegments, numClusters); if (bestSilhouette < currentSilhouette) { bestSilhouette = currentSilhouette; bestNumClusters = numClusters; if (bestModelPoints.Count > 0) bestModelPoints.Clear(); foreach (ModelPoint tempModelPoint in tempModelPoints) bestModelPoints.Add(tempModelPoint); } } } return bestModelPoints; }