protected List<ModelPoint> InitializeModelPoints(CoverageModel model) { List<ModelPoint> modelPoints = new List<ModelPoint>(); double[] mu = GetProjectedMeanCoverage(model.DiploidCoverage); double diploidMAF = this.AllPloidies[3].MinorAlleleFrequency; /// %%% Magic number! // Refine our estimate of diploid MAF: //double diploidMAF = this.EstimateDiploidMAF(2, model.DiploidCoverage); ///////////////////////////////////////////// // Update the parameters in each SegmentPloidy object, and construct corresponding SegmentInfo objects foreach (SegmentPloidy ploidy in this.AllPloidies) { ModelPoint point = new ModelPoint(); double pureCoverage = mu[ploidy.CopyNumber]; point.Coverage = pureCoverage; double pureMAF = ploidy.MinorAlleleFrequency; point.MAF = pureMAF; if (double.IsNaN(point.MAF)) point.MAF = 0; point.Ploidy = ploidy; modelPoints.Add(point); point.CN = ploidy.CopyNumber; ploidy.MixedMinorAlleleFrequency = point.MAF; ploidy.MixedCoverage = point.Coverage; } return modelPoints; }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyBedPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { this.CNOracle = new CopyNumberOracle(); this.CNOracle.LoadKnownCN(truthDataPath); } this.Segments = CanvasSegment.ReadSegments(inFile); this.TempFolder = Path.GetDirectoryName(inFile); if (this.Segments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, null, null); return 0; } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyBedPath)) ploidy = PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); // load MAF this.MeanCoverage = CanvasIO.LoadVariantFrequencies(variantFrequencyFile, this.Segments); int medianVariantCoverage = AggregateVariantCoverage(ref this.Segments); // Create new models for different copy number states this.InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref this.Segments); DiploidCoverage = CanvasCommon.Utilities.Mean(diploidCounts); CoverageWeightingFactor = CoverageWeighting / DiploidCoverage; // new coverage model this.Model = new CoverageModel(); Model.DiploidCoverage = DiploidCoverage; List<SegmentInfo> segments = new List<SegmentInfo>(); foreach (CanvasSegment segment in this.Segments) { SegmentInfo info = new SegmentInfo(); info.Segment = segment; List<double> MAF = new List<double>(); foreach (float value in segment.VariantFrequencies) MAF.Add(value > 0.5 ? 1 - value : value); if (MAF.Count > 0) { info.MAF = CanvasCommon.Utilities.Median(MAF); } else { info.MAF = -1; } info.Coverage = CanvasCommon.Utilities.Median(segment.Counts); if (this.Segments.Count > 100) { info.Weight = segment.End - segment.Begin; } else { info.Weight = segment.BinCount; } segments.Add(info); } // Assign copy number and major chromosome count for each segment bool useGaussianMixtureModel = false; // For now, this is set false, since we saw weird performance on chrY (CANV-115): if (useGaussianMixtureModel) { // optimize model covariance double likelihood = FitGaussians(Model, segments); AssignPloidyCallsGaussianMixture(); } else { AssignPloidyCallsDistance(Model, segments, medianVariantCoverage); } // Merge neighboring segments that got the same copy number call. CanvasSegment.MergeSegments(ref this.Segments); CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline); List<string> extraHeaders = new List<string>(); string coverageOutputPath = CanvasCommon.Utilities.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(this.Segments, Model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (this.CNOracle != null) { this.GenerateReportVersusKnownCN(); } if (ploidy != null && !string.IsNullOrEmpty(ploidy.HeaderLine)) extraHeaders.Add(ploidy.HeaderLine); CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, extraHeaders, ploidy); return 0; }
/// <summary> /// Fit a Gaussian mixture model. /// Fix the means to the model MAF and Coverage and run the EM algorithm until convergence. /// Compute the empirical MAF and Coverage. /// Fix the means to the empirical MAF and Coverage and run the EM algorithm again until convergence. /// Always estimate the full covariance matrix? /// </summary> /// <param name="model"></param> /// <param name="segments"></param> /// <param name="debugPath"></param> /// <returns></returns> private double FitGaussians(CoverageModel model, List<SegmentInfo> segments, string debugPath = null) { List<ModelPoint> modelPoints = InitializeModelPoints(model); GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, segments, this.MeanCoverage, this.CoverageWeightingFactor, 0); double likelihood = gmm.Fit(); if (debugPath != null) { // write Gaussian mixture model to debugPath using (StreamWriter writer = new StreamWriter(debugPath)) { writer.WriteLine("CN\tMajor Chr #\tMAF\tCoverage\tOmega\tMu0\tMu1\tSigma00\tSigma01\tSigma10\tSigma11"); foreach (ModelPoint modelPoint in modelPoints) { writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}", modelPoint.Ploidy.CopyNumber, modelPoint.Ploidy.MajorChromosomeCount, modelPoint.Ploidy.MixedMinorAlleleFrequency, modelPoint.Ploidy.MixedCoverage, modelPoint.Ploidy.Omega, modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Ploidy.Sigma[0][0], modelPoint.Ploidy.Sigma[0][1], modelPoint.Ploidy.Sigma[1][0], modelPoint.Ploidy.Sigma[1][1]); } writer.WriteLine(""); writer.WriteLine("MAF\tCoverage\tPosterior Probabilities"); StringBuilder sb = new StringBuilder(); foreach (SegmentInfo segment in segments) { sb.Clear(); sb.AppendFormat("{0}\t{1}", segment.MAF, segment.Coverage); foreach (ModelPoint modelPoint in modelPoints) { sb.AppendFormat("\t{0}", segment.PosteriorProbs[modelPoint]); } writer.WriteLine(sb.ToString()); } } } return likelihood; }
private void AssignPloidyCallsDistance(CoverageModel model, List<SegmentInfo> segments, int medianVariantCoverage) { List<ModelPoint> modelPoints = InitializeModelPoints(model); foreach (CanvasSegment segment in this.Segments) { // Compute (MAF, Coverage) for this segment: List<double> MAF = new List<double>(); foreach (float VF in segment.VariantFrequencies) MAF.Add(VF > 0.5 ? 1 - VF : VF); int expectedSnpDensityCutoff = (segment.End - segment.Begin) / MedianHetSnpsDistance / 2; List<Tuple<float, float>> weightedVariantFrequencies = new List<Tuple<float, float>>(); double medianCoverage = CanvasCommon.Utilities.Median(segment.Counts); double medianMAF = -1; SegmentPloidy bestPloidy = null; if (MAF.Count >= Math.Max(10, expectedSnpDensityCutoff)) { medianMAF = Utilities.Median(MAF); } double bestDistance = double.MaxValue; double secondBestDistance = double.MaxValue; foreach (SegmentPloidy ploidy in AllPloidies) { double diff = (ploidy.MixedCoverage - medianCoverage) * CoverageWeightingFactor; double distance = diff * diff; if (MAF.Count >= Math.Max(10, expectedSnpDensityCutoff)) { diff = ploidy.MixedMinorAlleleFrequency - medianMAF; distance += diff * diff; } if (distance < bestDistance) { secondBestDistance = bestDistance; bestDistance = distance; bestPloidy = ploidy; } else if (distance < secondBestDistance) { secondBestDistance = distance; } } segment.CopyNumber = bestPloidy.CopyNumber; segment.ModelDistance = bestDistance; segment.RunnerUpModelDistance = secondBestDistance; segment.MajorChromosomeCount = bestPloidy.MajorChromosomeCount; if (MAF.Count < 10) segment.MajorChromosomeCount = null; // Don't assign MCC if we don't have variant allele frequencies } }
private void AssignPloidyCallsDistance(CoverageModel model, List<SegmentInfo> segments, int medianVariantCoverage) { List<ModelPoint> modelPoints = InitializeModelPoints(model); foreach (CanvasSegment segment in this.Segments) { // Compute (MAF, Coverage) for this segment: List<double> MAF = new List<double>(); foreach (float VF in segment.VariantFrequencies) MAF.Add(VF > 0.5 ? 1 - VF : VF); List<Tuple<float, float>> weightedVariantFrequencies = new List<Tuple<float, float>>(); double medianCoverage = CanvasCommon.Utilities.Median(segment.Counts); for (int i = 0; i < MAF.Count; i++) { // for now penalize only low-coverage regions float variantWeight = 0; if (segment.VariantTotalCoverage[i] < medianVariantCoverage) { variantWeight = Convert.ToSingle(segment.VariantTotalCoverage[i] / medianVariantCoverage); } else { variantWeight = 1; } weightedVariantFrequencies.Add(Tuple.Create(Convert.ToSingle(MAF[i]), variantWeight)); } double medianMAF = -1; SegmentPloidy bestPloidy = null; if (MAF.Count >= 10) { medianMAF = Utilities.WeightedMedian(weightedVariantFrequencies); } double bestDistance = double.MaxValue; double secondBestDistance = double.MaxValue; foreach (SegmentPloidy ploidy in AllPloidies) { double diff = (ploidy.MixedCoverage - medianCoverage) * CoverageWeightingFactor; double distance = diff * diff; if (MAF.Count >= 10) { diff = ploidy.MixedMinorAlleleFrequency - medianMAF; distance += diff * diff; } if (distance < bestDistance) { secondBestDistance = bestDistance; bestDistance = distance; bestPloidy = ploidy; } else if (distance < secondBestDistance) { secondBestDistance = distance; } } segment.CopyNumber = bestPloidy.CopyNumber; segment.ModelDistance = bestDistance; segment.RunnerUpModelDistance = secondBestDistance; segment.MajorChromosomeCount = bestPloidy.MajorChromosomeCount; if (MAF.Count < 10) segment.MajorChromosomeCount = null; // Don't assign MCC if we don't have variant allele frequencies } }