private void EMComputePosteriorProbs(List<ModelPoint> modelPoints, SegmentInfo segment) { double tempsum1 = 0; Dictionary<ModelPoint, double> temp = new Dictionary<ModelPoint, double>(); foreach (var modelPoint in modelPoints) { temp[modelPoint] = modelPoint.Ploidy.Omega * Sigma(segment.MAF, segment.Coverage, modelPoint.Ploidy.Mu, modelPoint.Ploidy.Sigma); tempsum1 += temp[modelPoint]; } if (segment.PosteriorProbs == null) { segment.PosteriorProbs = new Dictionary<ModelPoint, double>(); } int bestCluster = 0; double bestProb = 0; foreach (var modelPoint in modelPoints) { segment.PosteriorProbs[modelPoint] = temp[modelPoint] / tempsum1; if (segment.PosteriorProbs[modelPoint] > bestProb) { bestCluster = modelPoint.Cluster.Value; bestProb = temp[modelPoint] / tempsum1; } if (Double.IsNaN(segment.PosteriorProbs[modelPoint])) { segment.PosteriorProbs[modelPoint] = 0; } } segment.Cluster = bestCluster; }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyBedPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { this.CNOracle = new CopyNumberOracle(); this.CNOracle.LoadKnownCN(truthDataPath); } this.Segments = CanvasSegment.ReadSegments(inFile); this.TempFolder = Path.GetDirectoryName(inFile); if (this.Segments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, null, null); return 0; } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyBedPath)) ploidy = PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); // load MAF this.MeanCoverage = CanvasIO.LoadVariantFrequencies(variantFrequencyFile, this.Segments); int medianVariantCoverage = AggregateVariantCoverage(ref this.Segments); // Create new models for different copy number states this.InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref this.Segments); DiploidCoverage = CanvasCommon.Utilities.Mean(diploidCounts); CoverageWeightingFactor = CoverageWeighting / DiploidCoverage; // new coverage model this.Model = new CoverageModel(); Model.DiploidCoverage = DiploidCoverage; List<SegmentInfo> segments = new List<SegmentInfo>(); foreach (CanvasSegment segment in this.Segments) { SegmentInfo info = new SegmentInfo(); info.Segment = segment; List<double> MAF = new List<double>(); foreach (float value in segment.VariantFrequencies) MAF.Add(value > 0.5 ? 1 - value : value); if (MAF.Count > 0) { info.MAF = CanvasCommon.Utilities.Median(MAF); } else { info.MAF = -1; } info.Coverage = CanvasCommon.Utilities.Median(segment.Counts); if (this.Segments.Count > 100) { info.Weight = segment.End - segment.Begin; } else { info.Weight = segment.BinCount; } segments.Add(info); } // Assign copy number and major chromosome count for each segment bool useGaussianMixtureModel = false; // For now, this is set false, since we saw weird performance on chrY (CANV-115): if (useGaussianMixtureModel) { // optimize model covariance double likelihood = FitGaussians(Model, segments); AssignPloidyCallsGaussianMixture(); } else { AssignPloidyCallsDistance(Model, segments, medianVariantCoverage); } // Merge neighboring segments that got the same copy number call. CanvasSegment.MergeSegments(ref this.Segments); CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline); List<string> extraHeaders = new List<string>(); string coverageOutputPath = CanvasCommon.Utilities.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(this.Segments, Model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (this.CNOracle != null) { this.GenerateReportVersusKnownCN(); } if (ploidy != null && !string.IsNullOrEmpty(ploidy.HeaderLine)) extraHeaders.Add(ploidy.HeaderLine); CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, extraHeaders, ploidy); return 0; }
static public List<SegmentInfo> GetUsableSegmentsForModeling(List<CanvasSegment> segments) { // Get the average count everwhere. Exclude segments whose coverage is >2x this average. float overallMean = 0; int overallCount = 0; foreach (CanvasSegment segment in segments) { foreach (float value in segment.Counts) { overallCount++; overallMean += value; } } overallMean /= Math.Max(1, overallCount); List<SegmentInfo> usableSegments = new List<SegmentInfo>(); foreach (CanvasSegment segment in segments) { if (segment.End - segment.Begin < 5000) continue; SegmentInfo info = new SegmentInfo(); info.Segment = segment; // If the segment has few or no variants, then don't use the MAF for this segment - set to -1 (no frequency) // Typically a segment will have no variants if it's on chrX or chrY and starling knows not to call a // heterozygous variant there (other than in the PAR regions). if (segment.VariantFrequencies.Count < MinimumVariantFrequenciesForInformativeSegment) { info.MAF = -1; } else { List<double> MAF = new List<double>(); foreach (float value in segment.VariantFrequencies) MAF.Add(value > 0.5 ? 1 - value : value); MAF.Sort(); info.MAF = MAF[MAF.Count / 2]; } info.Coverage = CanvasCommon.Utilities.Median(segment.Counts); if (info.Coverage > overallMean * 2) continue; if (segments.Count > 100) { info.Weight = segment.End - segment.Begin; } else { info.Weight = segment.BinCount; } if (segment.VariantFrequencies.Count < 10) { info.Weight *= (double)segment.VariantFrequencies.Count / 10; } usableSegments.Add(info); } return usableSegments; }