Esempio n. 1
0
 /// <summary>
 /// Estimate genome distance between two purity models (weighted absolute difference between copy number profiles)
 /// /// </summary>
 protected double CalculateModelDistance(CoveragePurityModel model1, CoveragePurityModel model2, List<SegmentInfo> usableSegments, long genomeLength)
 {
     double genomeDistance = 0;
     // every model should have the same number of segments
     if (model1.CNs.Count != model2.CNs.Count)
     {
         Console.WriteLine("Models do not have the same number of usable CN segments");
         return 1;
     }
     for (int i = 0; i < model1.CNs.Count; i++)
     {
         genomeDistance += Math.Abs(model1.CNs[i] - model2.CNs[i]) * (usableSegments[i].Segment.End - usableSegments[i].Segment.Begin) / (double)genomeLength;
     }
     return genomeDistance;
 }
Esempio n. 2
0
        /// <summary>
        /// Fit a Gaussian mixture model.
        /// Fix the means to the model MAF and Coverage and run the EM algorithm until convergence.
        /// Compute the empirical MAF and Coverage.
        /// Fix the means to the empirical MAF and Coverage and run the EM algorithm again until convergence.
        /// Always estimate the full covariance matrix?
        /// </summary>
        /// <param name="model"></param>
        /// <param name="segments"></param>
        /// <param name="debugPath"></param>
        /// <returns></returns>
        protected double FitGaussians(CoveragePurityModel model, List<SegmentInfo> segments, string debugPath = null, double knearestNeighbourCutoff = Int32.MaxValue)
        {
            List<ModelPoint> modelPoints = InitializeModelPoints(model);

            GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, segments, this.MeanCoverage, this.CoverageWeightingFactor, knearestNeighbourCutoff);
            double likelihood = gmm.Fit();

            if (debugPath != null)
            {
                // write Gaussian mixture model to debugPath
                using (StreamWriter writer = new StreamWriter(debugPath))
                {
                    writer.WriteLine("CN\tMajor Chr #\tMAF\tCoverage\tOmega\tMu0\tMu1\tSigma00\tSigma01\tSigma10\tSigma11");
                    foreach (ModelPoint modelPoint in modelPoints)
                    {
                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}",
                            modelPoint.Ploidy.CopyNumber, modelPoint.Ploidy.MajorChromosomeCount,
                            modelPoint.Ploidy.MixedMinorAlleleFrequency, modelPoint.Ploidy.MixedCoverage,
                            modelPoint.Ploidy.Omega, modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1],
                            modelPoint.Ploidy.Sigma[0][0], modelPoint.Ploidy.Sigma[0][1],
                            modelPoint.Ploidy.Sigma[1][0], modelPoint.Ploidy.Sigma[1][1]);
                    }

                    writer.WriteLine("");
                    writer.WriteLine("MAF\tCoverage\tPosterior Probabilities");
                    StringBuilder sb = new StringBuilder();
                    foreach (SegmentInfo segment in segments)
                    {
                        sb.Clear();
                        sb.AppendFormat("{0}\t{1}", segment.MAF, segment.Coverage);
                        foreach (ModelPoint modelPoint in modelPoints)
                        {
                            sb.AppendFormat("\t{0}", segment.PosteriorProbs[modelPoint]);
                        }
                        writer.WriteLine(sb.ToString());
                    }
                }
            }

            return likelihood;
        }
Esempio n. 3
0
 /// <summary>
 /// Given genome-wide copy number (CN) profile of the model estimate the total number of rearrangements that 
 /// need to be applied to a diploid genome to transform it into the tumor genome under given purity model. 
 /// The following logic is used:
 ///     1)	Assign one rearrangement score to a single CN state transition, i.e. transition 2 -> 3 will get a score of one 
 ///     while transition 2 -> 4 will get a score of 2. 
 ///     2)	Cumulative PercentCN of 80% and more for copy number bins > 2 indicate possible genome doubling. 
 ///     Assign score of 1 for genome doubling event. Use copy number 4 baseline instead of 2 and count events as in step 1.
 /// </summary>
 protected double DiploidModelDistance(CoveragePurityModel model, List<SegmentInfo> usableSegments, long genomeLength)
 {
     double totalCNevents = 0;
     int modelBaseline = 2;
     double amplificationPercentCN = 0;
     for (int copyNumber = 3; copyNumber < MaximumCopyNumber; copyNumber++)
         amplificationPercentCN += model.PercentCN[copyNumber];
     if (amplificationPercentCN > 0.8)
     {
         modelBaseline = 4;
         totalCNevents += 1;
     }
     for (int i = 0; i < model.CNs.Count; i++)
     {
         totalCNevents += Math.Abs(model.CNs[i] - modelBaseline) * (usableSegments[i].Segment.End - usableSegments[i].Segment.Begin) / (double)genomeLength;
     }
     model.DiploidDistance = (double)1.0 / Math.Max(0.001, totalCNevents);
     return totalCNevents;
 }
Esempio n. 4
0
        // Initialize model points given expected ploidy and purity values 
        protected List<ModelPoint> InitializeModelPoints(CoveragePurityModel model)
        {
            List<ModelPoint> modelPoints = new List<ModelPoint>();

            double[] mu = GetProjectedMeanCoverage(model.DiploidCoverage);
            double diploidMAF = this.AllPloidies[3].MinorAlleleFrequency; /// %%% Magic number!


            /////////////////////////////////////////////
            // Update the parameters in each SegmentPloidy object, and construct corresponding SegmentInfo objects
            foreach (SegmentPloidy ploidy in this.AllPloidies)
            {
                ModelPoint point = new ModelPoint();
                double pureCoverage = mu[ploidy.CopyNumber];
                point.Coverage = (model.Purity * pureCoverage) + (1 - model.Purity) * model.DiploidCoverage;
                double pureMAF = ploidy.MinorAlleleFrequency;
                if (ploidy.MajorChromosomeCount * 2 == ploidy.CopyNumber)
                {
                    point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 2 * diploidMAF);
                    point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2;
                    if (double.IsNaN(point.MAF)) point.MAF = 0;
                }
                else
                {
                    point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 1);
                    point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2;
                }
                point.Ploidy = ploidy;
                modelPoints.Add(point);
                point.CN = ploidy.CopyNumber;
                ploidy.MixedMinorAlleleFrequency = point.MAF;
                ploidy.MixedCoverage = point.Coverage;
            }

            return modelPoints;
        }
Esempio n. 5
0
        /// <summary>
        ///  Initialize model points given diploid purity modelInitialize model points given somatic purity model
        /// </summary>
        protected List<ModelPoint> InitializeModelPoints(List<SegmentInfo> segments, double coverage, int percentPurity, int numClusters)
        {
            List<ModelPoint> modelPoints = new List<ModelPoint>();
            CoveragePurityModel model = new CoveragePurityModel();
            model.DiploidCoverage = coverage;
            model.Purity = percentPurity / 100f;

            double[] mu = GetProjectedMeanCoverage(model.DiploidCoverage);
            double diploidMAF = this.AllPloidies[3].MinorAlleleFrequency; /// %%% Magic number!


            /////////////////////////////////////////////
            // Update the parameters in each SegmentPloidy object, and construct corresponding SegmentInfo objects
            foreach (SegmentPloidy ploidy in this.AllPloidies)
            {
                ModelPoint point = new ModelPoint();
                double pureCoverage = mu[ploidy.CopyNumber];
                point.Coverage = (model.Purity * pureCoverage) + (1 - model.Purity) * model.DiploidCoverage;
                double pureMAF = ploidy.MinorAlleleFrequency;
                if (ploidy.MajorChromosomeCount * 2 == ploidy.CopyNumber)
                {
                    point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 2 * diploidMAF);
                    point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2;
                    if (double.IsNaN(point.MAF)) point.MAF = 0;
                }
                else
                {
                    point.MAF = (model.Purity * ploidy.CopyNumber * pureMAF) + ((1 - model.Purity) * 1);
                    point.MAF /= model.Purity * ploidy.CopyNumber + (1 - model.Purity) * 2;
                }
                point.Ploidy = ploidy;
                modelPoints.Add(point);
                point.CN = ploidy.CopyNumber;
                ploidy.MixedMinorAlleleFrequency = point.MAF;
                ploidy.MixedCoverage = point.Coverage;
            }

            // estimate distance between each model point and segments 
            List<double> modelPointsScore = new List<double>();
            foreach (ModelPoint modelPoint in modelPoints)
            {
                List<double> distanceList = new List<double>();
                foreach (SegmentInfo info in segments)
                {
                    if (info.MAF >= 0)
                        distanceList.Add(GetModelDistance(info.Coverage, modelPoint.Coverage, info.MAF, modelPoint.MAF));
                }
                distanceList.Sort();
                double v15th_percentile = distanceList[Convert.ToInt32(distanceList.Count * 0.15)];
                // use model points with good fit to observed values
                modelPointsScore.Add(v15th_percentile);
            }
            // sort list and return indices
            var sortedScores = modelPointsScore.Select((x, i) => new KeyValuePair<double, int>(x, i)).OrderBy(x => x.Key).ToList();
            List<double> scoresValue = sortedScores.Select(x => x.Key).ToList();
            List<int> scoresIndex = sortedScores.Select(x => x.Value).ToList();

            List<ModelPoint> selectedModelPoints = new List<ModelPoint>();

            for (int i = 0; i < numClusters; i++)
            {
                modelPoints[scoresIndex[i]].Cluster = i + 1;
                selectedModelPoints.Add(modelPoints[scoresIndex[i]]);
            }

            return selectedModelPoints;
        }
Esempio n. 6
0
        /// <summary>
        /// Assign copy number calls to segments.  And, produce extra headers for the CNV vcf file, giving the 
        /// overall estimated purity and ploidy.
        /// </summary>
        protected List<string> CallCNVUsingSNVFrequency(double? localSDmertic, string referenceFolder)
        {
            List<string> Headers = new List<string>();
            if (this.CNOracle != null)
            {
                this.DerivePurityEstimateFromVF();
            }

            // Get genome length.
            GenomeMetadata genomeMetaData = null;
            genomeMetaData = new GenomeMetadata();
            genomeMetaData.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));

            // Derive a model of diploid coverage, and overall tumor purity:
            this.Model = ModelOverallCoverageAndPurity(genomeMetaData.Length);

            // Make preliminary ploidy calls for all segments.  For those segments which fit their ploidy reasonably well,
            // accumulate information about the MAF by site and coverage by bin.  
            this.HeterogeneousSegmentsSignature.Sort();

            if (AllPloidies.First().Sigma == null)
            {
                AssignPloidyCalls();
            }
            else
            {
                AssignPloidyCallsGaussianMixture();
            }

            // If the somatic SNV/indel file was provided, then we use it to derive another estimate of purity.
            // And, if we didn't make many CNV calls, then we report this estimate, instead of the estimate derived from
            // our overall model.
            if (!string.IsNullOrEmpty(SomaticVCFPath))
            {
                try
                {
                    double SNVPurityEstimate = EstimatePurityFromSomaticSNVs();
                    this.SelectPurityEstimate(SNVPurityEstimate, genomeMetaData.Length);
                }
                catch (Exception e)
                {
                    Console.Error.WriteLine("* Error deriving purity estimate from somatic SNVs.  Details:\n{0}", e.ToString());
                }
            }

            // Add some extra information to the vcf file header:
            Headers.Add(string.Format("##EstimatedTumorPurity={0:F2}", this.Model.Purity));
            double totalPloidy = 0;
            double totalWeight = 0;
            foreach (CanvasSegment segment in this.Segments)
            {
                totalWeight += segment.End - segment.Begin;
                totalPloidy += segment.CopyNumber * (segment.End - segment.Begin);
            }
            Headers.Add(string.Format("##OverallPloidy={0:F2}", totalPloidy / Math.Max(1, totalWeight)));
            Headers.Add(string.Format("##PurityModelFit={0:F4}", this.Model.Deviation));
            Headers.Add(string.Format("##InterModelDistance={0:F4}", this.Model.InterModelDistance));
            Headers.Add(string.Format("##EstimatedChromosomeCount={0:F2}", this.EstimateChromosomeCount()));
            Headers.Add(string.Format("##LocalSDmetric={0:F2}", localSDmertic));
            Headers.Add(string.Format("##Heterogeneity={0:F2}", this.Model.HeterogeneityIndex));
            return Headers;
        }
Esempio n. 7
0
        /// <summary>
        /// Identify the tuple (DiploidCoverage, OverallPurity) which best models our overall
        /// distribution of (MAF, Coverage) data across all segments.  Consider various tuples (first with a coarse-grained
        /// and then a fine-grained search), and for each one, measure the distortion - the average distance (weighted 
        /// by segment length) between actual and modeled (MAF, Coverage) coordinate.
        /// </summary>
        protected CoveragePurityModel ModelOverallCoverageAndPurity(long genomeLength)
        {
            List<SegmentInfo> usableSegments;
            // Identify usable segments using our MinimumVariantFrequenciesForInformativeSegment cutoff, 
            // then (if we don't find enough) we can try again with progressively more permissive cutoffs.
            while (true)
            {
                usableSegments = GetUsableSegmentsForModeling(this.Segments);
                int validMAFCount = usableSegments.Count(x => x.MAF >= 0);
                if (validMAFCount > Math.Min(20, this.Segments.Count)) break; // We have enough usable segments with nonnull MAF
                if (MinimumVariantFrequenciesForInformativeSegment <= 5) break; // Give up on modeling
                MinimumVariantFrequenciesForInformativeSegment -= 15;
            }
            Console.WriteLine("Modeling overall coverage/purity across {0} segments", usableSegments.Count);
            if (usableSegments.Count < 10)
                throw new UncallableDataException("Cannot model coverage/purity with less than 10 segments.");

            // When computing distances between model and actual points, we want to provide roughly equal weight
            // to coverage (which covers a large range) and MAF, which falls in the range (0, 0.5).  
            // If we already knew the diploid coverage, then we'd know just how to scale things (catch-22).
            // Let's assume that the median coverage is a sane thing to use for scaling:
            List<float> tempCoverageList = new List<float>();
            List<double> knearestNeighbourList = new List<double>();
                        
            // Segments clustering using Gaussian Expectation Maximisation

            // Step0: Prepare model parameters
            foreach (SegmentInfo info in usableSegments) tempCoverageList.Add(Convert.ToSingle(info.Coverage));
            Tuple<float, float, float> coverageQuartiles = CanvasCommon.Utilities.Quartiles(tempCoverageList);
            int minCoverageLevel = Convert.ToInt32(coverageQuartiles.Item1);
            int maxCoverageLevel = Convert.ToInt32(coverageQuartiles.Item3);
            int medianCoverageLevel = Convert.ToInt32(coverageQuartiles.Item2);
            this.CoverageWeightingFactor = this.CoverageWeighting / medianCoverageLevel;
            int bestNumClusters = 0;

            // Need  large number of segments for cluster analysis
            if (usableSegments.Count > 100)
            {
                List<float> tempMAFList = new List<float>();
                foreach (SegmentInfo info in usableSegments) tempMAFList.Add(Convert.ToSingle(info.MAF));
                Tuple<float, float, float> MAFQuartiles = CanvasCommon.Utilities.Quartiles(tempMAFList);
                double minMAF = Math.Max(Convert.ToDouble(MAFQuartiles.Item1) - 0.05, 0.01);
                double maxMAF = Math.Min(Convert.ToDouble(MAFQuartiles.Item3) + 0.05, 0.46);

                // Step1: Find outliers
                double knearestNeighbourCutoff = KnearestNeighbourCutoff(usableSegments);

                // Step2: Find the best CoverageWeightingFactor 
                double bestCoverageWeightingFactor = BestCoverageWeightingFactor(usableSegments, maxCoverageLevel, medianCoverageLevel, knearestNeighbourCutoff);

                // Step3: Find the optimal number of clusters
                List<ModelPoint> modelPoints = BestNumClusters(usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff);
                bestNumClusters = modelPoints.Count;

                // Step4: Find segment clusters using the final model
                GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff);
                double likelihood = gmm.runExpectationMaximization();

                // Step5: Write results
                string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusteringModel.txt");
                if (!string.IsNullOrEmpty(debugPathClusterModel))
                {
                    using (StreamWriter debugWriter = new StreamWriter(debugPathClusterModel))
                    {
                        debugWriter.WriteLine("#MAF\tCoverage\tClusterID");
                        foreach (ModelPoint modelPoint in modelPoints)
                        {
                            debugWriter.WriteLine("{0}\t{1}\t{2}", modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Cluster);
                        }
                        debugWriter.WriteLine();
                        debugWriter.WriteLine("#MAF\tCoverage\tBestDistance\tClusterID");
                        foreach (SegmentInfo info in usableSegments)
                        {

                            debugWriter.Write("{0}\t{1}\t{2}", info.MAF, info.Coverage, info.Cluster);
                            debugWriter.WriteLine();
                        }
                    }
                }
            }

            // Note: Don't consider purity below 20 (at this point), becuase that creates a model that is very noise-sensitive.
            // We tried using a "tumor" sample that is actually just the real normal: We could overfit this data as very low 
            // purity 5% and make lots of (bogus) calls which fit the noise in coverage and MAF.

            double bestDeviation = double.MaxValue;
            List<CoveragePurityModel> allModels = new List<CoveragePurityModel>();
            // set best somatic model to pre-specified  ploidy and purity values
            if (this.userPloidy != null && this.userPurity != null)
            {
                CoveragePurityModel bestModel = new CoveragePurityModel();
                bestModel.DiploidCoverage = medianCoverageLevel * Convert.ToDouble(this.userPloidy) / 2.0;
                bestModel.Purity = Convert.ToDouble(this.userPurity);

                this.ModelDeviation(bestModel, usableSegments, bestNumClusters);
                this.DiploidModelDistance(bestModel, usableSegments, genomeLength);
                return bestModel;
            }
            // find best somatic model
            else
            {
                // Coarse search: Consider various (coverage, purity) tuples.  
                int minCoverage = (int)Math.Max(10, medianCoverageLevel / 2.5);
                int maxCoverage = (int)Math.Max(10, medianCoverageLevel * 2.5);
                int coverageStep = Math.Max(1, (maxCoverage - minCoverage) / 80);
                Console.WriteLine(">>>DiploidCoverage: Consider {0}...{1} step {2}", minCoverage, maxCoverage, coverageStep);
                for (int coverage = minCoverage; coverage < maxCoverage; coverage += coverageStep)
                {
                    // iterate over purity range 
                    for (int percentPurity = 20; percentPurity <= 100; percentPurity += 5)
                    {
                        CoveragePurityModel model = new CoveragePurityModel();
                        model.DiploidCoverage = coverage;
                        model.Purity = percentPurity / 100f;
                        this.ModelDeviation(model, usableSegments, bestNumClusters);
                        this.DiploidModelDistance(model, usableSegments, genomeLength);
                        if (model.Deviation < bestDeviation && model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy)
                        {
                            bestDeviation = model.Deviation;
                        }
                        // exluce models with unrealistic genome ploidies
                        if (model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy)
                            allModels.Add(model);
                    }
                }

                // New logic for model selection:
                // - First, compute the best model deviation.  This establishes a baseline for how large the deviation is allowed to get in 
                //   an acceptable model.  Allow somewhat higher deviation for targeted data, since we see extra noise there.
                // - Review models.  Discard any with unacceptable deviation.  Note the best attainable % copy number 2 and % normal.
                // - For each model, scale PercentNormal to a range of 0..100 where 100 = the best number seen for any acceptable model.  Similarly
                //   for PercentCN2.  And similarly for DeviationScore: BestDeviation=1, WorstAllowedDeviation=0
                // - Choose a model (with acceptable deviation) which maximizes a score of the form:
                //   PercentNormal + a * PercentCN2 + b * DeviationScore
                double worstAllowedDeviation = bestDeviation * this.DeviationFactor;
                double bestCN2 = 0;
                double bestCN2Normal = 0;
                double bestDiploidDistance = 0;

                // derive max values for scaling
                int counter = 0;
                List<double> deviations = new List<double>();
                foreach (CoveragePurityModel model in allModels)
                {
                    if (model.Deviation < worstAllowedDeviation) counter++;
                    deviations.Add(model.Deviation);
                }
                deviations.Sort();
                if (counter < this.DeviationIndexCutoff)
                {
                    worstAllowedDeviation = deviations[Math.Min(this.DeviationIndexCutoff, deviations.Count - 1)];
                }

                double bestAccuracyDeviation = double.MaxValue;
                double bestPrecisionDeviation = double.MaxValue;
                // derive max values for scaling
                foreach (CoveragePurityModel model in allModels)
                {
                    bestAccuracyDeviation = Math.Min(bestAccuracyDeviation, model.AccuracyDeviation);
                    bestPrecisionDeviation = Math.Min(bestPrecisionDeviation, model.PrecisionDeviation);
                    if (model.Deviation > worstAllowedDeviation) continue;
                    if (model.PercentCN[2] > bestCN2) bestCN2 = model.PercentCN[2];
                    if (model.PercentNormal > bestCN2Normal) bestCN2Normal = model.PercentNormal;
                    if (model.DiploidDistance > bestDiploidDistance) bestDiploidDistance = model.DiploidDistance;
                }

                // coarse search to find best ploidy and purity model  
                List<CoveragePurityModel> bestModels = new List<CoveragePurityModel>();
                CoveragePurityModel bestModel = null;
                double bestScore = 0;
                // holds scores for all models
                List<double> scores = new List<double>();
                // save all purity and ploidy models to a file 
                string debugPath = Path.Combine(this.TempFolder, "PurityModel.txt");
                using (StreamWriter debugWriter = new StreamWriter(debugPath))
                {
                    debugWriter.Write("#Purity\tDiploidCoverage\t");
                    debugWriter.Write("Deviation\tAccuracyDeviation\tPrecisionDeviation\tWorstAllowedDeviation\tAccDev/best\tPrecDev/best\t");
                    debugWriter.Write("DeviationScore\tScore\tPloidy\t");
                    debugWriter.Write("Normal\tNormal/best\tCN2\tCN2/Best\t");
                    debugWriter.Write("DiploidDistance\tDiploidDistance/Best");
                    debugWriter.WriteLine();
                    foreach (CoveragePurityModel model in allModels)
                    {

                        // Filter models with unacceptable deviation:
                        if (model.Deviation > worstAllowedDeviation) continue;
                        // Transform purity into Weighting Factor to penalize abnormal ploidies at low purity: 
                        // (1.5 - 0.5) = minmax range of the new weighting scale; (1.0 - 0.2) = minmax range of the purity values 
                        // This transformation leads a maximal lowPurityWeightingFactor value of 1.5 for the lowest purity model and a minimal value of 0.75 for the highest purity model 
                        double lowPurityWeightingFactor = 1.5 / ((1.5 - 0.5) / (1.0 - 0.2) * (model.Purity - 0.2) + 1.0);
                        double score = this.PercentNormal2WeightingFactor * model.PercentNormal / Math.Max(0.01, bestCN2Normal);
                        score += lowPurityWeightingFactor * this.CN2WeightingFactor * model.PercentCN[2] / Math.Max(0.01, bestCN2);
                        score += this.DeviationScoreWeightingFactor * (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation);
                        score += this.DiploidDistanceScoreWeightingFactor * model.DiploidDistance / Math.Max(0.01, bestDiploidDistance);
                        scores.Add(score);
                        bestModels.Add(model);
                        // write to file
                        debugWriter.Write("{0}\t{1}\t", (int)Math.Round(100 * model.Purity), model.DiploidCoverage);
                        debugWriter.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t", model.Deviation, model.AccuracyDeviation, model.PrecisionDeviation,
                            worstAllowedDeviation, model.AccuracyDeviation / bestAccuracyDeviation, model.PrecisionDeviation / bestPrecisionDeviation);
                        debugWriter.Write("{0}\t{1}\t{2}\t", (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation),
                            score, model.Ploidy);
                        debugWriter.Write("{0}\t{1}\t{2}\t{3}\t", model.PercentNormal, model.PercentNormal / Math.Max(0.01, bestCN2Normal),
                            model.PercentCN[2], model.PercentCN[2] / Math.Max(0.01, bestCN2));
                        debugWriter.Write("{0}\t{1}\t", model.DiploidDistance, model.DiploidDistance / Math.Max(0.01, bestDiploidDistance));
                        debugWriter.WriteLine();

                        if (score > bestScore)
                        {
                            bestModel = model;
                            bestScore = score;
                        }
                    }
                }
                // sort list and return indices
                var sortedScores = scores.Select((x, i) => new KeyValuePair<double, int>(x, i)).OrderBy(x => x.Key).ToList();
                List<double> scoresValue = sortedScores.Select(x => x.Key).ToList();
                List<int> scoresIndex = sortedScores.Select(x => x.Value).ToList();

                // interModelDistance shows genome edit distance between the best model and other top models (defined by MaximumRelatedModels). 
                // The premise is that if the top models provide widely different genome baseline (leading to high interModelDistance), 
                // the overall modeling approach might be more unstable.
                double interModelDistance = 0;
                // start at one since model #0 is the highest scoring model to compare to
                for (int i = 1; i < MaximumRelatedModels; i++)
                {
                    interModelDistance += CalculateModelDistance(bestModels[scoresIndex[0]], bestModels[scoresIndex[i]], usableSegments, genomeLength);
                }
                interModelDistance /= (double)MaximumRelatedModels;

                Console.WriteLine(">>> Initial model: Deviation {0:F5}, coverage {1}, purity {2:F1}%, CN2 {3:F2}", bestModel.Deviation,
                        bestModel.DiploidCoverage, 100 * bestModel.Purity, bestModel.PercentCN[2]);

                // Refine search: Smaller step sizes in the neighborhood of the initial model.
                minCoverage = (int)Math.Round(bestModel.DiploidCoverage) - 5;
                maxCoverage = (int)Math.Round(bestModel.DiploidCoverage) + 5;
                int minPurity = Math.Max(20, (int)Math.Round(bestModel.Purity * 100) - 10);
                int maxPurity = Math.Min(100, (int)Math.Round(bestModel.Purity * 100) + 10); // %%% magic numbers
                bestDeviation = double.MaxValue;

                bestModel = null;
                for (int coverage = minCoverage; coverage <= maxCoverage; coverage++)
                {
                    for (int percentPurity = minPurity; percentPurity <= maxPurity; percentPurity++)
                    {
                        CoveragePurityModel model = new CoveragePurityModel();
                        model.DiploidCoverage = coverage;
                        model.Purity = percentPurity / 100f;
                        this.ModelDeviation(model, usableSegments, bestNumClusters);
                        if (bestModel == null || model.Deviation < bestModel.Deviation)
                        {
                            bestModel = model;
                        }
                    }
                }
                // string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusterModel.txt");
                string debugPathCNVModeling = Path.Combine(this.TempFolder, "CNVModeling.txt");

                ModelDeviation(bestModel, usableSegments, bestNumClusters, null, true, debugPathCNVModeling);
                Console.WriteLine();
                Console.WriteLine(">>> Refined model: Deviation {0:F5}, coverage {1}, purity {2:F1}%", bestModel.Deviation,
                    bestModel.DiploidCoverage, bestModel.Purity * 100);
                Console.WriteLine();


                {
                    foreach (SegmentPloidy ploidy in AllPloidies)
                    {
                        ploidy.Omega = 0;
                        ploidy.Mu = null;
                        ploidy.Sigma = null;
                    }
                }
                if (!bestModel.InterModelDistance.HasValue)
                {
                    bestModel.InterModelDistance = interModelDistance;
                }
                return bestModel;
            }
        }
Esempio n. 8
0
        /// <summary>
        /// Helper function for ModelOverallCoverageAndPurity.  Measure the deviation (mismatch) between our
        /// model of expected coverage + minor allele frequency, and the actual data.
        /// Note that this method updates the parameters in this.AllPloidies to match this model.
        /// TotalDeviation = PrecisionWeight * PrecisionDeviation + (1 - PrecisionWeight) * AccuracyDeviation
        /// PrecisionDeviation is the weighted average of the distance between segments and their assigned ploidy
        /// AccuracyDeviation is the weighted average of the distance from the segment centroid 
        /// and the corresponding ploidy.
        /// </summary>
        protected double ModelDeviation(CoveragePurityModel model, List<SegmentInfo> segments, int numClusters, string debugPathClusterInfo = null, bool bestModel = false, string debugPath = null)
        {
            List<ModelPoint> modelPoints = InitializeModelPoints(model);
            double precisionDeviation = 0;
            this.RefineDiploidMAF(segments, modelPoints);

            /////////////////////////////////////////////
            // Cluster our segments:
            Array.Clear(model.PercentCN, 0, model.PercentCN.Length);
            model.CNs.Clear();
            double totalWeight = 0;
            double totalBasesNormal = 0;
            foreach (SegmentInfo info in segments)
            {
                double bestDistance = double.MaxValue;
                int bestCN = 0;
                ModelPoint bestModelPoint = null;
                foreach (ModelPoint modelPoint in modelPoints)
                {
                    double distance = GetModelDistance(info.Coverage, modelPoint.Coverage, info.MAF, modelPoint.MAF);
                    if (distance < bestDistance)
                    {
                        bestDistance = distance;
                        bestCN = modelPoint.CN;
                        info.Ploidy = modelPoint.Ploidy;
                        bestModelPoint = modelPoint;
                    }
                }

                bestDistance = Math.Sqrt(bestDistance);
                info.Distance = bestDistance;            
                precisionDeviation += bestDistance * info.Weight;
                totalWeight += info.Weight;
                model.PercentCN[bestCN] += info.Weight;
                if (bestCN == 2 && info.Ploidy.MajorChromosomeCount == 1) totalBasesNormal += info.Weight;
                bestModelPoint.Weight += info.Weight;
                bestModelPoint.EmpiricalCoverage += info.Weight * info.Coverage;
                if (info.MAF >= 0)
                {
                    bestModelPoint.EmpiricalMAF += info.Weight * info.MAF;
                    bestModelPoint.MAFWeight += info.Weight;
                }
                // add CN variant of the segment to the model 
                if (bestCN == 2 && info.Ploidy.MajorChromosomeCount == 2)
                    // aproximate LOH; we presume that LOH counts as one event, hence similar in effect to HET deletions
                    model.CNs.Add(1);
                else
                    model.CNs.Add(bestCN);
            }
            precisionDeviation /= totalWeight;

            // Compute AccuracyDeviation:
            double accuracyDeviation = 0;
            foreach (ModelPoint modelPoint in modelPoints)
            {
                if (modelPoint.Weight == 0) continue;
                modelPoint.EmpiricalCoverage /= modelPoint.Weight;
                if (modelPoint.MAFWeight > 0) modelPoint.EmpiricalMAF /= modelPoint.MAFWeight;
                double distance = this.GetModelDistance(modelPoint.Coverage, modelPoint.EmpiricalCoverage, modelPoint.MAF, modelPoint.EmpiricalMAF);
                distance = Math.Sqrt(distance);
                accuracyDeviation += distance * modelPoint.Weight;
                if (!string.IsNullOrEmpty(debugPath))
                {
                    Console.WriteLine("{0}\t{1}\t{2:F2}\t{3:F0}\t{4:F2}\t{5:F0}\t{6:F3},{7:F0}",
                        modelPoint.CN, modelPoint.Ploidy.MajorChromosomeCount,
                        modelPoint.MAF, modelPoint.Coverage,
                        modelPoint.EmpiricalMAF, modelPoint.EmpiricalCoverage,
                        distance, modelPoint.Weight);
                }
            }
            accuracyDeviation /= totalWeight;
            
            // standard somatic model deviation
            double tempDeviation = precisionDeviation * 0.5f + 0.5f * accuracyDeviation;

            // compute cluster deviation
            int heterogeneousClusters = 0;
            double heterogeneityIndex = 0;
            double clusterDeviation = ClusterDeviation(segments, numClusters, tempDeviation, out heterogeneousClusters, out heterogeneityIndex, bestModel, debugPathClusterInfo);

            // compute total deviation
            double totalDeviation;
            if (heterogeneousClusters > 0)
                totalDeviation = PrecisionWeightingFactor * precisionDeviation + PrecisionWeightingFactor * accuracyDeviation + PrecisionWeightingFactor * clusterDeviation;
            else
                totalDeviation = tempDeviation;


            // estimate abundance of each CN state
            for (int index = 0; index < model.PercentCN.Length; index++)
            {
                model.PercentCN[index] /= totalWeight;
            }
     
            // get model ploidy
            for (int index = 0; index < model.PercentCN.Length; index++)
            {
                model.Ploidy += index * model.PercentCN[index];
            }

            model.PercentNormal = totalBasesNormal / totalWeight;
            if (!string.IsNullOrEmpty(debugPath))
            {
                try
                {
                    using (StreamWriter debugWriter = new StreamWriter(debugPath))
                    {
                        debugWriter.WriteLine("#MAF\tCoverage\t");
                        foreach (ModelPoint modelPoint in modelPoints)
                        {
                            string gt = modelPoint.Ploidy.MajorChromosomeCount.ToString() + "/" + modelPoint.CN.ToString();
                            debugWriter.WriteLine("{0}\t{1}\t{2}\t", modelPoint.MAF, modelPoint.Coverage, gt);
                        }
                        debugWriter.WriteLine();
                        debugWriter.WriteLine("#MAF\tCoverage\tBestDistance\tChromosome\tBegin\tEnd\tLength\tTruthSetCN");
                        foreach (SegmentInfo info in segments)
                        {
                            // Find the best fit for this segment:
                            double bestDistance = double.MaxValue;
                            foreach (ModelPoint modelPoint in modelPoints)
                            {
                                double distance = GetModelDistance(info.Coverage, modelPoint.Coverage, info.MAF, modelPoint.MAF);
                                if (distance < bestDistance) bestDistance = distance;
                            }
                            bestDistance = Math.Sqrt(bestDistance);
                            debugWriter.Write("{0}\t{1}\t", info.MAF, info.Coverage);
                            debugWriter.Write("{0}\t{1}\t{2}\t{3}\t", bestDistance, info.Segment.Chr, info.Segment.Begin, info.Segment.End);
                            debugWriter.Write("{0}\t", info.Segment.End - info.Segment.Begin);
                            int CN = this.GetKnownCNForSegment(info.Segment);
                            debugWriter.Write("{0}\t", CN);
                            debugWriter.WriteLine();
                        }
                    }
                }
                catch (IOException ex)
                {
                    // Whine, but continue - not outputing this file is not fatal.
                    Console.Error.WriteLine(ex.ToString());
                }
            }

            // make sure that CN profile length is equal to the usable segments length
            if (model.CNs.Count != segments.Count)
            {
                throw new IndexOutOfRangeException(String.Concat("Canvas Somatic Caller error: index sizes do not match, ",
                    model.CNs.Count, " != ", segments.Count));
            }
            model.PrecisionDeviation = precisionDeviation;
            model.AccuracyDeviation = accuracyDeviation;
            model.Deviation = totalDeviation;
            model.HeterogeneityIndex = heterogeneityIndex;
            return totalDeviation;
        }