Ejemplo n.º 1
0
        /// <summary>
        /// Fit a Gaussian mixture model.
        /// Fix the means to the model MAF and Coverage and run the EM algorithm until convergence.
        /// Compute the empirical MAF and Coverage.
        /// Fix the means to the empirical MAF and Coverage and run the EM algorithm again until convergence.
        /// Always estimate the full covariance matrix?
        /// </summary>
        /// <param name="model"></param>
        /// <param name="segments"></param>
        /// <param name="debugPath"></param>
        /// <returns></returns>
        private double FitGaussians(CoverageModel model, List<SegmentInfo> segments, string debugPath = null)
        {
            List<ModelPoint> modelPoints = InitializeModelPoints(model);

            GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, segments, this.MeanCoverage, this.CoverageWeightingFactor, 0);
            double likelihood = gmm.Fit();

            if (debugPath != null)
            {
                // write Gaussian mixture model to debugPath
                using (StreamWriter writer = new StreamWriter(debugPath))
                {
                    writer.WriteLine("CN\tMajor Chr #\tMAF\tCoverage\tOmega\tMu0\tMu1\tSigma00\tSigma01\tSigma10\tSigma11");
                    foreach (ModelPoint modelPoint in modelPoints)
                    {
                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}",
                            modelPoint.Ploidy.CopyNumber, modelPoint.Ploidy.MajorChromosomeCount,
                            modelPoint.Ploidy.MixedMinorAlleleFrequency, modelPoint.Ploidy.MixedCoverage,
                            modelPoint.Ploidy.Omega, modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1],
                            modelPoint.Ploidy.Sigma[0][0], modelPoint.Ploidy.Sigma[0][1],
                            modelPoint.Ploidy.Sigma[1][0], modelPoint.Ploidy.Sigma[1][1]);
                    }

                    writer.WriteLine("");
                    writer.WriteLine("MAF\tCoverage\tPosterior Probabilities");
                    StringBuilder sb = new StringBuilder();
                    foreach (SegmentInfo segment in segments)
                    {
                        sb.Clear();
                        sb.AppendFormat("{0}\t{1}", segment.MAF, segment.Coverage);
                        foreach (ModelPoint modelPoint in modelPoints)
                        {
                            sb.AppendFormat("\t{0}", segment.PosteriorProbs[modelPoint]);
                        }
                        writer.WriteLine(sb.ToString());
                    }
                }
            }

            return likelihood;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Estimate the optimal number of clusters in an Expectation Maximizationfrom model using silhouette coefficient
        /// </summary>
        public double BestCoverageWeightingFactor(List<SegmentInfo> usableSegments, int maxCoverageLevel, int medianCoverageLevel, double knearestNeighbourCutoff)
        {
            double bestLikelihood = double.MinValue;
            double bestCoverageWeightingFactor = 0;
            // magic-scaling for now - keep small to penalize coverage, tested on 50+ groundtruth corpus
            double maxCoverageWeightingFactor = this.CoverageWeighting / medianCoverageLevel;
            double minCoverageWeightingFactor = 0.1 / maxCoverageLevel;
            double stepCoverageWeightingFactor =  Math.Max(0.00001, (maxCoverageWeightingFactor - minCoverageWeightingFactor) / 10);

            for (double coverageWeighting = minCoverageWeightingFactor; coverageWeighting < maxCoverageWeightingFactor; coverageWeighting += stepCoverageWeightingFactor)
            {
                List<ModelPoint> tempModelPoints = InitializeModelPoints(usableSegments, medianCoverageLevel/2.0, 90, 6);
                GaussianMixtureModel tempgmm = new GaussianMixtureModel(tempModelPoints, usableSegments, medianCoverageLevel, coverageWeighting, knearestNeighbourCutoff);
                double currentLikelihood = tempgmm.runExpectationMaximization();
                if (currentLikelihood > bestLikelihood)
                {
                    bestLikelihood = currentLikelihood;
                    bestCoverageWeightingFactor = coverageWeighting;                    
                }
            }
            return bestCoverageWeightingFactor;
        } 
Ejemplo n.º 3
0
        /// <summary>
        /// Identify the tuple (DiploidCoverage, OverallPurity) which best models our overall
        /// distribution of (MAF, Coverage) data across all segments.  Consider various tuples (first with a coarse-grained
        /// and then a fine-grained search), and for each one, measure the distortion - the average distance (weighted 
        /// by segment length) between actual and modeled (MAF, Coverage) coordinate.
        /// </summary>
        protected CoveragePurityModel ModelOverallCoverageAndPurity(long genomeLength)
        {
            List<SegmentInfo> usableSegments;
            // Identify usable segments using our MinimumVariantFrequenciesForInformativeSegment cutoff, 
            // then (if we don't find enough) we can try again with progressively more permissive cutoffs.
            while (true)
            {
                usableSegments = GetUsableSegmentsForModeling(this.Segments);
                int validMAFCount = usableSegments.Count(x => x.MAF >= 0);
                if (validMAFCount > Math.Min(20, this.Segments.Count)) break; // We have enough usable segments with nonnull MAF
                if (MinimumVariantFrequenciesForInformativeSegment <= 5) break; // Give up on modeling
                MinimumVariantFrequenciesForInformativeSegment -= 15;
            }
            Console.WriteLine("Modeling overall coverage/purity across {0} segments", usableSegments.Count);
            if (usableSegments.Count < 10)
                throw new UncallableDataException("Cannot model coverage/purity with less than 10 segments.");

            // When computing distances between model and actual points, we want to provide roughly equal weight
            // to coverage (which covers a large range) and MAF, which falls in the range (0, 0.5).  
            // If we already knew the diploid coverage, then we'd know just how to scale things (catch-22).
            // Let's assume that the median coverage is a sane thing to use for scaling:
            List<float> tempCoverageList = new List<float>();
            List<double> knearestNeighbourList = new List<double>();
                        
            // Segments clustering using Gaussian Expectation Maximisation

            // Step0: Prepare model parameters
            foreach (SegmentInfo info in usableSegments) tempCoverageList.Add(Convert.ToSingle(info.Coverage));
            Tuple<float, float, float> coverageQuartiles = CanvasCommon.Utilities.Quartiles(tempCoverageList);
            int minCoverageLevel = Convert.ToInt32(coverageQuartiles.Item1);
            int maxCoverageLevel = Convert.ToInt32(coverageQuartiles.Item3);
            int medianCoverageLevel = Convert.ToInt32(coverageQuartiles.Item2);
            this.CoverageWeightingFactor = this.CoverageWeighting / medianCoverageLevel;
            int bestNumClusters = 0;

            // Need  large number of segments for cluster analysis
            if (usableSegments.Count > 100)
            {
                List<float> tempMAFList = new List<float>();
                foreach (SegmentInfo info in usableSegments) tempMAFList.Add(Convert.ToSingle(info.MAF));
                Tuple<float, float, float> MAFQuartiles = CanvasCommon.Utilities.Quartiles(tempMAFList);
                double minMAF = Math.Max(Convert.ToDouble(MAFQuartiles.Item1) - 0.05, 0.01);
                double maxMAF = Math.Min(Convert.ToDouble(MAFQuartiles.Item3) + 0.05, 0.46);

                // Step1: Find outliers
                double knearestNeighbourCutoff = KnearestNeighbourCutoff(usableSegments);

                // Step2: Find the best CoverageWeightingFactor 
                double bestCoverageWeightingFactor = BestCoverageWeightingFactor(usableSegments, maxCoverageLevel, medianCoverageLevel, knearestNeighbourCutoff);

                // Step3: Find the optimal number of clusters
                List<ModelPoint> modelPoints = BestNumClusters(usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff);
                bestNumClusters = modelPoints.Count;

                // Step4: Find segment clusters using the final model
                GaussianMixtureModel gmm = new GaussianMixtureModel(modelPoints, usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff);
                double likelihood = gmm.runExpectationMaximization();

                // Step5: Write results
                string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusteringModel.txt");
                if (!string.IsNullOrEmpty(debugPathClusterModel))
                {
                    using (StreamWriter debugWriter = new StreamWriter(debugPathClusterModel))
                    {
                        debugWriter.WriteLine("#MAF\tCoverage\tClusterID");
                        foreach (ModelPoint modelPoint in modelPoints)
                        {
                            debugWriter.WriteLine("{0}\t{1}\t{2}", modelPoint.Ploidy.Mu[0], modelPoint.Ploidy.Mu[1], modelPoint.Cluster);
                        }
                        debugWriter.WriteLine();
                        debugWriter.WriteLine("#MAF\tCoverage\tBestDistance\tClusterID");
                        foreach (SegmentInfo info in usableSegments)
                        {

                            debugWriter.Write("{0}\t{1}\t{2}", info.MAF, info.Coverage, info.Cluster);
                            debugWriter.WriteLine();
                        }
                    }
                }
            }

            // Note: Don't consider purity below 20 (at this point), becuase that creates a model that is very noise-sensitive.
            // We tried using a "tumor" sample that is actually just the real normal: We could overfit this data as very low 
            // purity 5% and make lots of (bogus) calls which fit the noise in coverage and MAF.

            double bestDeviation = double.MaxValue;
            List<CoveragePurityModel> allModels = new List<CoveragePurityModel>();
            // set best somatic model to pre-specified  ploidy and purity values
            if (this.userPloidy != null && this.userPurity != null)
            {
                CoveragePurityModel bestModel = new CoveragePurityModel();
                bestModel.DiploidCoverage = medianCoverageLevel * Convert.ToDouble(this.userPloidy) / 2.0;
                bestModel.Purity = Convert.ToDouble(this.userPurity);

                this.ModelDeviation(bestModel, usableSegments, bestNumClusters);
                this.DiploidModelDistance(bestModel, usableSegments, genomeLength);
                return bestModel;
            }
            // find best somatic model
            else
            {
                // Coarse search: Consider various (coverage, purity) tuples.  
                int minCoverage = (int)Math.Max(10, medianCoverageLevel / 2.5);
                int maxCoverage = (int)Math.Max(10, medianCoverageLevel * 2.5);
                int coverageStep = Math.Max(1, (maxCoverage - minCoverage) / 80);
                Console.WriteLine(">>>DiploidCoverage: Consider {0}...{1} step {2}", minCoverage, maxCoverage, coverageStep);
                for (int coverage = minCoverage; coverage < maxCoverage; coverage += coverageStep)
                {
                    // iterate over purity range 
                    for (int percentPurity = 20; percentPurity <= 100; percentPurity += 5)
                    {
                        CoveragePurityModel model = new CoveragePurityModel();
                        model.DiploidCoverage = coverage;
                        model.Purity = percentPurity / 100f;
                        this.ModelDeviation(model, usableSegments, bestNumClusters);
                        this.DiploidModelDistance(model, usableSegments, genomeLength);
                        if (model.Deviation < bestDeviation && model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy)
                        {
                            bestDeviation = model.Deviation;
                        }
                        // exluce models with unrealistic genome ploidies
                        if (model.Ploidy < this.MaxAllowedPloidy && model.Ploidy > this.MinAllowedPloidy)
                            allModels.Add(model);
                    }
                }

                // New logic for model selection:
                // - First, compute the best model deviation.  This establishes a baseline for how large the deviation is allowed to get in 
                //   an acceptable model.  Allow somewhat higher deviation for targeted data, since we see extra noise there.
                // - Review models.  Discard any with unacceptable deviation.  Note the best attainable % copy number 2 and % normal.
                // - For each model, scale PercentNormal to a range of 0..100 where 100 = the best number seen for any acceptable model.  Similarly
                //   for PercentCN2.  And similarly for DeviationScore: BestDeviation=1, WorstAllowedDeviation=0
                // - Choose a model (with acceptable deviation) which maximizes a score of the form:
                //   PercentNormal + a * PercentCN2 + b * DeviationScore
                double worstAllowedDeviation = bestDeviation * this.DeviationFactor;
                double bestCN2 = 0;
                double bestCN2Normal = 0;
                double bestDiploidDistance = 0;

                // derive max values for scaling
                int counter = 0;
                List<double> deviations = new List<double>();
                foreach (CoveragePurityModel model in allModels)
                {
                    if (model.Deviation < worstAllowedDeviation) counter++;
                    deviations.Add(model.Deviation);
                }
                deviations.Sort();
                if (counter < this.DeviationIndexCutoff)
                {
                    worstAllowedDeviation = deviations[Math.Min(this.DeviationIndexCutoff, deviations.Count - 1)];
                }

                double bestAccuracyDeviation = double.MaxValue;
                double bestPrecisionDeviation = double.MaxValue;
                // derive max values for scaling
                foreach (CoveragePurityModel model in allModels)
                {
                    bestAccuracyDeviation = Math.Min(bestAccuracyDeviation, model.AccuracyDeviation);
                    bestPrecisionDeviation = Math.Min(bestPrecisionDeviation, model.PrecisionDeviation);
                    if (model.Deviation > worstAllowedDeviation) continue;
                    if (model.PercentCN[2] > bestCN2) bestCN2 = model.PercentCN[2];
                    if (model.PercentNormal > bestCN2Normal) bestCN2Normal = model.PercentNormal;
                    if (model.DiploidDistance > bestDiploidDistance) bestDiploidDistance = model.DiploidDistance;
                }

                // coarse search to find best ploidy and purity model  
                List<CoveragePurityModel> bestModels = new List<CoveragePurityModel>();
                CoveragePurityModel bestModel = null;
                double bestScore = 0;
                // holds scores for all models
                List<double> scores = new List<double>();
                // save all purity and ploidy models to a file 
                string debugPath = Path.Combine(this.TempFolder, "PurityModel.txt");
                using (StreamWriter debugWriter = new StreamWriter(debugPath))
                {
                    debugWriter.Write("#Purity\tDiploidCoverage\t");
                    debugWriter.Write("Deviation\tAccuracyDeviation\tPrecisionDeviation\tWorstAllowedDeviation\tAccDev/best\tPrecDev/best\t");
                    debugWriter.Write("DeviationScore\tScore\tPloidy\t");
                    debugWriter.Write("Normal\tNormal/best\tCN2\tCN2/Best\t");
                    debugWriter.Write("DiploidDistance\tDiploidDistance/Best");
                    debugWriter.WriteLine();
                    foreach (CoveragePurityModel model in allModels)
                    {

                        // Filter models with unacceptable deviation:
                        if (model.Deviation > worstAllowedDeviation) continue;
                        // Transform purity into Weighting Factor to penalize abnormal ploidies at low purity: 
                        // (1.5 - 0.5) = minmax range of the new weighting scale; (1.0 - 0.2) = minmax range of the purity values 
                        // This transformation leads a maximal lowPurityWeightingFactor value of 1.5 for the lowest purity model and a minimal value of 0.75 for the highest purity model 
                        double lowPurityWeightingFactor = 1.5 / ((1.5 - 0.5) / (1.0 - 0.2) * (model.Purity - 0.2) + 1.0);
                        double score = this.PercentNormal2WeightingFactor * model.PercentNormal / Math.Max(0.01, bestCN2Normal);
                        score += lowPurityWeightingFactor * this.CN2WeightingFactor * model.PercentCN[2] / Math.Max(0.01, bestCN2);
                        score += this.DeviationScoreWeightingFactor * (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation);
                        score += this.DiploidDistanceScoreWeightingFactor * model.DiploidDistance / Math.Max(0.01, bestDiploidDistance);
                        scores.Add(score);
                        bestModels.Add(model);
                        // write to file
                        debugWriter.Write("{0}\t{1}\t", (int)Math.Round(100 * model.Purity), model.DiploidCoverage);
                        debugWriter.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t", model.Deviation, model.AccuracyDeviation, model.PrecisionDeviation,
                            worstAllowedDeviation, model.AccuracyDeviation / bestAccuracyDeviation, model.PrecisionDeviation / bestPrecisionDeviation);
                        debugWriter.Write("{0}\t{1}\t{2}\t", (worstAllowedDeviation - model.Deviation) / (worstAllowedDeviation - bestDeviation),
                            score, model.Ploidy);
                        debugWriter.Write("{0}\t{1}\t{2}\t{3}\t", model.PercentNormal, model.PercentNormal / Math.Max(0.01, bestCN2Normal),
                            model.PercentCN[2], model.PercentCN[2] / Math.Max(0.01, bestCN2));
                        debugWriter.Write("{0}\t{1}\t", model.DiploidDistance, model.DiploidDistance / Math.Max(0.01, bestDiploidDistance));
                        debugWriter.WriteLine();

                        if (score > bestScore)
                        {
                            bestModel = model;
                            bestScore = score;
                        }
                    }
                }
                // sort list and return indices
                var sortedScores = scores.Select((x, i) => new KeyValuePair<double, int>(x, i)).OrderBy(x => x.Key).ToList();
                List<double> scoresValue = sortedScores.Select(x => x.Key).ToList();
                List<int> scoresIndex = sortedScores.Select(x => x.Value).ToList();

                // interModelDistance shows genome edit distance between the best model and other top models (defined by MaximumRelatedModels). 
                // The premise is that if the top models provide widely different genome baseline (leading to high interModelDistance), 
                // the overall modeling approach might be more unstable.
                double interModelDistance = 0;
                // start at one since model #0 is the highest scoring model to compare to
                for (int i = 1; i < MaximumRelatedModels; i++)
                {
                    interModelDistance += CalculateModelDistance(bestModels[scoresIndex[0]], bestModels[scoresIndex[i]], usableSegments, genomeLength);
                }
                interModelDistance /= (double)MaximumRelatedModels;

                Console.WriteLine(">>> Initial model: Deviation {0:F5}, coverage {1}, purity {2:F1}%, CN2 {3:F2}", bestModel.Deviation,
                        bestModel.DiploidCoverage, 100 * bestModel.Purity, bestModel.PercentCN[2]);

                // Refine search: Smaller step sizes in the neighborhood of the initial model.
                minCoverage = (int)Math.Round(bestModel.DiploidCoverage) - 5;
                maxCoverage = (int)Math.Round(bestModel.DiploidCoverage) + 5;
                int minPurity = Math.Max(20, (int)Math.Round(bestModel.Purity * 100) - 10);
                int maxPurity = Math.Min(100, (int)Math.Round(bestModel.Purity * 100) + 10); // %%% magic numbers
                bestDeviation = double.MaxValue;

                bestModel = null;
                for (int coverage = minCoverage; coverage <= maxCoverage; coverage++)
                {
                    for (int percentPurity = minPurity; percentPurity <= maxPurity; percentPurity++)
                    {
                        CoveragePurityModel model = new CoveragePurityModel();
                        model.DiploidCoverage = coverage;
                        model.Purity = percentPurity / 100f;
                        this.ModelDeviation(model, usableSegments, bestNumClusters);
                        if (bestModel == null || model.Deviation < bestModel.Deviation)
                        {
                            bestModel = model;
                        }
                    }
                }
                // string debugPathClusterModel = Path.Combine(this.TempFolder, "ClusterModel.txt");
                string debugPathCNVModeling = Path.Combine(this.TempFolder, "CNVModeling.txt");

                ModelDeviation(bestModel, usableSegments, bestNumClusters, null, true, debugPathCNVModeling);
                Console.WriteLine();
                Console.WriteLine(">>> Refined model: Deviation {0:F5}, coverage {1}, purity {2:F1}%", bestModel.Deviation,
                    bestModel.DiploidCoverage, bestModel.Purity * 100);
                Console.WriteLine();


                {
                    foreach (SegmentPloidy ploidy in AllPloidies)
                    {
                        ploidy.Omega = 0;
                        ploidy.Mu = null;
                        ploidy.Sigma = null;
                    }
                }
                if (!bestModel.InterModelDistance.HasValue)
                {
                    bestModel.InterModelDistance = interModelDistance;
                }
                return bestModel;
            }
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Estimate the optimal number of clusters in an Expectation Maximizationfrom model using silhouette coefficient
        /// </summary>
        public List<ModelPoint> BestNumClusters(List<SegmentInfo> usableSegments, double medianCoverageLevel, double bestCoverageWeightingFactor, double knearestNeighbourCutoff)
        {
            double bestSilhouette = double.MinValue;
            List<ModelPoint> bestModelPoints = new List<ModelPoint>(); 
            int bestNumClusters = 0;
            int maxNumClusters = 8;

            // find distanceThreshold, use it in InitializeModelPoints
            List<double> tempModelDistanceList = new List<double>();
            for (int i = 0; i < usableSegments.Count; i++)
            {
                for (int j = 0; j < usableSegments.Count; j++)
                {
                    if (i != j && usableSegments[i].Cluster != -1 && usableSegments[j].Cluster != -1 &&  usableSegments[i].MAF >= 0 && usableSegments[j].MAF >= 0)
                    {
                        tempModelDistanceList.Add(GetModelDistance(usableSegments[i].Coverage, usableSegments[j].Coverage, usableSegments[i].MAF, usableSegments[j].MAF));
                    }
                }
            }
            tempModelDistanceList.Sort();
            int distanceThresholdIndex = Math.Min(Convert.ToInt32(tempModelDistanceList.Count * 0.8), tempModelDistanceList.Count - 1);
            double distanceThreshold = tempModelDistanceList[distanceThresholdIndex]; // enable capturing clusters with less than 30% abundance 

            // find optimal number of clusters using silhouette distance and return bestModelPoints
            for (int numClusters = 4; numClusters < maxNumClusters; numClusters++)
            {
                for (int i = 0; i < 10; i++)
                {
                    List<ModelPoint> tempModelPoints = InitializeModelPoints(usableSegments, numClusters, distanceThreshold);
                    GaussianMixtureModel tempgmm = new GaussianMixtureModel(tempModelPoints, usableSegments, medianCoverageLevel, bestCoverageWeightingFactor, knearestNeighbourCutoff);
                    double currentLikelihood = tempgmm.runExpectationMaximization(); // return BIC rather than raw likelihood
                    double currentSilhouette = ComputeSilhouette(usableSegments, numClusters);
                    if (bestSilhouette < currentSilhouette)
                    {
                        bestSilhouette = currentSilhouette;
                        bestNumClusters = numClusters;
                        if (bestModelPoints.Count > 0)
                            bestModelPoints.Clear();
                        foreach (ModelPoint tempModelPoint in tempModelPoints)
                            bestModelPoints.Add(tempModelPoint);
                    }
                }
            }
            return bestModelPoints;
        }