Represents a point in (MAF, Coverage) space to be clustered:
Inheritance: ModelPoint
Beispiel #1
0
        private void EMComputePosteriorProbs(List<ModelPoint> modelPoints, SegmentInfo segment)
        {
            double tempsum1 = 0;
            Dictionary<ModelPoint, double> temp = new Dictionary<ModelPoint, double>();
            foreach (var modelPoint in modelPoints)
            {
                temp[modelPoint] = modelPoint.Ploidy.Omega * Sigma(segment.MAF, segment.Coverage, modelPoint.Ploidy.Mu, modelPoint.Ploidy.Sigma);
                tempsum1 += temp[modelPoint];
            }

            if (segment.PosteriorProbs == null) { segment.PosteriorProbs = new Dictionary<ModelPoint, double>(); }
            int bestCluster = 0;
            double bestProb = 0;
            foreach (var modelPoint in modelPoints)
            {
                segment.PosteriorProbs[modelPoint] = temp[modelPoint] / tempsum1;
                if (segment.PosteriorProbs[modelPoint] > bestProb) { 
                    bestCluster = modelPoint.Cluster.Value;
                    bestProb = temp[modelPoint] / tempsum1;
                }
                if (Double.IsNaN(segment.PosteriorProbs[modelPoint])) { segment.PosteriorProbs[modelPoint] = 0; }
            }
            segment.Cluster = bestCluster;
        }
Beispiel #2
0
        public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyBedPath, string referenceFolder, string sampleName,
            string truthDataPath)
        {
            if (!string.IsNullOrEmpty(truthDataPath))
            {
                this.CNOracle = new CopyNumberOracle();
                this.CNOracle.LoadKnownCN(truthDataPath);
            }

            this.Segments = CanvasSegment.ReadSegments(inFile);
            this.TempFolder = Path.GetDirectoryName(inFile);
            if (this.Segments.Count == 0)
            {
                Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made.");
                CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, null, null);
                return 0;
            }
            PloidyInfo ploidy = null;
            if (!string.IsNullOrEmpty(ploidyBedPath)) ploidy = PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);

            // load MAF
            this.MeanCoverage = CanvasIO.LoadVariantFrequencies(variantFrequencyFile, this.Segments);
            int medianVariantCoverage = AggregateVariantCoverage(ref this.Segments);

            // Create new models for different copy number states
            this.InitializePloidies();

            // Compute statistics on the copy number two regions
            float[] diploidCounts = AggregateCounts(ref this.Segments);
            DiploidCoverage = CanvasCommon.Utilities.Mean(diploidCounts);
            CoverageWeightingFactor = CoverageWeighting / DiploidCoverage;

            // new coverage model
            this.Model = new CoverageModel();
            Model.DiploidCoverage = DiploidCoverage;
            List<SegmentInfo> segments = new List<SegmentInfo>();
            foreach (CanvasSegment segment in this.Segments)
            {
                SegmentInfo info = new SegmentInfo();
                info.Segment = segment;
                List<double> MAF = new List<double>();
                foreach (float value in segment.VariantFrequencies) MAF.Add(value > 0.5 ? 1 - value : value);

                if (MAF.Count > 0)
                {
                    info.MAF = CanvasCommon.Utilities.Median(MAF);

                }
                else
                {
                    info.MAF = -1;
                }

                info.Coverage = CanvasCommon.Utilities.Median(segment.Counts);

                if (this.Segments.Count > 100)
                {
                    info.Weight = segment.End - segment.Begin;
                }
                else
                {
                    info.Weight = segment.BinCount;
                }
                segments.Add(info);
            }

            // Assign copy number and major chromosome count for each segment
            bool useGaussianMixtureModel = false; // For now, this is set false, since we saw weird performance on chrY (CANV-115):
            if (useGaussianMixtureModel)
            {
                // optimize model covariance
                double likelihood = FitGaussians(Model, segments);
                AssignPloidyCallsGaussianMixture();
            }
            else
            {
                AssignPloidyCallsDistance(Model, segments, medianVariantCoverage);
            }

            // Merge neighboring segments that got the same copy number call.
            CanvasSegment.MergeSegments(ref this.Segments);
            CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline);
            List<string> extraHeaders = new List<string>();
            string coverageOutputPath = CanvasCommon.Utilities.GetCoverageAndVariantFrequencyOutputPath(outFile);
            CanvasSegment.WriteCoveragePlotData(this.Segments, Model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder);

            if (this.CNOracle != null)
            {
                this.GenerateReportVersusKnownCN();
            }

            if (ploidy != null && !string.IsNullOrEmpty(ploidy.HeaderLine)) extraHeaders.Add(ploidy.HeaderLine);
            CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, extraHeaders, ploidy);
            return 0;
        }
Beispiel #3
0
        static public List<SegmentInfo> GetUsableSegmentsForModeling(List<CanvasSegment> segments)
        {
            // Get the average count everwhere.  Exclude segments whose coverage is >2x this average.
            float overallMean = 0;
            int overallCount = 0;
            foreach (CanvasSegment segment in segments)
            {
                foreach (float value in segment.Counts)
                {
                    overallCount++;
                    overallMean += value;
                }
            }
            overallMean /= Math.Max(1, overallCount);

            List<SegmentInfo> usableSegments = new List<SegmentInfo>();
            foreach (CanvasSegment segment in segments)
            {
                if (segment.End - segment.Begin < 5000) continue;
                SegmentInfo info = new SegmentInfo();
                info.Segment = segment;
                // If the segment has few or no variants, then don't use the MAF for this segment - set to -1 (no frequency)
                // Typically a segment will have no variants if it's on chrX or chrY and starling knows not to call a
                // heterozygous variant there (other than in the PAR regions).
                if (segment.VariantFrequencies.Count < MinimumVariantFrequenciesForInformativeSegment)
                {
                    info.MAF = -1;
                }
                else
                {
                    List<double> MAF = new List<double>();
                    foreach (float value in segment.VariantFrequencies) MAF.Add(value > 0.5 ? 1 - value : value);
                    MAF.Sort();
                    info.MAF = MAF[MAF.Count / 2];
                }
                info.Coverage = CanvasCommon.Utilities.Median(segment.Counts);
                if (info.Coverage > overallMean * 2) continue;
                if (segments.Count > 100)
                {
                    info.Weight = segment.End - segment.Begin;
                }
                else
                {
                    info.Weight = segment.BinCount;
                }
                if (segment.VariantFrequencies.Count < 10)
                {
                    info.Weight *= (double)segment.VariantFrequencies.Count / 10;
                }
                usableSegments.Add(info);
            }
            return usableSegments;
        }