예제 #1
0
        /// <summary>
        /// Given a set canvasSegmentsSet with two alternative segmentation hypothesis (SegmentsSet: SetA and SetB), return log likelihood
        /// for a segmentation hypothesis specified by segmentsSet. Segmentation hypothesis could typically include segmentation results specified
        /// by partitioning or annotations of population (common) variants
        /// </summary>
        /// <param name="canvasSegmentsSet"></param>
        /// <param name="samplesInfo"></param>
        /// <param name="copyNumberModel"></param>
        /// <param name="segmentsSet"></param>
        /// <returns></returns>
        private double GetSegmentSetLogLikelihood(ISampleMap <OverlappingSegmentsRegion> canvasSegmentsSet, ISampleMap <SampleMetrics> samplesInfo,
                                                  ISampleMap <ICopyNumberModel> copyNumberModel, SegmentsSet segmentsSet)
        {
            double segmentSetLogLikelihood = 0;

            foreach (var sampleId in canvasSegmentsSet.SampleIds)
            {
                canvasSegmentsSet[sampleId].SetSet(segmentsSet);
            }

            var canvasSegments = new List <ISampleMap <CanvasSegment> >();
            int nSegments      = canvasSegmentsSet.First().Value.GetSet().Count;

            for (var canvasSegmentIndex = 0; canvasSegmentIndex < nSegments; canvasSegmentIndex++)
            {
                var canvasSegment = new SampleMap <CanvasSegment>();
                foreach (var id in canvasSegmentsSet.SampleIds)
                {
                    canvasSegment.Add(id, canvasSegmentsSet[id].GetSet()[canvasSegmentIndex]);
                }
                canvasSegments.Add(canvasSegment);
            }
            foreach (var canvasSegment in canvasSegments)
            {
                var copyNumbersLikelihoods = _copyNumberLikelihoodCalculator.GetCopyNumbersLikelihoods(canvasSegment, samplesInfo, copyNumberModel, _callerParameters.NumberOfTrimmedBins);
                var(_, likelihoods)      = GetCopyNumbersNoPedigreeInfo(canvasSegment, copyNumbersLikelihoods);
                segmentSetLogLikelihood += likelihoods.MaximalLogLikelihood;
            }

            return(segmentSetLogLikelihood);
        }
예제 #2
0
        private static ISampleMap <List <CanvasSegment> > MergeSegments(ISampleMap <List <CanvasSegment> > segments, int minimumCallSize, int qScoreThreshold)
        {
            int nSegments   = segments.First().Value.Count;
            var copyNumbers = new List <List <int> >(nSegments);
            var qscores     = new List <double>(nSegments);

            foreach (int segmentIndex in Enumerable.Range(0, nSegments))
            {
                copyNumbers.Add(segments.Select(s => s.Value[segmentIndex].CopyNumber).ToList());
                qscores.Add(segments.Select(s => s.Value[segmentIndex].QScore).Average());
            }

            if (copyNumbers == null && qscores != null || copyNumbers != null & qscores == null)
            {
                throw new ArgumentException("Both copyNumbers and qscores arguments must be specified.");
            }
            if (copyNumbers != null && copyNumbers.Count != nSegments)
            {
                throw new ArgumentException("Length of copyNumbers list should be equal to the number of segments.");
            }
            if (qscores != null && qscores.Count != nSegments)
            {
                throw new ArgumentException("Length of qscores list should be equal to the number of segments.");
            }

            var mergedSegments = new SampleMap <List <CanvasSegment> >();

            foreach (var sampleSegments in segments)
            {
                var mergedSegmentsThisSample = CanvasSegment.MergeSegments(sampleSegments.Value.ToList(),
                                                                           minimumCallSize, 10000, copyNumbers, qscores, qScoreThreshold);
                mergedSegments.Add(sampleSegments.Key, mergedSegmentsThisSample);
            }
            return(mergedSegments);
        }
        /// <summary>
        /// Calculates maximal likelihood for segments without SNV allele ratios. Updated CanvasSegment CopyNumber only.
        /// Use likelihoods as only median point estimator is used
        /// </summary>
        public ISampleMap <Dictionary <Genotype, double> > GetCopyNumbersLikelihoods(ISampleMap <CanvasSegment> canvasSegments, ISampleMap <SampleMetrics> samplesInfo,
                                                                                     ISampleMap <ICopyNumberModel> copyNumberModel)
        {
            const int    bins2Remove             = 5;
            var          genotypes               = Enumerable.Range(0, _maximumCopyNumber).Select(Genotype.Create).ToList();
            const double maxCoverageMultiplier   = 3.0;
            var          singleSampleLikelihoods = new SampleMap <Dictionary <Genotype, double> >();

            foreach (var sampleId in canvasSegments.SampleIds)
            {
                var density = new Dictionary <Genotype, double>();

                foreach (var genotypeCopyNumber in genotypes)
                {
                    double currentLikelihood =
                        copyNumberModel[sampleId].GetTotalCopyNumberLikelihoods(
                            Math.Min(canvasSegments[sampleId].TruncatedMedianCount(bins2Remove),
                                     samplesInfo[sampleId].MeanCoverage * maxCoverageMultiplier), genotypeCopyNumber);
                    currentLikelihood = Double.IsNaN(currentLikelihood) || Double.IsInfinity(currentLikelihood)
                        ? 0
                        : currentLikelihood;
                    density[genotypeCopyNumber] = currentLikelihood;
                }
                singleSampleLikelihoods.Add(sampleId, density);
            }
            return(singleSampleLikelihoods);
        }
예제 #4
0
        /// <summary>
        /// Evaluate joint log likelihood of all genotype combinations across samples.
        /// Return joint likelihood object and the copy number states with the highest likelihood
        /// </summary>
        public static (SampleMap <Genotype> copyNumbersGenotypes, JointLikelihoods jointLikelihood) GetCopyNumbersNoPedigreeInfo(ISampleMap <CanvasSegment> segments,
                                                                                                                                 ISampleMap <Dictionary <Genotype, double> > singleSampleLikelihoods)
        {
            // for non-pedigree samples JointLogLikelihoods object contains only maximum likelihood information
            var jointLogLikelihoods        = new JointLikelihoods();
            var sampleCopyNumbersGenotypes = new SampleMap <Genotype>();

            foreach (var sampleId in segments.SampleIds)
            {
                var(copyNumber, maxSampleLikelihood)      = singleSampleLikelihoods[sampleId].MaxBy(x => x.Value);
                jointLogLikelihoods.MaximalLogLikelihood += Math.Log(maxSampleLikelihood);
                sampleCopyNumbersGenotypes.Add(sampleId, copyNumber);
            }
            return(copyNumbersGenotypes : sampleCopyNumbersGenotypes, jointLikelihood : jointLogLikelihoods);
        }
예제 #5
0
        /// <summary>
        /// CreatRecordLevelFilter CanvasSegments from common CNVs bed file and overlap with CanvasPartition
        /// segments to create SegmentHaplotypes
        /// </summary>
        private IEnumerable <ISampleMap <OverlappingSegmentsRegion> > CreateSegmentSetsFromCommonCnvs(ISampleMap <string> variantFrequencyFiles,
                                                                                                      int defaultAlleleCountThreshold, string commonCNVsbedPath, ISampleMap <Segments> sampleSegments)
        {
            if (commonCNVsbedPath == null)
            {
                var defaultSampleRegions = sampleSegments
                                           .SelectValues(segments => segments.AllSegments.Select(segment => new OverlappingSegmentsRegion(segment)).ToList());
                return(GetOverlappingSegmentsRegionSampleLists(defaultSampleRegions));
            }

            var commonRegions = ReadCommonRegions(commonCNVsbedPath);
            var chromosomes   = sampleSegments.Values.First().GetChromosomes();

            if (IsIdenticalChromosomeNames(commonRegions, chromosomes))
            {
                throw new ArgumentException(
                          $"Chromosome names in a common CNVs bed file {commonCNVsbedPath} does not match the genome reference");
            }

            var segmentIntervalsByChromosome = new Dictionary <string, List <BedInterval> >();
            var genomicBinsByChromosome      = new Dictionary <string, IReadOnlyList <SampleGenomicBin> >();

            Parallel.ForEach(
                chromosomes,
                chr =>
            {
                genomicBinsByChromosome[chr]      = sampleSegments.Values.First().GetGenomicBinsForChromosome(chr);
                segmentIntervalsByChromosome[chr] =
                    CanvasSegment.RemapGenomicToBinCoordinates(commonRegions[chr], genomicBinsByChromosome[chr]);
            });

            var sampleRegions = new SampleMap <List <OverlappingSegmentsRegion> >();

            foreach (var sampleId in sampleSegments.SampleIds)
            {
                var commonIntervals = commonRegions.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Select(bedEntry => bedEntry.Interval).ToList());
                var allelesByChromosomeCommonSegs = CanvasIO.ReadFrequenciesWrapper(_logger,
                                                                                    new FileLocation(variantFrequencyFiles[sampleId]), commonIntervals);
                var segmentsSets = GetSegmentSets(defaultAlleleCountThreshold, commonRegions,
                                                  genomicBinsByChromosome, segmentIntervalsByChromosome, allelesByChromosomeCommonSegs, sampleSegments[sampleId]);
                sampleRegions.Add(sampleId, segmentsSets);
            }

            return(GetOverlappingSegmentsRegionSampleLists(sampleRegions));
        }
        /// <summary>
        /// Calculates maximal likelihood for segments without SNV allele ratios. Updated CanvasSegment CopyNumber only.
        /// Use likelihoods as only median point estimator is used
        /// </summary>
        public ISampleMap <Dictionary <Genotype, double> > GetCopyNumbersLikelihoods(ISampleMap <CanvasSegment> canvasSegments, ISampleMap <SampleMetrics> samplesInfo,
                                                                                     ISampleMap <ICopyNumberModel> copyNumberModel, int numberOfTrimmedBins)
        {
            var          genotypes               = Enumerable.Range(0, _maximumCopyNumber).Select(Genotype.Create).ToList();
            const double maxCoverageMultiplier   = 3.0;
            var          singleSampleLikelihoods = new SampleMap <Dictionary <Genotype, double> >();

            foreach (var sampleId in canvasSegments.SampleIds)
            {
                var density = new Dictionary <Genotype, double>();

                foreach (var genotypeCopyNumber in genotypes)
                {
                    double cvg = Math.Min(canvasSegments[sampleId].TruncatedMedianCount(numberOfTrimmedBins),
                                          samplesInfo[sampleId].MeanCoverage * maxCoverageMultiplier);
                    // In case we run into out-of-range trouble again (CANV-694), print details
                    {
                        int    intcvg         = Convert.ToInt32(cvg);
                        int    coverageBound  = copyNumberModel[sampleId].GetCoverageBound();
                        double truncatedDepth = canvasSegments[sampleId].TruncatedMedianCount(numberOfTrimmedBins);
                        double meanTimesThree = samplesInfo[sampleId].MeanCoverage * maxCoverageMultiplier;
                        int    maxAllowedCN   = copyNumberModel[sampleId].GetMaxCopyNumber();
                        if (intcvg >= coverageBound || genotypeCopyNumber.TotalCopyNumber > maxAllowedCN)
                        {
                            throw new ArgumentException(
                                      $"Tried to look up bad depth or CN for {sampleId}: depth {intcvg} CN {genotypeCopyNumber.TotalCopyNumber}" +
                                      $" where max handled values are {coverageBound} and {maxAllowedCN} respectively;" +
                                      $" original depth was {truncatedDepth}, mean * 3 was {meanTimesThree};" +
                                      $" segment {canvasSegments[sampleId].Chr}:{canvasSegments[sampleId].Begin}-{canvasSegments[sampleId].End}");
                        }
                    }
                    double currentLikelihood =
                        copyNumberModel[sampleId].GetTotalCopyNumberLikelihoods(cvg, genotypeCopyNumber);
                    currentLikelihood = Double.IsNaN(currentLikelihood) || Double.IsInfinity(currentLikelihood)
                        ? 0
                        : currentLikelihood;
                    density[genotypeCopyNumber] = currentLikelihood;
                }
                singleSampleLikelihoods.Add(sampleId, density);
            }
            return(singleSampleLikelihoods);
        }
예제 #7
0
        internal int CallVariants(List <string> variantFrequencyFiles, List <string> segmentFiles,
                                  IFileLocation outVcfFile, string ploidyBedPath, string referenceFolder, List <string> sampleNames, string commonCnvsBedPath, List <SampleType> sampleTypes)
        {
            // load files
            // initialize data structures and classes
            var fileCounter      = 0;
            var samplesInfo      = new SampleMap <SampleMetrics>();
            var sampleSegments   = new SampleMap <Segments>();
            var copyNumberModels = new SampleMap <ICopyNumberModel>();
            var variantFrequencyFilesSampleList = new SampleMap <string>();
            var kinships = new SampleMap <SampleType>();

            foreach (string sampleName in sampleNames)
            {
                var sampleId = new SampleId(sampleName);
                var segment  = Segments.ReadSegments(_logger, new FileLocation(segmentFiles[fileCounter]));
                segment.AddAlleles(CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[fileCounter]), segment.IntervalsByChromosome));
                sampleSegments.Add(sampleId, segment);
                var sampleInfo      = SampleMetrics.GetSampleInfo(segment.AllSegments, ploidyBedPath, _callerParameters.NumberOfTrimmedBins, sampleId);
                var copyNumberModel = _copyNumberModelFactory.CreateModel(_callerParameters.MaximumCopyNumber, sampleInfo.MaxCoverage, sampleInfo.MeanCoverage, sampleInfo.MeanMafCoverage);
                samplesInfo.Add(sampleId, sampleInfo);
                copyNumberModels.Add(sampleId, copyNumberModel);
                variantFrequencyFilesSampleList.Add(sampleId, variantFrequencyFiles[fileCounter]);
                kinships.Add(sampleId, sampleTypes[fileCounter]);
                fileCounter++;
            }
            var segmentSetsFromCommonCnvs = CreateSegmentSetsFromCommonCnvs(variantFrequencyFilesSampleList,
                                                                            _callerParameters.MinAlleleCountsThreshold, commonCnvsBedPath, sampleSegments);

            var          segmentsForVariantCalling = GetHighestLikelihoodSegments(segmentSetsFromCommonCnvs, samplesInfo, copyNumberModels).ToList();
            PedigreeInfo pedigreeInfo = PedigreeInfo.GetPedigreeInfo(kinships, _callerParameters);

            Parallel.ForEach(
                segmentsForVariantCalling,
                new ParallelOptions
            {
                MaxDegreeOfParallelism = Math.Min(Environment.ProcessorCount, _callerParameters.MaxCoreNumber)
            },
                segments => _variantCaller.CallVariant(segments, samplesInfo, copyNumberModels, pedigreeInfo)
                );
            var variantCalledSegments = new SampleMap <List <CanvasSegment> >();

            foreach (var key in samplesInfo.SampleIds)
            {
                variantCalledSegments.Add(key, segmentsForVariantCalling.Select(segment => segment[key]).ToList());
            }

            var mergedVariantCalledSegments = MergeSegments(variantCalledSegments, _callerParameters.MinimumCallSize, _qualityFilterThreshold);

            FilterExcessivelyShortSegments(mergedVariantCalledSegments);

            var outputFolder = outVcfFile.Directory;

            foreach (var sampleId in samplesInfo.SampleIds)
            {
                var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(outputFolder,
                                                                                                  sampleId.ToString());
                CanvasSegment.WriteCoveragePlotData(mergedVariantCalledSegments[sampleId], samplesInfo[sampleId].MeanCoverage,
                                                    samplesInfo[sampleId].Ploidy, coverageOutputPath, referenceFolder);
            }
            bool isPedigreeInfoSupplied = pedigreeInfo != null && pedigreeInfo.HasFullPedigree();
            var  denovoQualityThreshold = isPedigreeInfoSupplied ? (int?)_deNovoQualityFilterThreshold : null;
            var  ploidies        = samplesInfo.Select(info => info.Value.Ploidy).ToList();
            var  diploidCoverage = samplesInfo.Select(info => info.Value.MeanCoverage).ToList();
            var  names           = samplesInfo.SampleIds.Select(id => id.ToString()).ToList();

            CanvasSegmentWriter.WriteMultiSampleSegments(outVcfFile.FullName, mergedVariantCalledSegments, diploidCoverage, referenceFolder, names,
                                                         null, ploidies, _qualityFilterThreshold, denovoQualityThreshold, CanvasFilter.SegmentSizeCutoff, isPedigreeInfoSupplied);

            foreach (var sampleId in samplesInfo.SampleIds)
            {
                var outputVcfPath = SingleSampleCallset.GetVcfOutput(outputFolder, sampleId.ToString());
                var sampleMetrics = samplesInfo[sampleId];
                var segments      = mergedVariantCalledSegments[sampleId];
                CanvasSegmentWriter.WriteSegments(outputVcfPath.FullName, segments,
                                                  sampleMetrics.MeanCoverage, referenceFolder, sampleId.ToString(), null,
                                                  sampleMetrics.Ploidy, _qualityFilterThreshold, isPedigreeInfoSupplied, denovoQualityThreshold, null);

                var visualizationTemp   = outputFolder.CreateSubdirectory($"VisualizationTemp{sampleId}");
                var normalizationFactor = NormalizationCalculator.ComputeNormalizationFactor(segments);
                var bigWig = _coverageBigWigWriter.Write(segments, visualizationTemp, normalizationFactor);
                bigWig?.MoveTo(SingleSampleCallset.GetCoverageBigWig(outputFolder, sampleId.ToString()));
                var copyNumberBedGraph = SingleSampleCallset.GetCopyNumberBedGraph(outputFolder, sampleId.ToString());
                _copyNumberBedGraphWriter.Write(segments, sampleMetrics.Ploidy, copyNumberBedGraph);

                var partitionBedgraphHeader = "track type=bedGraph visibility=full autoScale=on graphType=points";
                var originalSegments        = sampleSegments[sampleId];
                _partitionCoverageBedGraphWriter.Write(originalSegments.AllSegments, SingleSampleCallset.GetPartitionBedGraph(outputFolder, sampleId.ToString()), normalizationFactor, partitionBedgraphHeader);
            }
            return(0);
        }
예제 #8
0
        /// <summary>
        /// Estimate joint likelihood and most likely CN assignment within a pedigree using total CN Genotype likelihoods and transition matrix
        /// </summary>
        /// <param name="pedigreeInfo"></param>
        /// <param name="copyNumbersLikelihoods"></param>
        /// <returns></returns>
        private (ISampleMap <Genotype> copyNumbersGenotypes, JointLikelihoods jointLikelihood) GetPedigreeCopyNumbers(PedigreeInfo pedigreeInfo, ISampleMap <Dictionary <Genotype, double> > copyNumbersLikelihoods)
        {
            int nHighestLikelihoodGenotypes = pedigreeInfo != null && pedigreeInfo.OffspringIds.Count >= 2 ? 3 : _callerParameters.MaximumCopyNumber;

            copyNumbersLikelihoods = copyNumbersLikelihoods.SelectValues(l => l.OrderByDescending(kvp => kvp.Value).Take(nHighestLikelihoodGenotypes).ToDictionary());

            var sampleCopyNumbersGenotypes = new SampleMap <Genotype>();
            var jointLikelihood            = new JointLikelihoods();

            if (!pedigreeInfo.HasFullPedigree())
            {
                return(sampleCopyNumbersGenotypes, jointLikelihood);
            }
            // parent 1 total CNs and likelihoods
            foreach (var copyNumberParent1 in copyNumbersLikelihoods[pedigreeInfo.ParentsIds.First()])
            {
                // parent 2 total CNs and likelihoods
                foreach (var copyNumberParent2 in copyNumbersLikelihoods[pedigreeInfo.ParentsIds.Last()])
                {
                    // for offspring in addition to querying likelihoods using total CNs, iterate over all possible genotype combination (CopyNumberA/B) for a given
                    // CN and estimate likely transition probabilities using TransitionMatrix
                    foreach (var offspringGtStates in pedigreeInfo.OffspringPhasedGenotypes)
                    {
                        if (!pedigreeInfo.OffspringIds.All(id => copyNumbersLikelihoods[id].ContainsKey(
                                                               Genotype.Create(Math.Min(offspringGtStates[pedigreeInfo.OffspringIds.IndexOf(id)].PhasedGenotype.CopyNumberA + offspringGtStates[pedigreeInfo.OffspringIds.IndexOf(id)].PhasedGenotype.CopyNumberB,
                                                                                        _callerParameters.MaximumCopyNumber - 1)))))
                        {
                            // unavailable total CN
                            continue;
                        }
                        // For a given combination of offspring copy numbers, only the genotypes that result in the maximum likelihood contribute to the final result."
                        double currentLikelihood        = copyNumberParent1.Value * copyNumberParent2.Value;
                        var    totalCopyNumberGenotypes = new List <Genotype>();
                        for (var counter = 0; counter < pedigreeInfo.OffspringIds.Count; counter++)
                        {
                            var child = pedigreeInfo.OffspringIds[counter];
                            var copyNumberGenotypeChild = Genotype.Create(Math.Min(offspringGtStates[counter].PhasedGenotype.CopyNumberA + offspringGtStates[counter].PhasedGenotype.CopyNumberB,
                                                                                   _callerParameters.MaximumCopyNumber - 1));
                            totalCopyNumberGenotypes.Add(copyNumberGenotypeChild);
                            currentLikelihood *= pedigreeInfo.TransitionMatrix[copyNumberParent1.Key.TotalCopyNumber][offspringGtStates[counter].PhasedGenotype.CopyNumberA] *
                                                 pedigreeInfo.TransitionMatrix[copyNumberParent2.Key.TotalCopyNumber][offspringGtStates[counter].PhasedGenotype.CopyNumberB] *
                                                 copyNumbersLikelihoods[child][copyNumberGenotypeChild];
                        }
                        currentLikelihood = Double.IsNaN(currentLikelihood) || Double.IsInfinity(currentLikelihood) ? 0 : currentLikelihood;

                        var genotypesInPedigree = new SampleMap <Genotype>
                        {
                            { pedigreeInfo.ParentsIds.First(), copyNumberParent1.Key },
                            { pedigreeInfo.ParentsIds.Last(), copyNumberParent2.Key }
                        };
                        pedigreeInfo.OffspringIds.Zip(totalCopyNumberGenotypes).ForEach(sampleIdGenotypeKvp => genotypesInPedigree.Add(sampleIdGenotypeKvp.Item1, sampleIdGenotypeKvp.Item2));
                        genotypesInPedigree = genotypesInPedigree.OrderBy(pedigreeInfo.AllSampleIds);
                        jointLikelihood.AddJointLikelihood(genotypesInPedigree, currentLikelihood);
                        double currentLogLikelihood = Math.Log(currentLikelihood);
                        if (currentLogLikelihood > jointLikelihood.MaximalLogLikelihood)
                        {
                            jointLikelihood.MaximalLogLikelihood = currentLogLikelihood;
                            sampleCopyNumbersGenotypes           = genotypesInPedigree;
                        }
                    }
                }
            }
            if (sampleCopyNumbersGenotypes.Empty())
            {
                throw new IlluminaException("Maximal likelihood was not found");
            }
            return(sampleCopyNumbersGenotypes, jointLikelihood);
        }