/// <summary> /// Given a set canvasSegmentsSet with two alternative segmentation hypothesis (SegmentsSet: SetA and SetB), return log likelihood /// for a segmentation hypothesis specified by segmentsSet. Segmentation hypothesis could typically include segmentation results specified /// by partitioning or annotations of population (common) variants /// </summary> /// <param name="canvasSegmentsSet"></param> /// <param name="samplesInfo"></param> /// <param name="copyNumberModel"></param> /// <param name="segmentsSet"></param> /// <returns></returns> private double GetSegmentSetLogLikelihood(ISampleMap <OverlappingSegmentsRegion> canvasSegmentsSet, ISampleMap <SampleMetrics> samplesInfo, ISampleMap <ICopyNumberModel> copyNumberModel, SegmentsSet segmentsSet) { double segmentSetLogLikelihood = 0; foreach (var sampleId in canvasSegmentsSet.SampleIds) { canvasSegmentsSet[sampleId].SetSet(segmentsSet); } var canvasSegments = new List <ISampleMap <CanvasSegment> >(); int nSegments = canvasSegmentsSet.First().Value.GetSet().Count; for (var canvasSegmentIndex = 0; canvasSegmentIndex < nSegments; canvasSegmentIndex++) { var canvasSegment = new SampleMap <CanvasSegment>(); foreach (var id in canvasSegmentsSet.SampleIds) { canvasSegment.Add(id, canvasSegmentsSet[id].GetSet()[canvasSegmentIndex]); } canvasSegments.Add(canvasSegment); } foreach (var canvasSegment in canvasSegments) { var copyNumbersLikelihoods = _copyNumberLikelihoodCalculator.GetCopyNumbersLikelihoods(canvasSegment, samplesInfo, copyNumberModel, _callerParameters.NumberOfTrimmedBins); var(_, likelihoods) = GetCopyNumbersNoPedigreeInfo(canvasSegment, copyNumbersLikelihoods); segmentSetLogLikelihood += likelihoods.MaximalLogLikelihood; } return(segmentSetLogLikelihood); }
private static ISampleMap <List <CanvasSegment> > MergeSegments(ISampleMap <List <CanvasSegment> > segments, int minimumCallSize, int qScoreThreshold) { int nSegments = segments.First().Value.Count; var copyNumbers = new List <List <int> >(nSegments); var qscores = new List <double>(nSegments); foreach (int segmentIndex in Enumerable.Range(0, nSegments)) { copyNumbers.Add(segments.Select(s => s.Value[segmentIndex].CopyNumber).ToList()); qscores.Add(segments.Select(s => s.Value[segmentIndex].QScore).Average()); } if (copyNumbers == null && qscores != null || copyNumbers != null & qscores == null) { throw new ArgumentException("Both copyNumbers and qscores arguments must be specified."); } if (copyNumbers != null && copyNumbers.Count != nSegments) { throw new ArgumentException("Length of copyNumbers list should be equal to the number of segments."); } if (qscores != null && qscores.Count != nSegments) { throw new ArgumentException("Length of qscores list should be equal to the number of segments."); } var mergedSegments = new SampleMap <List <CanvasSegment> >(); foreach (var sampleSegments in segments) { var mergedSegmentsThisSample = CanvasSegment.MergeSegments(sampleSegments.Value.ToList(), minimumCallSize, 10000, copyNumbers, qscores, qScoreThreshold); mergedSegments.Add(sampleSegments.Key, mergedSegmentsThisSample); } return(mergedSegments); }
/// <summary> /// Calculates maximal likelihood for segments without SNV allele ratios. Updated CanvasSegment CopyNumber only. /// Use likelihoods as only median point estimator is used /// </summary> public ISampleMap <Dictionary <Genotype, double> > GetCopyNumbersLikelihoods(ISampleMap <CanvasSegment> canvasSegments, ISampleMap <SampleMetrics> samplesInfo, ISampleMap <ICopyNumberModel> copyNumberModel) { const int bins2Remove = 5; var genotypes = Enumerable.Range(0, _maximumCopyNumber).Select(Genotype.Create).ToList(); const double maxCoverageMultiplier = 3.0; var singleSampleLikelihoods = new SampleMap <Dictionary <Genotype, double> >(); foreach (var sampleId in canvasSegments.SampleIds) { var density = new Dictionary <Genotype, double>(); foreach (var genotypeCopyNumber in genotypes) { double currentLikelihood = copyNumberModel[sampleId].GetTotalCopyNumberLikelihoods( Math.Min(canvasSegments[sampleId].TruncatedMedianCount(bins2Remove), samplesInfo[sampleId].MeanCoverage * maxCoverageMultiplier), genotypeCopyNumber); currentLikelihood = Double.IsNaN(currentLikelihood) || Double.IsInfinity(currentLikelihood) ? 0 : currentLikelihood; density[genotypeCopyNumber] = currentLikelihood; } singleSampleLikelihoods.Add(sampleId, density); } return(singleSampleLikelihoods); }
/// <summary> /// Evaluate joint log likelihood of all genotype combinations across samples. /// Return joint likelihood object and the copy number states with the highest likelihood /// </summary> public static (SampleMap <Genotype> copyNumbersGenotypes, JointLikelihoods jointLikelihood) GetCopyNumbersNoPedigreeInfo(ISampleMap <CanvasSegment> segments, ISampleMap <Dictionary <Genotype, double> > singleSampleLikelihoods) { // for non-pedigree samples JointLogLikelihoods object contains only maximum likelihood information var jointLogLikelihoods = new JointLikelihoods(); var sampleCopyNumbersGenotypes = new SampleMap <Genotype>(); foreach (var sampleId in segments.SampleIds) { var(copyNumber, maxSampleLikelihood) = singleSampleLikelihoods[sampleId].MaxBy(x => x.Value); jointLogLikelihoods.MaximalLogLikelihood += Math.Log(maxSampleLikelihood); sampleCopyNumbersGenotypes.Add(sampleId, copyNumber); } return(copyNumbersGenotypes : sampleCopyNumbersGenotypes, jointLikelihood : jointLogLikelihoods); }
/// <summary> /// CreatRecordLevelFilter CanvasSegments from common CNVs bed file and overlap with CanvasPartition /// segments to create SegmentHaplotypes /// </summary> private IEnumerable <ISampleMap <OverlappingSegmentsRegion> > CreateSegmentSetsFromCommonCnvs(ISampleMap <string> variantFrequencyFiles, int defaultAlleleCountThreshold, string commonCNVsbedPath, ISampleMap <Segments> sampleSegments) { if (commonCNVsbedPath == null) { var defaultSampleRegions = sampleSegments .SelectValues(segments => segments.AllSegments.Select(segment => new OverlappingSegmentsRegion(segment)).ToList()); return(GetOverlappingSegmentsRegionSampleLists(defaultSampleRegions)); } var commonRegions = ReadCommonRegions(commonCNVsbedPath); var chromosomes = sampleSegments.Values.First().GetChromosomes(); if (IsIdenticalChromosomeNames(commonRegions, chromosomes)) { throw new ArgumentException( $"Chromosome names in a common CNVs bed file {commonCNVsbedPath} does not match the genome reference"); } var segmentIntervalsByChromosome = new Dictionary <string, List <BedInterval> >(); var genomicBinsByChromosome = new Dictionary <string, IReadOnlyList <SampleGenomicBin> >(); Parallel.ForEach( chromosomes, chr => { genomicBinsByChromosome[chr] = sampleSegments.Values.First().GetGenomicBinsForChromosome(chr); segmentIntervalsByChromosome[chr] = CanvasSegment.RemapGenomicToBinCoordinates(commonRegions[chr], genomicBinsByChromosome[chr]); }); var sampleRegions = new SampleMap <List <OverlappingSegmentsRegion> >(); foreach (var sampleId in sampleSegments.SampleIds) { var commonIntervals = commonRegions.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Select(bedEntry => bedEntry.Interval).ToList()); var allelesByChromosomeCommonSegs = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[sampleId]), commonIntervals); var segmentsSets = GetSegmentSets(defaultAlleleCountThreshold, commonRegions, genomicBinsByChromosome, segmentIntervalsByChromosome, allelesByChromosomeCommonSegs, sampleSegments[sampleId]); sampleRegions.Add(sampleId, segmentsSets); } return(GetOverlappingSegmentsRegionSampleLists(sampleRegions)); }
/// <summary> /// Calculates maximal likelihood for segments without SNV allele ratios. Updated CanvasSegment CopyNumber only. /// Use likelihoods as only median point estimator is used /// </summary> public ISampleMap <Dictionary <Genotype, double> > GetCopyNumbersLikelihoods(ISampleMap <CanvasSegment> canvasSegments, ISampleMap <SampleMetrics> samplesInfo, ISampleMap <ICopyNumberModel> copyNumberModel, int numberOfTrimmedBins) { var genotypes = Enumerable.Range(0, _maximumCopyNumber).Select(Genotype.Create).ToList(); const double maxCoverageMultiplier = 3.0; var singleSampleLikelihoods = new SampleMap <Dictionary <Genotype, double> >(); foreach (var sampleId in canvasSegments.SampleIds) { var density = new Dictionary <Genotype, double>(); foreach (var genotypeCopyNumber in genotypes) { double cvg = Math.Min(canvasSegments[sampleId].TruncatedMedianCount(numberOfTrimmedBins), samplesInfo[sampleId].MeanCoverage * maxCoverageMultiplier); // In case we run into out-of-range trouble again (CANV-694), print details { int intcvg = Convert.ToInt32(cvg); int coverageBound = copyNumberModel[sampleId].GetCoverageBound(); double truncatedDepth = canvasSegments[sampleId].TruncatedMedianCount(numberOfTrimmedBins); double meanTimesThree = samplesInfo[sampleId].MeanCoverage * maxCoverageMultiplier; int maxAllowedCN = copyNumberModel[sampleId].GetMaxCopyNumber(); if (intcvg >= coverageBound || genotypeCopyNumber.TotalCopyNumber > maxAllowedCN) { throw new ArgumentException( $"Tried to look up bad depth or CN for {sampleId}: depth {intcvg} CN {genotypeCopyNumber.TotalCopyNumber}" + $" where max handled values are {coverageBound} and {maxAllowedCN} respectively;" + $" original depth was {truncatedDepth}, mean * 3 was {meanTimesThree};" + $" segment {canvasSegments[sampleId].Chr}:{canvasSegments[sampleId].Begin}-{canvasSegments[sampleId].End}"); } } double currentLikelihood = copyNumberModel[sampleId].GetTotalCopyNumberLikelihoods(cvg, genotypeCopyNumber); currentLikelihood = Double.IsNaN(currentLikelihood) || Double.IsInfinity(currentLikelihood) ? 0 : currentLikelihood; density[genotypeCopyNumber] = currentLikelihood; } singleSampleLikelihoods.Add(sampleId, density); } return(singleSampleLikelihoods); }
internal int CallVariants(List <string> variantFrequencyFiles, List <string> segmentFiles, IFileLocation outVcfFile, string ploidyBedPath, string referenceFolder, List <string> sampleNames, string commonCnvsBedPath, List <SampleType> sampleTypes) { // load files // initialize data structures and classes var fileCounter = 0; var samplesInfo = new SampleMap <SampleMetrics>(); var sampleSegments = new SampleMap <Segments>(); var copyNumberModels = new SampleMap <ICopyNumberModel>(); var variantFrequencyFilesSampleList = new SampleMap <string>(); var kinships = new SampleMap <SampleType>(); foreach (string sampleName in sampleNames) { var sampleId = new SampleId(sampleName); var segment = Segments.ReadSegments(_logger, new FileLocation(segmentFiles[fileCounter])); segment.AddAlleles(CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[fileCounter]), segment.IntervalsByChromosome)); sampleSegments.Add(sampleId, segment); var sampleInfo = SampleMetrics.GetSampleInfo(segment.AllSegments, ploidyBedPath, _callerParameters.NumberOfTrimmedBins, sampleId); var copyNumberModel = _copyNumberModelFactory.CreateModel(_callerParameters.MaximumCopyNumber, sampleInfo.MaxCoverage, sampleInfo.MeanCoverage, sampleInfo.MeanMafCoverage); samplesInfo.Add(sampleId, sampleInfo); copyNumberModels.Add(sampleId, copyNumberModel); variantFrequencyFilesSampleList.Add(sampleId, variantFrequencyFiles[fileCounter]); kinships.Add(sampleId, sampleTypes[fileCounter]); fileCounter++; } var segmentSetsFromCommonCnvs = CreateSegmentSetsFromCommonCnvs(variantFrequencyFilesSampleList, _callerParameters.MinAlleleCountsThreshold, commonCnvsBedPath, sampleSegments); var segmentsForVariantCalling = GetHighestLikelihoodSegments(segmentSetsFromCommonCnvs, samplesInfo, copyNumberModels).ToList(); PedigreeInfo pedigreeInfo = PedigreeInfo.GetPedigreeInfo(kinships, _callerParameters); Parallel.ForEach( segmentsForVariantCalling, new ParallelOptions { MaxDegreeOfParallelism = Math.Min(Environment.ProcessorCount, _callerParameters.MaxCoreNumber) }, segments => _variantCaller.CallVariant(segments, samplesInfo, copyNumberModels, pedigreeInfo) ); var variantCalledSegments = new SampleMap <List <CanvasSegment> >(); foreach (var key in samplesInfo.SampleIds) { variantCalledSegments.Add(key, segmentsForVariantCalling.Select(segment => segment[key]).ToList()); } var mergedVariantCalledSegments = MergeSegments(variantCalledSegments, _callerParameters.MinimumCallSize, _qualityFilterThreshold); FilterExcessivelyShortSegments(mergedVariantCalledSegments); var outputFolder = outVcfFile.Directory; foreach (var sampleId in samplesInfo.SampleIds) { var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(outputFolder, sampleId.ToString()); CanvasSegment.WriteCoveragePlotData(mergedVariantCalledSegments[sampleId], samplesInfo[sampleId].MeanCoverage, samplesInfo[sampleId].Ploidy, coverageOutputPath, referenceFolder); } bool isPedigreeInfoSupplied = pedigreeInfo != null && pedigreeInfo.HasFullPedigree(); var denovoQualityThreshold = isPedigreeInfoSupplied ? (int?)_deNovoQualityFilterThreshold : null; var ploidies = samplesInfo.Select(info => info.Value.Ploidy).ToList(); var diploidCoverage = samplesInfo.Select(info => info.Value.MeanCoverage).ToList(); var names = samplesInfo.SampleIds.Select(id => id.ToString()).ToList(); CanvasSegmentWriter.WriteMultiSampleSegments(outVcfFile.FullName, mergedVariantCalledSegments, diploidCoverage, referenceFolder, names, null, ploidies, _qualityFilterThreshold, denovoQualityThreshold, CanvasFilter.SegmentSizeCutoff, isPedigreeInfoSupplied); foreach (var sampleId in samplesInfo.SampleIds) { var outputVcfPath = SingleSampleCallset.GetVcfOutput(outputFolder, sampleId.ToString()); var sampleMetrics = samplesInfo[sampleId]; var segments = mergedVariantCalledSegments[sampleId]; CanvasSegmentWriter.WriteSegments(outputVcfPath.FullName, segments, sampleMetrics.MeanCoverage, referenceFolder, sampleId.ToString(), null, sampleMetrics.Ploidy, _qualityFilterThreshold, isPedigreeInfoSupplied, denovoQualityThreshold, null); var visualizationTemp = outputFolder.CreateSubdirectory($"VisualizationTemp{sampleId}"); var normalizationFactor = NormalizationCalculator.ComputeNormalizationFactor(segments); var bigWig = _coverageBigWigWriter.Write(segments, visualizationTemp, normalizationFactor); bigWig?.MoveTo(SingleSampleCallset.GetCoverageBigWig(outputFolder, sampleId.ToString())); var copyNumberBedGraph = SingleSampleCallset.GetCopyNumberBedGraph(outputFolder, sampleId.ToString()); _copyNumberBedGraphWriter.Write(segments, sampleMetrics.Ploidy, copyNumberBedGraph); var partitionBedgraphHeader = "track type=bedGraph visibility=full autoScale=on graphType=points"; var originalSegments = sampleSegments[sampleId]; _partitionCoverageBedGraphWriter.Write(originalSegments.AllSegments, SingleSampleCallset.GetPartitionBedGraph(outputFolder, sampleId.ToString()), normalizationFactor, partitionBedgraphHeader); } return(0); }
/// <summary> /// Estimate joint likelihood and most likely CN assignment within a pedigree using total CN Genotype likelihoods and transition matrix /// </summary> /// <param name="pedigreeInfo"></param> /// <param name="copyNumbersLikelihoods"></param> /// <returns></returns> private (ISampleMap <Genotype> copyNumbersGenotypes, JointLikelihoods jointLikelihood) GetPedigreeCopyNumbers(PedigreeInfo pedigreeInfo, ISampleMap <Dictionary <Genotype, double> > copyNumbersLikelihoods) { int nHighestLikelihoodGenotypes = pedigreeInfo != null && pedigreeInfo.OffspringIds.Count >= 2 ? 3 : _callerParameters.MaximumCopyNumber; copyNumbersLikelihoods = copyNumbersLikelihoods.SelectValues(l => l.OrderByDescending(kvp => kvp.Value).Take(nHighestLikelihoodGenotypes).ToDictionary()); var sampleCopyNumbersGenotypes = new SampleMap <Genotype>(); var jointLikelihood = new JointLikelihoods(); if (!pedigreeInfo.HasFullPedigree()) { return(sampleCopyNumbersGenotypes, jointLikelihood); } // parent 1 total CNs and likelihoods foreach (var copyNumberParent1 in copyNumbersLikelihoods[pedigreeInfo.ParentsIds.First()]) { // parent 2 total CNs and likelihoods foreach (var copyNumberParent2 in copyNumbersLikelihoods[pedigreeInfo.ParentsIds.Last()]) { // for offspring in addition to querying likelihoods using total CNs, iterate over all possible genotype combination (CopyNumberA/B) for a given // CN and estimate likely transition probabilities using TransitionMatrix foreach (var offspringGtStates in pedigreeInfo.OffspringPhasedGenotypes) { if (!pedigreeInfo.OffspringIds.All(id => copyNumbersLikelihoods[id].ContainsKey( Genotype.Create(Math.Min(offspringGtStates[pedigreeInfo.OffspringIds.IndexOf(id)].PhasedGenotype.CopyNumberA + offspringGtStates[pedigreeInfo.OffspringIds.IndexOf(id)].PhasedGenotype.CopyNumberB, _callerParameters.MaximumCopyNumber - 1))))) { // unavailable total CN continue; } // For a given combination of offspring copy numbers, only the genotypes that result in the maximum likelihood contribute to the final result." double currentLikelihood = copyNumberParent1.Value * copyNumberParent2.Value; var totalCopyNumberGenotypes = new List <Genotype>(); for (var counter = 0; counter < pedigreeInfo.OffspringIds.Count; counter++) { var child = pedigreeInfo.OffspringIds[counter]; var copyNumberGenotypeChild = Genotype.Create(Math.Min(offspringGtStates[counter].PhasedGenotype.CopyNumberA + offspringGtStates[counter].PhasedGenotype.CopyNumberB, _callerParameters.MaximumCopyNumber - 1)); totalCopyNumberGenotypes.Add(copyNumberGenotypeChild); currentLikelihood *= pedigreeInfo.TransitionMatrix[copyNumberParent1.Key.TotalCopyNumber][offspringGtStates[counter].PhasedGenotype.CopyNumberA] * pedigreeInfo.TransitionMatrix[copyNumberParent2.Key.TotalCopyNumber][offspringGtStates[counter].PhasedGenotype.CopyNumberB] * copyNumbersLikelihoods[child][copyNumberGenotypeChild]; } currentLikelihood = Double.IsNaN(currentLikelihood) || Double.IsInfinity(currentLikelihood) ? 0 : currentLikelihood; var genotypesInPedigree = new SampleMap <Genotype> { { pedigreeInfo.ParentsIds.First(), copyNumberParent1.Key }, { pedigreeInfo.ParentsIds.Last(), copyNumberParent2.Key } }; pedigreeInfo.OffspringIds.Zip(totalCopyNumberGenotypes).ForEach(sampleIdGenotypeKvp => genotypesInPedigree.Add(sampleIdGenotypeKvp.Item1, sampleIdGenotypeKvp.Item2)); genotypesInPedigree = genotypesInPedigree.OrderBy(pedigreeInfo.AllSampleIds); jointLikelihood.AddJointLikelihood(genotypesInPedigree, currentLikelihood); double currentLogLikelihood = Math.Log(currentLikelihood); if (currentLogLikelihood > jointLikelihood.MaximalLogLikelihood) { jointLikelihood.MaximalLogLikelihood = currentLogLikelihood; sampleCopyNumbersGenotypes = genotypesInPedigree; } } } } if (sampleCopyNumbersGenotypes.Empty()) { throw new IlluminaException("Maximal likelihood was not found"); } return(sampleCopyNumbersGenotypes, jointLikelihood); }