/// <summary> /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment. /// </summary> public void LoadVAFInput(string referenceFolder) { try { var vafByChr = new Dictionary <string, List <List <double> > >(); var intervalsByChromosome = new Dictionary <string, List <BedInterval> >(); foreach (string chr in CoverageInfo.StartByChr.Keys) { vafByChr[chr] = new List <List <double> >(CoverageInfo.StartByChr[chr].Length); intervalsByChromosome[chr] = new List <BedInterval>(); for (int index = 0; index < CoverageInfo.StartByChr[chr].Length; index++) { vafByChr[chr].Add(new List <double>()); intervalsByChromosome[chr].Add(new BedInterval(Convert.ToInt32(CoverageInfo.StartByChr[chr][index]), Convert.ToInt32(CoverageInfo.EndByChr[chr][index]))); } } var alleleCountsByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(this.InputVafPath), intervalsByChromosome); foreach (var chr in alleleCountsByChromosome.Keys) { for (int index = 0; index < alleleCountsByChromosome[chr].Count; index++) { vafByChr[chr][index] = alleleCountsByChromosome[chr][index].MaxFrequencies; } } foreach (string chr in vafByChr.Keys) { VafByChr[chr] = new List <VafContainingBins>(); var index = 0; foreach (var bin in vafByChr[chr]) { if (bin.Count > 0) { VafByChr[chr].Add(new VafContainingBins(index, bin.Average())); } index++; } } _logger.Info("Done processing VAFs\n"); } catch (Exception e) { Console.Error.WriteLine("File {0} could not be read:", this.InputVafPath); Console.Error.WriteLine(e.Message); Environment.Exit(1); } }
/// <summary> /// CreatRecordLevelFilter CanvasSegments from common CNVs bed file and overlap with CanvasPartition /// segments to create SegmentHaplotypes /// </summary> private IEnumerable <ISampleMap <OverlappingSegmentsRegion> > CreateSegmentSetsFromCommonCnvs(ISampleMap <string> variantFrequencyFiles, int defaultAlleleCountThreshold, string commonCNVsbedPath, ISampleMap <Segments> sampleSegments) { if (commonCNVsbedPath == null) { var defaultSampleRegions = sampleSegments .SelectValues(segments => segments.AllSegments.Select(segment => new OverlappingSegmentsRegion(segment)).ToList()); return(GetOverlappingSegmentsRegionSampleLists(defaultSampleRegions)); } var commonRegions = ReadCommonRegions(commonCNVsbedPath); var chromosomes = sampleSegments.Values.First().GetChromosomes(); if (IsIdenticalChromosomeNames(commonRegions, chromosomes)) { throw new ArgumentException( $"Chromosome names in a common CNVs bed file {commonCNVsbedPath} does not match the genome reference"); } var segmentIntervalsByChromosome = new Dictionary <string, List <BedInterval> >(); var genomicBinsByChromosome = new Dictionary <string, IReadOnlyList <SampleGenomicBin> >(); Parallel.ForEach( chromosomes, chr => { genomicBinsByChromosome[chr] = sampleSegments.Values.First().GetGenomicBinsForChromosome(chr); segmentIntervalsByChromosome[chr] = CanvasSegment.RemapGenomicToBinCoordinates(commonRegions[chr], genomicBinsByChromosome[chr]); }); var sampleRegions = new SampleMap <List <OverlappingSegmentsRegion> >(); foreach (var sampleId in sampleSegments.SampleIds) { var commonIntervals = commonRegions.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Select(bedEntry => bedEntry.Interval).ToList()); var allelesByChromosomeCommonSegs = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[sampleId]), commonIntervals); var segmentsSets = GetSegmentSets(defaultAlleleCountThreshold, commonRegions, genomicBinsByChromosome, segmentIntervalsByChromosome, allelesByChromosomeCommonSegs, sampleSegments[sampleId]); sampleRegions.Add(sampleId, segmentsSets); } return(GetOverlappingSegmentsRegionSampleLists(sampleRegions)); }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyVcfPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { _cnOracle = new CopyNumberOracle(); _cnOracle.LoadKnownCN(truthDataPath); } _segments = Segments.ReadSegments(_logger, new FileLocation(inFile)); _allSegments = _segments.AllSegments.ToList(); TempFolder = Path.GetDirectoryName(inFile); if (_allSegments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegmentWriter.WriteSegments(outFile, _allSegments, _model?.DiploidCoverage, referenceFolder, sampleName, null, null, QualityFilterThreshold, false, null, null); return(0); } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyVcfPath)) { ploidy = PloidyInfo.LoadPloidyFromVcfFileNoSampleId(ploidyVcfPath); } // load MAF var allelesByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFile), _segments.IntervalsByChromosome); _segments.AddAlleles(allelesByChromosome); MeanCoverage = allelesByChromosome.SelectMany(x => x.Value).SelectMany(y => y.TotalCoverage).Average(); AggregateVariantCoverage(ref _allSegments); // Create new models for different copy number states InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref _allSegments); _diploidCoverage = Utilities.Mean(diploidCounts); _coverageWeightingFactor = CoverageWeighting / _diploidCoverage; // new coverage model _model = new CoverageModel { DiploidCoverage = _diploidCoverage }; List <SegmentInfo> segments = new List <SegmentInfo>(); foreach (CanvasSegment segment in _allSegments) { SegmentInfo info = new SegmentInfo { Segment = segment }; List <double> mafs = new List <double>(); foreach (float value in segment.Balleles.Frequencies) { mafs.Add(value > 0.5 ? 1 - value : value); } if (mafs.Count > 0) { info.Maf = Utilities.Median(mafs); } else { info.Maf = -1; } info.Coverage = Utilities.Median(segment.Counts); info.Weight = _allSegments.Count > 100 ? segment.Length : segment.BinCount; segments.Add(info); } AssignPloidyCallsDistance(_model); CanvasSegment.AssignQualityScores(_allSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters); // Merge neighboring segments that got the same copy number call. // merging segments requires quality scores so we do it after quality scores have been assigned var mergedSegments = CanvasSegment.MergeSegments(_allSegments); // recalculating qscores after merging segments improves performance! CanvasSegment.AssignQualityScores(mergedSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters); CanvasSegment.SetFilterForSegments(QualityFilterThreshold, mergedSegments, CanvasFilter.SegmentSizeCutoff); List <string> extraHeaders = new List <string>(); var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(mergedSegments, _model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (_cnOracle != null) { GenerateReportVersusKnownCopyNumber(); } if (!string.IsNullOrEmpty(ploidy?.HeaderLine)) { extraHeaders.Add(ploidy.HeaderLine); } CanvasSegmentWriter.WriteSegments(outFile, mergedSegments, _model.DiploidCoverage, referenceFolder, sampleName, extraHeaders, ploidy, QualityFilterThreshold, false, null, null); return(0); }
internal int CallVariants(List <string> variantFrequencyFiles, List <string> segmentFiles, IFileLocation outVcfFile, string ploidyBedPath, string referenceFolder, List <string> sampleNames, string commonCnvsBedPath, List <SampleType> sampleTypes) { // load files // initialize data structures and classes var fileCounter = 0; var samplesInfo = new SampleMap <SampleMetrics>(); var sampleSegments = new SampleMap <Segments>(); var copyNumberModels = new SampleMap <ICopyNumberModel>(); var variantFrequencyFilesSampleList = new SampleMap <string>(); var kinships = new SampleMap <SampleType>(); foreach (string sampleName in sampleNames) { var sampleId = new SampleId(sampleName); var segment = Segments.ReadSegments(_logger, new FileLocation(segmentFiles[fileCounter])); segment.AddAlleles(CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[fileCounter]), segment.IntervalsByChromosome)); sampleSegments.Add(sampleId, segment); var sampleInfo = SampleMetrics.GetSampleInfo(segment.AllSegments, ploidyBedPath, _callerParameters.NumberOfTrimmedBins, sampleId); var copyNumberModel = _copyNumberModelFactory.CreateModel(_callerParameters.MaximumCopyNumber, sampleInfo.MaxCoverage, sampleInfo.MeanCoverage, sampleInfo.MeanMafCoverage); samplesInfo.Add(sampleId, sampleInfo); copyNumberModels.Add(sampleId, copyNumberModel); variantFrequencyFilesSampleList.Add(sampleId, variantFrequencyFiles[fileCounter]); kinships.Add(sampleId, sampleTypes[fileCounter]); fileCounter++; } var segmentSetsFromCommonCnvs = CreateSegmentSetsFromCommonCnvs(variantFrequencyFilesSampleList, _callerParameters.MinAlleleCountsThreshold, commonCnvsBedPath, sampleSegments); var segmentsForVariantCalling = GetHighestLikelihoodSegments(segmentSetsFromCommonCnvs, samplesInfo, copyNumberModels).ToList(); PedigreeInfo pedigreeInfo = PedigreeInfo.GetPedigreeInfo(kinships, _callerParameters); Parallel.ForEach( segmentsForVariantCalling, new ParallelOptions { MaxDegreeOfParallelism = Math.Min(Environment.ProcessorCount, _callerParameters.MaxCoreNumber) }, segments => _variantCaller.CallVariant(segments, samplesInfo, copyNumberModels, pedigreeInfo) ); var variantCalledSegments = new SampleMap <List <CanvasSegment> >(); foreach (var key in samplesInfo.SampleIds) { variantCalledSegments.Add(key, segmentsForVariantCalling.Select(segment => segment[key]).ToList()); } var mergedVariantCalledSegments = MergeSegments(variantCalledSegments, _callerParameters.MinimumCallSize, _qualityFilterThreshold); FilterExcessivelyShortSegments(mergedVariantCalledSegments); var outputFolder = outVcfFile.Directory; foreach (var sampleId in samplesInfo.SampleIds) { var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(outputFolder, sampleId.ToString()); CanvasSegment.WriteCoveragePlotData(mergedVariantCalledSegments[sampleId], samplesInfo[sampleId].MeanCoverage, samplesInfo[sampleId].Ploidy, coverageOutputPath, referenceFolder); } bool isPedigreeInfoSupplied = pedigreeInfo != null && pedigreeInfo.HasFullPedigree(); var denovoQualityThreshold = isPedigreeInfoSupplied ? (int?)_deNovoQualityFilterThreshold : null; var ploidies = samplesInfo.Select(info => info.Value.Ploidy).ToList(); var diploidCoverage = samplesInfo.Select(info => info.Value.MeanCoverage).ToList(); var names = samplesInfo.SampleIds.Select(id => id.ToString()).ToList(); CanvasSegmentWriter.WriteMultiSampleSegments(outVcfFile.FullName, mergedVariantCalledSegments, diploidCoverage, referenceFolder, names, null, ploidies, _qualityFilterThreshold, denovoQualityThreshold, CanvasFilter.SegmentSizeCutoff, isPedigreeInfoSupplied); foreach (var sampleId in samplesInfo.SampleIds) { var outputVcfPath = SingleSampleCallset.GetVcfOutput(outputFolder, sampleId.ToString()); var sampleMetrics = samplesInfo[sampleId]; var segments = mergedVariantCalledSegments[sampleId]; CanvasSegmentWriter.WriteSegments(outputVcfPath.FullName, segments, sampleMetrics.MeanCoverage, referenceFolder, sampleId.ToString(), null, sampleMetrics.Ploidy, _qualityFilterThreshold, isPedigreeInfoSupplied, denovoQualityThreshold, null); var visualizationTemp = outputFolder.CreateSubdirectory($"VisualizationTemp{sampleId}"); var normalizationFactor = NormalizationCalculator.ComputeNormalizationFactor(segments); var bigWig = _coverageBigWigWriter.Write(segments, visualizationTemp, normalizationFactor); bigWig?.MoveTo(SingleSampleCallset.GetCoverageBigWig(outputFolder, sampleId.ToString())); var copyNumberBedGraph = SingleSampleCallset.GetCopyNumberBedGraph(outputFolder, sampleId.ToString()); _copyNumberBedGraphWriter.Write(segments, sampleMetrics.Ploidy, copyNumberBedGraph); var partitionBedgraphHeader = "track type=bedGraph visibility=full autoScale=on graphType=points"; var originalSegments = sampleSegments[sampleId]; _partitionCoverageBedGraphWriter.Write(originalSegments.AllSegments, SingleSampleCallset.GetPartitionBedGraph(outputFolder, sampleId.ToString()), normalizationFactor, partitionBedgraphHeader); } return(0); }