private SmallPedigreeCallset GetCallset() { var callSets = new List <PedigreeSample>(); var outputVcf = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz"); foreach (var sample in SmallPedigreeOptions.Samples) { string sampleName = sample.SampleName; SingleSampleCallset callSet = new SingleSampleCallset( new Bam(sample.Bam), sampleName, SmallPedigreeOptions.BAlleleSites, SmallPedigreeOptions.IsPopulationBAlleleSites, CommonOptions.OutputDirectory, outputVcf); callSet.SampleOutputFolder.Create(); callSets.Add(new PedigreeSample(callSet, sample.SampleType)); } AnalysisDetails analysisDetails = new AnalysisDetails( CommonOptions.OutputDirectory, CommonOptions.WholeGenomeFasta, CommonOptions.KmerFasta, CommonOptions.FilterBed, SmallPedigreeOptions.MultiSamplePloidyVcf, SmallPedigreeOptions.CommonCnvsBed); return(new SmallPedigreeCallset(callSets, analysisDetails)); }
private void MoveIntermediateOutput(SampleInfo info, IntermediateOutput output, IFileMover fileMover) { var stub = GetSingleSampleOutputStub(info); // Output: fileMover.Move(output.CnvVcf.VcfFile, SingleSampleCallset.GetVcfOutput(stub)); // Files for visualization: fileMover.Move(output.CoverageBigwig, SingleSampleCallset.GetCoverageBigWig(stub)); var targetBAlleleBedgraph = SingleSampleCallset.GetBAlleleBedGraph(stub); fileMover.Move(output.BAlleleBedgraph.FileLocation, targetBAlleleBedgraph.FileLocation); fileMover.Move(output.BAlleleBedgraph.TabixIndex, targetBAlleleBedgraph.TabixIndex); var targetCopyNumbedBedgraph = SingleSampleCallset.GetCopyNumberBedGraph(stub); fileMover.Move(output.CopyNumberBedgraph.FileLocation, targetCopyNumbedBedgraph.FileLocation); fileMover.Move(output.CopyNumberBedgraph.TabixIndex, targetCopyNumbedBedgraph.TabixIndex); // Deprecated files: #pragma warning disable CS0618 // Type or member is obsolete fileMover.Move(output.CoverageAndVariantFrequencies, SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(stub)); // Used for (non-dynamic) plotting fileMover.Move(output.Partitioned, SingleSampleCallset.GetPartitionedPath(stub)); // used by BSVI fileMover.Move(output.VariantFrequencies, SingleSampleCallset.GetVfSummaryPath(stub)); // used by BSVI fileMover.Move(output.VariantFrequenciesBaf, SingleSampleCallset.GetVfSummaryBafPath(stub)); // used by BSVI #pragma warning restore CS0618 // Type or member is obsolete }
private void MoveIntermediateOutput(SampleInfo info, IntermediateOutput output, IFileMover fileMover) { var stub = GetStub(info.Id); fileMover.Move(output.CoverageAndVariantFrequencies, SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(stub)); if (_canvasWorkerFactory.IncludeIntermediateResults()) { fileMover.Move(output.Partitioned, SingleSampleCallset.GetPartitionedPath(stub)); fileMover.Move(output.VariantFrequencies, SingleSampleCallset.GetVfSummaryPath(stub)); fileMover.Move(output.VariantFrequenciesBaf, SingleSampleCallset.GetVfSummaryPath(stub)); } }
private CanvasSmallPedigreeOutput GetCanvasOutput(SampleSet <Bam> pedigreeBams, IDirectoryLocation sampleSandbox) { var readGroupSamples = pedigreeBams.SelectData(GetReadGroupSample); var intermediateResults = readGroupSamples.SelectData(readGroupSample => { var variantFrequencies = SingleSampleCallset.GetVfSummaryPath(sampleSandbox, readGroupSample); var variantFrequenciesBaf = SingleSampleCallset.GetVfSummaryBafPath(sampleSandbox, readGroupSample); var partitioned = SingleSampleCallset.GetPartitionedPath(sampleSandbox, readGroupSample); var coverageAndVariantFrequencies = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(sampleSandbox, readGroupSample); return(new IntermediateOutput(coverageAndVariantFrequencies, variantFrequencies, variantFrequenciesBaf, partitioned)); }); var cnvVcf = new Vcf(sampleSandbox.GetFileLocation("CNV.vcf.gz")); return(new CanvasSmallPedigreeOutput(cnvVcf, intermediateResults)); }
public CanvasCallset( IFileLocation bam, string sampleName, IFileLocation normalVcfPath, bool isDbSnpVcf, IEnumerable <IFileLocation> normalBamPaths, NexteraManifest manifest, IFileLocation somaticVcfPath, IFileLocation outputVcfPath, AnalysisDetails analysisDetails) { SingleSampleCallset = new SingleSampleCallset(new Bam(bam), sampleName, normalVcfPath, isDbSnpVcf, analysisDetails.OutputFolder, outputVcfPath); Manifest = manifest; SomaticVcfPath = somaticVcfPath; AnalysisDetails = analysisDetails; NormalBamPaths = normalBamPaths.Select(file => new Bam(file)); }
private CanvasSmallPedigreeOutput Load(CanvasSmallPedigreeInput input) { var intermediateOutputs = input.Samples.SelectData((info, sample) => { var stub = GetStub(info.Id); var coverageAndVariantFrequency = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(stub); if (!_canvasWorkerFactory.IncludeIntermediateResults()) { return(new IntermediateOutput(coverageAndVariantFrequency, null, null, null)); } var partitioned = SingleSampleCallset.GetPartitionedPath(stub); var variantFrequencies = SingleSampleCallset.GetVfSummaryPath(stub); var variantFrequenciesBaf = SingleSampleCallset.GetVfSummaryPath(stub); return(new IntermediateOutput(coverageAndVariantFrequency, variantFrequencies, variantFrequenciesBaf, partitioned)); }); return(new CanvasSmallPedigreeOutput(new Vcf(GetCnvVcf()), intermediateOutputs)); }
public CanvasCallset( SingleSampleCallset singleSampleCallset, AnalysisDetails analysisDetails, IEnumerable <IFileLocation> normalBamPaths, NexteraManifest manifest, IFileLocation somaticVcfPath) { SingleSampleCallset = singleSampleCallset; Manifest = manifest; if (somaticVcfPath != null) { SomaticVcfPath = somaticVcfPath; } AnalysisDetails = analysisDetails; if (normalBamPaths != null) { NormalBamPaths = normalBamPaths.Select(file => new Bam(file)); } }
private CanvasSmallPedigreeOutput GetCanvasOutput(SampleSet <CanvasPedigreeSample> pedigreeSamples, IDirectoryLocation sampleSandbox) { var intermediateResults = pedigreeSamples.SelectSamples(sampleInfo => { var sampleId = sampleInfo.Id; var variantFrequencies = SingleSampleCallset.GetVfSummaryPath(sampleSandbox, sampleId); var variantFrequenciesBaf = SingleSampleCallset.GetVfSummaryBafPath(sampleSandbox, sampleId); var partitioned = SingleSampleCallset.GetPartitionedPath(sampleSandbox, sampleId); var coverageAndVariantFrequencies = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(sampleSandbox, sampleId); var singleSampleVcf = SingleSampleCallset.GetVcfOutput(sampleSandbox, sampleId); var coverageBigwig = SingleSampleCallset.GetCoverageBigWig(sampleSandbox, sampleId); var bAlleleBedgraph = SingleSampleCallset.GetBAlleleBedGraph(sampleSandbox, sampleId); var copyNumberBedgraph = SingleSampleCallset.GetCopyNumberBedGraph(sampleSandbox, sampleId); return(new IntermediateOutput(new Vcf(singleSampleVcf), coverageAndVariantFrequencies, variantFrequencies, variantFrequenciesBaf, partitioned, coverageBigwig, bAlleleBedgraph, copyNumberBedgraph)); }); var cnvVcf = new Vcf(sampleSandbox.GetFileLocation("CNV.vcf.gz")); return(new CanvasSmallPedigreeOutput(cnvVcf, intermediateResults)); }
private CanvasSmallPedigreeOutput Load(CanvasSmallPedigreeInput input) { var intermediateOutputs = input.Samples.SelectData((info, sample) => { var stub = GetSingleSampleOutputStub(info); var coverageAndVariantFrequency = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(stub); var singleSampleVcf = new Vcf(SingleSampleCallset.GetVcfOutput(stub)); var partitioned = SingleSampleCallset.GetPartitionedPath(stub); var variantFrequencies = SingleSampleCallset.GetVfSummaryPath(stub); var variantFrequenciesBaf = SingleSampleCallset.GetVfSummaryBafPath(stub); var coverageBigwig = SingleSampleCallset.GetCoverageBigWig(stub); var bAlleleBedgraph = SingleSampleCallset.GetBAlleleBedGraph(stub); var copyNumberBedgraph = SingleSampleCallset.GetCopyNumberBedGraph(stub); return(new IntermediateOutput(singleSampleVcf, coverageAndVariantFrequency, variantFrequencies, variantFrequenciesBaf, partitioned, coverageBigwig, bAlleleBedgraph, copyNumberBedgraph)); }); return(new CanvasSmallPedigreeOutput(new Vcf(GetPedigreeVcf()), intermediateOutputs)); }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyVcfPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { _cnOracle = new CopyNumberOracle(); _cnOracle.LoadKnownCN(truthDataPath); } _segments = Segments.ReadSegments(_logger, new FileLocation(inFile)); _allSegments = _segments.AllSegments.ToList(); TempFolder = Path.GetDirectoryName(inFile); if (_allSegments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegmentWriter.WriteSegments(outFile, _allSegments, _model?.DiploidCoverage, referenceFolder, sampleName, null, null, QualityFilterThreshold, false, null, null); return(0); } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyVcfPath)) { ploidy = PloidyInfo.LoadPloidyFromVcfFileNoSampleId(ploidyVcfPath); } // load MAF var allelesByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFile), _segments.IntervalsByChromosome); _segments.AddAlleles(allelesByChromosome); MeanCoverage = allelesByChromosome.SelectMany(x => x.Value).SelectMany(y => y.TotalCoverage).Average(); AggregateVariantCoverage(ref _allSegments); // Create new models for different copy number states InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref _allSegments); _diploidCoverage = Utilities.Mean(diploidCounts); _coverageWeightingFactor = CoverageWeighting / _diploidCoverage; // new coverage model _model = new CoverageModel { DiploidCoverage = _diploidCoverage }; List <SegmentInfo> segments = new List <SegmentInfo>(); foreach (CanvasSegment segment in _allSegments) { SegmentInfo info = new SegmentInfo { Segment = segment }; List <double> mafs = new List <double>(); foreach (float value in segment.Balleles.Frequencies) { mafs.Add(value > 0.5 ? 1 - value : value); } if (mafs.Count > 0) { info.Maf = Utilities.Median(mafs); } else { info.Maf = -1; } info.Coverage = Utilities.Median(segment.Counts); info.Weight = _allSegments.Count > 100 ? segment.Length : segment.BinCount; segments.Add(info); } AssignPloidyCallsDistance(_model); CanvasSegment.AssignQualityScores(_allSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters); // Merge neighboring segments that got the same copy number call. // merging segments requires quality scores so we do it after quality scores have been assigned var mergedSegments = CanvasSegment.MergeSegments(_allSegments); // recalculating qscores after merging segments improves performance! CanvasSegment.AssignQualityScores(mergedSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters); CanvasSegment.SetFilterForSegments(QualityFilterThreshold, mergedSegments, CanvasFilter.SegmentSizeCutoff); List <string> extraHeaders = new List <string>(); var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(mergedSegments, _model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (_cnOracle != null) { GenerateReportVersusKnownCopyNumber(); } if (!string.IsNullOrEmpty(ploidy?.HeaderLine)) { extraHeaders.Add(ploidy.HeaderLine); } CanvasSegmentWriter.WriteSegments(outFile, mergedSegments, _model.DiploidCoverage, referenceFolder, sampleName, extraHeaders, ploidy, QualityFilterThreshold, false, null, null); return(0); }
public PedigreeSample(SingleSampleCallset sample, SampleType sampleType) { Sample = sample; SampleType = sampleType; }
internal int CallVariants(List <string> variantFrequencyFiles, List <string> segmentFiles, IFileLocation outVcfFile, string ploidyBedPath, string referenceFolder, List <string> sampleNames, string commonCnvsBedPath, List <SampleType> sampleTypes) { // load files // initialize data structures and classes var fileCounter = 0; var samplesInfo = new SampleMap <SampleMetrics>(); var sampleSegments = new SampleMap <Segments>(); var copyNumberModels = new SampleMap <ICopyNumberModel>(); var variantFrequencyFilesSampleList = new SampleMap <string>(); var kinships = new SampleMap <SampleType>(); foreach (string sampleName in sampleNames) { var sampleId = new SampleId(sampleName); var segment = Segments.ReadSegments(_logger, new FileLocation(segmentFiles[fileCounter])); segment.AddAlleles(CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[fileCounter]), segment.IntervalsByChromosome)); sampleSegments.Add(sampleId, segment); var sampleInfo = SampleMetrics.GetSampleInfo(segment.AllSegments, ploidyBedPath, _callerParameters.NumberOfTrimmedBins, sampleId); var copyNumberModel = _copyNumberModelFactory.CreateModel(_callerParameters.MaximumCopyNumber, sampleInfo.MaxCoverage, sampleInfo.MeanCoverage, sampleInfo.MeanMafCoverage); samplesInfo.Add(sampleId, sampleInfo); copyNumberModels.Add(sampleId, copyNumberModel); variantFrequencyFilesSampleList.Add(sampleId, variantFrequencyFiles[fileCounter]); kinships.Add(sampleId, sampleTypes[fileCounter]); fileCounter++; } var segmentSetsFromCommonCnvs = CreateSegmentSetsFromCommonCnvs(variantFrequencyFilesSampleList, _callerParameters.MinAlleleCountsThreshold, commonCnvsBedPath, sampleSegments); var segmentsForVariantCalling = GetHighestLikelihoodSegments(segmentSetsFromCommonCnvs, samplesInfo, copyNumberModels).ToList(); PedigreeInfo pedigreeInfo = PedigreeInfo.GetPedigreeInfo(kinships, _callerParameters); Parallel.ForEach( segmentsForVariantCalling, new ParallelOptions { MaxDegreeOfParallelism = Math.Min(Environment.ProcessorCount, _callerParameters.MaxCoreNumber) }, segments => _variantCaller.CallVariant(segments, samplesInfo, copyNumberModels, pedigreeInfo) ); var variantCalledSegments = new SampleMap <List <CanvasSegment> >(); foreach (var key in samplesInfo.SampleIds) { variantCalledSegments.Add(key, segmentsForVariantCalling.Select(segment => segment[key]).ToList()); } var mergedVariantCalledSegments = MergeSegments(variantCalledSegments, _callerParameters.MinimumCallSize, _qualityFilterThreshold); FilterExcessivelyShortSegments(mergedVariantCalledSegments); var outputFolder = outVcfFile.Directory; foreach (var sampleId in samplesInfo.SampleIds) { var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(outputFolder, sampleId.ToString()); CanvasSegment.WriteCoveragePlotData(mergedVariantCalledSegments[sampleId], samplesInfo[sampleId].MeanCoverage, samplesInfo[sampleId].Ploidy, coverageOutputPath, referenceFolder); } bool isPedigreeInfoSupplied = pedigreeInfo != null && pedigreeInfo.HasFullPedigree(); var denovoQualityThreshold = isPedigreeInfoSupplied ? (int?)_deNovoQualityFilterThreshold : null; var ploidies = samplesInfo.Select(info => info.Value.Ploidy).ToList(); var diploidCoverage = samplesInfo.Select(info => info.Value.MeanCoverage).ToList(); var names = samplesInfo.SampleIds.Select(id => id.ToString()).ToList(); CanvasSegmentWriter.WriteMultiSampleSegments(outVcfFile.FullName, mergedVariantCalledSegments, diploidCoverage, referenceFolder, names, null, ploidies, _qualityFilterThreshold, denovoQualityThreshold, CanvasFilter.SegmentSizeCutoff, isPedigreeInfoSupplied); foreach (var sampleId in samplesInfo.SampleIds) { var outputVcfPath = SingleSampleCallset.GetVcfOutput(outputFolder, sampleId.ToString()); var sampleMetrics = samplesInfo[sampleId]; var segments = mergedVariantCalledSegments[sampleId]; CanvasSegmentWriter.WriteSegments(outputVcfPath.FullName, segments, sampleMetrics.MeanCoverage, referenceFolder, sampleId.ToString(), null, sampleMetrics.Ploidy, _qualityFilterThreshold, isPedigreeInfoSupplied, denovoQualityThreshold, null); var visualizationTemp = outputFolder.CreateSubdirectory($"VisualizationTemp{sampleId}"); var normalizationFactor = NormalizationCalculator.ComputeNormalizationFactor(segments); var bigWig = _coverageBigWigWriter.Write(segments, visualizationTemp, normalizationFactor); bigWig?.MoveTo(SingleSampleCallset.GetCoverageBigWig(outputFolder, sampleId.ToString())); var copyNumberBedGraph = SingleSampleCallset.GetCopyNumberBedGraph(outputFolder, sampleId.ToString()); _copyNumberBedGraphWriter.Write(segments, sampleMetrics.Ploidy, copyNumberBedGraph); var partitionBedgraphHeader = "track type=bedGraph visibility=full autoScale=on graphType=points"; var originalSegments = sampleSegments[sampleId]; _partitionCoverageBedGraphWriter.Write(originalSegments.AllSegments, SingleSampleCallset.GetPartitionBedGraph(outputFolder, sampleId.ToString()), normalizationFactor, partitionBedgraphHeader); } return(0); }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyBedPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { this.CNOracle = new CopyNumberOracle(); this.CNOracle.LoadKnownCN(truthDataPath); } this.Segments = CanvasSegment.ReadSegments(inFile); this.TempFolder = Path.GetDirectoryName(inFile); if (this.Segments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegmentWriter.WriteSegments(outFile, this.Segments, Model?.DiploidCoverage, referenceFolder, sampleName, null, null, QualityFilterThreshold, isPedigreeInfoSupplied: false); return(0); } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyBedPath)) { ploidy = PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); } // load MAF this.MeanCoverage = CanvasIO.LoadFrequencies(variantFrequencyFile, this.Segments); int medianVariantCoverage = AggregateVariantCoverage(ref this.Segments); // Create new models for different copy number states this.InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref this.Segments); DiploidCoverage = CanvasCommon.Utilities.Mean(diploidCounts); CoverageWeightingFactor = CoverageWeighting / DiploidCoverage; // new coverage model this.Model = new CoverageModel(); Model.DiploidCoverage = DiploidCoverage; List <SegmentInfo> segments = new List <SegmentInfo>(); foreach (CanvasSegment segment in this.Segments) { SegmentInfo info = new SegmentInfo(); info.Segment = segment; List <double> MAF = new List <double>(); foreach (float value in segment.Alleles.Frequencies) { MAF.Add(value > 0.5 ? 1 - value : value); } if (MAF.Count > 0) { info.MAF = CanvasCommon.Utilities.Median(MAF); } else { info.MAF = -1; } info.Coverage = CanvasCommon.Utilities.Median(segment.Counts); if (this.Segments.Count > 100) { info.Weight = segment.End - segment.Begin; } else { info.Weight = segment.BinCount; } segments.Add(info); } // Assign copy number and major chromosome count for each segment bool useGaussianMixtureModel = false; // For now, this is set false, since we saw weird performance on chrY (CANV-115): if (useGaussianMixtureModel) { // optimize model covariance double likelihood = FitGaussians(Model, segments); AssignPloidyCallsGaussianMixture(); } else { AssignPloidyCallsDistance(Model, segments, medianVariantCoverage); } CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline, germlineScoreParameters); // Merge neighboring segments that got the same copy number call. // merging segments requires quality scores so we do it after quality scores have been assigned CanvasSegment.MergeSegments(ref this.Segments); // recalculating qscores after merging segments improves performance! CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline, germlineScoreParameters); CanvasSegment.FilterSegments(QualityFilterThreshold, Segments); List <string> extraHeaders = new List <string>(); string coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(this.Segments, Model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (this.CNOracle != null) { this.GenerateReportVersusKnownCN(); } if (ploidy != null && !string.IsNullOrEmpty(ploidy.HeaderLine)) { extraHeaders.Add(ploidy.HeaderLine); } CanvasSegmentWriter.WriteSegments(outFile, this.Segments, Model.DiploidCoverage, referenceFolder, sampleName, extraHeaders, ploidy, QualityFilterThreshold, isPedigreeInfoSupplied: false); return(0); }