/// <summary> /// Wavelets: unbalanced HAAR wavelets segmentation /// </summary> public Dictionary <string, SegmentationInput.Segment[]> Run(SegmentationInput segmentationInput, int windowSize) { double?coverageCV = segmentationInput.GetCoverageVariability(windowSize); var factorOfThreeCMADs = segmentationInput.FactorOfThreeCoverageVariabilities();; try { double evennessScore = segmentationInput.GetEvennessScore(windowSize); if (!segmentationInput.EvennessMetricFile.IsNullOrEmpty()) { CanvasIO.WriteEvennessMetricToTextFile(segmentationInput.EvennessMetricFile, evennessScore); } } catch (Exception) { Console.Error.WriteLine("Unable to calculate an evenness score, using coverage for segmentation"); } Dictionary <string, List <int> > adjustedBreakpoints; var breakpoints = LaunchWavelets(segmentationInput.CoverageInfo.CoverageByChr, segmentationInput.CoverageInfo.StartByChr, segmentationInput.CoverageInfo.EndByChr, coverageCV, factorOfThreeCMADs); adjustedBreakpoints = AdjustBreakpoints(segmentationInput.CoverageInfo.CoverageByChr, breakpoints, vafContainingBinsByChr: null); var segments = new Dictionary <string, SegmentationInput.Segment[]>(); foreach (string chr in segmentationInput.VafByChr.Keys) { segments[chr] = SegmentationInput.DeriveSegments(adjustedBreakpoints[chr], segmentationInput.CoverageInfo.CoverageByChr[chr].Length, segmentationInput.CoverageInfo.StartByChr[chr], segmentationInput.CoverageInfo.EndByChr[chr]); } return(segments); }
public void TestReadFrequencies() { var intervals = new List <BedInterval> { new BedInterval(1, 50), new BedInterval(51, 150), }; const string chr = "chr22"; var intervalsByChromosome = new Dictionary <string, List <BedInterval> > { { chr, intervals } }; var variantCounts = ""; variantCounts += "chr22\t10\tC\tT\t20\t10\n"; variantCounts += "chr22\t20\tC\tT\t30\t20\n"; variantCounts += "chr22\t100\tC\tT\t40\t30\n"; var stringReader = new StringReader(variantCounts); using (var reader = new GzipOrTextReader(stringReader)) { var allelesByChromosome = CanvasIO.ReadFrequencies(reader, intervalsByChromosome); Assert.Equal(allelesByChromosome[chr].Count, intervals.Count); Assert.Equal(2, allelesByChromosome[chr].First().Size()); Assert.Equal(1, allelesByChromosome[chr].Last().Size()); } }
public static void RatiosToCounts(IEnumerable <SampleGenomicBin> ratios, IFileLocation referencePloidyBedFile, IFileLocation outputPath) { PloidyInfo referencePloidy = null; if (referencePloidyBedFile != null && referencePloidyBedFile.Exists) { referencePloidy = PloidyInfo.LoadPloidyFromBedFile(referencePloidyBedFile.FullName); } CanvasIO.WriteToTextFile(outputPath.FullName, RatiosToCounts(ratios, referencePloidy)); }
/// <summary> /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment. /// </summary> public void LoadVAFInput(string referenceFolder) { try { var vafByChr = new Dictionary <string, List <List <double> > >(); var intervalsByChromosome = new Dictionary <string, List <BedInterval> >(); foreach (string chr in CoverageInfo.StartByChr.Keys) { vafByChr[chr] = new List <List <double> >(CoverageInfo.StartByChr[chr].Length); intervalsByChromosome[chr] = new List <BedInterval>(); for (int index = 0; index < CoverageInfo.StartByChr[chr].Length; index++) { vafByChr[chr].Add(new List <double>()); intervalsByChromosome[chr].Add(new BedInterval(Convert.ToInt32(CoverageInfo.StartByChr[chr][index]), Convert.ToInt32(CoverageInfo.EndByChr[chr][index]))); } } var alleleCountsByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(this.InputVafPath), intervalsByChromosome); foreach (var chr in alleleCountsByChromosome.Keys) { for (int index = 0; index < alleleCountsByChromosome[chr].Count; index++) { vafByChr[chr][index] = alleleCountsByChromosome[chr][index].MaxFrequencies; } } foreach (string chr in vafByChr.Keys) { VafByChr[chr] = new List <VafContainingBins>(); var index = 0; foreach (var bin in vafByChr[chr]) { if (bin.Count > 0) { VafByChr[chr].Add(new VafContainingBins(index, bin.Average())); } index++; } } _logger.Info("Done processing VAFs\n"); } catch (Exception e) { Console.Error.WriteLine("File {0} could not be read:", this.InputVafPath); Console.Error.WriteLine(e.Message); Environment.Exit(1); } }
/// <summary> /// Adds an input switch to the left border. It is not placed at the correct position yet, "UpdateCanvasIO()" /// will correct it later. /// </summary> public void AddInput() { CanvasIO newInput = Instantiate(inputOutputPrefab, leftBorderTransform).GetComponent <CanvasIO>(); inputs.Add(newInput); int index = 0; foreach (CanvasIO c in inputs) { c.transform.position = new Vector2(leftBorderTransform.transform.position.x, leftBorderTransform.position.y - (leftBorderTransform.GetComponent <RectTransform>().rect.size.y / 2) + (leftBorderTransform.GetComponent <RectTransform>().rect.size.y / (inputs.Count + 1)) * (index + 1)); c.io.pos = new Vector2(c.transform.position.x + (leftBorderTransform.GetComponent <RectTransform>().rect.size.x / 2), c.transform.position.y); c.io.pos = Camera.main.ScreenToWorldPoint(c.io.pos); c.io.input = false; index++; } }
/// <summary> /// Deletes and output from the right border. The inputs left are then replaced at the correct position /// next time "UpdateCanvasIO()" is called. /// </summary> public void DeleteOutput() { CanvasIO toDelete = outputs[outputs.Count - 1]; toDelete.io.FlushIO(); outputs.Remove(toDelete); Destroy(toDelete.gameObject); int index = 0; foreach (CanvasIO c in outputs) { c.transform.position = new Vector2(rightBorderTransform.transform.position.x, rightBorderTransform.position.y - (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / 2) + (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / (outputs.Count + 1)) * (index + 1)); c.io.pos = new Vector2(c.transform.position.x - (rightBorderTransform.GetComponent <RectTransform>().rect.size.x / 2), c.transform.position.y); c.io.pos = Camera.main.ScreenToWorldPoint(c.io.pos); index++; } }
/// <summary> /// Adds an output switch to the right border. It is not placed at the correct position yet, "UpdateCanvasIO()" /// will correct it later. /// </summary> public void AddOutput() { CanvasIO newOutput = Instantiate(inputOutputPrefab, rightBorderTransform).GetComponent <CanvasIO>(); outputs.Add(newOutput); int index = 0; foreach (CanvasIO c in outputs) { c.transform.position = new Vector2(rightBorderTransform.transform.position.x, rightBorderTransform.position.y - (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / 2) + (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / (outputs.Count + 1)) * (index + 1)); c.io.pos = new Vector2(c.transform.position.x - (rightBorderTransform.GetComponent <RectTransform>().rect.size.x / 2), c.transform.position.y); c.io.pos = Camera.main.ScreenToWorldPoint(c.io.pos); c.io.input = true; c.ChangeState(); index++; } }
/// <summary> /// CreatRecordLevelFilter CanvasSegments from common CNVs bed file and overlap with CanvasPartition /// segments to create SegmentHaplotypes /// </summary> private IEnumerable <ISampleMap <OverlappingSegmentsRegion> > CreateSegmentSetsFromCommonCnvs(ISampleMap <string> variantFrequencyFiles, int defaultAlleleCountThreshold, string commonCNVsbedPath, ISampleMap <Segments> sampleSegments) { if (commonCNVsbedPath == null) { var defaultSampleRegions = sampleSegments .SelectValues(segments => segments.AllSegments.Select(segment => new OverlappingSegmentsRegion(segment)).ToList()); return(GetOverlappingSegmentsRegionSampleLists(defaultSampleRegions)); } var commonRegions = ReadCommonRegions(commonCNVsbedPath); var chromosomes = sampleSegments.Values.First().GetChromosomes(); if (IsIdenticalChromosomeNames(commonRegions, chromosomes)) { throw new ArgumentException( $"Chromosome names in a common CNVs bed file {commonCNVsbedPath} does not match the genome reference"); } var segmentIntervalsByChromosome = new Dictionary <string, List <BedInterval> >(); var genomicBinsByChromosome = new Dictionary <string, IReadOnlyList <SampleGenomicBin> >(); Parallel.ForEach( chromosomes, chr => { genomicBinsByChromosome[chr] = sampleSegments.Values.First().GetGenomicBinsForChromosome(chr); segmentIntervalsByChromosome[chr] = CanvasSegment.RemapGenomicToBinCoordinates(commonRegions[chr], genomicBinsByChromosome[chr]); }); var sampleRegions = new SampleMap <List <OverlappingSegmentsRegion> >(); foreach (var sampleId in sampleSegments.SampleIds) { var commonIntervals = commonRegions.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Select(bedEntry => bedEntry.Interval).ToList()); var allelesByChromosomeCommonSegs = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[sampleId]), commonIntervals); var segmentsSets = GetSegmentSets(defaultAlleleCountThreshold, commonRegions, genomicBinsByChromosome, segmentIntervalsByChromosome, allelesByChromosomeCommonSegs, sampleSegments[sampleId]); sampleRegions.Add(sampleId, segmentsSets); } return(GetOverlappingSegmentsRegionSampleLists(sampleRegions)); }
/// <summary> /// Writes copy-number data (cnd) file. /// </summary> /// <param name="fragmentCountFile"></param> /// <param name="referenceCountFile"></param> /// <param name="ratios"></param> /// <param name="outputFile"></param> public static void WriteCndFile(IFileLocation fragmentCountFile, IFileLocation referenceCountFile, IEnumerable <SampleGenomicBin> ratios, IFileLocation outputFile) { IEnumerable <SampleGenomicBin> fragmentCounts = CanvasIO.IterateThroughTextFile(fragmentCountFile.FullName); IEnumerable <SampleGenomicBin> referenceCounts = CanvasIO.IterateThroughTextFile(referenceCountFile.FullName); using (var eFragment = fragmentCounts.GetEnumerator()) using (var eReference = referenceCounts.GetEnumerator()) using (var eRatio = ratios.GetEnumerator()) using (FileStream stream = new FileStream(outputFile.FullName, FileMode.Create, FileAccess.Write)) using (StreamWriter writer = new StreamWriter(stream)) { writer.WriteLine(CSVWriter.GetLine("Fragment Count", "Reference Count", "Chromosome", "Start", "End", "Unsmoothed Log Ratio")); while (eFragment.MoveNext() && eReference.MoveNext() && eRatio.MoveNext()) { // Some bins could have been skipped when calculating the ratios while (!eFragment.Current.IsSameBin(eRatio.Current)) { if (!eFragment.MoveNext()) // Ran out of fragment bins { throw new Illumina.Common.IlluminaException("Fragment bins and ratio bins are not in the same order."); } } while (!eReference.Current.IsSameBin(eRatio.Current)) { if (!eReference.MoveNext()) // Ran out of reference bins { throw new Illumina.Common.IlluminaException("Reference bins and ratio bins are not in the same order."); } } if (!eFragment.Current.IsSameBin(eReference.Current) || !eFragment.Current.IsSameBin(eRatio.Current)) { throw new Illumina.Common.IlluminaException("Bins are not in the same order."); } writer.WriteLine(CSVWriter.GetLine(eFragment.Current.Count.ToString(), eReference.Current.Count.ToString(), eFragment.Current.GenomicBin.Chromosome, eFragment.Current.Start.ToString(), eFragment.Current.Stop.ToString(), eRatio.Current.Count.ToString())); } } }
public void Run(IFileLocation outputFile) { List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName); VerifyBinOrder(sampleBins); // set bin count to 1 if less than 1 foreach (var bin in sampleBins) { bin.Count = Math.Max(1, bin.Count); } // center the sample var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray(); // project onto the axes var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes); // undo centering and set bin count to 1 if less than 1 var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count)); // write temporary reference count file var tempReferenceFile = new FileLocation(Path.GetTempFileName()); var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count)); CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins); // calcualte median ratio var ratios = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest); double medianRatio = ratios.OnTargetMedianBinCount; // delete temporary reference count file tempReferenceFile.Delete(); // multiply reference counts by the median ratio var referenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio))); // write reference count file CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins); }
public int Run(IFileLocation inputFile, IFileLocation outputFile) { // read input bins var binsByChrom = CanvasIO.GetGenomicBinsByChrom(inputFile.FullName); // smooth bins on each chromosome RepeatedMedianSmoother smoother = new RepeatedMedianSmoother(MaxHalfWindowSize); var chromosomes = binsByChrom.Keys; ConcurrentDictionary <string, List <SampleGenomicBin> > smoothedBinsByChrom = new ConcurrentDictionary <string, List <SampleGenomicBin> >(); Console.WriteLine("Launch smoothing jobs..."); Parallel.ForEach(chromosomes, chrom => { smoothedBinsByChrom[chrom] = smoother.Smooth(binsByChrom[chrom]); }); Console.WriteLine("Completed smoothing jobs."); // write smoothed bins CanvasIO.WriteToTextFile(outputFile.FullName, chromosomes.SelectMany(chrom => smoothedBinsByChrom[chrom])); return(0); }
public IEnumerable <SampleGenomicBin> Run(IFileLocation sampleBedFile, IFileLocation referenceBedFile) { if (!sampleBedFile.Exists) { throw new FileNotFoundException(sampleBedFile.FullName + " does not exist."); } if (!referenceBedFile.Exists) { throw new FileNotFoundException(referenceBedFile.FullName + " does not exist."); } var sampleBins = CanvasIO.IterateThroughTextFile(sampleBedFile.FullName); var referenceBins = CanvasIO.IterateThroughTextFile(referenceBedFile.FullName); using (var eSampleBins = sampleBins.GetEnumerator()) using (var eReferenceBins = referenceBins.GetEnumerator()) { while (eSampleBins.MoveNext() && eReferenceBins.MoveNext()) { var sampleBin = eSampleBins.Current; var referenceBin = eReferenceBins.Current; // Bins with extreme reference counts introduce large variance into the ratios. // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling. if (referenceBin.Count < _minReferenceCount) { continue; } // skip the bin if (referenceBin.Count > _maxReferenceCount) { continue; } // skip the bin double sampleCount = eSampleBins.Current.Count; double ratio = sampleBin.Count / referenceBin.Count; yield return(new SampleGenomicBin(sampleBin.GenomicBin.Chromosome, sampleBin.Start, sampleBin.Stop, sampleBin.GenomicBin.GC, (float)ratio)); } } }
public IEnumerable <SampleGenomicBin> Run(IFileLocation sampleBedFile, IFileLocation referenceBedFile) { if (!sampleBedFile.Exists) { throw new FileNotFoundException(sampleBedFile.FullName + " does not exist."); } if (!referenceBedFile.Exists) { throw new FileNotFoundException(referenceBedFile.FullName + " does not exist."); } var sampleBins = CanvasIO.IterateThroughTextFile(sampleBedFile.FullName); var referenceBins = CanvasIO.IterateThroughTextFile(referenceBedFile.FullName); double sampleMedian = (new BinCounts(sampleBins, manifest: _manifest)).OnTargetMedianBinCount; double referenceMedian = (new BinCounts(referenceBins, manifest: _manifest)).OnTargetMedianBinCount; double librarySizeFactor = (sampleMedian > 0 && referenceMedian > 0) ? referenceMedian / sampleMedian : 1; using (var eSampleBins = sampleBins.GetEnumerator()) using (var eReferenceBins = referenceBins.GetEnumerator()) { while (eSampleBins.MoveNext() && eReferenceBins.MoveNext()) { var sampleBin = eSampleBins.Current; var referenceBin = eReferenceBins.Current; // The weighted average count of a bin could be less than 1. // Using these small counts for coverage normalization creates large ratios. // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling. if (referenceBin.Count < 1) { continue; } // skip the bin double ratio = sampleBin.Count / referenceBin.Count * librarySizeFactor; yield return(new SampleGenomicBin(sampleBin.GenomicBin.Chromosome, sampleBin.Start, sampleBin.Stop, sampleBin.GenomicBin.GC, (float)ratio)); } } }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyVcfPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { _cnOracle = new CopyNumberOracle(); _cnOracle.LoadKnownCN(truthDataPath); } _segments = Segments.ReadSegments(_logger, new FileLocation(inFile)); _allSegments = _segments.AllSegments.ToList(); TempFolder = Path.GetDirectoryName(inFile); if (_allSegments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegmentWriter.WriteSegments(outFile, _allSegments, _model?.DiploidCoverage, referenceFolder, sampleName, null, null, QualityFilterThreshold, false, null, null); return(0); } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyVcfPath)) { ploidy = PloidyInfo.LoadPloidyFromVcfFileNoSampleId(ploidyVcfPath); } // load MAF var allelesByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFile), _segments.IntervalsByChromosome); _segments.AddAlleles(allelesByChromosome); MeanCoverage = allelesByChromosome.SelectMany(x => x.Value).SelectMany(y => y.TotalCoverage).Average(); AggregateVariantCoverage(ref _allSegments); // Create new models for different copy number states InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref _allSegments); _diploidCoverage = Utilities.Mean(diploidCounts); _coverageWeightingFactor = CoverageWeighting / _diploidCoverage; // new coverage model _model = new CoverageModel { DiploidCoverage = _diploidCoverage }; List <SegmentInfo> segments = new List <SegmentInfo>(); foreach (CanvasSegment segment in _allSegments) { SegmentInfo info = new SegmentInfo { Segment = segment }; List <double> mafs = new List <double>(); foreach (float value in segment.Balleles.Frequencies) { mafs.Add(value > 0.5 ? 1 - value : value); } if (mafs.Count > 0) { info.Maf = Utilities.Median(mafs); } else { info.Maf = -1; } info.Coverage = Utilities.Median(segment.Counts); info.Weight = _allSegments.Count > 100 ? segment.Length : segment.BinCount; segments.Add(info); } AssignPloidyCallsDistance(_model); CanvasSegment.AssignQualityScores(_allSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters); // Merge neighboring segments that got the same copy number call. // merging segments requires quality scores so we do it after quality scores have been assigned var mergedSegments = CanvasSegment.MergeSegments(_allSegments); // recalculating qscores after merging segments improves performance! CanvasSegment.AssignQualityScores(mergedSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters); CanvasSegment.SetFilterForSegments(QualityFilterThreshold, mergedSegments, CanvasFilter.SegmentSizeCutoff); List <string> extraHeaders = new List <string>(); var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(mergedSegments, _model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (_cnOracle != null) { GenerateReportVersusKnownCopyNumber(); } if (!string.IsNullOrEmpty(ploidy?.HeaderLine)) { extraHeaders.Add(ploidy.HeaderLine); } CanvasSegmentWriter.WriteSegments(outFile, mergedSegments, _model.DiploidCoverage, referenceFolder, sampleName, extraHeaders, ploidy, QualityFilterThreshold, false, null, null); return(0); }
private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts, out List <int> onTargetIndices) { LoadBinCounts(CanvasIO.IterateThroughTextFile(binnedPath), manifest, out binCounts, out onTargetIndices); }
/// <summary> /// Implements the Canvas binning algorithm /// </summary> public static int Run(CanvasBinParameters parameters) { // Will hold a bunch of BitArrays, one for each chromosome. // Each one's length corresponds to the length of the chromosome it represents. // A position will be marked 'true' if the mer starting at that position is unique in the genome. Dictionary <string, BitArray> possibleAlignments = new Dictionary <string, BitArray>(); // Will hold a bunch of HitArrays, one for each chromosome. // Each one's length corresponds to the length of the chromosome it represents. // A position will be marked with the number of times the mer starting at that position // is observed in the SAM file. Dictionary <string, HitArray> observedAlignments = new Dictionary <string, HitArray>(); // Will hold a bunch of byte arrays, one for each chromosome. // Each one's length corresponds to the length of the chromosome it represents. // A value at a given index will represents fragment length of the read starting at that index Dictionary <string, Int16[]> fragmentLengths = new Dictionary <string, Int16[]>(); if (parameters.intermediatePaths.Count == 0) { BinOneGenomicInterval(parameters, possibleAlignments, observedAlignments, fragmentLengths); return(0); } //load our intermediate data files List <string> inputFiles = new List <string>(parameters.intermediatePaths); Object semaphore = new object(); // control access to possibleAlignments, observedAlignments, fragmentLengths // retrieve the number of processors //int processorCoreCount = Environment.ProcessorCount; int processorCoreCount = 1; // Limit # of deserialization threads to avoid (rare) protobuf issue. List <Thread> threads = new List <Thread>(); Console.WriteLine("Start deserialization:"); Console.Out.Flush(); while (threads.Count > 0 || inputFiles.Count > 0) { // Remove defunct threads: threads.RemoveAll(t => !t.IsAlive); if (threads.Count == processorCoreCount) { Thread.Sleep(1000); continue; } while (inputFiles.Count > 0 && threads.Count < processorCoreCount) { string inputFile = inputFiles.First(); ThreadStart threadDelegate = new ThreadStart(() => DeserializeCanvasData(inputFile, possibleAlignments, observedAlignments, fragmentLengths, semaphore, parameters.coverageMode)); Thread newThread = new Thread(threadDelegate); threads.Add(newThread); newThread.Name = "CanvasBin " + inputFiles[0]; Console.WriteLine(newThread.Name); newThread.Start(); inputFiles.RemoveAt(0); } } Console.WriteLine("{0} Deserialization complete", DateTime.Now); Console.Out.Flush(); NexteraManifest manifest = parameters.manifestFile == null ? null : new NexteraManifest(parameters.manifestFile, null, Console.WriteLine); if (parameters.binSize == -1) { // Turn the desired # of alignments per bin into the number of possible alignments expected per bin. parameters.binSize = CalculateNumberOfPossibleAlignmentsPerBin(parameters.countsPerBin, possibleAlignments, observedAlignments, manifest: manifest); } if (parameters.binSizeOnly) { // Write bin size to file System.IO.File.WriteAllText(parameters.outFile + ".binsize", "" + parameters.binSize); return(0); } Dictionary <string, List <GenomicBin> > predefinedBins = null; if (parameters.predefinedBinsFile != null) { // Read predefined bins predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile); } // Bin alignments. List <GenomicBin> bins = BinCounts(parameters.referenceFile, parameters.binSize, parameters.coverageMode, manifest, possibleAlignments, observedAlignments, fragmentLengths, predefinedBins, parameters.outFile); // Output! Console.WriteLine("{0} Output binned counts:", DateTime.Now); CanvasIO.WriteToTextFile(parameters.outFile, bins); Console.WriteLine("{0} Output complete", DateTime.Now); Console.Out.Flush(); return(0); }
static int Main(string[] args) { Utilities.LogCommandLine(args); string inFile = null; string outFile = null; bool doGCnorm = false; bool doSizeFilter = false; bool doOutlierRemoval = false; string ffpeOutliersFile = null; string manifestFile = null; CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC; string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}", String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()), gcNormalizationMode); bool needHelp = false; OptionSet p = new OptionSet() { { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v }, { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v }, { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null }, { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null }, { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null }, { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v }, { "t|manifest=", "Nextera manifest file", v => manifestFile = v }, { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) }, { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) }, { "h|help", "show this message and exit", v => needHelp = v != null }, }; List <string> extraArgs = p.Parse(args); if (needHelp) { ShowHelp(p); return(0); } if (inFile == null || outFile == null) { ShowHelp(p); return(0); } // Does the input file exist? if (!File.Exists(inFile)) { Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile); return(1); } List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile); if (doOutlierRemoval) { bins = RemoveOutliers(bins); } if (doSizeFilter) { bins = RemoveBigBins(bins); } // do not run FFPE outlier removal on targeted/low coverage data if (ffpeOutliersFile != null && bins.Count < 50000) { ffpeOutliersFile = null; } // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file double LocalSD = -1.0; if (ffpeOutliersFile != null) { LocalSD = getLocalStandardDeviation(bins); CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD); } if (doGCnorm) { NexteraManifest manifest = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine); List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest) : bins; if (strippedBins.Count == 0) { Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction"); } else { bins = strippedBins; NormalizeByGC(bins, manifest, gcNormalizationMode); // Use variance normalization only on large exome panels and whole genome sequencing // The treshold is set to 10% of an average number of bins on CanvasClean data if (ffpeOutliersFile != null && bins.Count > 500000) { bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest); // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC if (isNormalizeVarianceByGC) { NormalizeByGC(bins, manifest, gcNormalizationMode); } } } } if (ffpeOutliersFile != null) { // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples) List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile); bins = LocalMadstrippedBins; } CanvasIO.WriteToTextFile(outFile, bins); return(0); }
public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyBedPath, string referenceFolder, string sampleName, string truthDataPath) { if (!string.IsNullOrEmpty(truthDataPath)) { this.CNOracle = new CopyNumberOracle(); this.CNOracle.LoadKnownCN(truthDataPath); } this.Segments = CanvasSegment.ReadSegments(inFile); this.TempFolder = Path.GetDirectoryName(inFile); if (this.Segments.Count == 0) { Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made."); CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, null, null); return(0); } PloidyInfo ploidy = null; if (!string.IsNullOrEmpty(ploidyBedPath)) { ploidy = PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); } // load MAF this.MeanCoverage = CanvasIO.LoadVariantFrequencies(variantFrequencyFile, this.Segments); int medianVariantCoverage = AggregateVariantCoverage(ref this.Segments); // Create new models for different copy number states this.InitializePloidies(); // Compute statistics on the copy number two regions float[] diploidCounts = AggregateCounts(ref this.Segments); DiploidCoverage = CanvasCommon.Utilities.Mean(diploidCounts); CoverageWeightingFactor = CoverageWeighting / DiploidCoverage; // new coverage model this.Model = new CoverageModel(); Model.DiploidCoverage = DiploidCoverage; List <SegmentInfo> segments = new List <SegmentInfo>(); foreach (CanvasSegment segment in this.Segments) { SegmentInfo info = new SegmentInfo(); info.Segment = segment; List <double> MAF = new List <double>(); foreach (float value in segment.VariantFrequencies) { MAF.Add(value > 0.5 ? 1 - value : value); } if (MAF.Count > 0) { info.MAF = CanvasCommon.Utilities.Median(MAF); } else { info.MAF = -1; } info.Coverage = CanvasCommon.Utilities.Median(segment.Counts); if (this.Segments.Count > 100) { info.Weight = segment.End - segment.Begin; } else { info.Weight = segment.BinCount; } segments.Add(info); } // Assign copy number and major chromosome count for each segment bool useGaussianMixtureModel = false; // For now, this is set false, since we saw weird performance on chrY (CANV-115): if (useGaussianMixtureModel) { // optimize model covariance double likelihood = FitGaussians(Model, segments); AssignPloidyCallsGaussianMixture(); } else { AssignPloidyCallsDistance(Model, segments, medianVariantCoverage); } // Merge neighboring segments that got the same copy number call. CanvasSegment.MergeSegments(ref this.Segments); CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline); List <string> extraHeaders = new List <string>(); string coverageOutputPath = CanvasCommon.Utilities.GetCoverageAndVariantFrequencyOutputPath(outFile); CanvasSegment.WriteCoveragePlotData(this.Segments, Model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder); if (this.CNOracle != null) { this.GenerateReportVersusKnownCN(); } if (ploidy != null && !string.IsNullOrEmpty(ploidy.HeaderLine)) { extraHeaders.Add(ploidy.HeaderLine); } CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, extraHeaders, ploidy); return(0); }
internal int CallVariants(List <string> variantFrequencyFiles, List <string> segmentFiles, IFileLocation outVcfFile, string ploidyBedPath, string referenceFolder, List <string> sampleNames, string commonCnvsBedPath, List <SampleType> sampleTypes) { // load files // initialize data structures and classes var fileCounter = 0; var samplesInfo = new SampleMap <SampleMetrics>(); var sampleSegments = new SampleMap <Segments>(); var copyNumberModels = new SampleMap <ICopyNumberModel>(); var variantFrequencyFilesSampleList = new SampleMap <string>(); var kinships = new SampleMap <SampleType>(); foreach (string sampleName in sampleNames) { var sampleId = new SampleId(sampleName); var segment = Segments.ReadSegments(_logger, new FileLocation(segmentFiles[fileCounter])); segment.AddAlleles(CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[fileCounter]), segment.IntervalsByChromosome)); sampleSegments.Add(sampleId, segment); var sampleInfo = SampleMetrics.GetSampleInfo(segment.AllSegments, ploidyBedPath, _callerParameters.NumberOfTrimmedBins, sampleId); var copyNumberModel = _copyNumberModelFactory.CreateModel(_callerParameters.MaximumCopyNumber, sampleInfo.MaxCoverage, sampleInfo.MeanCoverage, sampleInfo.MeanMafCoverage); samplesInfo.Add(sampleId, sampleInfo); copyNumberModels.Add(sampleId, copyNumberModel); variantFrequencyFilesSampleList.Add(sampleId, variantFrequencyFiles[fileCounter]); kinships.Add(sampleId, sampleTypes[fileCounter]); fileCounter++; } var segmentSetsFromCommonCnvs = CreateSegmentSetsFromCommonCnvs(variantFrequencyFilesSampleList, _callerParameters.MinAlleleCountsThreshold, commonCnvsBedPath, sampleSegments); var segmentsForVariantCalling = GetHighestLikelihoodSegments(segmentSetsFromCommonCnvs, samplesInfo, copyNumberModels).ToList(); PedigreeInfo pedigreeInfo = PedigreeInfo.GetPedigreeInfo(kinships, _callerParameters); Parallel.ForEach( segmentsForVariantCalling, new ParallelOptions { MaxDegreeOfParallelism = Math.Min(Environment.ProcessorCount, _callerParameters.MaxCoreNumber) }, segments => _variantCaller.CallVariant(segments, samplesInfo, copyNumberModels, pedigreeInfo) ); var variantCalledSegments = new SampleMap <List <CanvasSegment> >(); foreach (var key in samplesInfo.SampleIds) { variantCalledSegments.Add(key, segmentsForVariantCalling.Select(segment => segment[key]).ToList()); } var mergedVariantCalledSegments = MergeSegments(variantCalledSegments, _callerParameters.MinimumCallSize, _qualityFilterThreshold); FilterExcessivelyShortSegments(mergedVariantCalledSegments); var outputFolder = outVcfFile.Directory; foreach (var sampleId in samplesInfo.SampleIds) { var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(outputFolder, sampleId.ToString()); CanvasSegment.WriteCoveragePlotData(mergedVariantCalledSegments[sampleId], samplesInfo[sampleId].MeanCoverage, samplesInfo[sampleId].Ploidy, coverageOutputPath, referenceFolder); } bool isPedigreeInfoSupplied = pedigreeInfo != null && pedigreeInfo.HasFullPedigree(); var denovoQualityThreshold = isPedigreeInfoSupplied ? (int?)_deNovoQualityFilterThreshold : null; var ploidies = samplesInfo.Select(info => info.Value.Ploidy).ToList(); var diploidCoverage = samplesInfo.Select(info => info.Value.MeanCoverage).ToList(); var names = samplesInfo.SampleIds.Select(id => id.ToString()).ToList(); CanvasSegmentWriter.WriteMultiSampleSegments(outVcfFile.FullName, mergedVariantCalledSegments, diploidCoverage, referenceFolder, names, null, ploidies, _qualityFilterThreshold, denovoQualityThreshold, CanvasFilter.SegmentSizeCutoff, isPedigreeInfoSupplied); foreach (var sampleId in samplesInfo.SampleIds) { var outputVcfPath = SingleSampleCallset.GetVcfOutput(outputFolder, sampleId.ToString()); var sampleMetrics = samplesInfo[sampleId]; var segments = mergedVariantCalledSegments[sampleId]; CanvasSegmentWriter.WriteSegments(outputVcfPath.FullName, segments, sampleMetrics.MeanCoverage, referenceFolder, sampleId.ToString(), null, sampleMetrics.Ploidy, _qualityFilterThreshold, isPedigreeInfoSupplied, denovoQualityThreshold, null); var visualizationTemp = outputFolder.CreateSubdirectory($"VisualizationTemp{sampleId}"); var normalizationFactor = NormalizationCalculator.ComputeNormalizationFactor(segments); var bigWig = _coverageBigWigWriter.Write(segments, visualizationTemp, normalizationFactor); bigWig?.MoveTo(SingleSampleCallset.GetCoverageBigWig(outputFolder, sampleId.ToString())); var copyNumberBedGraph = SingleSampleCallset.GetCopyNumberBedGraph(outputFolder, sampleId.ToString()); _copyNumberBedGraphWriter.Write(segments, sampleMetrics.Ploidy, copyNumberBedGraph); var partitionBedgraphHeader = "track type=bedGraph visibility=full autoScale=on graphType=points"; var originalSegments = sampleSegments[sampleId]; _partitionCoverageBedGraphWriter.Write(originalSegments.AllSegments, SingleSampleCallset.GetPartitionBedGraph(outputFolder, sampleId.ToString()), normalizationFactor, partitionBedgraphHeader); } return(0); }
/// <summary> /// Performs fragment binning. /// </summary> /// <returns></returns> public int Bin() { if (parameters.predefinedBinsFile == null) { throw new Illumina.Common.IlluminaException("Predefined bins in BED is required for fragment binning."); } if (!parameters.isPairedEnd) // Janus-SRS-189 { throw new Illumina.Common.IlluminaException("Paired-end reads are required for fragment binning."); } Dictionary <string, List <SampleGenomicBin> > predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile, gcIndex: 3); List <string> chromosomes = GetChromosomesInBam(); // used to order chromosomes if (!Utilities.IsSubset(predefinedBins.Keys, chromosomes)) { throw new Illumina.Common.IlluminaException( String.Format("Not all chromosomes in {0} are found in {1}.", parameters.predefinedBinsFile, parameters.bamFile)); } // Count fragments by chromosome List <ThreadStart> binningThreads = new List <ThreadStart>(); List <BinTask> tasks = new List <BinTask>(); foreach (string chrom in chromosomes) { if (!predefinedBins.ContainsKey(chrom)) { continue; } BinTask task = new BinTask(parameters.referenceFile, chrom, parameters.bamFile, predefinedBins[chrom]); tasks.Add(task); binningThreads.Add(new ThreadStart(() => { task.DoIt(); })); } Console.WriteLine("Launch fragment binning jobs..."); Console.Out.WriteLine(); Parallel.ForEach(binningThreads, t => { t.Invoke(); }); Console.WriteLine("Completed fragment binning jobs."); Console.Out.WriteLine(); long usableFragmentCount = tasks.Select(t => t.UsableFragmentCount).Sum(); if (usableFragmentCount == 0) { throw new Illumina.Common.IlluminaException(String.Format("No passing-filter fragments overlapping bins are found in {0}", parameters.bamFile)); } // Aggregate bins List <SampleGenomicBin> finalBins = new List <SampleGenomicBin>(); foreach (string chrom in chromosomes) { if (!predefinedBins.ContainsKey(chrom)) { continue; } finalBins.AddRange(predefinedBins[chrom]); } // Output! CanvasIO.WriteToTextFile(parameters.outFile, finalBins); return(0); }