private static void PostProcessAndWriteResults(SegmentationInput segmentationInput, string outPartitionedFile, PloidyInfo referencePloidy, GenomeSegmentationResults segmentationResults) { var segments = segmentationInput.PostProcessSegments(segmentationResults, referencePloidy); segmentationInput.WriteCanvasPartitionResults(outPartitionedFile, segments); }
// dataType: "logratio" (aCGH, ROMA, etc.) or "binary" (LOH) public Segmentation(string inputBinPath, string forbiddenBedPath, string dataType = "logratio", int maxInterBinDistInSegment = 1000000) { this.InputBinPath = inputBinPath; this.DataType = dataType; this.SegmentationResults = null; this.ForbiddenIntervalBedPath = forbiddenBedPath; this.MaxInterBinDistInSegment = maxInterBinDistInSegment; // Read the input file: this.ReadBEDInput(); }
public void WriteCanvasPartitionResults(string outPath, GenomeSegmentationResults segmentationResults) { Dictionary <string, bool> starts = new Dictionary <string, bool>(); Dictionary <string, bool> stops = new Dictionary <string, bool>(); foreach (string chr in segmentationResults.SegmentByChr.Keys) { for (int segmentIndex = 0; segmentIndex < segmentationResults.SegmentByChr[chr].Length; segmentIndex++) { Segmentation.Segment segment = segmentationResults.SegmentByChr[chr][segmentIndex]; starts[chr + ":" + segment.start] = true; stops[chr + ":" + segment.end] = true; } } Dictionary <string, List <SampleGenomicBin> > excludedIntervals = new Dictionary <string, List <SampleGenomicBin> >(); if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath)) { excludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath); } using (GzipWriter writer = new GzipWriter(outPath)) { int segmentNum = -1; foreach (string chr in StartByChr.Keys) { List <SampleGenomicBin> excludeIntervals = null; if (excludedIntervals.ContainsKey(chr)) { excludeIntervals = excludedIntervals[chr]; } int excludeIndex = 0; // Points to the first interval which *doesn't* end before our current position uint previousBinEnd = 0; for (int pos = 0; pos < StartByChr[chr].Length; pos++) { uint start = StartByChr[chr][pos]; uint end = EndByChr[chr][pos]; string key = chr + ":" + start; bool newSegment = IsNewSegment(starts, key, excludeIntervals, previousBinEnd, end, start, ref excludeIndex); if (newSegment) { segmentNum++; } writer.WriteLine(string.Format($"{chr}\t{start}\t{end}\t{ScoreByChr[chr][pos]}\t{segmentNum}")); previousBinEnd = end; } } } }
public Dictionary <string, List <SegmentWithBins> > PostProcessSegments( GenomeSegmentationResults segmentationResults, PloidyInfo referencePloidy) { var excludedIntervals = new Dictionary <string, List <SampleGenomicBin> >(); if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath)) { excludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath); } return(_processor.PostProcessSegments(segmentationResults, referencePloidy, excludedIntervals, CoverageInfo)); }
/// <summary> /// Wavelets: unbalanced HAAR wavelets segmentation /// </summary> /// <param name="threshold">wavelets coefficient threshold</param> private void Wavelets(bool isGermline, double thresholdLower = 5, double thresholdUpper = 80, int minSize = 10, int verbose = 1) { Dictionary <string, int[]> inaByChr = new Dictionary <string, int[]>(); Dictionary <string, double[]> finiteScoresByChr = new Dictionary <string, double[]>(); List <ThreadStart> tasks = new List <ThreadStart>(); foreach (KeyValuePair <string, double[]> scoreByChrKVP in ScoreByChr) { tasks.Add(new ThreadStart(() => { string chr = scoreByChrKVP.Key; int[] ina; Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf double[] scores; if (ina.Length == scoreByChrKVP.Value.Length) { scores = scoreByChrKVP.Value; } else { Helper.ExtractValues <double>(scoreByChrKVP.Value, ina, out scores); } lock (finiteScoresByChr) { finiteScoresByChr[chr] = scores; inaByChr[chr] = ina; } })); } Isas.Shared.Utilities.DoWorkParallelThreads(tasks); // Quick sanity-check: If we don't have any segments, then return a dummy result. int n = 0; foreach (var list in finiteScoresByChr.Values) { n += list.Length; } if (n == 0) { this.SegmentationResults = this.GetDummySegmentationResults(); return; } Dictionary <string, Segment[]> segmentByChr = new Dictionary <string, Segment[]>(); // when parallelizing we need an RNG for each chromosome to get deterministic results Random seedGenerator = new MersenneTwister(0); Dictionary <string, Random> perChromosomeRandom = new Dictionary <string, Random>(); foreach (string chr in this.ScoreByChr.Keys) { perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true); } tasks = new List <ThreadStart>(); foreach (string chr in ScoreByChr.Keys) { tasks.Add(new ThreadStart(() => { int[] ina = inaByChr[chr]; List <int> breakpoints = new List <int>(); int sizeScoreByChr = this.ScoreByChr[chr].Length; if (sizeScoreByChr > minSize) { WaveletSegmentation.HaarWavelets(this.ScoreByChr[chr].ToArray(), thresholdLower, thresholdUpper, breakpoints, isGermline); } List <int> startBreakpointsPos = new List <int>(); List <int> endBreakpointPos = new List <int>(); List <int> lengthSeg = new List <int>(); if (breakpoints.Count() >= 2 && sizeScoreByChr > 10) { startBreakpointsPos.Add(breakpoints[0]); endBreakpointPos.Add(breakpoints[1] - 1); lengthSeg.Add(breakpoints[1] - 1); for (int i = 1; i < breakpoints.Count - 1; i++) { startBreakpointsPos.Add(breakpoints[i]); endBreakpointPos.Add(breakpoints[i + 1] - 1); lengthSeg.Add(breakpoints[i + 1] - 1 - breakpoints[i]); } startBreakpointsPos.Add(breakpoints[breakpoints.Count - 1]); endBreakpointPos.Add(sizeScoreByChr - 1); lengthSeg.Add(sizeScoreByChr - breakpoints[breakpoints.Count - 1] - 1); } else { startBreakpointsPos.Add(0); endBreakpointPos.Add(sizeScoreByChr - 1); lengthSeg.Add(sizeScoreByChr - 1); } // estimate segment means double[] segmentMeans = new double[lengthSeg.Count()]; int ss = 0, ee = 0; for (int i = 0; i < lengthSeg.Count(); i++) { ee += lengthSeg[i]; // Works even if weights == null segmentMeans[i] = Helper.WeightedAverage(this.ScoreByChr[chr], null, iStart: ss, iEnd: ee); ss = ee; } Segment[] segments = new Segment[startBreakpointsPos.Count]; for (int i = 0; i < startBreakpointsPos.Count; i++) { int start = startBreakpointsPos[i]; int end = endBreakpointPos[i]; segments[i] = new Segment(); segments[i].start = this.StartByChr[chr][start]; // Genomic start segments[i].end = this.EndByChr[chr][end]; // Genomic end segments[i].nMarkers = lengthSeg[i]; segments[i].mean = segmentMeans[i]; } lock (segmentByChr) { segmentByChr[chr] = segments; } })); } Console.WriteLine("{0} Launching wavelet tasks", DateTime.Now); Isas.Shared.Utilities.DoWorkParallelThreads(tasks); Console.WriteLine("{0} Completed wavelet tasks", DateTime.Now); this.SegmentationResults = new GenomeSegmentationResults(segmentByChr); Console.WriteLine("{0} Segmentation results complete", DateTime.Now); }
private GenomeSegmentationResults GetDummySegmentationResults() { GenomeSegmentationResults results = new GenomeSegmentationResults(new Dictionary <string, Segment[]>()); return(results); }
/// <summary> /// CBS: circular binary segmentation porting the R function segment in DNAcopy /// </summary> /// <param name="alpha">Now in this.Alpha</param> /// <param name="nPerm"></param> /// <param name="pMethod">"hybrid" or "perm"</param> /// <param name="minWidth"></param> /// <param name="kMax"></param> /// <param name="nMin"></param> /// <param name="eta"></param> /// <param name="sbdry"></param> /// <param name="trim"></param> /// <param name="undoSplit">"none" or "prune" or "sdundo"; now in this.UndoMethod</param> /// <param name="undoPrune"></param> /// <param name="undoSD"></param> /// <param name="verbose"></param> private void CBS(uint nPerm = 10000, string pMethod = "hybrid", int minWidth = 2, int kMax = 25, uint nMin = 200, double eta = 0.05, uint[] sbdry = null, double trim = 0.025, double undoPrune = 0.05, double undoSD = 3, int verbose = 1) { if (minWidth < 2 || minWidth > 5) { Console.Error.WriteLine("Minimum segment width should be between 2 and 5"); Environment.Exit(1); } if (nMin < 4 * kMax) { Console.Error.WriteLine("nMin should be >= 4 * kMax"); Environment.Exit(1); } if (sbdry == null) { GetBoundary.ComputeBoundary(nPerm, this.Alpha, eta, out sbdry); } Dictionary <string, int[]> inaByChr = new Dictionary <string, int[]>(); Dictionary <string, double[]> finiteScoresByChr = new Dictionary <string, double[]>(); List <ThreadStart> tasks = new List <ThreadStart>(); foreach (KeyValuePair <string, double[]> scoreByChrKVP in ScoreByChr) { tasks.Add(new ThreadStart(() => { string chr = scoreByChrKVP.Key; int[] ina; Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf double[] scores; if (ina.Length == scoreByChrKVP.Value.Length) { scores = scoreByChrKVP.Value; } else { Helper.ExtractValues <double>(scoreByChrKVP.Value, ina, out scores); } lock (finiteScoresByChr) { finiteScoresByChr[chr] = scores; inaByChr[chr] = ina; } })); } //Parallel.ForEach(tasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(tasks); // Quick sanity-check: If we don't have any segments, then return a dummy result. int n = 0; foreach (var list in finiteScoresByChr.Values) { n += list.Length; } if (n == 0) { this.SegmentationResults = this.GetDummySegmentationResults(); return; } double trimmedSD = Math.Sqrt(ChangePoint.TrimmedVariance(finiteScoresByChr, trim: trim)); Dictionary <string, Segment[]> segmentByChr = new Dictionary <string, Segment[]>(); // when parallelizing we need an RNG for each chromosome to get deterministic results Random seedGenerator = new MersenneTwister(0); Dictionary <string, Random> perChromosomeRandom = new Dictionary <string, Random>(); foreach (string chr in this.ScoreByChr.Keys) { perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true); } tasks = new List <ThreadStart>(); foreach (string chr in ScoreByChr.Keys) { tasks.Add(new ThreadStart(() => { int[] ina = inaByChr[chr]; int[] lengthSeg; double[] segmentMeans; ChangePoint.ChangePoints(this.ScoreByChr[chr], sbdry, out lengthSeg, out segmentMeans, perChromosomeRandom[chr], dataType: this.DataType, alpha: this.Alpha, nPerm: nPerm, pMethod: pMethod, minWidth: minWidth, kMax: kMax, nMin: nMin, trimmedSD: trimmedSD, undoSplits: this.UndoMethod, undoPrune: undoPrune, undoSD: undoSD, verbose: verbose); Segment[] segments = new Segment[lengthSeg.Length]; int cs1 = 0, cs2 = -1; // cumulative sum for (int i = 0; i < lengthSeg.Length; i++) { cs2 += lengthSeg[i]; int start = ina[cs1]; int end = ina[cs2]; segments[i] = new Segment(); segments[i].start = this.StartByChr[chr][start]; // Genomic start segments[i].end = this.EndByChr[chr][end]; // Genomic end segments[i].nMarkers = lengthSeg[i]; segments[i].mean = segmentMeans[i]; cs1 += lengthSeg[i]; } lock (segmentByChr) { segmentByChr[chr] = segments; } })); } //Parallel.ForEach(tasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(tasks); this.SegmentationResults = new GenomeSegmentationResults(segmentByChr); }
public Dictionary <string, List <SegmentWithBins> > PostProcessSegments( GenomeSegmentationResults segmentationResults, PloidyInfo referencePloidy, Dictionary <string, List <SampleGenomicBin> > excludedIntervals, CoverageInfo coverageInfo) { var starts = new Dictionary <string, bool>(); var stops = new Dictionary <string, bool>(); foreach (string chr in segmentationResults.SegmentByChr.Keys) { for (int segmentIndex = 0; segmentIndex < segmentationResults.SegmentByChr[chr].Length; segmentIndex++) { var segment = segmentationResults.SegmentByChr[chr][segmentIndex]; starts[chr + ":" + segment.start] = true; stops[chr + ":" + segment.end] = true; } } int segmentNum = -1; var segmentsByChromosome = new Dictionary <string, List <SegmentWithBins> >(); foreach (string chr in coverageInfo.StartByChr.Keys) { segmentsByChromosome.Add(chr, new List <SegmentWithBins>()); SegmentWithBins currentSegment = null; List <SampleGenomicBin> excludeIntervals = null; if (excludedIntervals.ContainsKey(chr)) { excludeIntervals = excludedIntervals[chr]; } var excludeIndex = 0; // Points to the first interval which *doesn't* end before our current position uint previousBinEnd = 0; for (int binIndex = 0; binIndex < coverageInfo.StartByChr[chr].Length; binIndex++) { uint start = coverageInfo.StartByChr[chr][binIndex]; uint end = coverageInfo.EndByChr[chr][binIndex]; bool newSegment = IsNewSegment(starts, chr, excludeIntervals, previousBinEnd, end, start, ref excludeIndex, referencePloidy); var bin = new Bin(start, end, coverageInfo.CoverageByChr[chr][binIndex]); if (newSegment) { segmentNum++; currentSegment = new SegmentWithBins(segmentNum, bin); segmentsByChromosome[chr].Add(currentSegment); } else { if (currentSegment == null) { currentSegment = new SegmentWithBins(segmentNum, bin); segmentsByChromosome[chr].Add(currentSegment); } else { currentSegment.AddBin(bin); } } previousBinEnd = end; } } return(segmentsByChromosome); }
static int Main(string[] args) { CanvasCommon.Utilities.LogCommandLine(args); List <string> cleanedFiles = new List <string>(); List <string> outPartitionedFiles = new List <string>(); List <string> vafFiles = new List <string>(); bool needHelp = false; bool isGermline = false; string filterBedFile = null; string referenceFolder = null; string commonCNVsbedPath = null; string evennessMetricFile = null; SegmentSplitUndo undoMethod = SegmentSplitUndo.None; SegmentationInput.SegmentationMethod partitionMethod = SegmentationInput.SegmentationMethod.Wavelets; string parameterconfigPath = Path.Combine(Isas.Framework.Utilities.Utilities.GetAssemblyFolder(typeof(CanvasPartition)), "CanvasPartitionParameters.json"); string ploidyVcfPath = null; OptionSet p = new OptionSet() { { "i|infile=", "input file - usually generated by CanvasClean", v => cleanedFiles.Add(v) }, { "v|vaffile=", "variant frequencyfiles - usually generated by CanvasSNV", v => vafFiles.Add(v) }, { "o|outfile=", "text file to output", v => outPartitionedFiles.Add(v) }, { "m|method=", "segmentation method (Wavelets/CBS). Default: " + partitionMethod, v => partitionMethod = (SegmentationInput.SegmentationMethod)Enum.Parse(typeof(SegmentationInput.SegmentationMethod), v) }, { "r|reference=", "folder that contains both genome.fa and GenomeSize.xml", v => referenceFolder = v }, { "s|split=", "CBS split method (None/Prune/SDUndo). Default: " + undoMethod, v => undoMethod = (SegmentSplitUndo)Enum.Parse(typeof(SegmentSplitUndo), v) }, { "b|bedfile=", "bed file to exclude (don't span these intervals)", v => filterBedFile = v }, { "c|commoncnvs=", "bed file with common CNVs (always include these intervals into segmentation results)", v => commonCNVsbedPath = v }, { "g|germline", "flag indicating that input file represents germline genome", v => isGermline = v != null }, { $"{CommandLineOptions.EvennessMetricFile}=", "output file for evenness metric (optional)", v => evennessMetricFile = v }, { "p|ploidyVcfFile=", "vcf file specifying reference ploidy (e.g. for sex chromosomes) (optional)", v => ploidyVcfPath = v }, { "config=", "parameter configuration path (default {parameterconfigPath})", v => parameterconfigPath = v }, { "h|help", "show this message and exit", v => needHelp = v != null } }; List <string> extraArgs = p.Parse(args); if (extraArgs.Any()) { throw new IlluminaException($"Unknown arguments: {string.Join(",", extraArgs)}"); } if (needHelp) { ShowHelp(p); return(0); } if (!cleanedFiles.Any() || !outPartitionedFiles.Any() || referenceFolder == null) { ShowHelp(p); return(0); } if (cleanedFiles.Any(inFile => !File.Exists(inFile))) { Console.WriteLine("CanvasPartition.exe: File {0} does not exist! Exiting.", cleanedFiles); return(1); } if (!string.IsNullOrEmpty(filterBedFile) && !File.Exists(filterBedFile)) { Console.WriteLine("CanvasPartition.exe: File {0} does not exist! Exiting.", filterBedFile); return(1); } if (!File.Exists(parameterconfigPath)) { Console.WriteLine($"CanvasPedigreeCaller.exe: File {parameterconfigPath} does not exist! Exiting."); return(1); } if (!string.IsNullOrEmpty(ploidyVcfPath) && !File.Exists(ploidyVcfPath)) { Console.WriteLine("CanvasPartition.exe: File {0} does not exist! Exiting.", ploidyVcfPath); return(1); } var parameterconfigFile = new FileLocation(parameterconfigPath); var canvasPartitionParameters = Deserialize <CanvasPartitionParameters>(parameterconfigFile); ILogger logger = new Logger(Console.Out.ToEnumerable(), Console.Error.ToEnumerable()); var processor = new SegmentationResultsProcessor(canvasPartitionParameters.MaxInterBinDistInSegment); var segmentationInputs = vafFiles.Count > 0 && vafFiles.Count == cleanedFiles.Count ? cleanedFiles.Zip(vafFiles, (inFile, vafFile) => new SegmentationInput(inFile, vafFile, filterBedFile, referenceFolder, evennessMetricFile, logger, processor)).ToList() : cleanedFiles.Select(inFile => new SegmentationInput(inFile, null, filterBedFile, referenceFolder, evennessMetricFile, logger, processor)).ToList(); GenomeSegmentationResults segmentationResults; PloidyInfo referencePloidy = ploidyVcfPath != null?PloidyInfo.LoadPloidyFromVcfFileNoSampleId(ploidyVcfPath) : null; switch (partitionMethod) { default: // use Wavelets if CBS is not selected Console.WriteLine("{0} Running Wavelet Partitioning", DateTime.Now); var waveletsRunner = new WaveletsRunner(new WaveletsRunner.WaveletsRunnerParams(isGermline, commonCNVsbedPath, madFactor: canvasPartitionParameters.MadFactor, thresholdLowerMaf: canvasPartitionParameters.ThresholdLowerMaf, evennessScoreThreshold: canvasPartitionParameters.EvennessScoreThreshold, verbose: 2)); segmentationResults = new GenomeSegmentationResults(waveletsRunner.Run(segmentationInputs.Single(), canvasPartitionParameters.EvennessScoreWindow)); PostProcessAndWriteResults(segmentationInputs.Single(), outPartitionedFiles.Single(), referencePloidy, segmentationResults); break; case SegmentationInput.SegmentationMethod.CBS: { Console.WriteLine("{0} Running CBS Partitioning", DateTime.Now); var cbsRunner = new CBSRunner(canvasPartitionParameters.MaxInterBinDistInSegment, undoMethod, canvasPartitionParameters.CBSalpha); var sampleSegmentations = new List <GenomeSegmentationResults>(); foreach (var input in segmentationInputs) { var segmentation = new GenomeSegmentationResults(cbsRunner.Run(input, verbose: 2)); sampleSegmentations.Add(segmentation); } segmentationResults = GenomeSegmentationResults.SplitOverlappingSegments(sampleSegmentations); foreach (var(segmentationInput, outPartitionedFile) in segmentationInputs.Zip(outPartitionedFiles)) { PostProcessAndWriteResults(segmentationInput, outPartitionedFile, referencePloidy, segmentationResults); } break; } case SegmentationInput.SegmentationMethod.HMM: { Console.WriteLine("{0} Running HMM Partitioning", DateTime.Now); var hiddenMarkovModelsRunner = new HiddenMarkovModelsRunner(cleanedFiles.Count); bool isPerSample = false; segmentationResults = new GenomeSegmentationResults(hiddenMarkovModelsRunner.Run(segmentationInputs, isPerSample)); for (int i = 0; i < segmentationInputs.Count; i++) { PostProcessAndWriteResults(segmentationInputs[i], outPartitionedFiles[i], referencePloidy, segmentationResults); } break; } case SegmentationInput.SegmentationMethod.PerSampleHMM: { Console.WriteLine("{0} Running Per-sample HMM Partitioning", DateTime.Now); var hiddenMarkovModelsRunner = new HiddenMarkovModelsRunner(1); var sampleSegmentations = new List <GenomeSegmentationResults>(); bool isPerSample = true; foreach (var input in segmentationInputs) { var segmentation = new GenomeSegmentationResults( hiddenMarkovModelsRunner.Run(input.Yield().ToList(), isPerSample)); sampleSegmentations.Add(segmentation); } segmentationResults = GenomeSegmentationResults.SplitOverlappingSegments(sampleSegmentations); foreach (var(segmentationInput, outPartitionedFile) in segmentationInputs.Zip(outPartitionedFiles)) { PostProcessAndWriteResults(segmentationInput, outPartitionedFile, referencePloidy, segmentationResults); } break; } } Console.WriteLine("{0} CanvasPartition results written out", DateTime.Now); return(0); }