// dataType: "logratio" (aCGH, ROMA, etc.) or "binary" (LOH) public Segmentation(string inputBinPath, string forbiddenBedPath, string dataType = "logratio") { this.InputBinPath = inputBinPath; this.DataType = dataType; this.SegmentationResults = null; this.ForbiddenIntervalBedPath = forbiddenBedPath; // Read the input file: this.ReadBEDInput(); }
/// <summary> /// CBS: circular binary segmentation porting the R function segment in DNAcopy /// </summary> /// <param name="alpha">Now in this.Alpha</param> /// <param name="nPerm"></param> /// <param name="pMethod">"hybrid" or "perm"</param> /// <param name="minWidth"></param> /// <param name="kMax"></param> /// <param name="nMin"></param> /// <param name="eta"></param> /// <param name="sbdry"></param> /// <param name="trim"></param> /// <param name="undoSplit">"none" or "prune" or "sdundo"; now in this.UndoMethod</param> /// <param name="undoPrune"></param> /// <param name="undoSD"></param> /// <param name="verbose"></param> private void CBS(uint nPerm = 10000, string pMethod = "hybrid", int minWidth = 2, int kMax = 25, uint nMin = 200, double eta = 0.05, uint[] sbdry = null, double trim = 0.025, double undoPrune = 0.05, double undoSD = 3, int verbose = 1) { if (minWidth < 2 || minWidth > 5) { Console.Error.WriteLine("Minimum segment width should be between 2 and 5"); Environment.Exit(1); } if (nMin < 4 * kMax) { Console.Error.WriteLine("nMin should be >= 4 * kMax"); Environment.Exit(1); } if (sbdry == null) { GetBoundary.ComputeBoundary(nPerm, this.Alpha, eta, out sbdry); } Dictionary<string, int[]> inaByChr = new Dictionary<string, int[]>(); Dictionary<string, double[]> finiteScoresByChr = new Dictionary<string, double[]>(); List<ThreadStart> tasks = new List<ThreadStart>(); foreach (KeyValuePair<string, double[]> scoreByChrKVP in ScoreByChr) { tasks.Add(new ThreadStart(() => { string chr = scoreByChrKVP.Key; int[] ina; Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf double[] scores; if (ina.Length == scoreByChrKVP.Value.Length) { scores = scoreByChrKVP.Value; } else { Helper.ExtractValues<double>(scoreByChrKVP.Value, ina, out scores); } lock (finiteScoresByChr) { finiteScoresByChr[chr] = scores; inaByChr[chr] = ina; } })); } //Parallel.ForEach(tasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(tasks); // Quick sanity-check: If we don't have any segments, then return a dummy result. int n = 0; foreach (var list in finiteScoresByChr.Values) { n += list.Length; } if (n == 0) { this.SegmentationResults = this.GetDummySegmentationResults(); return; } double trimmedSD = Math.Sqrt(ChangePoint.TrimmedVariance(finiteScoresByChr, trim: trim)); Dictionary<string, Segment[]> segmentByChr = new Dictionary<string, Segment[]>(); // when parallelizing we need an RNG for each chromosome to get deterministic results Random seedGenerator = new MersenneTwister(0); Dictionary<string, Random> perChromosomeRandom = new Dictionary<string, Random>(); foreach (string chr in this.ScoreByChr.Keys) { perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true); } tasks = new List<ThreadStart>(); foreach (string chr in ScoreByChr.Keys) { tasks.Add(new ThreadStart(() => { int[] ina = inaByChr[chr]; int[] lengthSeg; double[] segmentMeans; ChangePoint.ChangePoints(this.ScoreByChr[chr], sbdry, out lengthSeg, out segmentMeans, perChromosomeRandom[chr], dataType: this.DataType, alpha: this.Alpha, nPerm: nPerm, pMethod: pMethod, minWidth: minWidth, kMax: kMax, nMin: nMin, trimmedSD: trimmedSD, undoSplits: this.UndoMethod, undoPrune: undoPrune, undoSD: undoSD, verbose: verbose); Segment[] segments = new Segment[lengthSeg.Length]; int cs1 = 0, cs2 = -1; // cumulative sum for (int i = 0; i < lengthSeg.Length; i++) { cs2 += lengthSeg[i]; int start = ina[cs1]; int end = ina[cs2]; segments[i] = new Segment(); segments[i].start = this.StartByChr[chr][start]; // Genomic start segments[i].end = this.EndByChr[chr][end]; // Genomic end segments[i].nMarkers = lengthSeg[i]; segments[i].mean = segmentMeans[i]; cs1 += lengthSeg[i]; } lock (segmentByChr) { segmentByChr[chr] = segments; } })); } //Parallel.ForEach(tasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(tasks); this.SegmentationResults = new GenomeSegmentationResults(segmentByChr); }
/// <summary> /// Wavelets: unbalanced HAAR wavelets segmentation /// </summary> /// <param name="threshold">wavelets coefficient threshold</param> private void Wavelets(bool isGermline, double thresholdLower = 5, double thresholdUpper = 80, int minSize = 10, int verbose = 1) { Dictionary<string, int[]> inaByChr = new Dictionary<string, int[]>(); Dictionary<string, double[]> finiteScoresByChr = new Dictionary<string, double[]>(); List<ThreadStart> tasks = new List<ThreadStart>(); foreach (KeyValuePair<string, double[]> scoreByChrKVP in ScoreByChr) { tasks.Add(new ThreadStart(() => { string chr = scoreByChrKVP.Key; int[] ina; Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf double[] scores; if (ina.Length == scoreByChrKVP.Value.Length) { scores = scoreByChrKVP.Value; } else { Helper.ExtractValues<double>(scoreByChrKVP.Value, ina, out scores); } lock (finiteScoresByChr) { finiteScoresByChr[chr] = scores; inaByChr[chr] = ina; } })); } Isas.Shared.Utilities.DoWorkParallelThreads(tasks); // Quick sanity-check: If we don't have any segments, then return a dummy result. int n = 0; foreach (var list in finiteScoresByChr.Values) { n += list.Length; } if (n == 0) { this.SegmentationResults = this.GetDummySegmentationResults(); return; } Dictionary<string, Segment[]> segmentByChr = new Dictionary<string, Segment[]>(); // when parallelizing we need an RNG for each chromosome to get deterministic results Random seedGenerator = new MersenneTwister(0); Dictionary<string, Random> perChromosomeRandom = new Dictionary<string, Random>(); foreach (string chr in this.ScoreByChr.Keys) { perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true); } tasks = new List<ThreadStart>(); foreach (string chr in ScoreByChr.Keys) { tasks.Add(new ThreadStart(() => { int[] ina = inaByChr[chr]; List<int> breakpoints = new List<int>(); int sizeScoreByChr = this.ScoreByChr[chr].Length; if (sizeScoreByChr > minSize) { WaveletSegmentation.HaarWavelets(this.ScoreByChr[chr].ToArray(), thresholdLower, thresholdUpper, breakpoints, isGermline); } List<int> startBreakpointsPos = new List<int>(); List<int> endBreakpointPos = new List<int>(); List<int> lengthSeg = new List<int>(); if (breakpoints.Count() >= 2 && sizeScoreByChr > 10) { startBreakpointsPos.Add(breakpoints[0]); endBreakpointPos.Add(breakpoints[1] - 1); lengthSeg.Add(breakpoints[1] - 1); for (int i = 1; i < breakpoints.Count - 1; i++) { startBreakpointsPos.Add(breakpoints[i]); endBreakpointPos.Add(breakpoints[i + 1] - 1); lengthSeg.Add(breakpoints[i + 1] - 1 - breakpoints[i]); } startBreakpointsPos.Add(breakpoints[breakpoints.Count - 1]); endBreakpointPos.Add(sizeScoreByChr - 1); lengthSeg.Add(sizeScoreByChr - breakpoints[breakpoints.Count - 1] - 1); } else { startBreakpointsPos.Add(0); endBreakpointPos.Add(sizeScoreByChr - 1); lengthSeg.Add(sizeScoreByChr - 1); } // estimate segment means double[] segmentMeans = new double[lengthSeg.Count()]; int ss = 0, ee = 0; for (int i = 0; i < lengthSeg.Count(); i++) { ee += lengthSeg[i]; // Works even if weights == null segmentMeans[i] = Helper.WeightedAverage(this.ScoreByChr[chr], null, iStart: ss, iEnd: ee); ss = ee; } Segment[] segments = new Segment[startBreakpointsPos.Count]; for (int i = 0; i < startBreakpointsPos.Count; i++) { int start = startBreakpointsPos[i]; int end = endBreakpointPos[i]; segments[i] = new Segment(); segments[i].start = this.StartByChr[chr][start]; // Genomic start segments[i].end = this.EndByChr[chr][end]; // Genomic end segments[i].nMarkers = lengthSeg[i]; segments[i].mean = segmentMeans[i]; } lock (segmentByChr) { segmentByChr[chr] = segments; } })); } Console.WriteLine("{0} Launching wavelet tasks", DateTime.Now); Isas.Shared.Utilities.DoWorkParallelThreads(tasks); Console.WriteLine("{0} Completed wavelet tasks", DateTime.Now); this.SegmentationResults = new GenomeSegmentationResults(segmentByChr); Console.WriteLine("{0} Segmentation results complete", DateTime.Now); }
private GenomeSegmentationResults GetDummySegmentationResults() { GenomeSegmentationResults results = new GenomeSegmentationResults(new Dictionary<string, Segment[]>()); return results; }
public void PostProcessSegmentsTests() { var processor = new SegmentationResultsProcessor(100); var chr1Segments = new List <SegmentationInput.Segment>(); chr1Segments.Add(new SegmentationInput.Segment() { start = 1, end = 1000 }); chr1Segments.Add(new SegmentationInput.Segment() { start = 1100, end = 4500 }); chr1Segments.Add(new SegmentationInput.Segment() { start = 4600, end = 5000 }); var segmentsByChrom = new Dictionary <string, SegmentationInput.Segment[]>(); segmentsByChrom.Add("chr1", chr1Segments.ToArray()); var segmentationResults = new GenomeSegmentationResults(segmentsByChrom); var ploidyInfo = new PloidyInfo(); var excludedIntervals = new Dictionary <string, List <SampleGenomicBin> >(); var coverageInfo = new CoverageInfo(); coverageInfo.CoverageByChr = new Dictionary <string, double[]>(); coverageInfo.EndByChr = new Dictionary <string, uint[]>(); coverageInfo.StartByChr = new Dictionary <string, uint[]>(); coverageInfo.CoverageByChr.Add("chr1", new double[] { 10, 10, 50, 100, 25, 10 }); coverageInfo.StartByChr.Add("chr1", new uint[] { 100, 600, 1200, 1300, 4001, 5000 }); coverageInfo.EndByChr.Add("chr1", new uint[] { 500, 890, 1299, 4000, 4500, 5050 }); var results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo); var chr1Results = results["chr1"]; Assert.Equal(3, chr1Results.Count); // Final segments should reflect boundaries of actual bins within them // (in practice, these probably shouldn't disagree? but let's go theoretical here) SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 890, 10, 2); SegmentTestHelpers.CheckSegment(chr1Results[1], 1200, 4500, 50, 3); SegmentTestHelpers.CheckSegment(chr1Results[2], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?) // Add forbidden zone between two bins of the same original segment, this should split up the affected segment excludedIntervals.Add("chr1", new List <SampleGenomicBin>() { GetForbiddenZone("chr1", 525, 575) }); // Mid = 550, in between the bins of the first segment results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo); chr1Results = results["chr1"]; Assert.Equal(4, chr1Results.Count); SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 500, 10, 1); SegmentTestHelpers.CheckSegment(chr1Results[1], 600, 890, 10, 1); SegmentTestHelpers.CheckSegment(chr1Results[2], 1200, 4500, 50, 3); SegmentTestHelpers.CheckSegment(chr1Results[3], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?) // Forbidden zone midpoint is in the second bin -- apparently this is presumed to never happen because it would have already been taken care of // This fails the test with the Debug Asserts in there. Otherwise it would be counted as a new bin excludedIntervals.Clear(); excludedIntervals.Add("chr1", new List <SampleGenomicBin>() { GetForbiddenZone("chr1", 585, 635) }); // Mid = 610, in second bin results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo); chr1Results = results["chr1"]; Assert.Equal(4, chr1Results.Count); SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 500, 10, 1); SegmentTestHelpers.CheckSegment(chr1Results[1], 600, 890, 10, 1); SegmentTestHelpers.CheckSegment(chr1Results[2], 1200, 4500, 50, 3); SegmentTestHelpers.CheckSegment(chr1Results[3], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?) // Forbidden zone midpoint is in the first bin although it ends between bins -- apparently this is presumed to never happen because it would have already been taken care of // Note the asymmetry compared to the above excludedIntervals.Clear(); excludedIntervals.Add("chr1", new List <SampleGenomicBin>() { GetForbiddenZone("chr1", 465, 515) }); // Mid = 490, in first bin of first segment results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo); chr1Results = results["chr1"]; // Would fail - asymmetry. What do we want? // Leave as-is for now so as not to change the behavior in this (unrelated) feature addition //Assert.Equal(4, chr1Results.Count); //SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 500, 10, 1); //SegmentTestHelpers.CheckSegment(chr1Results[1], 600, 890, 10, 1); //SegmentTestHelpers.CheckSegment(chr1Results[2], 1200, 4500, 50, 3); //SegmentTestHelpers.CheckSegment(chr1Results[3], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?) // TODO test where no segment covers bins? // TODO overlapping segments or bins? // TODO bin starts before segment // TODO test interbin dist }