public static double TrimmedVariance(IDictionary <string, double[]> scoresByChr, double trim = 0.025) { int n = 0; foreach (string chr in scoresByChr.Keys) { n += scoresByChr[chr].Length; } double[] diff = new double[n - 1]; int i = 0; double last = Double.NaN; foreach (string chr in scoresByChr.Keys) { if (scoresByChr[chr].Length <= 0) { continue; } if (i > 0) { diff[i] = scoresByChr[chr][0] - last; i++; } Array.Copy(Helper.Diff(scoresByChr[chr]), 0, diff, i, scoresByChr[chr].Length - 1); i += (scoresByChr[chr].Length - 1); last = scoresByChr[chr][scoresByChr[chr].Length - 1]; } int nKeep = Convert.ToInt32(Math.Round((1 - 2 * trim) * (n - 1))); // R code: inflfact(trim)*sum((sort(abs(diff(genomdat)))[1:n.keep])^2 / (2*n.keep)) Helper.InplaceAbs(diff); Array.Sort(diff); return(ChangePoint.InflationFactor(trim) * Helper.PartialSumOfPowers(diff, 2, 0, nKeep) / (2 * nKeep)); }
/// <summary> /// CBS: circular binary segmentation porting the R function segment in DNAcopy /// </summary> /// <param name="alpha">Now in this.Alpha</param> /// <param name="nPerm"></param> /// <param name="pMethod">"hybrid" or "perm"</param> /// <param name="minWidth"></param> /// <param name="kMax"></param> /// <param name="nMin"></param> /// <param name="eta"></param> /// <param name="sbdry"></param> /// <param name="trim"></param> /// <param name="undoSplit">"none" or "prune" or "sdundo"; now in this.UndoMethod</param> /// <param name="undoPrune"></param> /// <param name="undoSD"></param> /// <param name="verbose"></param> public Dictionary <string, Segmentation.Segment[]> Run(Segmentation segmentation, uint nPerm = 10000, string pMethod = "hybrid", int minWidth = 2, int kMax = 25, uint nMin = 200, double eta = 0.05, uint[] sbdry = null, double trim = 0.025, double undoPrune = 0.05, double undoSD = 3, int verbose = 1) { if (minWidth < 2 || minWidth > 5) { Console.Error.WriteLine("Minimum segment width should be between 2 and 5"); Environment.Exit(1); } if (nMin < 4 * kMax) { Console.Error.WriteLine("nMin should be >= 4 * kMax"); Environment.Exit(1); } if (sbdry == null) { GetBoundary.ComputeBoundary(nPerm, this._alpha, eta, out sbdry); } Dictionary <string, int[]> inaByChr = new Dictionary <string, int[]>(); Dictionary <string, double[]> finiteScoresByChr = new Dictionary <string, double[]>(); List <ThreadStart> tasks = new List <ThreadStart>(); foreach (KeyValuePair <string, double[]> scoreByChrKVP in segmentation.ScoreByChr) { tasks.Add(new ThreadStart(() => { string chr = scoreByChrKVP.Key; int[] ina; Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf double[] scores; if (ina.Length == scoreByChrKVP.Value.Length) { scores = scoreByChrKVP.Value; } else { Helper.ExtractValues <double>(scoreByChrKVP.Value, ina, out scores); } lock (finiteScoresByChr) { finiteScoresByChr[chr] = scores; inaByChr[chr] = ina; } })); } Parallel.ForEach(tasks, task => task.Invoke()); // Quick sanity-check: If we don't have any segments, then return a dummy result. int n = 0; foreach (var list in finiteScoresByChr.Values) { n += list.Length; } if (n == 0) { return(new Dictionary <string, Segmentation.Segment[]>()); } double trimmedSD = Math.Sqrt(ChangePoint.TrimmedVariance(finiteScoresByChr, trim: trim)); Dictionary <string, Segmentation.Segment[]> segmentByChr = new Dictionary <string, Segmentation.Segment[]>(); // when parallelizing we need an RNG for each chromosome to get deterministic results Random seedGenerator = new MersenneTwister(0); Dictionary <string, Random> perChromosomeRandom = new Dictionary <string, Random>(); foreach (string chr in segmentation.ScoreByChr.Keys) { perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true); } tasks = new List <ThreadStart>(); foreach (string chr in segmentation.ScoreByChr.Keys) { tasks.Add(new ThreadStart(() => { int[] ina = inaByChr[chr]; int[] lengthSeg; double[] segmentMeans; ChangePoint.ChangePoints(segmentation.ScoreByChr[chr], sbdry, out lengthSeg, out segmentMeans, perChromosomeRandom[chr], dataType: "logratio", alpha: this._alpha, nPerm: nPerm, pMethod: pMethod, minWidth: minWidth, kMax: kMax, nMin: nMin, trimmedSD: trimmedSD, undoSplits: this._undoMethod, undoPrune: undoPrune, undoSD: undoSD, verbose: verbose); Segmentation.Segment[] segments = new Segmentation.Segment[lengthSeg.Length]; int cs1 = 0, cs2 = -1; // cumulative sum for (int i = 0; i < lengthSeg.Length; i++) { cs2 += lengthSeg[i]; int start = ina[cs1]; int end = ina[cs2]; segments[i] = new Segmentation.Segment(); segments[i].start = segmentation.StartByChr[chr][start]; // Genomic start segments[i].end = segmentation.EndByChr[chr][end]; // Genomic end cs1 += lengthSeg[i]; } lock (segmentByChr) { segmentByChr[chr] = segments; } })); } Parallel.ForEach(tasks, task => task.Invoke()); // segmentation.SegmentationResults = new Segmentation.GenomeSegmentationResults(segmentByChr); Console.WriteLine("{0} Completed CBS tasks", DateTime.Now); Console.WriteLine("{0} Segmentation results complete", DateTime.Now); return(segmentByChr); }
/// <summary> /// Outputs: /// lengthSeg /// segmentMeans /// </summary> /// <param name="genomeData"></param> /// <param name="sbdry"></param> /// <param name="lengthSeg">segment lengths</param> /// <param name="segmentMeans">segment means</param> /// <param name="dataType">"logratio" or "binary"</param> /// <param name="alpha"></param> /// <param name="nPerm"></param> /// <param name="pMethod"></param> /// <param name="minWidth"></param> /// <param name="kMax"></param> /// <param name="nMin"></param> /// <param name="trimmedSD"></param> /// <param name="undoSplits">"none" or "prune" or "sdundo"</param> /// <param name="undoPrune"></param> /// <param name="undoSD"></param> /// <param name="verbose"></param> /// <param name="nGrid"></param> /// <param name="tol"></param> public static void ChangePoints(double[] genomeData, uint[] sbdry, out int[] lengthSeg, out double[] segmentMeans, Random rnd, string dataType = "logratio", double alpha = 0.01, uint nPerm = 10000, string pMethod = "hybrid", int minWidth = 2, int kMax = 25, uint nMin = 200, double trimmedSD = -1, SegmentSplitUndo undoSplits = SegmentSplitUndo.None, double undoPrune = 0.05, double undoSD = 3, int verbose = 1, int nGrid = 100, double tol = 1E-6) { if (trimmedSD <= 0) { trimmedSD = Helper.MedianAbsoluteDeviation(Helper.Diff(genomeData)) / Math.Sqrt(2); } // start with the whole List <int> segEnd = new List <int>(); segEnd.Add(0); // inclusive segEnd.Add(genomeData.Length); // exclusive int k = segEnd.Count; List <int> changeLocations = new List <int>(); int nChangePoints = 0; int[] iChangePoint = null; while (k > 1) { int currentN = segEnd[k - 1] - segEnd[k - 2]; if (verbose >= 3) { Console.Write(".... current segment: {0} - {1} \n", segEnd[k - 2] + 1, segEnd[k - 1]); } if (currentN >= 2 * minWidth) { double[] currentGenomeData = new double[currentN]; Array.Copy(genomeData, segEnd[k - 2], currentGenomeData, 0, currentN); // check whether hybrid method needs to be used bool hybrid = false; double delta = 0.0; if (pMethod.Equals("hybrid") && nMin < currentN) { hybrid = true; delta = (kMax + 1.0) / currentN; } // if all values of current.genomdat are the same don't segment if (currentGenomeData.Max() == currentGenomeData.Min()) { nChangePoints = 0; } else { // centering the current data will save a lot of computations later double currentAverage = currentGenomeData.Average(); Helper.InplaceSub(currentGenomeData, currentAverage); // need total sum of squares too double currentTSS = Helper.WeightedSumOfSquares(currentGenomeData, null); ChangePoint.FindChangePoints(currentGenomeData, currentTSS, nPerm, alpha, out nChangePoints, out iChangePoint, dataType.Equals("binary"), hybrid, minWidth, kMax, delta, nGrid, sbdry, tol, rnd); } } else { nChangePoints = 0; } // Save the change location // segEnd[k - 1] will be removed when nChangePoints == 0 if (nChangePoints == 0) { changeLocations.Add(segEnd[k - 1]); } // Offset iChangePoint by segEnd[k - 2] for (int i = 0; i < nChangePoints; i++) { iChangePoint[i] += segEnd[k - 2]; } switch (nChangePoints) // switch by the number of change points { case 0: // no change point segEnd.RemoveAt(k - 1); // Remove the last element break; case 1: // one change point segEnd.Insert(k - 1, iChangePoint[0]); break; case 2: // two change points segEnd.InsertRange(k - 1, iChangePoint); break; default: Console.Error.WriteLine("There should be 0, 1, or 2 change points"); break; } k = segEnd.Count; if (verbose >= 3) { Console.Write(".... segments to go: {0} \n", String.Join(" ", segEnd)); } } changeLocations.Reverse(); // changeLocations is no longer needed List <int> segEnds = changeLocations; int nSeg = segEnds.Count; segEnds.Insert(0, 0); lengthSeg = Helper.Diff(segEnds.ToArray()); if (nSeg > 1) { if (undoSplits == SegmentSplitUndo.Prune) { lengthSeg = ChangePointsPrune(genomeData, lengthSeg, changeCutoff: undoPrune); } if (undoSplits == SegmentSplitUndo.SDUndo) { lengthSeg = ChangePointsSDUndo(genomeData, lengthSeg, trimmedSD, changeSD: undoSD); } } segmentMeans = new double[lengthSeg.Length]; int ll = 0, uu = 0; for (int i = 0; i < lengthSeg.Length; i++) { uu += lengthSeg[i]; // Works even if weights == null segmentMeans[i] = Helper.WeightedAverage(genomeData, null, iStart: ll, iEnd: uu); ll = uu; } }