예제 #1
0
 // dataType: "logratio" (aCGH, ROMA, etc.) or "binary" (LOH)
 public Segmentation(string inputBinPath, string forbiddenBedPath, string dataType = "logratio")
 {
     this.InputBinPath = inputBinPath;
     this.DataType = dataType;
     this.SegmentationResults = null;
     this.ForbiddenIntervalBedPath = forbiddenBedPath;
     // Read the input file:
     this.ReadBEDInput();
 }
예제 #2
0
        /// <summary>
        /// CBS: circular binary segmentation porting the R function segment in DNAcopy
        /// </summary>
        /// <param name="alpha">Now in this.Alpha</param>
        /// <param name="nPerm"></param>
        /// <param name="pMethod">"hybrid" or "perm"</param>
        /// <param name="minWidth"></param>
        /// <param name="kMax"></param>
        /// <param name="nMin"></param>
        /// <param name="eta"></param>
        /// <param name="sbdry"></param>
        /// <param name="trim"></param>
        /// <param name="undoSplit">"none" or "prune" or "sdundo"; now in this.UndoMethod</param>
        /// <param name="undoPrune"></param>
        /// <param name="undoSD"></param>
        /// <param name="verbose"></param>
        private void CBS(uint nPerm = 10000, string pMethod = "hybrid", int minWidth = 2, int kMax = 25,
            uint nMin = 200, double eta = 0.05, uint[] sbdry = null, double trim = 0.025,
            double undoPrune = 0.05, double undoSD = 3, int verbose = 1)
        {
            if (minWidth < 2 || minWidth > 5)
            {
                Console.Error.WriteLine("Minimum segment width should be between 2 and 5");
                Environment.Exit(1);
            }
            if (nMin < 4 * kMax)
            {
                Console.Error.WriteLine("nMin should be >= 4 * kMax");
                Environment.Exit(1);
            }
            if (sbdry == null)
            {
                GetBoundary.ComputeBoundary(nPerm, this.Alpha, eta, out sbdry);
            }

            Dictionary<string, int[]> inaByChr = new Dictionary<string, int[]>();
            Dictionary<string, double[]> finiteScoresByChr = new Dictionary<string, double[]>();

            List<ThreadStart> tasks = new List<ThreadStart>();
            foreach (KeyValuePair<string, double[]> scoreByChrKVP in ScoreByChr)
            {
                tasks.Add(new ThreadStart(() =>
                {
                    string chr = scoreByChrKVP.Key;
                    int[] ina;
                    Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf

                    double[] scores;
                    if (ina.Length == scoreByChrKVP.Value.Length)
                    {
                        scores = scoreByChrKVP.Value;
                    }
                    else
                    {
                        Helper.ExtractValues<double>(scoreByChrKVP.Value, ina, out scores);
                    }

                    lock (finiteScoresByChr)
                    {
                        finiteScoresByChr[chr] = scores;
                        inaByChr[chr] = ina;
                    }

                }));
            }
            //Parallel.ForEach(tasks, t => { t.Invoke(); });
            Isas.Shared.Utilities.DoWorkParallelThreads(tasks);
            // Quick sanity-check: If we don't have any segments, then return a dummy result.
            int n = 0;
            foreach (var list in finiteScoresByChr.Values)
            {
                n += list.Length;
            }
            if (n == 0)
            {
                this.SegmentationResults = this.GetDummySegmentationResults();
                return;
            }

            double trimmedSD = Math.Sqrt(ChangePoint.TrimmedVariance(finiteScoresByChr, trim: trim));
            Dictionary<string, Segment[]> segmentByChr = new Dictionary<string, Segment[]>();

            // when parallelizing we need an RNG for each chromosome to get deterministic results
            Random seedGenerator = new MersenneTwister(0);
            Dictionary<string, Random> perChromosomeRandom = new Dictionary<string, Random>();
            foreach (string chr in this.ScoreByChr.Keys)
            {
                perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true);
            }

            tasks = new List<ThreadStart>();
            foreach (string chr in ScoreByChr.Keys)
            {
                tasks.Add(new ThreadStart(() =>
                {
                    int[] ina = inaByChr[chr];
                    int[] lengthSeg;
                    double[] segmentMeans;
                    ChangePoint.ChangePoints(this.ScoreByChr[chr], sbdry, out lengthSeg, out segmentMeans, perChromosomeRandom[chr],
                        dataType: this.DataType, alpha: this.Alpha, nPerm: nPerm,
                        pMethod: pMethod, minWidth: minWidth, kMax: kMax, nMin: nMin, trimmedSD: trimmedSD,
                        undoSplits: this.UndoMethod, undoPrune: undoPrune, undoSD: undoSD, verbose: verbose);

                    Segment[] segments = new Segment[lengthSeg.Length];
                    int cs1 = 0, cs2 = -1; // cumulative sum
                    for (int i = 0; i < lengthSeg.Length; i++)
                    {
                        cs2 += lengthSeg[i];
                        int start = ina[cs1];
                        int end = ina[cs2];
                        segments[i] = new Segment();
                        segments[i].start = this.StartByChr[chr][start]; // Genomic start
                        segments[i].end = this.EndByChr[chr][end]; // Genomic end
                        segments[i].nMarkers = lengthSeg[i];
                        segments[i].mean = segmentMeans[i];
                        cs1 += lengthSeg[i];
                    }

                    lock (segmentByChr)
                    {
                        segmentByChr[chr] = segments;
                    }
                }));
            }

            //Parallel.ForEach(tasks, t => { t.Invoke(); });
            Isas.Shared.Utilities.DoWorkParallelThreads(tasks);
            this.SegmentationResults = new GenomeSegmentationResults(segmentByChr);
        }
예제 #3
0
        /// <summary>
        /// Wavelets: unbalanced HAAR wavelets segmentation 
        /// </summary>
        /// <param name="threshold">wavelets coefficient threshold</param>
        private void Wavelets(bool isGermline, double thresholdLower = 5, double thresholdUpper = 80, int minSize = 10, int verbose = 1)
        {
            Dictionary<string, int[]> inaByChr = new Dictionary<string, int[]>();
            Dictionary<string, double[]> finiteScoresByChr = new Dictionary<string, double[]>();

            List<ThreadStart> tasks = new List<ThreadStart>();
            foreach (KeyValuePair<string, double[]> scoreByChrKVP in ScoreByChr)
            {
                tasks.Add(new ThreadStart(() =>
                {
                    string chr = scoreByChrKVP.Key;
                    int[] ina;
                    Helper.GetFiniteIndices(scoreByChrKVP.Value, out ina); // not NaN, -Inf, Inf

                    double[] scores;
                    if (ina.Length == scoreByChrKVP.Value.Length)
                    {
                        scores = scoreByChrKVP.Value;
                    }
                    else
                    {
                        Helper.ExtractValues<double>(scoreByChrKVP.Value, ina, out scores);
                    }

                    lock (finiteScoresByChr)
                    {
                        finiteScoresByChr[chr] = scores;
                        inaByChr[chr] = ina;
                    }

                }));
            }
            Isas.Shared.Utilities.DoWorkParallelThreads(tasks);
            // Quick sanity-check: If we don't have any segments, then return a dummy result.
            int n = 0;
            foreach (var list in finiteScoresByChr.Values)
            {
                n += list.Length;
            }
            if (n == 0)
            {
                this.SegmentationResults = this.GetDummySegmentationResults();
                return;
            }

            Dictionary<string, Segment[]> segmentByChr = new Dictionary<string, Segment[]>();

            // when parallelizing we need an RNG for each chromosome to get deterministic results
            Random seedGenerator = new MersenneTwister(0);
            Dictionary<string, Random> perChromosomeRandom = new Dictionary<string, Random>();
            foreach (string chr in this.ScoreByChr.Keys)
            {
                perChromosomeRandom[chr] = new MersenneTwister(seedGenerator.NextFullRangeInt32(), true);
            }

            tasks = new List<ThreadStart>();
            foreach (string chr in ScoreByChr.Keys)
            {
                tasks.Add(new ThreadStart(() =>
                {
                    int[] ina = inaByChr[chr];
                    List<int> breakpoints = new List<int>();
                    int sizeScoreByChr = this.ScoreByChr[chr].Length;
                    if (sizeScoreByChr > minSize)
                    {
                        WaveletSegmentation.HaarWavelets(this.ScoreByChr[chr].ToArray(), thresholdLower, thresholdUpper, breakpoints, isGermline);
                    }

                    List<int> startBreakpointsPos = new List<int>();
                    List<int> endBreakpointPos = new List<int>();
                    List<int> lengthSeg = new List<int>();

                    if (breakpoints.Count() >= 2 && sizeScoreByChr > 10)
                    {
                        startBreakpointsPos.Add(breakpoints[0]);
                        endBreakpointPos.Add(breakpoints[1] - 1);
                        lengthSeg.Add(breakpoints[1] - 1);

                        for (int i = 1; i < breakpoints.Count - 1; i++)
                        {
                            startBreakpointsPos.Add(breakpoints[i]);
                            endBreakpointPos.Add(breakpoints[i + 1] - 1);
                            lengthSeg.Add(breakpoints[i + 1] - 1 - breakpoints[i]);
                        }
                        startBreakpointsPos.Add(breakpoints[breakpoints.Count - 1]);
                        endBreakpointPos.Add(sizeScoreByChr - 1);
                        lengthSeg.Add(sizeScoreByChr - breakpoints[breakpoints.Count - 1] - 1);
                    }
                    else
                    {
                        startBreakpointsPos.Add(0);
                        endBreakpointPos.Add(sizeScoreByChr - 1);
                        lengthSeg.Add(sizeScoreByChr - 1);

                    }
                    // estimate segment means

                    double[] segmentMeans = new double[lengthSeg.Count()];
                    int ss = 0, ee = 0;
                    for (int i = 0; i < lengthSeg.Count(); i++)
                    {
                        ee += lengthSeg[i];
                        // Works even if weights == null
                        segmentMeans[i] = Helper.WeightedAverage(this.ScoreByChr[chr], null, iStart: ss, iEnd: ee);
                        ss = ee;
                    }

                    Segment[] segments = new Segment[startBreakpointsPos.Count];
                    for (int i = 0; i < startBreakpointsPos.Count; i++)
                    {
                        int start = startBreakpointsPos[i];
                        int end = endBreakpointPos[i];
                        segments[i] = new Segment();
                        segments[i].start = this.StartByChr[chr][start]; // Genomic start
                        segments[i].end = this.EndByChr[chr][end]; // Genomic end
                        segments[i].nMarkers = lengthSeg[i];
                        segments[i].mean = segmentMeans[i];
                    }

                    lock (segmentByChr)
                    {
                        segmentByChr[chr] = segments;
                    }
                }));

            }
            Console.WriteLine("{0} Launching wavelet tasks", DateTime.Now);
            Isas.Shared.Utilities.DoWorkParallelThreads(tasks);
            Console.WriteLine("{0} Completed wavelet tasks", DateTime.Now);
            this.SegmentationResults = new GenomeSegmentationResults(segmentByChr);
            Console.WriteLine("{0} Segmentation results complete", DateTime.Now);
        }
예제 #4
0
 private GenomeSegmentationResults GetDummySegmentationResults()
 {
     GenomeSegmentationResults results = new GenomeSegmentationResults(new Dictionary<string, Segment[]>());
     return results;
 }
예제 #5
0
        public void PostProcessSegmentsTests()
        {
            var processor = new SegmentationResultsProcessor(100);

            var chr1Segments = new List <SegmentationInput.Segment>();

            chr1Segments.Add(new SegmentationInput.Segment()
            {
                start = 1, end = 1000
            });
            chr1Segments.Add(new SegmentationInput.Segment()
            {
                start = 1100, end = 4500
            });
            chr1Segments.Add(new SegmentationInput.Segment()
            {
                start = 4600, end = 5000
            });

            var segmentsByChrom = new Dictionary <string, SegmentationInput.Segment[]>();

            segmentsByChrom.Add("chr1", chr1Segments.ToArray());
            var segmentationResults = new GenomeSegmentationResults(segmentsByChrom);

            var ploidyInfo        = new PloidyInfo();
            var excludedIntervals = new Dictionary <string, List <SampleGenomicBin> >();
            var coverageInfo      = new CoverageInfo();

            coverageInfo.CoverageByChr = new Dictionary <string, double[]>();
            coverageInfo.EndByChr      = new Dictionary <string, uint[]>();
            coverageInfo.StartByChr    = new Dictionary <string, uint[]>();
            coverageInfo.CoverageByChr.Add("chr1", new double[] { 10, 10, 50, 100, 25, 10 });
            coverageInfo.StartByChr.Add("chr1", new uint[] { 100, 600, 1200, 1300, 4001, 5000 });
            coverageInfo.EndByChr.Add("chr1", new uint[] { 500, 890, 1299, 4000, 4500, 5050 });

            var results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo);

            var chr1Results = results["chr1"];

            Assert.Equal(3, chr1Results.Count);

            // Final segments should reflect boundaries of actual bins within them
            //  (in practice, these probably shouldn't disagree? but let's go theoretical here)
            SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 890, 10, 2);
            SegmentTestHelpers.CheckSegment(chr1Results[1], 1200, 4500, 50, 3);
            SegmentTestHelpers.CheckSegment(chr1Results[2], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?)

            // Add forbidden zone between two bins of the same original segment, this should split up the affected segment
            excludedIntervals.Add("chr1", new List <SampleGenomicBin>()
            {
                GetForbiddenZone("chr1", 525, 575)
            });                                                                                              // Mid = 550, in between the bins of the first segment
            results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo);

            chr1Results = results["chr1"];
            Assert.Equal(4, chr1Results.Count);

            SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 500, 10, 1);
            SegmentTestHelpers.CheckSegment(chr1Results[1], 600, 890, 10, 1);
            SegmentTestHelpers.CheckSegment(chr1Results[2], 1200, 4500, 50, 3);
            SegmentTestHelpers.CheckSegment(chr1Results[3], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?)

            // Forbidden zone midpoint is in the second bin -- apparently this is presumed to never happen because it would have already been taken care of
            // This fails the test with the Debug Asserts in there. Otherwise it would be counted as a new bin
            excludedIntervals.Clear();
            excludedIntervals.Add("chr1", new List <SampleGenomicBin>()
            {
                GetForbiddenZone("chr1", 585, 635)
            });                                                                                                 // Mid = 610, in second bin
            results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo);

            chr1Results = results["chr1"];
            Assert.Equal(4, chr1Results.Count);

            SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 500, 10, 1);
            SegmentTestHelpers.CheckSegment(chr1Results[1], 600, 890, 10, 1);
            SegmentTestHelpers.CheckSegment(chr1Results[2], 1200, 4500, 50, 3);
            SegmentTestHelpers.CheckSegment(chr1Results[3], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?)

            // Forbidden zone midpoint is in the first bin although it ends between bins -- apparently this is presumed to never happen because it would have already been taken care of
            // Note the asymmetry compared to the above
            excludedIntervals.Clear();
            excludedIntervals.Add("chr1", new List <SampleGenomicBin>()
            {
                GetForbiddenZone("chr1", 465, 515)
            });                                                                                                 // Mid = 490, in first bin of first segment
            results = processor.PostProcessSegments(segmentationResults, ploidyInfo, excludedIntervals, coverageInfo);

            chr1Results = results["chr1"];
            // Would fail - asymmetry. What do we want?
            // Leave as-is for now so as not to change the behavior in this (unrelated) feature addition
            //Assert.Equal(4, chr1Results.Count);

            //SegmentTestHelpers.CheckSegment(chr1Results[0], 100, 500, 10, 1);
            //SegmentTestHelpers.CheckSegment(chr1Results[1], 600, 890, 10, 1);
            //SegmentTestHelpers.CheckSegment(chr1Results[2], 1200, 4500, 50, 3);
            //SegmentTestHelpers.CheckSegment(chr1Results[3], 5000, 5050, 10, 1); // Bin extends past segment - still keep it (?)

            // TODO test where no segment covers bins?
            // TODO overlapping segments or bins?
            // TODO bin starts before segment
            // TODO test interbin dist
        }