public Dictionary <string, Segmentation.Segment[]> Run(List <Segmentation> segmentation) { Dictionary <string, List <SampleGenomicBin> > commonCNVintervals = null; if (_commonCnVs != null) { commonCNVintervals = CanvasCommon.Utilities.LoadBedFile(_commonCnVs); CanvasCommon.Utilities.SortAndOverlapCheck(commonCNVintervals, _commonCnVs); } var segmentByChr = new Dictionary <string, Segmentation.Segment[]>(); var cts = new CancellationTokenSource(); Parallel.ForEach( segmentation.First().ScoreByChr.Keys, new ParallelOptions { CancellationToken = cts.Token, MaxDegreeOfParallelism = Environment.ProcessorCount, TaskScheduler = TaskScheduler.Default }, chr => { var breakpoints = new List <int>(); int length = segmentation.First().ScoreByChr[chr].Length; var startByChr = segmentation.First().StartByChr[chr]; var endByChr = segmentation.First().EndByChr[chr]; var multiSampleCoverage = new List <List <double> >(length); for (int i = 0; i < length; i++) { multiSampleCoverage.Add(segmentation.Select(x => x.ScoreByChr[chr][i]).ToList()); } if (length > _minSize) { var haploidMeans = new List <double>(_nHiddenStates); var negativeBinomialDistributions = InitializeNegativeBinomialEmission(multiSampleCoverage, _nHiddenStates, haploidMeans); var hmm = new HiddenMarkovModel(multiSampleCoverage, negativeBinomialDistributions, haploidMeans); Console.WriteLine($"{DateTime.Now} Launching HMM task for chromosome {chr}"); if (_nSamples <= 3) { hmm.FindMaximalLikelihood(multiSampleCoverage); } var bestPathViterbi = hmm.BestPathViterbi(multiSampleCoverage, startByChr, haploidMeans); Console.WriteLine($"{DateTime.Now} Completed HMM task for chromosome {chr}"); breakpoints.Add(0); for (int i = 1; i < length; i++) { if (bestPathViterbi[i] - bestPathViterbi[i - 1] != 0) { breakpoints.Add(i); } } if (_commonCnVs != null) { if (commonCNVintervals.ContainsKey(chr)) { var remappedCommonCNVintervals = Segmentation.RemapCommonRegions(commonCNVintervals[chr], startByChr, endByChr); var oldbreakpoints = breakpoints; breakpoints = Segmentation.OverlapCommonRegions(oldbreakpoints, remappedCommonCNVintervals); } } var segments = Segmentation.DeriveSegments(breakpoints, length, startByChr, endByChr); lock (segmentByChr) { segmentByChr[chr] = segments; } } }); Console.WriteLine("{0} Completed HMM tasks", DateTime.Now); Console.WriteLine("{0} Segmentation results complete", DateTime.Now); return(segmentByChr); }
public Dictionary <string, SegmentationInput.Segment[]> Run(List <SegmentationInput> segmentation, bool isPerSample) { var segmentByChr = new Dictionary <string, SegmentationInput.Segment[]>(); var cts = new CancellationTokenSource(); // Compute whole-genome median and inter-quartile-range-based pseudo-variance for each sample; // it would be better to exclude regions that are not diploid, and we should really be // using a different variance for each copy number, but using these values is better than // using the per-chromosome mean and variance, which have the following problems: // - chromosomes with a lot of outliers can get a very high variance // - chromosomes that have a whole-chromosome CNV or a CNV that affects a lot of the chromosome // can have problematic estimates var medians = new List <double>(); var pseudoVariances = new List <double>(); foreach (var singleSampleSegmentation in segmentation) { var cvgVals = new List <float>(); foreach (var chr in singleSampleSegmentation.CoverageInfo.CoverageByChr.Keys) { cvgVals.AddRange(singleSampleSegmentation.CoverageInfo.CoverageByChr[chr].Select(x => (float)x)); } var quartiles = CanvasCommon.Utilities.Quartiles(cvgVals); medians.Add(quartiles.Item2); var iqr = quartiles.Item3 - quartiles.Item1; pseudoVariances.Add(iqr * iqr); //Console.WriteLine($"Global estimation of median and pseudovariance: {quartiles.Item2} {iqr * iqr}"); } Parallel.ForEach( segmentation.First().CoverageInfo.CoverageByChr.Keys, new ParallelOptions { CancellationToken = cts.Token, MaxDegreeOfParallelism = Environment.ProcessorCount, TaskScheduler = TaskScheduler.Default }, chr => { var breakpoints = new List <int>(); int length = segmentation.First().CoverageInfo.CoverageByChr[chr].Length; var startByChr = segmentation.First().CoverageInfo.StartByChr[chr]; var endByChr = segmentation.First().CoverageInfo.EndByChr[chr]; var multiSampleCoverage = new List <List <double> >(length); for (int i = 0; i < length; i++) { multiSampleCoverage.Add(segmentation.Select(x => x.CoverageInfo.CoverageByChr[chr][i]).ToList()); } if (length > _minSize) { var haploidMeans = new List <double>(_nHiddenStates); var negativeBinomialDistributions = isPerSample ? InitializeNegativeBinomialEmission(multiSampleCoverage, _nHiddenStates, haploidMeans, medians, pseudoVariances) : InitializeNegativeBinomialEmission(multiSampleCoverage, _nHiddenStates, haploidMeans, null, null); //for (int j = 0; j < 1; j++) // for (int i = 0; i < 190; i++) // { // Console.WriteLine($"NegBin smp {j} count {i}: {negativeBinomialDistributions[0].Probability(j, i)} {negativeBinomialDistributions[1].Probability(j, i)} {negativeBinomialDistributions[2].Probability(j, i)} {negativeBinomialDistributions[3].Probability(j, i)} {negativeBinomialDistributions[4].Probability(j, i)}"); // } var hmm = new HiddenMarkovModel(multiSampleCoverage, negativeBinomialDistributions, haploidMeans, isPerSample); Console.WriteLine($"{DateTime.Now} Launching HMM task for chromosome {chr}"); //if (_nSamples == 1) // hmm.FindMaximalLikelihood(multiSampleCoverage); var bestPathViterbi = hmm.BestPathViterbi(multiSampleCoverage, startByChr, haploidMeans); Console.WriteLine($"{DateTime.Now} Completed HMM task for chromosome {chr}"); breakpoints.Add(0); for (int i = 1; i < length; i++) { if (bestPathViterbi[i] - bestPathViterbi[i - 1] != 0) { breakpoints.Add(i); } } var segments = SegmentationInput.DeriveSegments(breakpoints, length, startByChr, endByChr); lock (segmentByChr) { segmentByChr[chr] = segments; } } }); Console.WriteLine("{0} Completed HMM tasks", DateTime.Now); Console.WriteLine("{0} Segmentation results complete", DateTime.Now); return(segmentByChr); }