Example #1
0
        /// <summary>
        /// Wavelets: unbalanced HAAR wavelets segmentation
        /// </summary>
        public Dictionary <string, SegmentationInput.Segment[]> Run(SegmentationInput segmentationInput, int windowSize)
        {
            double?coverageCV         = segmentationInput.GetCoverageVariability(windowSize);
            var    factorOfThreeCMADs = segmentationInput.FactorOfThreeCoverageVariabilities();;

            try
            {
                double evennessScore = segmentationInput.GetEvennessScore(windowSize);
                if (!segmentationInput.EvennessMetricFile.IsNullOrEmpty())
                {
                    CanvasIO.WriteEvennessMetricToTextFile(segmentationInput.EvennessMetricFile, evennessScore);
                }
            }
            catch (Exception)
            {
                Console.Error.WriteLine("Unable to calculate an evenness score, using coverage for segmentation");
            }

            Dictionary <string, List <int> > adjustedBreakpoints;

            var breakpoints = LaunchWavelets(segmentationInput.CoverageInfo.CoverageByChr, segmentationInput.CoverageInfo.StartByChr,
                                             segmentationInput.CoverageInfo.EndByChr, coverageCV, factorOfThreeCMADs);

            adjustedBreakpoints = AdjustBreakpoints(segmentationInput.CoverageInfo.CoverageByChr, breakpoints, vafContainingBinsByChr: null);

            var segments = new Dictionary <string, SegmentationInput.Segment[]>();

            foreach (string chr in segmentationInput.VafByChr.Keys)
            {
                segments[chr] = SegmentationInput.DeriveSegments(adjustedBreakpoints[chr], segmentationInput.CoverageInfo.CoverageByChr[chr].Length,
                                                                 segmentationInput.CoverageInfo.StartByChr[chr], segmentationInput.CoverageInfo.EndByChr[chr]);
            }
            return(segments);
        }
Example #2
0
        public void TestReadFrequencies()
        {
            var intervals = new List <BedInterval>
            {
                new BedInterval(1, 50),
                new BedInterval(51, 150),
            };
            const string chr = "chr22";
            var          intervalsByChromosome = new Dictionary <string, List <BedInterval> > {
                { chr, intervals }
            };
            var variantCounts = "";

            variantCounts += "chr22\t10\tC\tT\t20\t10\n";
            variantCounts += "chr22\t20\tC\tT\t30\t20\n";
            variantCounts += "chr22\t100\tC\tT\t40\t30\n";
            var stringReader = new StringReader(variantCounts);

            using (var reader = new GzipOrTextReader(stringReader))
            {
                var allelesByChromosome =
                    CanvasIO.ReadFrequencies(reader, intervalsByChromosome);
                Assert.Equal(allelesByChromosome[chr].Count, intervals.Count);
                Assert.Equal(2, allelesByChromosome[chr].First().Size());
                Assert.Equal(1, allelesByChromosome[chr].Last().Size());
            }
        }
        public static void RatiosToCounts(IEnumerable <SampleGenomicBin> ratios, IFileLocation referencePloidyBedFile,
                                          IFileLocation outputPath)
        {
            PloidyInfo referencePloidy = null;

            if (referencePloidyBedFile != null && referencePloidyBedFile.Exists)
            {
                referencePloidy = PloidyInfo.LoadPloidyFromBedFile(referencePloidyBedFile.FullName);
            }

            CanvasIO.WriteToTextFile(outputPath.FullName, RatiosToCounts(ratios, referencePloidy));
        }
Example #4
0
        /// <summary>
        /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment.
        /// </summary>
        public void LoadVAFInput(string referenceFolder)
        {
            try
            {
                var vafByChr = new Dictionary <string, List <List <double> > >();
                var intervalsByChromosome = new Dictionary <string, List <BedInterval> >();

                foreach (string chr in CoverageInfo.StartByChr.Keys)
                {
                    vafByChr[chr] = new List <List <double> >(CoverageInfo.StartByChr[chr].Length);
                    intervalsByChromosome[chr] = new List <BedInterval>();
                    for (int index = 0; index < CoverageInfo.StartByChr[chr].Length; index++)
                    {
                        vafByChr[chr].Add(new List <double>());
                        intervalsByChromosome[chr].Add(new BedInterval(Convert.ToInt32(CoverageInfo.StartByChr[chr][index]),
                                                                       Convert.ToInt32(CoverageInfo.EndByChr[chr][index])));
                    }
                }

                var alleleCountsByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(this.InputVafPath), intervalsByChromosome);
                foreach (var chr in alleleCountsByChromosome.Keys)
                {
                    for (int index = 0; index < alleleCountsByChromosome[chr].Count; index++)
                    {
                        vafByChr[chr][index] = alleleCountsByChromosome[chr][index].MaxFrequencies;
                    }
                }

                foreach (string chr in vafByChr.Keys)
                {
                    VafByChr[chr] = new List <VafContainingBins>();
                    var index = 0;
                    foreach (var bin in vafByChr[chr])
                    {
                        if (bin.Count > 0)
                        {
                            VafByChr[chr].Add(new VafContainingBins(index, bin.Average()));
                        }
                        index++;
                    }
                }
                _logger.Info("Done processing VAFs\n");
            }
            catch (Exception e)
            {
                Console.Error.WriteLine("File {0} could not be read:", this.InputVafPath);
                Console.Error.WriteLine(e.Message);
                Environment.Exit(1);
            }
        }
Example #5
0
    /// <summary>
    /// Adds an input switch to the left border. It is not placed at the correct position yet, "UpdateCanvasIO()"
    /// will correct it later.
    /// </summary>
    public void AddInput()
    {
        CanvasIO newInput = Instantiate(inputOutputPrefab, leftBorderTransform).GetComponent <CanvasIO>();

        inputs.Add(newInput);
        int index = 0;

        foreach (CanvasIO c in inputs)
        {
            c.transform.position = new Vector2(leftBorderTransform.transform.position.x, leftBorderTransform.position.y - (leftBorderTransform.GetComponent <RectTransform>().rect.size.y / 2) + (leftBorderTransform.GetComponent <RectTransform>().rect.size.y / (inputs.Count + 1)) * (index + 1));
            c.io.pos             = new Vector2(c.transform.position.x + (leftBorderTransform.GetComponent <RectTransform>().rect.size.x / 2), c.transform.position.y);
            c.io.pos             = Camera.main.ScreenToWorldPoint(c.io.pos);
            c.io.input           = false;
            index++;
        }
    }
Example #6
0
    /// <summary>
    /// Deletes and output from the right border. The inputs left are then replaced at the correct position
    /// next time "UpdateCanvasIO()" is called.
    /// </summary>
    public void DeleteOutput()
    {
        CanvasIO toDelete = outputs[outputs.Count - 1];

        toDelete.io.FlushIO();
        outputs.Remove(toDelete);
        Destroy(toDelete.gameObject);
        int index = 0;

        foreach (CanvasIO c in outputs)
        {
            c.transform.position = new Vector2(rightBorderTransform.transform.position.x, rightBorderTransform.position.y - (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / 2) + (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / (outputs.Count + 1)) * (index + 1));
            c.io.pos             = new Vector2(c.transform.position.x - (rightBorderTransform.GetComponent <RectTransform>().rect.size.x / 2), c.transform.position.y);
            c.io.pos             = Camera.main.ScreenToWorldPoint(c.io.pos);
            index++;
        }
    }
Example #7
0
    /// <summary>
    /// Adds an output switch to the right border. It is not placed at the correct position yet, "UpdateCanvasIO()"
    /// will correct it later.
    /// </summary>
    public void AddOutput()
    {
        CanvasIO newOutput = Instantiate(inputOutputPrefab, rightBorderTransform).GetComponent <CanvasIO>();

        outputs.Add(newOutput);
        int index = 0;

        foreach (CanvasIO c in outputs)
        {
            c.transform.position = new Vector2(rightBorderTransform.transform.position.x, rightBorderTransform.position.y - (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / 2) + (rightBorderTransform.GetComponent <RectTransform>().rect.size.y / (outputs.Count + 1)) * (index + 1));
            c.io.pos             = new Vector2(c.transform.position.x - (rightBorderTransform.GetComponent <RectTransform>().rect.size.x / 2), c.transform.position.y);
            c.io.pos             = Camera.main.ScreenToWorldPoint(c.io.pos);
            c.io.input           = true;
            c.ChangeState();
            index++;
        }
    }
Example #8
0
        /// <summary>
        /// CreatRecordLevelFilter CanvasSegments from common CNVs bed file and overlap with CanvasPartition
        /// segments to create SegmentHaplotypes
        /// </summary>
        private IEnumerable <ISampleMap <OverlappingSegmentsRegion> > CreateSegmentSetsFromCommonCnvs(ISampleMap <string> variantFrequencyFiles,
                                                                                                      int defaultAlleleCountThreshold, string commonCNVsbedPath, ISampleMap <Segments> sampleSegments)
        {
            if (commonCNVsbedPath == null)
            {
                var defaultSampleRegions = sampleSegments
                                           .SelectValues(segments => segments.AllSegments.Select(segment => new OverlappingSegmentsRegion(segment)).ToList());
                return(GetOverlappingSegmentsRegionSampleLists(defaultSampleRegions));
            }

            var commonRegions = ReadCommonRegions(commonCNVsbedPath);
            var chromosomes   = sampleSegments.Values.First().GetChromosomes();

            if (IsIdenticalChromosomeNames(commonRegions, chromosomes))
            {
                throw new ArgumentException(
                          $"Chromosome names in a common CNVs bed file {commonCNVsbedPath} does not match the genome reference");
            }

            var segmentIntervalsByChromosome = new Dictionary <string, List <BedInterval> >();
            var genomicBinsByChromosome      = new Dictionary <string, IReadOnlyList <SampleGenomicBin> >();

            Parallel.ForEach(
                chromosomes,
                chr =>
            {
                genomicBinsByChromosome[chr]      = sampleSegments.Values.First().GetGenomicBinsForChromosome(chr);
                segmentIntervalsByChromosome[chr] =
                    CanvasSegment.RemapGenomicToBinCoordinates(commonRegions[chr], genomicBinsByChromosome[chr]);
            });

            var sampleRegions = new SampleMap <List <OverlappingSegmentsRegion> >();

            foreach (var sampleId in sampleSegments.SampleIds)
            {
                var commonIntervals = commonRegions.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Select(bedEntry => bedEntry.Interval).ToList());
                var allelesByChromosomeCommonSegs = CanvasIO.ReadFrequenciesWrapper(_logger,
                                                                                    new FileLocation(variantFrequencyFiles[sampleId]), commonIntervals);
                var segmentsSets = GetSegmentSets(defaultAlleleCountThreshold, commonRegions,
                                                  genomicBinsByChromosome, segmentIntervalsByChromosome, allelesByChromosomeCommonSegs, sampleSegments[sampleId]);
                sampleRegions.Add(sampleId, segmentsSets);
            }

            return(GetOverlappingSegmentsRegionSampleLists(sampleRegions));
        }
        /// <summary>
        /// Writes copy-number data (cnd) file.
        /// </summary>
        /// <param name="fragmentCountFile"></param>
        /// <param name="referenceCountFile"></param>
        /// <param name="ratios"></param>
        /// <param name="outputFile"></param>
        public static void WriteCndFile(IFileLocation fragmentCountFile, IFileLocation referenceCountFile,
                                        IEnumerable <SampleGenomicBin> ratios, IFileLocation outputFile)
        {
            IEnumerable <SampleGenomicBin> fragmentCounts  = CanvasIO.IterateThroughTextFile(fragmentCountFile.FullName);
            IEnumerable <SampleGenomicBin> referenceCounts = CanvasIO.IterateThroughTextFile(referenceCountFile.FullName);

            using (var eFragment = fragmentCounts.GetEnumerator())
                using (var eReference = referenceCounts.GetEnumerator())
                    using (var eRatio = ratios.GetEnumerator())
                        using (FileStream stream = new FileStream(outputFile.FullName, FileMode.Create, FileAccess.Write))
                            using (StreamWriter writer = new StreamWriter(stream))
                            {
                                writer.WriteLine(CSVWriter.GetLine("Fragment Count", "Reference Count", "Chromosome",
                                                                   "Start", "End", "Unsmoothed Log Ratio"));
                                while (eFragment.MoveNext() && eReference.MoveNext() && eRatio.MoveNext())
                                {
                                    // Some bins could have been skipped when calculating the ratios
                                    while (!eFragment.Current.IsSameBin(eRatio.Current))
                                    {
                                        if (!eFragment.MoveNext()) // Ran out of fragment bins
                                        {
                                            throw new Illumina.Common.IlluminaException("Fragment bins and ratio bins are not in the same order.");
                                        }
                                    }
                                    while (!eReference.Current.IsSameBin(eRatio.Current))
                                    {
                                        if (!eReference.MoveNext()) // Ran out of reference bins
                                        {
                                            throw new Illumina.Common.IlluminaException("Reference bins and ratio bins are not in the same order.");
                                        }
                                    }
                                    if (!eFragment.Current.IsSameBin(eReference.Current) ||
                                        !eFragment.Current.IsSameBin(eRatio.Current))
                                    {
                                        throw new Illumina.Common.IlluminaException("Bins are not in the same order.");
                                    }
                                    writer.WriteLine(CSVWriter.GetLine(eFragment.Current.Count.ToString(),
                                                                       eReference.Current.Count.ToString(), eFragment.Current.GenomicBin.Chromosome,
                                                                       eFragment.Current.Start.ToString(), eFragment.Current.Stop.ToString(),
                                                                       eRatio.Current.Count.ToString()));
                                }
                            }
        }
        public void Run(IFileLocation outputFile)
        {
            List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName);

            VerifyBinOrder(sampleBins);

            // set bin count to 1 if less than 1
            foreach (var bin in sampleBins)
            {
                bin.Count = Math.Max(1, bin.Count);
            }

            // center the sample
            var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray();

            // project onto the axes
            var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes);

            // undo centering and set bin count to 1 if less than 1
            var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count));

            // write temporary reference count file
            var tempReferenceFile = new FileLocation(Path.GetTempFileName());
            var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector,
                                                   (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count));

            CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins);

            // calcualte median ratio
            var    ratios      = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest);
            double medianRatio = ratios.OnTargetMedianBinCount;

            // delete temporary reference count file
            tempReferenceFile.Delete();

            // multiply reference counts by the median ratio
            var referenceBins = Enumerable.Zip(sampleBins, referenceVector,
                                               (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio)));

            // write reference count file
            CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins);
        }
Example #11
0
        public int Run(IFileLocation inputFile, IFileLocation outputFile)
        {
            // read input bins
            var binsByChrom = CanvasIO.GetGenomicBinsByChrom(inputFile.FullName);

            // smooth bins on each chromosome
            RepeatedMedianSmoother smoother = new RepeatedMedianSmoother(MaxHalfWindowSize);
            var chromosomes = binsByChrom.Keys;
            ConcurrentDictionary <string, List <SampleGenomicBin> > smoothedBinsByChrom = new ConcurrentDictionary <string, List <SampleGenomicBin> >();

            Console.WriteLine("Launch smoothing jobs...");
            Parallel.ForEach(chromosomes, chrom =>
            {
                smoothedBinsByChrom[chrom] = smoother.Smooth(binsByChrom[chrom]);
            });
            Console.WriteLine("Completed smoothing jobs.");

            // write smoothed bins
            CanvasIO.WriteToTextFile(outputFile.FullName, chromosomes.SelectMany(chrom => smoothedBinsByChrom[chrom]));

            return(0);
        }
Example #12
0
        public IEnumerable <SampleGenomicBin> Run(IFileLocation sampleBedFile, IFileLocation referenceBedFile)
        {
            if (!sampleBedFile.Exists)
            {
                throw new FileNotFoundException(sampleBedFile.FullName + " does not exist.");
            }
            if (!referenceBedFile.Exists)
            {
                throw new FileNotFoundException(referenceBedFile.FullName + " does not exist.");
            }

            var sampleBins    = CanvasIO.IterateThroughTextFile(sampleBedFile.FullName);
            var referenceBins = CanvasIO.IterateThroughTextFile(referenceBedFile.FullName);

            using (var eSampleBins = sampleBins.GetEnumerator())
                using (var eReferenceBins = referenceBins.GetEnumerator())
                {
                    while (eSampleBins.MoveNext() && eReferenceBins.MoveNext())
                    {
                        var sampleBin    = eSampleBins.Current;
                        var referenceBin = eReferenceBins.Current;
                        // Bins with extreme reference counts introduce large variance into the ratios.
                        // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                        if (referenceBin.Count < _minReferenceCount)
                        {
                            continue;
                        }                                                      // skip the bin
                        if (referenceBin.Count > _maxReferenceCount)
                        {
                            continue;
                        }                                                      // skip the bin
                        double sampleCount = eSampleBins.Current.Count;
                        double ratio       = sampleBin.Count / referenceBin.Count;
                        yield return(new SampleGenomicBin(sampleBin.GenomicBin.Chromosome, sampleBin.Start, sampleBin.Stop, sampleBin.GenomicBin.GC, (float)ratio));
                    }
                }
        }
Example #13
0
        public IEnumerable <SampleGenomicBin> Run(IFileLocation sampleBedFile, IFileLocation referenceBedFile)
        {
            if (!sampleBedFile.Exists)
            {
                throw new FileNotFoundException(sampleBedFile.FullName + " does not exist.");
            }
            if (!referenceBedFile.Exists)
            {
                throw new FileNotFoundException(referenceBedFile.FullName + " does not exist.");
            }

            var    sampleBins        = CanvasIO.IterateThroughTextFile(sampleBedFile.FullName);
            var    referenceBins     = CanvasIO.IterateThroughTextFile(referenceBedFile.FullName);
            double sampleMedian      = (new BinCounts(sampleBins, manifest: _manifest)).OnTargetMedianBinCount;
            double referenceMedian   = (new BinCounts(referenceBins, manifest: _manifest)).OnTargetMedianBinCount;
            double librarySizeFactor = (sampleMedian > 0 && referenceMedian > 0) ? referenceMedian / sampleMedian : 1;

            using (var eSampleBins = sampleBins.GetEnumerator())
                using (var eReferenceBins = referenceBins.GetEnumerator())
                {
                    while (eSampleBins.MoveNext() && eReferenceBins.MoveNext())
                    {
                        var sampleBin    = eSampleBins.Current;
                        var referenceBin = eReferenceBins.Current;
                        // The weighted average count of a bin could be less than 1.
                        // Using these small counts for coverage normalization creates large ratios.
                        // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                        if (referenceBin.Count < 1)
                        {
                            continue;
                        }                                     // skip the bin
                        double ratio = sampleBin.Count / referenceBin.Count * librarySizeFactor;
                        yield return(new SampleGenomicBin(sampleBin.GenomicBin.Chromosome, sampleBin.Start, sampleBin.Stop, sampleBin.GenomicBin.GC, (float)ratio));
                    }
                }
        }
Example #14
0
        public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyVcfPath, string referenceFolder, string sampleName,
                                string truthDataPath)
        {
            if (!string.IsNullOrEmpty(truthDataPath))
            {
                _cnOracle = new CopyNumberOracle();
                _cnOracle.LoadKnownCN(truthDataPath);
            }

            _segments    = Segments.ReadSegments(_logger, new FileLocation(inFile));
            _allSegments = _segments.AllSegments.ToList();
            TempFolder   = Path.GetDirectoryName(inFile);
            if (_allSegments.Count == 0)
            {
                Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made.");
                CanvasSegmentWriter.WriteSegments(outFile, _allSegments, _model?.DiploidCoverage, referenceFolder,
                                                  sampleName, null, null, QualityFilterThreshold, false, null, null);
                return(0);
            }
            PloidyInfo ploidy = null;

            if (!string.IsNullOrEmpty(ploidyVcfPath))
            {
                ploidy = PloidyInfo.LoadPloidyFromVcfFileNoSampleId(ploidyVcfPath);
            }

            // load MAF
            var allelesByChromosome = CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFile), _segments.IntervalsByChromosome);

            _segments.AddAlleles(allelesByChromosome);
            MeanCoverage = allelesByChromosome.SelectMany(x => x.Value).SelectMany(y => y.TotalCoverage).Average();
            AggregateVariantCoverage(ref _allSegments);

            // Create new models for different copy number states
            InitializePloidies();

            // Compute statistics on the copy number two regions
            float[] diploidCounts = AggregateCounts(ref _allSegments);
            _diploidCoverage         = Utilities.Mean(diploidCounts);
            _coverageWeightingFactor = CoverageWeighting / _diploidCoverage;
            // new coverage model
            _model = new CoverageModel {
                DiploidCoverage = _diploidCoverage
            };
            List <SegmentInfo> segments = new List <SegmentInfo>();

            foreach (CanvasSegment segment in _allSegments)
            {
                SegmentInfo info = new SegmentInfo {
                    Segment = segment
                };
                List <double> mafs = new List <double>();
                foreach (float value in segment.Balleles.Frequencies)
                {
                    mafs.Add(value > 0.5 ? 1 - value : value);
                }

                if (mafs.Count > 0)
                {
                    info.Maf = Utilities.Median(mafs);
                }
                else
                {
                    info.Maf = -1;
                }

                info.Coverage = Utilities.Median(segment.Counts);

                info.Weight = _allSegments.Count > 100 ? segment.Length : segment.BinCount;
                segments.Add(info);
            }

            AssignPloidyCallsDistance(_model);

            CanvasSegment.AssignQualityScores(_allSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters);

            // Merge neighboring segments that got the same copy number call.
            // merging segments requires quality scores so we do it after quality scores have been assigned
            var mergedSegments = CanvasSegment.MergeSegments(_allSegments);

            // recalculating qscores after merging segments improves performance!

            CanvasSegment.AssignQualityScores(mergedSegments, CanvasSegment.QScoreMethod.LogisticGermline, _germlineScoreParameters);
            CanvasSegment.SetFilterForSegments(QualityFilterThreshold, mergedSegments, CanvasFilter.SegmentSizeCutoff);

            List <string> extraHeaders       = new List <string>();
            var           coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutputPath(outFile);

            CanvasSegment.WriteCoveragePlotData(mergedSegments, _model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder);

            if (_cnOracle != null)
            {
                GenerateReportVersusKnownCopyNumber();
            }

            if (!string.IsNullOrEmpty(ploidy?.HeaderLine))
            {
                extraHeaders.Add(ploidy.HeaderLine);
            }

            CanvasSegmentWriter.WriteSegments(outFile, mergedSegments, _model.DiploidCoverage, referenceFolder, sampleName,
                                              extraHeaders, ploidy, QualityFilterThreshold, false, null, null);
            return(0);
        }
Example #15
0
 private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts,
                                   out List <int> onTargetIndices)
 {
     LoadBinCounts(CanvasIO.IterateThroughTextFile(binnedPath), manifest, out binCounts, out onTargetIndices);
 }
Example #16
0
        /// <summary>
        /// Implements the Canvas binning algorithm
        /// </summary>
        public static int Run(CanvasBinParameters parameters)
        {
            // Will hold a bunch of BitArrays, one for each chromosome.
            // Each one's length corresponds to the length of the chromosome it represents.
            // A position will be marked 'true' if the mer starting at that position is unique in the genome.
            Dictionary <string, BitArray> possibleAlignments = new Dictionary <string, BitArray>();

            // Will hold a bunch of HitArrays, one for each chromosome.
            // Each one's length corresponds to the length of the chromosome it represents.
            // A position will be marked with the number of times the mer starting at that position
            // is observed in the SAM file.
            Dictionary <string, HitArray> observedAlignments = new Dictionary <string, HitArray>();

            // Will hold a bunch of byte arrays, one for each chromosome.
            // Each one's length corresponds to the length of the chromosome it represents.
            // A value at a given index will represents fragment length of the read starting at that index
            Dictionary <string, Int16[]> fragmentLengths = new Dictionary <string, Int16[]>();

            if (parameters.intermediatePaths.Count == 0)
            {
                BinOneGenomicInterval(parameters, possibleAlignments, observedAlignments, fragmentLengths);
                return(0);
            }

            //load our intermediate data files
            List <string> inputFiles = new List <string>(parameters.intermediatePaths);
            Object        semaphore  = new object(); // control access to possibleAlignments, observedAlignments, fragmentLengths
            // retrieve the number of processors
            //int processorCoreCount = Environment.ProcessorCount;
            int           processorCoreCount = 1; // Limit # of deserialization threads to avoid (rare) protobuf issue.
            List <Thread> threads            = new List <Thread>();

            Console.WriteLine("Start deserialization:");
            Console.Out.Flush();
            while (threads.Count > 0 || inputFiles.Count > 0)
            {
                // Remove defunct threads:
                threads.RemoveAll(t => !t.IsAlive);
                if (threads.Count == processorCoreCount)
                {
                    Thread.Sleep(1000);
                    continue;
                }
                while (inputFiles.Count > 0 && threads.Count < processorCoreCount)
                {
                    string      inputFile      = inputFiles.First();
                    ThreadStart threadDelegate = new ThreadStart(() => DeserializeCanvasData(inputFile, possibleAlignments, observedAlignments, fragmentLengths, semaphore, parameters.coverageMode));
                    Thread      newThread      = new Thread(threadDelegate);
                    threads.Add(newThread);
                    newThread.Name = "CanvasBin " + inputFiles[0];
                    Console.WriteLine(newThread.Name);
                    newThread.Start();
                    inputFiles.RemoveAt(0);
                }
            }
            Console.WriteLine("{0} Deserialization complete", DateTime.Now);
            Console.Out.Flush();

            NexteraManifest manifest = parameters.manifestFile == null ? null : new NexteraManifest(parameters.manifestFile, null, Console.WriteLine);

            if (parameters.binSize == -1)
            {
                // Turn the desired # of alignments per bin into the number of possible alignments expected per bin.
                parameters.binSize = CalculateNumberOfPossibleAlignmentsPerBin(parameters.countsPerBin, possibleAlignments, observedAlignments,
                                                                               manifest: manifest);
            }

            if (parameters.binSizeOnly)
            {
                // Write bin size to file
                System.IO.File.WriteAllText(parameters.outFile + ".binsize", "" + parameters.binSize);
                return(0);
            }

            Dictionary <string, List <GenomicBin> > predefinedBins = null;

            if (parameters.predefinedBinsFile != null)
            {
                // Read predefined bins
                predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile);
            }

            // Bin alignments.
            List <GenomicBin> bins = BinCounts(parameters.referenceFile, parameters.binSize, parameters.coverageMode, manifest,
                                               possibleAlignments, observedAlignments, fragmentLengths, predefinedBins, parameters.outFile);

            // Output!
            Console.WriteLine("{0} Output binned counts:", DateTime.Now);
            CanvasIO.WriteToTextFile(parameters.outFile, bins);
            Console.WriteLine("{0} Output complete", DateTime.Now);
            Console.Out.Flush();
            return(0);
        }
Example #17
0
        static int Main(string[] args)
        {
            Utilities.LogCommandLine(args);
            string inFile           = null;
            string outFile          = null;
            bool   doGCnorm         = false;
            bool   doSizeFilter     = false;
            bool   doOutlierRemoval = false;
            string ffpeOutliersFile = null;
            string manifestFile     = null;
            CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC;
            string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}",
                                                   String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()),
                                                   gcNormalizationMode);
            bool needHelp = false;

            OptionSet p = new OptionSet()
            {
                { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v },
                { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v },
                { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null },
                { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null },
                { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null },
                { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v },
                { "t|manifest=", "Nextera manifest file", v => manifestFile = v },
                { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) },
                { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) },
                { "h|help", "show this message and exit", v => needHelp = v != null },
            };

            List <string> extraArgs = p.Parse(args);

            if (needHelp)
            {
                ShowHelp(p);
                return(0);
            }

            if (inFile == null || outFile == null)
            {
                ShowHelp(p);
                return(0);
            }

            // Does the input file exist?
            if (!File.Exists(inFile))
            {
                Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile);
                return(1);
            }

            List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile);

            if (doOutlierRemoval)
            {
                bins = RemoveOutliers(bins);
            }

            if (doSizeFilter)
            {
                bins = RemoveBigBins(bins);
            }

            // do not run FFPE outlier removal on targeted/low coverage data
            if (ffpeOutliersFile != null && bins.Count < 50000)
            {
                ffpeOutliersFile = null;
            }

            // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file
            double LocalSD = -1.0;

            if (ffpeOutliersFile != null)
            {
                LocalSD = getLocalStandardDeviation(bins);
                CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD);
            }

            if (doGCnorm)
            {
                NexteraManifest         manifest     = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine);
                List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC
                    ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest)
                    : bins;
                if (strippedBins.Count == 0)
                {
                    Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction");
                }
                else
                {
                    bins = strippedBins;
                    NormalizeByGC(bins, manifest, gcNormalizationMode);
                    // Use variance normalization only on large exome panels and whole genome sequencing
                    // The treshold is set to 10% of an average number of bins on CanvasClean data
                    if (ffpeOutliersFile != null && bins.Count > 500000)
                    {
                        bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest);
                        // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC
                        if (isNormalizeVarianceByGC)
                        {
                            NormalizeByGC(bins, manifest, gcNormalizationMode);
                        }
                    }
                }
            }

            if (ffpeOutliersFile != null)
            {
                // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples)
                List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile);
                bins = LocalMadstrippedBins;
            }

            CanvasIO.WriteToTextFile(outFile, bins);
            return(0);
        }
Example #18
0
        public int CallVariants(string variantFrequencyFile, string inFile, string outFile, string ploidyBedPath, string referenceFolder, string sampleName,
                                string truthDataPath)
        {
            if (!string.IsNullOrEmpty(truthDataPath))
            {
                this.CNOracle = new CopyNumberOracle();
                this.CNOracle.LoadKnownCN(truthDataPath);
            }

            this.Segments   = CanvasSegment.ReadSegments(inFile);
            this.TempFolder = Path.GetDirectoryName(inFile);
            if (this.Segments.Count == 0)
            {
                Console.WriteLine("CanvasDiploidCaller: No segments loaded; no CNV calls will be made.");
                CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, null, null);
                return(0);
            }
            PloidyInfo ploidy = null;

            if (!string.IsNullOrEmpty(ploidyBedPath))
            {
                ploidy = PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);
            }

            // load MAF
            this.MeanCoverage = CanvasIO.LoadVariantFrequencies(variantFrequencyFile, this.Segments);
            int medianVariantCoverage = AggregateVariantCoverage(ref this.Segments);


            // Create new models for different copy number states
            this.InitializePloidies();

            // Compute statistics on the copy number two regions
            float[] diploidCounts = AggregateCounts(ref this.Segments);
            DiploidCoverage         = CanvasCommon.Utilities.Mean(diploidCounts);
            CoverageWeightingFactor = CoverageWeighting / DiploidCoverage;


            // new coverage model
            this.Model            = new CoverageModel();
            Model.DiploidCoverage = DiploidCoverage;
            List <SegmentInfo> segments = new List <SegmentInfo>();

            foreach (CanvasSegment segment in this.Segments)
            {
                SegmentInfo info = new SegmentInfo();
                info.Segment = segment;
                List <double> MAF = new List <double>();
                foreach (float value in segment.VariantFrequencies)
                {
                    MAF.Add(value > 0.5 ? 1 - value : value);
                }

                if (MAF.Count > 0)
                {
                    info.MAF = CanvasCommon.Utilities.Median(MAF);
                }
                else
                {
                    info.MAF = -1;
                }

                info.Coverage = CanvasCommon.Utilities.Median(segment.Counts);

                if (this.Segments.Count > 100)
                {
                    info.Weight = segment.End - segment.Begin;
                }
                else
                {
                    info.Weight = segment.BinCount;
                }
                segments.Add(info);
            }

            // Assign copy number and major chromosome count for each segment
            bool useGaussianMixtureModel = false; // For now, this is set false, since we saw weird performance on chrY (CANV-115):

            if (useGaussianMixtureModel)
            {
                // optimize model covariance
                double likelihood = FitGaussians(Model, segments);
                AssignPloidyCallsGaussianMixture();
            }
            else
            {
                AssignPloidyCallsDistance(Model, segments, medianVariantCoverage);
            }

            // Merge neighboring segments that got the same copy number call.
            CanvasSegment.MergeSegments(ref this.Segments);
            CanvasSegment.AssignQualityScores(this.Segments, CanvasSegment.QScoreMethod.LogisticGermline);
            List <string> extraHeaders       = new List <string>();
            string        coverageOutputPath = CanvasCommon.Utilities.GetCoverageAndVariantFrequencyOutputPath(outFile);

            CanvasSegment.WriteCoveragePlotData(this.Segments, Model.DiploidCoverage, ploidy, coverageOutputPath, referenceFolder);

            if (this.CNOracle != null)
            {
                this.GenerateReportVersusKnownCN();
            }

            if (ploidy != null && !string.IsNullOrEmpty(ploidy.HeaderLine))
            {
                extraHeaders.Add(ploidy.HeaderLine);
            }
            CanvasSegment.WriteSegments(outFile, this.Segments, referenceFolder, sampleName, extraHeaders, ploidy);
            return(0);
        }
Example #19
0
        internal int CallVariants(List <string> variantFrequencyFiles, List <string> segmentFiles,
                                  IFileLocation outVcfFile, string ploidyBedPath, string referenceFolder, List <string> sampleNames, string commonCnvsBedPath, List <SampleType> sampleTypes)
        {
            // load files
            // initialize data structures and classes
            var fileCounter      = 0;
            var samplesInfo      = new SampleMap <SampleMetrics>();
            var sampleSegments   = new SampleMap <Segments>();
            var copyNumberModels = new SampleMap <ICopyNumberModel>();
            var variantFrequencyFilesSampleList = new SampleMap <string>();
            var kinships = new SampleMap <SampleType>();

            foreach (string sampleName in sampleNames)
            {
                var sampleId = new SampleId(sampleName);
                var segment  = Segments.ReadSegments(_logger, new FileLocation(segmentFiles[fileCounter]));
                segment.AddAlleles(CanvasIO.ReadFrequenciesWrapper(_logger, new FileLocation(variantFrequencyFiles[fileCounter]), segment.IntervalsByChromosome));
                sampleSegments.Add(sampleId, segment);
                var sampleInfo      = SampleMetrics.GetSampleInfo(segment.AllSegments, ploidyBedPath, _callerParameters.NumberOfTrimmedBins, sampleId);
                var copyNumberModel = _copyNumberModelFactory.CreateModel(_callerParameters.MaximumCopyNumber, sampleInfo.MaxCoverage, sampleInfo.MeanCoverage, sampleInfo.MeanMafCoverage);
                samplesInfo.Add(sampleId, sampleInfo);
                copyNumberModels.Add(sampleId, copyNumberModel);
                variantFrequencyFilesSampleList.Add(sampleId, variantFrequencyFiles[fileCounter]);
                kinships.Add(sampleId, sampleTypes[fileCounter]);
                fileCounter++;
            }
            var segmentSetsFromCommonCnvs = CreateSegmentSetsFromCommonCnvs(variantFrequencyFilesSampleList,
                                                                            _callerParameters.MinAlleleCountsThreshold, commonCnvsBedPath, sampleSegments);

            var          segmentsForVariantCalling = GetHighestLikelihoodSegments(segmentSetsFromCommonCnvs, samplesInfo, copyNumberModels).ToList();
            PedigreeInfo pedigreeInfo = PedigreeInfo.GetPedigreeInfo(kinships, _callerParameters);

            Parallel.ForEach(
                segmentsForVariantCalling,
                new ParallelOptions
            {
                MaxDegreeOfParallelism = Math.Min(Environment.ProcessorCount, _callerParameters.MaxCoreNumber)
            },
                segments => _variantCaller.CallVariant(segments, samplesInfo, copyNumberModels, pedigreeInfo)
                );
            var variantCalledSegments = new SampleMap <List <CanvasSegment> >();

            foreach (var key in samplesInfo.SampleIds)
            {
                variantCalledSegments.Add(key, segmentsForVariantCalling.Select(segment => segment[key]).ToList());
            }

            var mergedVariantCalledSegments = MergeSegments(variantCalledSegments, _callerParameters.MinimumCallSize, _qualityFilterThreshold);

            FilterExcessivelyShortSegments(mergedVariantCalledSegments);

            var outputFolder = outVcfFile.Directory;

            foreach (var sampleId in samplesInfo.SampleIds)
            {
                var coverageOutputPath = SingleSampleCallset.GetCoverageAndVariantFrequencyOutput(outputFolder,
                                                                                                  sampleId.ToString());
                CanvasSegment.WriteCoveragePlotData(mergedVariantCalledSegments[sampleId], samplesInfo[sampleId].MeanCoverage,
                                                    samplesInfo[sampleId].Ploidy, coverageOutputPath, referenceFolder);
            }
            bool isPedigreeInfoSupplied = pedigreeInfo != null && pedigreeInfo.HasFullPedigree();
            var  denovoQualityThreshold = isPedigreeInfoSupplied ? (int?)_deNovoQualityFilterThreshold : null;
            var  ploidies        = samplesInfo.Select(info => info.Value.Ploidy).ToList();
            var  diploidCoverage = samplesInfo.Select(info => info.Value.MeanCoverage).ToList();
            var  names           = samplesInfo.SampleIds.Select(id => id.ToString()).ToList();

            CanvasSegmentWriter.WriteMultiSampleSegments(outVcfFile.FullName, mergedVariantCalledSegments, diploidCoverage, referenceFolder, names,
                                                         null, ploidies, _qualityFilterThreshold, denovoQualityThreshold, CanvasFilter.SegmentSizeCutoff, isPedigreeInfoSupplied);

            foreach (var sampleId in samplesInfo.SampleIds)
            {
                var outputVcfPath = SingleSampleCallset.GetVcfOutput(outputFolder, sampleId.ToString());
                var sampleMetrics = samplesInfo[sampleId];
                var segments      = mergedVariantCalledSegments[sampleId];
                CanvasSegmentWriter.WriteSegments(outputVcfPath.FullName, segments,
                                                  sampleMetrics.MeanCoverage, referenceFolder, sampleId.ToString(), null,
                                                  sampleMetrics.Ploidy, _qualityFilterThreshold, isPedigreeInfoSupplied, denovoQualityThreshold, null);

                var visualizationTemp   = outputFolder.CreateSubdirectory($"VisualizationTemp{sampleId}");
                var normalizationFactor = NormalizationCalculator.ComputeNormalizationFactor(segments);
                var bigWig = _coverageBigWigWriter.Write(segments, visualizationTemp, normalizationFactor);
                bigWig?.MoveTo(SingleSampleCallset.GetCoverageBigWig(outputFolder, sampleId.ToString()));
                var copyNumberBedGraph = SingleSampleCallset.GetCopyNumberBedGraph(outputFolder, sampleId.ToString());
                _copyNumberBedGraphWriter.Write(segments, sampleMetrics.Ploidy, copyNumberBedGraph);

                var partitionBedgraphHeader = "track type=bedGraph visibility=full autoScale=on graphType=points";
                var originalSegments        = sampleSegments[sampleId];
                _partitionCoverageBedGraphWriter.Write(originalSegments.AllSegments, SingleSampleCallset.GetPartitionBedGraph(outputFolder, sampleId.ToString()), normalizationFactor, partitionBedgraphHeader);
            }
            return(0);
        }
Example #20
0
        /// <summary>
        /// Performs fragment binning.
        /// </summary>
        /// <returns></returns>
        public int Bin()
        {
            if (parameters.predefinedBinsFile == null)
            {
                throw new Illumina.Common.IlluminaException("Predefined bins in BED is required for fragment binning.");
            }
            if (!parameters.isPairedEnd) // Janus-SRS-189
            {
                throw new Illumina.Common.IlluminaException("Paired-end reads are required for fragment binning.");
            }

            Dictionary <string, List <SampleGenomicBin> > predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile, gcIndex: 3);
            List <string> chromosomes = GetChromosomesInBam(); // used to order chromosomes

            if (!Utilities.IsSubset(predefinedBins.Keys, chromosomes))
            {
                throw new Illumina.Common.IlluminaException(
                          String.Format("Not all chromosomes in {0} are found in {1}.", parameters.predefinedBinsFile, parameters.bamFile));
            }

            // Count fragments by chromosome
            List <ThreadStart> binningThreads = new List <ThreadStart>();
            List <BinTask>     tasks          = new List <BinTask>();

            foreach (string chrom in chromosomes)
            {
                if (!predefinedBins.ContainsKey(chrom))
                {
                    continue;
                }
                BinTask task = new BinTask(parameters.referenceFile, chrom, parameters.bamFile, predefinedBins[chrom]);
                tasks.Add(task);
                binningThreads.Add(new ThreadStart(() => { task.DoIt(); }));
            }

            Console.WriteLine("Launch fragment binning jobs...");
            Console.Out.WriteLine();
            Parallel.ForEach(binningThreads, t => { t.Invoke(); });
            Console.WriteLine("Completed fragment binning jobs.");
            Console.Out.WriteLine();

            long usableFragmentCount = tasks.Select(t => t.UsableFragmentCount).Sum();

            if (usableFragmentCount == 0)
            {
                throw new Illumina.Common.IlluminaException(String.Format("No passing-filter fragments overlapping bins are found in {0}", parameters.bamFile));
            }

            // Aggregate bins
            List <SampleGenomicBin> finalBins = new List <SampleGenomicBin>();

            foreach (string chrom in chromosomes)
            {
                if (!predefinedBins.ContainsKey(chrom))
                {
                    continue;
                }
                finalBins.AddRange(predefinedBins[chrom]);
            }

            // Output!
            CanvasIO.WriteToTextFile(parameters.outFile, finalBins);

            return(0);
        }