Beispiel #1
0
        public CanvasCallset(
            IFileLocation bam,
            string sampleName,
            IDirectoryLocation wholeGenomeFastaFolder,
            IDirectoryLocation outputFolder,
            IFileLocation kmerFasta,
            IFileLocation filterBed,
            IFileLocation ploidyBed,
            IFileLocation normalVcfPath,
            bool isDbSnpVcf,
            IEnumerable<IFileLocation> normalBamPaths,
            NexteraManifest manifest,
            IFileLocation somaticVcfPath,
            IFileLocation outputVcfPath)
        {
            Bam = new Bam(bam);
            SampleName = sampleName;
            WholeGenomeFastaFolder = wholeGenomeFastaFolder;
            OutputFolder = outputFolder;
            KmerFasta = kmerFasta;
            FilterBed = filterBed;
            PloidyBed = ploidyBed;
            NormalVcfPath = normalVcfPath;
            IsDbSnpVcf = isDbSnpVcf;
            Manifest = manifest;
            SomaticVcfPath = somaticVcfPath;
            OutputVcfPath = outputVcfPath;
            NormalBamPaths = normalBamPaths.Select(file => new Bam(file));

            var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml");
            GenomeMetadata = new GenomeMetadata();
            GenomeMetadata.Deserialize(genomeSizeXml.FullName);
        }
Beispiel #2
0
        public static void WriteTargetBed(NexteraManifest manifest, BgzipOrStreamWriter writer, GenomeMetadata genome)
        {
            List<NexteraManifest.ManifestRegion> tempRegions = manifest.Regions;
            if (genome != null)
            {
                tempRegions = new List<NexteraManifest.ManifestRegion>(manifest.Regions);
                Dictionary<string, int> chromsomeIndexLookup = new Dictionary<string, int>();
                //generate chromsome index lookup and sort
                for (int chromosomeIndex = 0; chromosomeIndex < genome.Sequences.Count; chromosomeIndex++)
                {
                    GenomeMetadata.SequenceMetadata sequence = genome.Sequences[chromosomeIndex];
                    chromsomeIndexLookup[sequence.Name] = chromosomeIndex;
                }
                tempRegions.Sort((a, b) => a.CompareTo(b, chromsomeIndexLookup));
            }

                foreach (NexteraManifest.ManifestRegion region in tempRegions)
                {
                TargetInterval interval = region.GetTargetInterval();
                    writer.WriteLine(string.Join("\t", new[]
                    {
                        interval.ReferenceName, 
                        (interval.Begin - 1).ToString(CultureInfo.InvariantCulture), 
                        interval.End.ToString(CultureInfo.InvariantCulture),
                    region.Name //region name is needed for PUMA metrics outputs to generate .coverage.csv file
                    }));
                }
            }
Beispiel #3
0
 public RawRatioCalculator(NexteraManifest manifest, double minReferenceCount = 1,
                           double maxReferecneCount = double.PositiveInfinity)
 {
     _manifest          = manifest;
     _minReferenceCount = minReferenceCount;
     _maxReferenceCount = maxReferecneCount;
 }
        /// <summary>
        /// Assumes the bins are sorted by genomic coordinates
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized</param>
        /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param>
        /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param>
        public static void GetCountsByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, out List <float>[] countsByGC, out List <float> counts)
        {
            countsByGC = new List <float> [numberOfGCbins];
            counts     = new List <float>(bins.Count);

            // Initialize the lists
            for (int i = 0; i < countsByGC.Length; i++)
            {
                countsByGC[i] = new List <float>();
            }

            foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest))
            {
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome))
                {
                    continue;
                }

                // Put the observed count in the GC-appropriate list.
                countsByGC[bin.GenomicBin.GC].Add(bin.Count);

                // Add to the global list of counts.
                counts.Add(bin.Count);
            }
        }
Beispiel #5
0
 /// <summary>
 /// Output bed file of regions. Each region spans both probes and the target interval
 /// Note that the BED format uses:
 /// 0-based start position (inclusive) and 1-based end position (inclusive)
 /// which is equivalent to saying:
 /// 0-based start position (inclusive) and 0-based end position (exclusive)
 /// </summary>
 public static void WriteRegionBed(NexteraManifest manifest, string outputPath, GenomeMetadata genome)
 {
     using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outputPath))
     {
         WriteRegionBed(manifest, writer, genome);
     }
 }
        private IFileLocation WriteManifest(NexteraManifest manifest, IDirectoryLocation sandbox)
        {
            var path = sandbox.GetFileLocation(manifest.Name);

            NexteraManifestUtils.WriteNexteraManifests(manifest, path.FullName);
            return(path);
        }
        private IFileLocation CreateDbSnpVcfForManifest(IFileLocation fullDbSnpVcf, NexteraManifest manifest, IDirectoryLocation sandBox)
        {
            IFileLocation targetedDbSnpVcf = sandBox.GetFileLocation($"{manifest.Name}_{fullDbSnpVcf.Name}");

            Isas.Manifests.NexteraManifest.VcfUtilities.IntersectVcfWithManifest(fullDbSnpVcf.FullName, targetedDbSnpVcf.FullName, manifest);
            return(targetedDbSnpVcf);
        }
Beispiel #8
0
        private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts,
                                          out List <int> onTargetIndices)
        {
            binCounts       = new List <double>();
            onTargetIndices = new List <int>();

            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool onTarget    = false;

            using (GzipReader reader = new GzipReader(binnedPath))
            {
                string   line;
                string[] toks;
                int      binIdx = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    string chrom = toks[0];
                    int    start = int.Parse(toks[1]); // 0-based, inclusive
                    int    stop  = int.Parse(toks[2]); // 0-based, exclusive
                    if (currChrom != chrom)
                    {
                        currChrom = chrom;
                        onTarget  = false;
                        if (!regionsByChrom.ContainsKey(currChrom))
                        {
                            regions = null;
                        }
                        else
                        {
                            regions     = regionsByChrom[currChrom];
                            regionIndex = 0;
                        }
                    }
                    while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1)
                    {
                        regionIndex++;
                    }
                    if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap
                    {
                        onTarget = true;
                    }
                    else
                    {
                        onTarget = false;
                    }

                    if (onTarget)
                    {
                        onTargetIndices.Add(binIdx);
                    }

                    binCounts.Add(double.Parse(toks[3]));
                    binIdx++;
                }
            }
        }
Beispiel #9
0
        public CanvasCallset(
            IFileLocation bam,
            string sampleName,
            IDirectoryLocation wholeGenomeFastaFolder,
            IDirectoryLocation outputFolder,
            IFileLocation kmerFasta,
            IFileLocation filterBed,
            IFileLocation ploidyBed,
            IFileLocation normalVcfPath,
            bool isDbSnpVcf,
            IEnumerable <IFileLocation> normalBamPaths,
            NexteraManifest manifest,
            IFileLocation somaticVcfPath,
            IFileLocation outputVcfPath)
        {
            Bam                    = new Bam(bam);
            SampleName             = sampleName;
            WholeGenomeFastaFolder = wholeGenomeFastaFolder;
            OutputFolder           = outputFolder;
            KmerFasta              = kmerFasta;
            FilterBed              = filterBed;
            PloidyBed              = ploidyBed;
            NormalVcfPath          = normalVcfPath;
            IsDbSnpVcf             = isDbSnpVcf;
            Manifest               = manifest;
            SomaticVcfPath         = somaticVcfPath;
            OutputVcfPath          = outputVcfPath;
            NormalBamPaths         = normalBamPaths.Select(file => new Bam(file));

            var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml");

            GenomeMetadata = new GenomeMetadata();
            GenomeMetadata.Deserialize(genomeSizeXml.FullName);
        }
Beispiel #10
0
        private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath,
                                                       NexteraManifest manifest = null)
        {
            int sampleCount = binnedPaths.Count();

            if (sampleCount == 1) // copy file
            {
                if (File.Exists(binnedPaths.First()))
                {
                    if (File.Exists(mergedBinnedPath))
                    {
                        File.Delete(mergedBinnedPath);
                    }
                    File.Copy(binnedPaths.First(), mergedBinnedPath);
                }
            }
            else // merge normal samples
            {
                double[]        weights           = new double[sampleCount];
                List <double>[] binCountsBySample = new List <double> [sampleCount];
                for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
                {
                    string        binnedPath = binnedPaths.ElementAt(sampleIndex);
                    var           binCounts  = new BinCounts(binnedPath, manifest: manifest);
                    List <double> counts     = binCounts.AllCounts;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double median = binCounts.OnTargetMedianBinCount;
                    weights[sampleIndex]           = median > 0 ? 1.0 / median : 0;
                    binCountsBySample[sampleIndex] = counts;
                }
                double weightSum = weights.Sum();
                for (int i = 0; i < sampleCount; i++)
                {
                    weights[i] /= weightSum;
                }                                                                  // so weights sum to 1

                // Computed weighted average of bin counts across samples
                using (GzipReader reader = new GzipReader(binnedPaths.First()))
                    using (GzipWriter writer = new GzipWriter(mergedBinnedPath))
                    {
                        string   line;
                        string[] toks;
                        int      lineIdx = 0;
                        while ((line = reader.ReadLine()) != null)
                        {
                            toks = line.Split('\t');
                            double weightedBinCount = 0;
                            for (int i = 0; i < sampleCount; i++)
                            {
                                weightedBinCount += weights[i] * binCountsBySample[i][lineIdx];
                            }
                            toks[3] = String.Format("{0}", weightedBinCount);
                            writer.WriteLine(String.Join("\t", toks));
                            lineIdx++;
                        }
                    }
            }
        }
Beispiel #11
0
        /// <summary>
        /// Pick the best normal control that has the smallest mean squared log-ratios (LR2s).
        /// </summary>
        /// <param name="tumorBinnedPath"></param>
        /// <param name="normalBinnedPaths"></param>
        /// <param name="bestBinnedPath"></param>
        /// <param name="manifest"></param>
        private static void GetBestLR2BinCount(string tumorBinnedPath, IEnumerable<string> normalBinnedPaths, string bestBinnedPath,
            NexteraManifest manifest = null)
        {
            int bestNormalSampleIndex = 0;
            int normalSampleCount = normalBinnedPaths.Count();
            if (normalSampleCount > 1) // find the best normal
            {
                List<double[]> binCountsByNormalSample = new List<double[]>();
                for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++)
                {
                    string normalBinnedPath = normalBinnedPaths.ElementAt(normalSampleIndex);
                    var binCounts = new BinCounts(normalBinnedPath, manifest: manifest);
                    List<double> counts = binCounts.OnTargetCounts;
                    double median = binCounts.OnTargetMedianBinCount;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double weight = median > 0 ? 1.0 / median : 0;
                    binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray());
                }
                double[] tumorBinCounts;
                {
                    var binCounts = new BinCounts(tumorBinnedPath, manifest: manifest);
                    List<double> counts = binCounts.OnTargetCounts;
                    double tumorMedian = binCounts.OnTargetMedianBinCount;
                    double tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0;
                    tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray();
                }

                // Find the best normal sample
                bestNormalSampleIndex = -1;
                double minMeanSquaredLogRatios = double.PositiveInfinity;
                for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++)
                {
                    // Get the sum of squared log ratios
                    var result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]);
                    double meanSquaredLogRatios = result.Item1;
                    int ignoredBinCount = result.Item2;
                    // TODO: Skip a (bad) normal sample if too many bins were ignored.
                    //       Donavan's script skips a normal sample if more than 100 log ratios is NA.
                    //       The cut-off is likely panel-dependent.
                    if (meanSquaredLogRatios < minMeanSquaredLogRatios)
                    {
                        minMeanSquaredLogRatios = meanSquaredLogRatios;
                        bestNormalSampleIndex = normalSampleIndex;
                    }
                }
            }

            // copy file
            string srcBinnedPath = normalBinnedPaths.ElementAt(bestNormalSampleIndex);
            if (File.Exists(srcBinnedPath))
            {
                if (File.Exists(bestBinnedPath)) { File.Delete(bestBinnedPath); }
                File.Copy(srcBinnedPath, bestBinnedPath);
            }
        }
 public PCAReferenceGenerator(IFileLocation sampleBinnedFile, IFileLocation pcaModelFile,
                              NexteraManifest manifest, double minBinCount = 1, double maxBinCount = double.PositiveInfinity)
 {
     _sampleBinnedFile = sampleBinnedFile;
     _model            = new PCAModel(pcaModelFile);
     _manifest         = manifest;
     _minBinCount      = minBinCount;
     _maxBinCount      = maxBinCount;
     _ratioCalculator  = new RawRatioCalculator(manifest, _minBinCount, _maxBinCount);
 }
Beispiel #13
0
 private void LoadBinCounts(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest)
 {
     if (manifest == null)
     {
         Counts = bins.Select(bin => (double)bin.Count).ToList();
     }
     else
     {
         LoadBinCounts(bins, manifest, out Counts, out OnTargetIndices);
     }
 }
Beispiel #14
0
 private void LoadBinCounts(string binnedPath, NexteraManifest manifest)
 {
     if (manifest == null)
     {
         LoadBinCounts(binnedPath, out Counts);
     }
     else
     {
         LoadBinCounts(binnedPath, manifest, out Counts, out OnTargetIndices);
     }
 }
Beispiel #15
0
 public LoessGCNormalizer(IEnumerable<GenomicBin> bins, NexteraManifest manifest, int robustnessIter = 2,
     Func<float, double> countTransformer = null, Func<double, float> invCountTransformer = null)
 {
     this.bins = bins;
     this.manifest = manifest;
     if (robustnessIter >= 0) { this.robustnessIter = robustnessIter; }
     if (countTransformer != null && invCountTransformer != null)
     {
         this.countTransformer = countTransformer;
         this.invCountTransformer = invCountTransformer;
     }
     initialize();
 }
        public WeightedAverageReferenceGenerator(IEnumerable <IFileLocation> controlBinnedFiles, NexteraManifest manifest)
        {
            foreach (var binnedFile in controlBinnedFiles)
            {
                if (!binnedFile.Exists)
                {
                    throw new FileNotFoundException(binnedFile.FullName + " does not exist.");
                }
            }

            _controlBinnedFiles = controlBinnedFiles;
            _manifest           = manifest;
        }
Beispiel #17
0
        private static void LoadBinCounts(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest,
                                          out List <double> binCounts, out List <int> onTargetIndices)
        {
            binCounts       = new List <double>();
            onTargetIndices = new List <int>();

            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool onTarget    = false;
            int  binIdx      = 0;

            foreach (var bin in bins)
            {
                if (currChrom != bin.GenomicBin.Chromosome)
                {
                    currChrom = bin.GenomicBin.Chromosome;
                    onTarget  = false;
                    if (!regionsByChrom.ContainsKey(currChrom))
                    {
                        regions = null;
                    }
                    else
                    {
                        regions     = regionsByChrom[currChrom];
                        regionIndex = 0;
                    }
                }
                while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1)
                {
                    regionIndex++;
                }
                if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap
                {
                    onTarget = true;
                }
                else
                {
                    onTarget = false;
                }

                if (onTarget)
                {
                    onTargetIndices.Add(binIdx);
                }

                binCounts.Add(bin.Count);
                binIdx++;
            }
        }
Beispiel #18
0
        private static void GetWeightedAverageBinCount(IEnumerable<string> binnedPaths, string mergedBinnedPath,
            NexteraManifest manifest = null)
        {
            int sampleCount = binnedPaths.Count();
            if (sampleCount == 1) // copy file
            {
                if (File.Exists(binnedPaths.First()))
                {
                    if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); }
                    File.Copy(binnedPaths.First(), mergedBinnedPath);
                }
            }
            else // merge normal samples
            {
                double[] weights = new double[sampleCount];
                List<double>[] binCountsBySample = new List<double>[sampleCount];
                for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
                {
                    string binnedPath = binnedPaths.ElementAt(sampleIndex);
                    var binCounts = new BinCounts(binnedPath, manifest: manifest);
                    List<double> counts = binCounts.AllCounts;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double median = binCounts.OnTargetMedianBinCount;
                    weights[sampleIndex] = median > 0 ? 1.0 / median : 0;
                    binCountsBySample[sampleIndex] = counts;
                }
                double weightSum = weights.Sum();
                for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1

                // Computed weighted average of bin counts across samples
                using (GzipReader reader = new GzipReader(binnedPaths.First()))
                using (GzipWriter writer = new GzipWriter(mergedBinnedPath))
                {
                    string line;
                    string[] toks;
                    int lineIdx = 0;
                    while ((line = reader.ReadLine()) != null)
                    {
                        toks = line.Split('\t');
                        double weightedBinCount = 0;
                        for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; }
                        toks[3] = String.Format("{0}", weightedBinCount);
                        writer.WriteLine(String.Join("\t", toks));
                        lineIdx++;
                    }
                }
            }
        }
Beispiel #19
0
 public CanvasTumorNormalEnrichmentInput(
     Bam tumorBam,
     Bam normalBam,
     Vcf normalVcf,
     Vcf somaticVcf,
     GenomeMetadata genomeMetadata,
     NexteraManifest nexteraManifest)
 {
     TumorBam        = tumorBam;
     NormalBam       = normalBam;
     NormalVcf       = normalVcf;
     SomaticVcf      = somaticVcf;
     GenomeMetadata  = genomeMetadata;
     NexteraManifest = nexteraManifest;
 }
Beispiel #20
0
 public LoessGCNormalizer(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest, int robustnessIter = 2,
                          Func <float, double> countTransformer = null, Func <double, float> invCountTransformer = null)
 {
     this.bins     = bins;
     this.manifest = manifest;
     if (robustnessIter >= 0)
     {
         this.robustnessIter = robustnessIter;
     }
     if (countTransformer != null && invCountTransformer != null)
     {
         this.countTransformer    = countTransformer;
         this.invCountTransformer = invCountTransformer;
     }
     initialize();
 }
 public CanvasEnrichmentInput(Bam bam, GenomeMetadata genomeMetadata,
                              IEnumerable <Bam> controlBamPaths,
                              NexteraManifest nexteraManifest,
                              CanvasEnrichmentPrecomputedControl precomputedControl,
                              SamplePloidyInfo ploidyInfo,
                              IFileLocation predefinedBinsFile,
                              CanvasPcaModels pcaModels)
 {
     Bam                = bam;
     GenomeMetadata     = genomeMetadata;
     NexteraManifest    = nexteraManifest;
     PrecomputedControl = precomputedControl;
     NormalBamPaths     = new ReadOnlyCollection <Bam>(controlBamPaths.ToList());
     PloidyInfo         = ploidyInfo;
     PredefinedBinsFile = predefinedBinsFile;
     PcaModels          = pcaModels;
 }
Beispiel #22
0
        private CanvasCallset GetCallset(ILogger logger)
        {
            AnalysisDetails analysisDetails = new AnalysisDetails(CommonOptions.OutputDirectory, CommonOptions.WholeGenomeFasta, CommonOptions.KmerFasta, CommonOptions.FilterBed, SingleSampleCommonOptions.PloidyVcf, null);
            IFileLocation   outputVcfPath   = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz");
            var             manifest        = new NexteraManifest(_manifest.FullName, null, logger.Error);
            CanvasCallset   callSet         = new CanvasCallset(
                _tumorNormalOptions.TumorBam,
                SingleSampleCommonOptions.SampleName,
                SingleSampleCommonOptions.BAlleleSites,
                SingleSampleCommonOptions.IsDbSnpVcf,
                new[] { _normalBam },
                manifest,
                _tumorNormalOptions.SomaticVcf,
                outputVcfPath,
                analysisDetails);

            return(callSet);
        }
Beispiel #23
0
        public CanvasCallset(
            IFileLocation bam,
            string sampleName,
            IFileLocation normalVcfPath,
            bool isDbSnpVcf,
            IEnumerable <IFileLocation> normalBamPaths,
            NexteraManifest manifest,
            IFileLocation somaticVcfPath,
            IFileLocation outputVcfPath,
            AnalysisDetails analysisDetails)

        {
            SingleSampleCallset = new SingleSampleCallset(new Bam(bam), sampleName, normalVcfPath, isDbSnpVcf, analysisDetails.OutputFolder, outputVcfPath);
            Manifest            = manifest;
            SomaticVcfPath      = somaticVcfPath;
            AnalysisDetails     = analysisDetails;
            NormalBamPaths      = normalBamPaths.Select(file => new Bam(file));
        }
Beispiel #24
0
        private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath,
                                        string ploidyBedPath, NexteraManifest manifest = null)
        {
            PloidyInfo referencePloidy   = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);
            double     tumorMedian       = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double     normalMedian      = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double     librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1;

            using (GzipReader tumorReader = new GzipReader(tumorBinnedPath))
                using (GzipReader normalReader = new GzipReader(normalBinnedPath))
                    using (GzipWriter writer = new GzipWriter(ratioBinnedPath))
                    {
                        string   normalLine;
                        string   tumorLine;
                        string[] normalToks;
                        string[] tumorToks;
                        double   normalCount;
                        double   tumorCount;
                        double   ratio;
                        while ((normalLine = normalReader.ReadLine()) != null)
                        {
                            tumorLine   = tumorReader.ReadLine();
                            normalToks  = normalLine.Split('\t');
                            tumorToks   = tumorLine.Split('\t');
                            normalCount = double.Parse(normalToks[3]);
                            tumorCount  = double.Parse(tumorToks[3]);
                            // The weighted average count of a bin could be less than 1.
                            // Using these small counts for coverage normalization creates large ratios.
                            // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                            if (normalCount < 1)
                            {
                                continue;
                            }                          // skip the bin
                            string chrom = normalToks[0];
                            int    start = int.Parse(normalToks[1]);
                            int    end   = int.Parse(normalToks[2]);
                            // get the normal ploidy from intervalsWithPloidyByChrom
                            double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0;
                            ratio         = tumorCount / normalCount * factor * librarySizeFactor;
                            normalToks[3] = String.Format("{0}", ratio);
                            writer.WriteLine(String.Join("\t", normalToks));
                        }
                    }
        }
Beispiel #25
0
        public BestLR2ReferenceGenerator(IFileLocation sampleBinnedFile, IEnumerable <IFileLocation> controlBinnedFiles,
                                         NexteraManifest manifest)
        {
            if (!sampleBinnedFile.Exists)
            {
                throw new FileNotFoundException(sampleBinnedFile.FullName + " does not exist.");
            }
            foreach (var binnedFile in controlBinnedFiles)
            {
                if (!binnedFile.Exists)
                {
                    throw new FileNotFoundException(binnedFile.FullName + " does not exist.");
                }
            }

            _sampleBinnedFile   = sampleBinnedFile;
            _controlBinnedFiles = controlBinnedFiles;
            _manifest           = manifest;
        }
Beispiel #26
0
        /// <summary>
        /// Perform GC normalization depending on the mode
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized</param>
        /// <param name="manifest"></param>
        /// <param name="mode">GC normalization mode</param>
        static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, CanvasGCNormalizationMode mode)
        {
            switch (mode)
            {
            case CanvasGCNormalizationMode.MedianByGC:
                NormalizeByGC(bins, manifest: manifest);
                break;

            case CanvasGCNormalizationMode.LOESS:
                var normalizer = new LoessGCNormalizer(bins, manifest, robustnessIter: 0,
                                                       countTransformer: x => (double)Math.Log(x),
                                                       invCountTransformer: x => (float)Math.Exp(x));
                normalizer.Normalize();
                break;

            default:
                throw new Illumina.Common.IlluminaException("Unsupported Canvas GC normalization mode: " + mode.ToString());
            }
        }
Beispiel #27
0
        public CanvasCallset(
            SingleSampleCallset singleSampleCallset,
            AnalysisDetails analysisDetails,
            IEnumerable <IFileLocation> normalBamPaths,
            NexteraManifest manifest,
            IFileLocation somaticVcfPath)

        {
            SingleSampleCallset = singleSampleCallset;
            Manifest            = manifest;
            if (somaticVcfPath != null)
            {
                SomaticVcfPath = somaticVcfPath;
            }
            AnalysisDetails = analysisDetails;
            if (normalBamPaths != null)
            {
                NormalBamPaths = normalBamPaths.Select(file => new Bam(file));
            }
        }
 private CanvasCallset GetCallset(ILogger logger)
 {
     IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz");
     var manifest = new NexteraManifest(_manifest.FullName, null, logger.Error);
     CanvasCallset callSet = new CanvasCallset(
             _tumorNormalOptions.TumorBam,
             CommonOptions.SampleName,
             CommonOptions.WholeGenomeFasta,
             CommonOptions.OutputDirectory,
             CommonOptions.KmerFasta,
             CommonOptions.FilterBed,
             CommonOptions.PloidyBed,
             CommonOptions.BAlleleSites,
             CommonOptions.IsDbSnpVcf,
             new[] { _normalBam },
             manifest,
             _tumorNormalOptions.SomaticVcf,
             outputVcfPath);
     return callSet;
 }
 private CanvasCallset GetCallset(ILogger logger)
 {
     IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz");
     var manifest = new NexteraManifest(_somaticEnrichmentOptions.Manifest.FullName, null, logger.Error);
     CanvasCallset callSet = new CanvasCallset(
             _somaticEnrichmentOptions.Bam,
             CommonOptions.SampleName,
             CommonOptions.WholeGenomeFasta,
             CommonOptions.OutputDirectory,
             CommonOptions.KmerFasta,
             CommonOptions.FilterBed,
             CommonOptions.PloidyBed,
             CommonOptions.BAlleleSites,
             CommonOptions.IsDbSnpVcf,
             _somaticEnrichmentOptions.ControlBams,
             manifest,
             null,
             outputVcfPath);
     return callSet;
 }
Beispiel #30
0
        /// <summary>
        /// Assumes the bins are sorted by genomic coordinates
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized</param>
        /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param>
        /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param>
        public static void GetCountsByGC(List<GenomicBin> bins, NexteraManifest manifest, out List<float>[] countsByGC, out List<float> counts)
        {
            countsByGC = new List<float>[numberOfGCbins];
            counts = new List<float>(bins.Count);

            // Initialize the lists
            for (int i = 0; i < countsByGC.Length; i++)
                countsByGC[i] = new List<float>();

            foreach (GenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest))
            {
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.Chromosome)) { continue; }

                // Put the observed count in the GC-appropriate list.
                countsByGC[bin.GC].Add(bin.Count);

                // Add to the global list of counts.
                counts.Add(bin.Count);
            }
        }
Beispiel #31
0
        /// <summary>
        /// Get the on-target bins by intersecting the manifest.
        /// </summary>
        /// <param name="bins"></param>
        /// <param name="manifest"></param>
        /// <returns></returns>
        public static IEnumerable<GenomicBin> GetOnTargetBins(IEnumerable<GenomicBin> bins, NexteraManifest manifest)
        {
            var regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom = null;
            List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int regionIndex = -1;
            bool offTarget = true;
            foreach (GenomicBin bin in bins) // 0-based bins
            {
                if (currChrom != bin.Chromosome)
                {
                    currChrom = bin.Chromosome;
                    offTarget = true;
                    if (!regionsByChrom.ContainsKey(currChrom))
                    {
                        regions = null;
                    }
                    else
                    {
                        regions = regionsByChrom[currChrom];
                        regionIndex = 0;
                    }
                }
                while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1)
                {
                    regionIndex++;
                }
                if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap
                {
                    offTarget = false;
                }
                else
                {
                    offTarget = true;
                }

                if (offTarget) { continue; } // ignore off-target bins

                yield return bin;
            }
        }
        private CanvasCallset GetCallset(ILogger logger)
        {
            IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz");
            var           manifest      = new NexteraManifest(_somaticEnrichmentOptions.Manifest.FullName, null, logger.Error);
            CanvasCallset callSet       = new CanvasCallset(
                _somaticEnrichmentOptions.Bam,
                CommonOptions.SampleName,
                CommonOptions.WholeGenomeFasta,
                CommonOptions.OutputDirectory,
                CommonOptions.KmerFasta,
                CommonOptions.FilterBed,
                CommonOptions.PloidyBed,
                CommonOptions.BAlleleSites,
                CommonOptions.IsDbSnpVcf,
                _somaticEnrichmentOptions.ControlBams,
                manifest,
                null,
                outputVcfPath);

            return(callSet);
        }
Beispiel #33
0
        public static void WriteRegionBed(NexteraManifest manifest, BgzipOrStreamWriter writer, GenomeMetadata genome)
        {
            List<NexteraManifest.ManifestRegion> tempRegions = manifest.Regions;
            if (genome != null)
            {
                tempRegions = new List<NexteraManifest.ManifestRegion>(manifest.Regions);
                Dictionary<string, int> chromsomeIndexLookup = new Dictionary<string, int>();
                //generate chromsome index lookup and sort
                for (int chromosomeIndex = 0; chromosomeIndex < genome.Sequences.Count; chromosomeIndex++)
                {
                    GenomeMetadata.SequenceMetadata sequence = genome.Sequences[chromosomeIndex];
                    chromsomeIndexLookup[sequence.Name] = chromosomeIndex;
                }
                tempRegions.Sort((a, b) => a.CompareTo(b, chromsomeIndexLookup));
            }

                foreach (NexteraManifest.ManifestRegion region in tempRegions)
                {
                    writer.WriteLine(string.Format("{0}\t{1}\t{2}", region.Chromosome, region.Start - 1, region.End));
                }
            }
Beispiel #34
0
        /// <summary>
        /// Perform a simple GC normalization.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        /// <param name="manifest"></param>
        static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List <float>[] countsByGC;

            // Will hold all of the autosomal counts present in 'bins'
            List <float> counts;

            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            double globalMedian = Utilities.Median(counts);

            double?[] medians = new double?[countsByGC.Length];

            // Compute the median count for each GC bin
            for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++)
            {
                if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC)
                {
                    medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]);
                }
                else
                {
                    List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex);
                    medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts);
                }
            }

            // Divide each count by the median count of bins with the same GC content
            for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++)
            {
                double?median = medians[bins[gcBinIndex].GenomicBin.GC];
                if (median != null && median > 0)
                {
                    bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median);
                }
            }
            // DebugPrintCountsByGC(bins, "CountsByGC-After.txt");
        }
Beispiel #35
0
        private CanvasCallset GetCallset(ILogger logger)
        {
            AnalysisDetails analysisDetails = new AnalysisDetails(CommonOptions.OutputDirectory, CommonOptions.WholeGenomeFasta,
                                                                  CommonOptions.KmerFasta, CommonOptions.FilterBed, SingleSampleCommonOptions.PloidyVcf, null);
            IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz");
            var           manifest      = new NexteraManifest(_somaticEnrichmentOptions.Manifest.FullName, null, logger.Error);

            // TODO: refactor and remove the following two lines
            manifest.CanvasControlBinnedPath = _somaticEnrichmentOptions.ControlBinned?.FullName;
            manifest.CanvasBinSize           = _somaticEnrichmentOptions.ControlBinSize;
            CanvasCallset callSet = new CanvasCallset(
                _somaticEnrichmentOptions.Bam,
                SingleSampleCommonOptions.SampleName,
                SingleSampleCommonOptions.BAlleleSites,
                SingleSampleCommonOptions.IsDbSnpVcf,
                _somaticEnrichmentOptions.ControlBams,
                manifest,
                null,
                outputVcfPath,
                analysisDetails);

            return(callSet);
        }
Beispiel #36
0
        public static int Run(CanvasNormalizeParameters parameters)
        {
            NexteraManifest manifest = string.IsNullOrEmpty(parameters.manifestPath) ? null : new NexteraManifest(parameters.manifestPath, null, Console.WriteLine);

            switch (parameters.normalizationMode)
            {
            case CanvasNormalizeMode.BestLR2:
                GetBestLR2BinCount(parameters.tumorBedPath, parameters.normalBedPaths, parameters.weightedAverageNormalBedPath,
                                   manifest: manifest);
                break;

            case CanvasNormalizeMode.WeightedAverage:
                GetWeightedAverageBinCount(parameters.normalBedPaths, parameters.weightedAverageNormalBedPath, manifest: manifest);
                break;

            default:
                throw new Exception(string.Format("Invalid CanvasNormalize mode '{0}'", parameters.normalizationMode));
            }

            GetBinRatio(parameters.tumorBedPath, parameters.weightedAverageNormalBedPath, parameters.outBedPath,
                        parameters.ploidyBedPath, manifest: manifest);

            return(0);
        }
Beispiel #37
0
 public BinCounts(string binnedPath, NexteraManifest manifest = null)
 {
     BinnedPath = binnedPath;
     Manifest = manifest;
     LoadBinCounts();
 }
Beispiel #38
0
        public static void WriteNexteraManifests(NexteraManifest manifest, TextWriter writer)
        {
                writer.WriteLine("#{0}: {1}", "Manifest Type", "Regions");
            writer.WriteLine("#{0}: {1}", "Target Region Count", manifest.Regions.Count);
                writer.WriteLine("#{0}: {1}", "Date", DateTime.Now.ToShortDateString());
                writer.WriteLine("[Header]");
                //writer.WriteLine("Manifest Version\t1.0");
                if (!string.IsNullOrEmpty(manifest.GenomeName))
                {
                    writer.WriteLine("ReferenceGenome\t" + manifest.GenomeName);
                }
                writer.WriteLine("[Regions]");
                List<string> headers = new List<string>();
                if (manifest.ColumnNames != null && manifest.ColumnNames.Length > 0)
                {
                    foreach (int columnNumber in manifest.ColumnNumbers)
                    {
                        if (columnNumber >= 0 && columnNumber < manifest.ColumnNames.Length)
                        {
                            headers.Add(manifest.ColumnNames[columnNumber]);
                        }
                    }
                    writer.WriteLine(string.Join("\t", headers.ToArray()));
                }

                if (manifest.Regions != null && manifest.Regions.Count > 0)
                {
                    foreach (NexteraManifest.ManifestRegion region in manifest.Regions)
                    {
                        NexteraManifest.ManifestRegion tmpRegion = new NexteraManifest.ManifestRegion(region);

                        List<string> line = new List<string>();
                        foreach (string header in headers)
                        {
                            switch (header.ToLowerInvariant())
                            {
                                case "name":
                                    line.Add(tmpRegion.Name);
                                    break;
                                case "chromosome":
                                    line.Add(tmpRegion.Chromosome);
                                    break;
                                case "start":
                                case "amplicon start":
                                    line.Add(tmpRegion.Start.ToString());
                                    break;
                                case "end":
                                case "amplicon end":
                                    line.Add(tmpRegion.End.ToString());
                                    break;
                                case "startprobelength":
                                case "upstream probe length":
                                    line.Add(tmpRegion.StartProbeLength.ToString());
                                    break;
                                case "endprobelength":
                                case "downstream probe length":
                                    line.Add(tmpRegion.EndProbeLength.ToString());
                                    break;
                                case "groupname":
                                case "group name":
                                case "group":
                                case "ip group":
                                    line.Add(tmpRegion.GroupName);
                                    break;
                            }
                        }
                        writer.WriteLine(string.Join("\t", line.ToArray()));
                    }
                }
            }
Beispiel #39
0
        public static NexteraManifest GetUpdatedNexteraManifestsWithNewRegions(NexteraManifest manifest, List<RegionStatistics> regionStats)
        {
            NexteraManifest nexteraManifestsWithNewRegions = new NexteraManifest(manifest);

            // create a dictionary for new regions
            Dictionary<string, RegionStatistics> regionStatsLookup = new Dictionary<string, RegionStatistics>();
            foreach (RegionStatistics regionStat in regionStats)
            {
                regionStatsLookup.Add(regionStat.RegionName, regionStat);
        }

            //update the regions
            if (nexteraManifestsWithNewRegions.Regions != null && nexteraManifestsWithNewRegions.Regions.Count > 0)
            {
                var newRegions = new List<NexteraManifest.ManifestRegion>();
                foreach (NexteraManifest.ManifestRegion region in nexteraManifestsWithNewRegions.Regions)
                {
                    NexteraManifest.ManifestRegion tmpRegion = new NexteraManifest.ManifestRegion(region);
                    if (!regionStatsLookup.ContainsKey(region.Name))
                        continue;

                    tmpRegion.Start = regionStatsLookup[region.Name].StartPosition;
                    tmpRegion.End = regionStatsLookup[region.Name].EndPosition;
                    newRegions.Add(tmpRegion);

                }
                nexteraManifestsWithNewRegions.Regions = newRegions;
            }

            return nexteraManifestsWithNewRegions;
        }
Beispiel #40
0
        private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath,
            string ploidyBedPath, NexteraManifest manifest = null)
        {
            PloidyInfo referencePloidy = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);
            double tumorMedian = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double normalMedian = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1;

            using (GzipReader tumorReader = new GzipReader(tumorBinnedPath))
            using (GzipReader normalReader = new GzipReader(normalBinnedPath))
            using (GzipWriter writer = new GzipWriter(ratioBinnedPath))
            {
                string normalLine;
                string tumorLine;
                string[] normalToks;
                string[] tumorToks;
                double normalCount;
                double tumorCount;
                double ratio;
                while ((normalLine = normalReader.ReadLine()) != null)
                {
                    tumorLine = tumorReader.ReadLine();
                    normalToks = normalLine.Split('\t');
                    tumorToks = tumorLine.Split('\t');
                    normalCount = double.Parse(normalToks[3]);
                    tumorCount = double.Parse(tumorToks[3]);
                    // The weighted average count of a bin could be less than 1.
                    // Using these small counts for coverage normalization creates large ratios.
                    // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                    if (normalCount < 1) { continue; } // skip the bin
                    string chrom = normalToks[0];
                    int start = int.Parse(normalToks[1]);
                    int end = int.Parse(normalToks[2]);
                    // get the normal ploidy from intervalsWithPloidyByChrom
                    double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0;
                    ratio = tumorCount / normalCount * factor * librarySizeFactor;
                    normalToks[3] = String.Format("{0}", ratio);
                    writer.WriteLine(String.Join("\t", normalToks));
                }
            }
        }
Beispiel #41
0
        /// <summary>
        /// Calculates how many possible alignments corresponds to the desired number of observed alignments per bin.
        /// </summary>
        /// <param name="countsPerBin">Desired number of observed alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments (unique mers).</param>
        /// <param name="observedAlignments">BitArrays storing the observed alignments.</param>
        /// <returns>Median alignment rate observed on the autosomes.</returns>
        static int CalculateNumberOfPossibleAlignmentsPerBin(int countsPerBin, Dictionary<string, BitArray> possibleAlignments,
            Dictionary<string, HitArray> observedAlignments, NexteraManifest manifest = null)
        {
            List<double> rates = new List<double>();

            Dictionary<string, List<NexteraManifest.ManifestRegion>> manifestRegionsByChrom = null;
            if (manifest != null)
            {
                manifestRegionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            List<ThreadStart> tasks = new List<ThreadStart>();
            foreach (string chr in possibleAlignments.Keys)
            {
                // We don't want to include the sex chromosomes because they may not be copy number 2
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(chr))
                    continue;
                HitArray observed = observedAlignments[chr];
                BitArray possible = possibleAlignments[chr];
                List<NexteraManifest.ManifestRegion> regions = null;
                if (manifestRegionsByChrom != null)
                {
                    if (!manifestRegionsByChrom.ContainsKey(chr)) { continue; }
                    regions = manifestRegionsByChrom[chr];
                }
                tasks.Add(new ThreadStart(() =>
                {
                    int numberObserved = observed.CountSetBits(regions);
                    int numberPossible = CountSetBits(possible, regions);

                    double rate = numberObserved / (double)numberPossible;

                    lock (rates)
                    {
                        rates.Add(rate);
                    }

                }));
            }

            Console.WriteLine("Launch CalculateNumberOfPossibleAlignmentsPerBin jobs...");
            Console.Out.WriteLine();
            //Parallel.ForEach(tasks, t => { t.Invoke(); }); //todo allow controling degree of parallelism
            Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(tasks);
            Console.WriteLine("CalculateNumberOfPossibleAlignmentsPerBin jobs complete.");
            Console.Out.WriteLine();
            double medianRate = CanvasCommon.Utilities.Median(rates);
            return (int)(countsPerBin / medianRate);
        }
Beispiel #42
0
        /// <summary>
        /// Perform variance stabilization by GC bins.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        static bool NormalizeVarianceByGC(List<GenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List<float>[] countsByGC;
            // Will hold all of the autosomal counts present in 'bins'
            List<float> counts;
            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            // Estimate quartiles of all bins genomewide
            var globalQuartiles = Utilities.Quartiles(counts);
            // Will hold interquartile range (IQR) separately for each GC bin
            List<float> localIQR = new List<float>(countsByGC.Length);
            // Will hold quartiles separately for each GC bin
            List<Tuple<float, float, float>> localQuartiles = new List<Tuple<float, float, float>>(countsByGC.Length);

            // calculate interquartile range (IQR) for GC bins and populate localQuartiles list
            for (int i = 0; i < countsByGC.Length; i++)
            {
                if (countsByGC[i].Count == 0)
                {
                    localIQR.Add(-1f);
                    localQuartiles.Add(new Tuple<float, float, float>(-1f, -1f, -1f));
                }
                else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC)
                {
                    localQuartiles.Add(Utilities.Quartiles(countsByGC[i]));
                    localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1);
                }
                else
                {
                    List<Tuple<float, float>> weightedCounts = GetWeightedCounts(countsByGC, i);
                    double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List<float>() { 0.25f, 0.5f, 0.75f });
                    localQuartiles.Add(new Tuple<float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2]));
                    localIQR.Add((float)(quartiles[2] - quartiles[0]));
                }
            }

            // Identify if particular GC bins have IQR twice as large as IQR genomewide
            float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1;
            // Holder for GC bins with large IQR (compared to genomewide IQR)
            int significantIQRcounter = 0;
            for (int i = 10; i < 90; i++)
            {
                if (globalIQR < localIQR[i] * 2f)
                    significantIQRcounter++;
            }

            if (significantIQRcounter <= 0)
                return false;

            // Divide each count by the median count of bins with the same GC content
            foreach (GenomicBin bin in bins)
            {
                var scaledLocalIqr = localIQR[bin.GC] * 0.8f;
                if (globalIQR >= scaledLocalIqr) continue;

                // ratio of GC bins and global IQRs
                float iqrRatio = scaledLocalIqr / globalIQR;
                var medianGCCount = localQuartiles[bin.GC].Item2;
                bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio;
            }

            // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt");
            return true;
        }
Beispiel #43
0
 public static void WriteNexteraManifests(NexteraManifest manifest, string path)
 {
     using (StreamWriter writer = new StreamWriter(path))
     {
         WriteNexteraManifests(manifest, writer);
     }
 }
        /// <summary>
        /// Get the on-target bins by intersecting the manifest.
        /// </summary>
        /// <param name="bins"></param>
        /// <param name="manifest"></param>
        /// <returns></returns>
        public static IEnumerable <SampleGenomicBin> GetOnTargetBins(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest)
        {
            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool offTarget   = true;

            foreach (SampleGenomicBin bin in bins) // 0-based bins
            {
                if (currChrom != bin.GenomicBin.Chromosome)
                {
                    currChrom = bin.GenomicBin.Chromosome;
                    offTarget = true;
                    if (!regionsByChrom.ContainsKey(currChrom))
                    {
                        regions = null;
                    }
                    else
                    {
                        regions     = regionsByChrom[currChrom];
                        regionIndex = 0;
                    }
                }
                while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1)
                {
                    regionIndex++;
                }
                if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap
                {
                    offTarget = false;
                }
                else
                {
                    offTarget = true;
                }

                if (offTarget)
                {
                    continue;
                }                            // ignore off-target bins

                yield return(bin);
            }
        }
Beispiel #45
0
        /// <summary>
        /// Perform a simple GC normalization.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        /// <param name="manifest"></param>
        static void NormalizeByGC(List<GenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List<float>[] countsByGC;

            // Will hold all of the autosomal counts present in 'bins'
            List<float> counts;
            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            double globalMedian = Utilities.Median(counts);
            double?[] medians = new double?[countsByGC.Length];

            // Compute the median count for each GC bin
            for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++)
            {
                if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC)
                {
                    medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]);
                }
                else
                {
                    List<Tuple<float, float>> weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex);
                    medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts);
                }
            }

            // Divide each count by the median count of bins with the same GC content
            for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++)
            {
                double? median = medians[bins[gcBinIndex].GC];
                if (median != null && median > 0)
                    bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median);
            }
            // DebugPrintCountsByGC(bins, "CountsByGC-After.txt");
        }
Beispiel #46
0
        static int Main(string[] args)
        {
            Utilities.LogCommandLine(args);
            string inFile           = null;
            string outFile          = null;
            bool   doGCnorm         = false;
            bool   doSizeFilter     = false;
            bool   doOutlierRemoval = false;
            string ffpeOutliersFile = null;
            string manifestFile     = null;
            CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC;
            string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}",
                                                   String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()),
                                                   gcNormalizationMode);
            bool needHelp = false;

            OptionSet p = new OptionSet()
            {
                { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v },
                { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v },
                { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null },
                { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null },
                { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null },
                { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v },
                { "t|manifest=", "Nextera manifest file", v => manifestFile = v },
                { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) },
                { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) },
                { "h|help", "show this message and exit", v => needHelp = v != null },
            };

            List <string> extraArgs = p.Parse(args);

            if (needHelp)
            {
                ShowHelp(p);
                return(0);
            }

            if (inFile == null || outFile == null)
            {
                ShowHelp(p);
                return(0);
            }

            // Does the input file exist?
            if (!File.Exists(inFile))
            {
                Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile);
                return(1);
            }

            List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile);

            if (doOutlierRemoval)
            {
                bins = RemoveOutliers(bins);
            }

            if (doSizeFilter)
            {
                bins = RemoveBigBins(bins);
            }

            // do not run FFPE outlier removal on targeted/low coverage data
            if (ffpeOutliersFile != null && bins.Count < 50000)
            {
                ffpeOutliersFile = null;
            }

            // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file
            double LocalSD = -1.0;

            if (ffpeOutliersFile != null)
            {
                LocalSD = getLocalStandardDeviation(bins);
                CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD);
            }

            if (doGCnorm)
            {
                NexteraManifest         manifest     = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine);
                List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC
                    ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest)
                    : bins;
                if (strippedBins.Count == 0)
                {
                    Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction");
                }
                else
                {
                    bins = strippedBins;
                    NormalizeByGC(bins, manifest, gcNormalizationMode);
                    // Use variance normalization only on large exome panels and whole genome sequencing
                    // The treshold is set to 10% of an average number of bins on CanvasClean data
                    if (ffpeOutliersFile != null && bins.Count > 500000)
                    {
                        bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest);
                        // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC
                        if (isNormalizeVarianceByGC)
                        {
                            NormalizeByGC(bins, manifest, gcNormalizationMode);
                        }
                    }
                }
            }

            if (ffpeOutliersFile != null)
            {
                // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples)
                List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile);
                bins = LocalMadstrippedBins;
            }

            CanvasIO.WriteToTextFile(outFile, bins);
            return(0);
        }
Beispiel #47
0
        /// <summary>
        /// Bin alignments.
        /// </summary>
        /// <param name="referenceFile">Reference fasta file.</param>
        /// <param name="binSize">Desired number of alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments.</param>
        /// <param name="observedAlignments">BitArrays of observed alignments.</param>
        /// <param name="predefinedBins">Pre-defined bins. null if not available.</param>
        /// <returns>A list of bins.</returns>
        static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest,
            Dictionary<string, BitArray> possibleAlignments,
            Dictionary<string, HitArray> observedAlignments,
            Dictionary<string, Int16[]> fragmentLengths,
            Dictionary<string, List<GenomicBin>> predefinedBins,
            string outFile)
        {
            bool debugGCCorrection = false; // write value of GC bins and correction factor
            Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>();
            List<string> chromosomes = new List<string>();
            Int16 meanFragmentSize = 0;
            Int16 meanFragmentCutoff = 3;
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                meanFragmentSize = MeanFragmentSize(fragmentLengths);

            using (FastaReader reader = new FastaReader(referenceFile))
            {
                GenericRead fastaEntry = new GenericRead();

                // Loop through each chromosome in the reference.
                while (reader.GetNextEntry(ref fastaEntry))
                {
                    chromosomes.Add(fastaEntry.Name);
                    fastaEntries[fastaEntry.Name] = fastaEntry;
                    fastaEntry = new GenericRead();
                }
            }

            // calculate GC content of the forward read at every position along the genome  
            Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>();
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                byte gcCap = (byte)numberOfGCbins;
                List<ThreadStart> normalizationTasks = new List<ThreadStart>();
                foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths)
                {
                    string chr = fragmentLengthsKVP.Key;
                    GenericRead fastaEntry = fastaEntries[chr];

                    normalizationTasks.Add(new ThreadStart(() =>
                    {
                    // contains GC content of the forward read at every position for current chr
                    byte[] gcContent = new byte[fastaEntry.Bases.Length];

                        int gcCounter = 0;

                    // Iteratively calculate GC content of "reads" using fasta genome reference
                    for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++)
                        {
                            Int16 currentFragment = 0;

                            if (fragmentLengthsKVP.Value[pos] == 0)
                                currentFragment = meanFragmentSize;
                            else
                                currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff));
                            for (int i = pos; i < pos + currentFragment; i++)
                            {
                                switch (fastaEntry.Bases[i])
                                {
                                    case 'C':
                                    case 'c':
                                    case 'G':
                                    case 'g':
                                        gcCounter++;
                                        break;
                                    default:
                                        break;
                                }
                            }
                            if (gcCounter < 0)
                                gcCounter = 0;
                            gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap);
                            gcCounter = 0;
                        }
                        lock (readGCContent)
                        {
                            readGCContent[chr] = gcContent;
                        }
                    }));
                }

                Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now);
                Console.Out.Flush();
                //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); });
                Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks);
                Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now);
                Console.Out.Flush();
            }

            // populate observed and expected read GC bin vectors
            float[] observedVsExpectedGC = new float[0];
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile);

            Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>();
            List<ThreadStart> binningTasks = new List<ThreadStart>();
            foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries)
            {
                string chr = fastaEntryKVP.Key;
                if (!possibleAlignments.ContainsKey(chr)) continue;
                if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue;

                BinTaskArguments args = new BinTaskArguments();
                args.FastaEntry = fastaEntryKVP.Value;
                args.Chromosome = chr;
                args.PossibleAlignments = possibleAlignments[chr];
                args.ObservedAlignments = observedAlignments[chr];
                args.CoverageMode = coverageMode;
                perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr];
                args.Bins = perChromosomeBins[chr];
                args.BinSize = binSize;
                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    args.ReadGCContent = readGCContent[chr];
                else
                    args.ReadGCContent = null;
                args.ObservedVsExpectedGC = observedVsExpectedGC;
                binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); }));
            }
            Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now);
            Console.Out.WriteLine();
            //Parallel.ForEach(binningTasks, t => { t.Invoke(); });
            Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks);
            Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now);
            Console.Out.WriteLine();

            List<GenomicBin> finalBins = new List<GenomicBin>();
            foreach (string chr in chromosomes)
            {
                if (!perChromosomeBins.ContainsKey(chr)) continue;
                finalBins.AddRange(perChromosomeBins[chr]);
            }
            return finalBins;
        }
Beispiel #48
0
        private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List<double> binCounts,
            out List<int> onTargetIndices)
        {
            binCounts = new List<double>();
            onTargetIndices = new List<int>();

            var regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom = null;
            List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int regionIndex = -1;
            bool onTarget = false;
            using (GzipReader reader = new GzipReader(binnedPath))
            {
                string line;
                string[] toks;
                int binIdx = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    string chrom = toks[0];
                    int start = int.Parse(toks[1]); // 0-based, inclusive
                    int stop = int.Parse(toks[2]); // 0-based, exclusive
                    if (currChrom != chrom)
                    {
                        currChrom = chrom;
                        onTarget = false;
                        if (!regionsByChrom.ContainsKey(currChrom))
                        {
                            regions = null;
                        }
                        else
                        {
                            regions = regionsByChrom[currChrom];
                            regionIndex = 0;
                        }
                    }
                    while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1)
                    {
                        regionIndex++;
                    }
                    if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap
                    {
                        onTarget = true;
                    }
                    else
                    {
                        onTarget = false;
                    }

                    if (onTarget) { onTargetIndices.Add(binIdx); }

                    binCounts.Add(double.Parse(toks[3]));
                    binIdx++;
                }
            }
        }
Beispiel #49
0
 public LSNormRatioCalculator(NexteraManifest manifest)
 {
     _manifest = manifest;
 }
Beispiel #50
0
        /// <summary>
        /// Pick the best normal control that has the smallest mean squared log-ratios (LR2s).
        /// </summary>
        /// <param name="tumorBinnedPath"></param>
        /// <param name="normalBinnedPaths"></param>
        /// <param name="bestBinnedPath"></param>
        /// <param name="manifest"></param>
        private static void GetBestLR2BinCount(string tumorBinnedPath, IEnumerable <string> normalBinnedPaths, string bestBinnedPath,
                                               NexteraManifest manifest = null)
        {
            int bestNormalSampleIndex = 0;
            int normalSampleCount     = normalBinnedPaths.Count();

            if (normalSampleCount > 1) // find the best normal
            {
                List <double[]> binCountsByNormalSample = new List <double[]>();
                for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++)
                {
                    string        normalBinnedPath = normalBinnedPaths.ElementAt(normalSampleIndex);
                    var           binCounts        = new BinCounts(normalBinnedPath, manifest: manifest);
                    List <double> counts           = binCounts.OnTargetCounts;
                    double        median           = binCounts.OnTargetMedianBinCount;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double weight = median > 0 ? 1.0 / median : 0;
                    binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray());
                }
                double[] tumorBinCounts;
                {
                    var           binCounts   = new BinCounts(tumorBinnedPath, manifest: manifest);
                    List <double> counts      = binCounts.OnTargetCounts;
                    double        tumorMedian = binCounts.OnTargetMedianBinCount;
                    double        tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0;
                    tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray();
                }

                // Find the best normal sample
                bestNormalSampleIndex = -1;
                double minMeanSquaredLogRatios = double.PositiveInfinity;
                for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++)
                {
                    // Get the sum of squared log ratios
                    var    result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]);
                    double meanSquaredLogRatios = result.Item1;
                    int    ignoredBinCount      = result.Item2;
                    // TODO: Skip a (bad) normal sample if too many bins were ignored.
                    //       Donavan's script skips a normal sample if more than 100 log ratios is NA.
                    //       The cut-off is likely panel-dependent.
                    if (meanSquaredLogRatios < minMeanSquaredLogRatios)
                    {
                        minMeanSquaredLogRatios = meanSquaredLogRatios;
                        bestNormalSampleIndex   = normalSampleIndex;
                    }
                }
            }

            // copy file
            string srcBinnedPath = normalBinnedPaths.ElementAt(bestNormalSampleIndex);

            if (File.Exists(srcBinnedPath))
            {
                if (File.Exists(bestBinnedPath))
                {
                    File.Delete(bestBinnedPath);
                }
                File.Copy(srcBinnedPath, bestBinnedPath);
            }
        }
Beispiel #51
0
 /// <summary>
 /// Perform GC normalization depending on the mode
 /// </summary>
 /// <param name="bins">Bins whose counts are to be normalized</param>
 /// <param name="manifest"></param>
 /// <param name="mode">GC normalization mode</param>
 static void NormalizeByGC(List<GenomicBin> bins, NexteraManifest manifest, CanvasGCNormalizationMode mode)
 {
     switch (mode)
     {
         case CanvasGCNormalizationMode.MedianByGC:
             NormalizeByGC(bins, manifest: manifest);
             break;
         case CanvasGCNormalizationMode.LOESS:
             var normalizer = new LoessGCNormalizer(bins, manifest, robustnessIter: 0,
                 countTransformer: x => (double)Math.Log(x),
                 invCountTransformer: x => (float)Math.Exp(x));
             normalizer.Normalize();
             break;
         default:
             throw new ApplicationException("Unsupported Canvas GC normalization mode: " + mode.ToString());
     }
 }
Beispiel #52
0
        /// <summary>
        /// Perform variance stabilization by GC bins.
        /// </summary>
        /// <param name="bins">Bins whose counts are to be normalized.</param>
        static bool NormalizeVarianceByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null)
        {
            // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt");
            // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
            List <float>[] countsByGC;
            // Will hold all of the autosomal counts present in 'bins'
            List <float> counts;

            EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);

            // Estimate quartiles of all bins genomewide
            var globalQuartiles = Utilities.Quartiles(counts);
            // Will hold interquartile range (IQR) separately for each GC bin
            List <float> localIQR = new List <float>(countsByGC.Length);
            // Will hold quartiles separately for each GC bin
            List <Tuple <float, float, float> > localQuartiles = new List <Tuple <float, float, float> >(countsByGC.Length);

            // calculate interquartile range (IQR) for GC bins and populate localQuartiles list
            for (int i = 0; i < countsByGC.Length; i++)
            {
                if (countsByGC[i].Count == 0)
                {
                    localIQR.Add(-1f);
                    localQuartiles.Add(new Tuple <float, float, float>(-1f, -1f, -1f));
                }
                else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC)
                {
                    localQuartiles.Add(Utilities.Quartiles(countsByGC[i]));
                    localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1);
                }
                else
                {
                    List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, i);
                    double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List <float>()
                    {
                        0.25f, 0.5f, 0.75f
                    });
                    localQuartiles.Add(new Tuple <float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2]));
                    localIQR.Add((float)(quartiles[2] - quartiles[0]));
                }
            }

            // Identify if particular GC bins have IQR twice as large as IQR genomewide
            float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1;
            // Holder for GC bins with large IQR (compared to genomewide IQR)
            int significantIQRcounter = 0;

            for (int i = 10; i < 90; i++)
            {
                if (globalIQR < localIQR[i] * 2f)
                {
                    significantIQRcounter++;
                }
            }

            if (significantIQRcounter <= 0)
            {
                return(false);
            }

            // Divide each count by the median count of bins with the same GC content
            foreach (SampleGenomicBin bin in bins)
            {
                var scaledLocalIqr = localIQR[bin.GenomicBin.GC] * 0.8f;
                if (globalIQR >= scaledLocalIqr)
                {
                    continue;
                }

                // ratio of GC bins and global IQRs
                float iqrRatio      = scaledLocalIqr / globalIQR;
                var   medianGCCount = localQuartiles[bin.GenomicBin.GC].Item2;
                bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio;
            }

            // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt");
            return(true);
        }
Beispiel #53
0
        /// <summary>
        /// Remove bins with extreme GC content.
        /// </summary>
        /// <param name="bins">Genomic bins in from which we filter out GC content outliers.</param>
        /// <param name="threshold">Minimum number of bins with the same GC content required to keep a bin.</param>
        /// 
        /// The rationale of this function is that a GC normalization is performed by computing the median count
        /// for each possible GC value. If that count is small, then the corresponding normalization constant
        /// is unstable and we shouldn't use these data.
        static List<GenomicBin> RemoveBinsWithExtremeGC(List<GenomicBin> bins, int threshold, NexteraManifest manifest = null)
        {
            // Will hold outlier-removed bins.
            List<GenomicBin> stripped = new List<GenomicBin>();

            // used to count the number of bins with each possible GC content (0-100)
            int[] counts = new int[EnrichmentUtilities.numberOfGCbins];
            double totalCount = 0;
            foreach (GenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest))
            {

                // We only count autosomal bins because these are the ones we computed normalization factor upon.
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.Chromosome))
                    continue;

                counts[bin.GC]++;
                totalCount++;
            }

            int averageCountPerGC = Math.Max(minNumberOfBinsPerGCForWeightedMedian, (int)(totalCount / counts.Length));
            threshold = Math.Min(threshold, averageCountPerGC);
            foreach (GenomicBin bin in bins)
            {
                // Remove outlier (not a lot of bins with the same GC content)
                if (counts[bin.GC] < threshold)
                    continue;
                stripped.Add(bin);
            }

            return stripped;
        }
Beispiel #54
0
        /// <summary>
        /// Computes fragment-based GC normalization correction factor 
        /// </summary>
        /// <returns>An array of observed vs expected GC counts.</returns>
        static float[] ComputeObservedVsExpectedGC(Dictionary<string, HitArray> observedAlignments,
            Dictionary<string, byte[]> readGCContent, NexteraManifest manifest,
            bool debugGC, string outFile)
        {

            Dictionary<string, List<NexteraManifest.ManifestRegion>> regionsByChrom = null;
            if (manifest != null)
            {
                regionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            long[] expectedReadCountsByGC = new long[numberOfGCbins];
            long[] observedReadCountsByGC = new long[numberOfGCbins];
            foreach (KeyValuePair<string, byte[]> chromosomeReadGCContent in readGCContent)
            {
                string chr = chromosomeReadGCContent.Key;
                if (!observedAlignments.ContainsKey(chr)) { continue; }

                if (manifest == null) // look at the entire genome
                {
                    for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
                    {
                        expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                        observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                    }
                }
                else // look at only the targeted regions
                {
                    if (!regionsByChrom.ContainsKey(chr)) { continue; }
                    int i = -1;
                    foreach (var region in regionsByChrom[chr])
                    {
                        if (i < region.Start) // avoid overlapping targeted regions
                        {
                            i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
                        }
                        for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
                        {
                            expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                            observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                        }
                    }
                }
            }

            // calculate ratio of observed to expected read counts for each read GC bin
            float[] observedVsExpectedGC = new float[numberOfGCbins];
            for (int i = 0; i < numberOfGCbins; i++)
                observedVsExpectedGC[i] = 1;
            long sumObserved = 0;
            long sumExpected = 0;
            foreach (long gcContent in observedReadCountsByGC)
                sumObserved += gcContent;
            foreach (long gcContent in expectedReadCountsByGC)
                sumExpected += gcContent;
            for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
            {
                if (expectedReadCountsByGC[binIndex] == 0)
                    expectedReadCountsByGC[binIndex] = 1;
                if (observedReadCountsByGC[binIndex] == 0)
                    observedReadCountsByGC[binIndex] = 1;
                observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
            }

            if (debugGC)
            {
                using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
                {
                    for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
                    {
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
                    }
                }
            }
            return observedVsExpectedGC;
        }