public CanvasCallset( IFileLocation bam, string sampleName, IDirectoryLocation wholeGenomeFastaFolder, IDirectoryLocation outputFolder, IFileLocation kmerFasta, IFileLocation filterBed, IFileLocation ploidyBed, IFileLocation normalVcfPath, bool isDbSnpVcf, IEnumerable<IFileLocation> normalBamPaths, NexteraManifest manifest, IFileLocation somaticVcfPath, IFileLocation outputVcfPath) { Bam = new Bam(bam); SampleName = sampleName; WholeGenomeFastaFolder = wholeGenomeFastaFolder; OutputFolder = outputFolder; KmerFasta = kmerFasta; FilterBed = filterBed; PloidyBed = ploidyBed; NormalVcfPath = normalVcfPath; IsDbSnpVcf = isDbSnpVcf; Manifest = manifest; SomaticVcfPath = somaticVcfPath; OutputVcfPath = outputVcfPath; NormalBamPaths = normalBamPaths.Select(file => new Bam(file)); var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml"); GenomeMetadata = new GenomeMetadata(); GenomeMetadata.Deserialize(genomeSizeXml.FullName); }
public static void WriteTargetBed(NexteraManifest manifest, BgzipOrStreamWriter writer, GenomeMetadata genome) { List<NexteraManifest.ManifestRegion> tempRegions = manifest.Regions; if (genome != null) { tempRegions = new List<NexteraManifest.ManifestRegion>(manifest.Regions); Dictionary<string, int> chromsomeIndexLookup = new Dictionary<string, int>(); //generate chromsome index lookup and sort for (int chromosomeIndex = 0; chromosomeIndex < genome.Sequences.Count; chromosomeIndex++) { GenomeMetadata.SequenceMetadata sequence = genome.Sequences[chromosomeIndex]; chromsomeIndexLookup[sequence.Name] = chromosomeIndex; } tempRegions.Sort((a, b) => a.CompareTo(b, chromsomeIndexLookup)); } foreach (NexteraManifest.ManifestRegion region in tempRegions) { TargetInterval interval = region.GetTargetInterval(); writer.WriteLine(string.Join("\t", new[] { interval.ReferenceName, (interval.Begin - 1).ToString(CultureInfo.InvariantCulture), interval.End.ToString(CultureInfo.InvariantCulture), region.Name //region name is needed for PUMA metrics outputs to generate .coverage.csv file })); } }
public RawRatioCalculator(NexteraManifest manifest, double minReferenceCount = 1, double maxReferecneCount = double.PositiveInfinity) { _manifest = manifest; _minReferenceCount = minReferenceCount; _maxReferenceCount = maxReferecneCount; }
/// <summary> /// Assumes the bins are sorted by genomic coordinates /// </summary> /// <param name="bins">Bins whose counts are to be normalized</param> /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param> /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param> public static void GetCountsByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, out List <float>[] countsByGC, out List <float> counts) { countsByGC = new List <float> [numberOfGCbins]; counts = new List <float>(bins.Count); // Initialize the lists for (int i = 0; i < countsByGC.Length; i++) { countsByGC[i] = new List <float>(); } foreach (SampleGenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.GenomicBin.Chromosome)) { continue; } // Put the observed count in the GC-appropriate list. countsByGC[bin.GenomicBin.GC].Add(bin.Count); // Add to the global list of counts. counts.Add(bin.Count); } }
/// <summary> /// Output bed file of regions. Each region spans both probes and the target interval /// Note that the BED format uses: /// 0-based start position (inclusive) and 1-based end position (inclusive) /// which is equivalent to saying: /// 0-based start position (inclusive) and 0-based end position (exclusive) /// </summary> public static void WriteRegionBed(NexteraManifest manifest, string outputPath, GenomeMetadata genome) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outputPath)) { WriteRegionBed(manifest, writer, genome); } }
private IFileLocation WriteManifest(NexteraManifest manifest, IDirectoryLocation sandbox) { var path = sandbox.GetFileLocation(manifest.Name); NexteraManifestUtils.WriteNexteraManifests(manifest, path.FullName); return(path); }
private IFileLocation CreateDbSnpVcfForManifest(IFileLocation fullDbSnpVcf, NexteraManifest manifest, IDirectoryLocation sandBox) { IFileLocation targetedDbSnpVcf = sandBox.GetFileLocation($"{manifest.Name}_{fullDbSnpVcf.Name}"); Isas.Manifests.NexteraManifest.VcfUtilities.IntersectVcfWithManifest(fullDbSnpVcf.FullName, targetedDbSnpVcf.FullName, manifest); return(targetedDbSnpVcf); }
private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts, out List <int> onTargetIndices) { binCounts = new List <double>(); onTargetIndices = new List <int>(); var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool onTarget = false; using (GzipReader reader = new GzipReader(binnedPath)) { string line; string[] toks; int binIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]); // 0-based, inclusive int stop = int.Parse(toks[2]); // 0-based, exclusive if (currChrom != chrom) { currChrom = chrom; onTarget = false; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap { onTarget = true; } else { onTarget = false; } if (onTarget) { onTargetIndices.Add(binIdx); } binCounts.Add(double.Parse(toks[3])); binIdx++; } } }
public CanvasCallset( IFileLocation bam, string sampleName, IDirectoryLocation wholeGenomeFastaFolder, IDirectoryLocation outputFolder, IFileLocation kmerFasta, IFileLocation filterBed, IFileLocation ploidyBed, IFileLocation normalVcfPath, bool isDbSnpVcf, IEnumerable <IFileLocation> normalBamPaths, NexteraManifest manifest, IFileLocation somaticVcfPath, IFileLocation outputVcfPath) { Bam = new Bam(bam); SampleName = sampleName; WholeGenomeFastaFolder = wholeGenomeFastaFolder; OutputFolder = outputFolder; KmerFasta = kmerFasta; FilterBed = filterBed; PloidyBed = ploidyBed; NormalVcfPath = normalVcfPath; IsDbSnpVcf = isDbSnpVcf; Manifest = manifest; SomaticVcfPath = somaticVcfPath; OutputVcfPath = outputVcfPath; NormalBamPaths = normalBamPaths.Select(file => new Bam(file)); var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml"); GenomeMetadata = new GenomeMetadata(); GenomeMetadata.Deserialize(genomeSizeXml.FullName); }
private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List <double>[] binCountsBySample = new List <double> [sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List <double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
/// <summary> /// Pick the best normal control that has the smallest mean squared log-ratios (LR2s). /// </summary> /// <param name="tumorBinnedPath"></param> /// <param name="normalBinnedPaths"></param> /// <param name="bestBinnedPath"></param> /// <param name="manifest"></param> private static void GetBestLR2BinCount(string tumorBinnedPath, IEnumerable<string> normalBinnedPaths, string bestBinnedPath, NexteraManifest manifest = null) { int bestNormalSampleIndex = 0; int normalSampleCount = normalBinnedPaths.Count(); if (normalSampleCount > 1) // find the best normal { List<double[]> binCountsByNormalSample = new List<double[]>(); for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { string normalBinnedPath = normalBinnedPaths.ElementAt(normalSampleIndex); var binCounts = new BinCounts(normalBinnedPath, manifest: manifest); List<double> counts = binCounts.OnTargetCounts; double median = binCounts.OnTargetMedianBinCount; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double weight = median > 0 ? 1.0 / median : 0; binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray()); } double[] tumorBinCounts; { var binCounts = new BinCounts(tumorBinnedPath, manifest: manifest); List<double> counts = binCounts.OnTargetCounts; double tumorMedian = binCounts.OnTargetMedianBinCount; double tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0; tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray(); } // Find the best normal sample bestNormalSampleIndex = -1; double minMeanSquaredLogRatios = double.PositiveInfinity; for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { // Get the sum of squared log ratios var result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]); double meanSquaredLogRatios = result.Item1; int ignoredBinCount = result.Item2; // TODO: Skip a (bad) normal sample if too many bins were ignored. // Donavan's script skips a normal sample if more than 100 log ratios is NA. // The cut-off is likely panel-dependent. if (meanSquaredLogRatios < minMeanSquaredLogRatios) { minMeanSquaredLogRatios = meanSquaredLogRatios; bestNormalSampleIndex = normalSampleIndex; } } } // copy file string srcBinnedPath = normalBinnedPaths.ElementAt(bestNormalSampleIndex); if (File.Exists(srcBinnedPath)) { if (File.Exists(bestBinnedPath)) { File.Delete(bestBinnedPath); } File.Copy(srcBinnedPath, bestBinnedPath); } }
public PCAReferenceGenerator(IFileLocation sampleBinnedFile, IFileLocation pcaModelFile, NexteraManifest manifest, double minBinCount = 1, double maxBinCount = double.PositiveInfinity) { _sampleBinnedFile = sampleBinnedFile; _model = new PCAModel(pcaModelFile); _manifest = manifest; _minBinCount = minBinCount; _maxBinCount = maxBinCount; _ratioCalculator = new RawRatioCalculator(manifest, _minBinCount, _maxBinCount); }
private void LoadBinCounts(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest) { if (manifest == null) { Counts = bins.Select(bin => (double)bin.Count).ToList(); } else { LoadBinCounts(bins, manifest, out Counts, out OnTargetIndices); } }
private void LoadBinCounts(string binnedPath, NexteraManifest manifest) { if (manifest == null) { LoadBinCounts(binnedPath, out Counts); } else { LoadBinCounts(binnedPath, manifest, out Counts, out OnTargetIndices); } }
public LoessGCNormalizer(IEnumerable<GenomicBin> bins, NexteraManifest manifest, int robustnessIter = 2, Func<float, double> countTransformer = null, Func<double, float> invCountTransformer = null) { this.bins = bins; this.manifest = manifest; if (robustnessIter >= 0) { this.robustnessIter = robustnessIter; } if (countTransformer != null && invCountTransformer != null) { this.countTransformer = countTransformer; this.invCountTransformer = invCountTransformer; } initialize(); }
public WeightedAverageReferenceGenerator(IEnumerable <IFileLocation> controlBinnedFiles, NexteraManifest manifest) { foreach (var binnedFile in controlBinnedFiles) { if (!binnedFile.Exists) { throw new FileNotFoundException(binnedFile.FullName + " does not exist."); } } _controlBinnedFiles = controlBinnedFiles; _manifest = manifest; }
private static void LoadBinCounts(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest, out List <double> binCounts, out List <int> onTargetIndices) { binCounts = new List <double>(); onTargetIndices = new List <int>(); var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool onTarget = false; int binIdx = 0; foreach (var bin in bins) { if (currChrom != bin.GenomicBin.Chromosome) { currChrom = bin.GenomicBin.Chromosome; onTarget = false; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap { onTarget = true; } else { onTarget = false; } if (onTarget) { onTargetIndices.Add(binIdx); } binCounts.Add(bin.Count); binIdx++; } }
private static void GetWeightedAverageBinCount(IEnumerable<string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List<double>[] binCountsBySample = new List<double>[sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List<double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
public CanvasTumorNormalEnrichmentInput( Bam tumorBam, Bam normalBam, Vcf normalVcf, Vcf somaticVcf, GenomeMetadata genomeMetadata, NexteraManifest nexteraManifest) { TumorBam = tumorBam; NormalBam = normalBam; NormalVcf = normalVcf; SomaticVcf = somaticVcf; GenomeMetadata = genomeMetadata; NexteraManifest = nexteraManifest; }
public LoessGCNormalizer(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest, int robustnessIter = 2, Func <float, double> countTransformer = null, Func <double, float> invCountTransformer = null) { this.bins = bins; this.manifest = manifest; if (robustnessIter >= 0) { this.robustnessIter = robustnessIter; } if (countTransformer != null && invCountTransformer != null) { this.countTransformer = countTransformer; this.invCountTransformer = invCountTransformer; } initialize(); }
public CanvasEnrichmentInput(Bam bam, GenomeMetadata genomeMetadata, IEnumerable <Bam> controlBamPaths, NexteraManifest nexteraManifest, CanvasEnrichmentPrecomputedControl precomputedControl, SamplePloidyInfo ploidyInfo, IFileLocation predefinedBinsFile, CanvasPcaModels pcaModels) { Bam = bam; GenomeMetadata = genomeMetadata; NexteraManifest = nexteraManifest; PrecomputedControl = precomputedControl; NormalBamPaths = new ReadOnlyCollection <Bam>(controlBamPaths.ToList()); PloidyInfo = ploidyInfo; PredefinedBinsFile = predefinedBinsFile; PcaModels = pcaModels; }
private CanvasCallset GetCallset(ILogger logger) { AnalysisDetails analysisDetails = new AnalysisDetails(CommonOptions.OutputDirectory, CommonOptions.WholeGenomeFasta, CommonOptions.KmerFasta, CommonOptions.FilterBed, SingleSampleCommonOptions.PloidyVcf, null); IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz"); var manifest = new NexteraManifest(_manifest.FullName, null, logger.Error); CanvasCallset callSet = new CanvasCallset( _tumorNormalOptions.TumorBam, SingleSampleCommonOptions.SampleName, SingleSampleCommonOptions.BAlleleSites, SingleSampleCommonOptions.IsDbSnpVcf, new[] { _normalBam }, manifest, _tumorNormalOptions.SomaticVcf, outputVcfPath, analysisDetails); return(callSet); }
public CanvasCallset( IFileLocation bam, string sampleName, IFileLocation normalVcfPath, bool isDbSnpVcf, IEnumerable <IFileLocation> normalBamPaths, NexteraManifest manifest, IFileLocation somaticVcfPath, IFileLocation outputVcfPath, AnalysisDetails analysisDetails) { SingleSampleCallset = new SingleSampleCallset(new Bam(bam), sampleName, normalVcfPath, isDbSnpVcf, analysisDetails.OutputFolder, outputVcfPath); Manifest = manifest; SomaticVcfPath = somaticVcfPath; AnalysisDetails = analysisDetails; NormalBamPaths = normalBamPaths.Select(file => new Bam(file)); }
private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath, string ploidyBedPath, NexteraManifest manifest = null) { PloidyInfo referencePloidy = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); double tumorMedian = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double normalMedian = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1; using (GzipReader tumorReader = new GzipReader(tumorBinnedPath)) using (GzipReader normalReader = new GzipReader(normalBinnedPath)) using (GzipWriter writer = new GzipWriter(ratioBinnedPath)) { string normalLine; string tumorLine; string[] normalToks; string[] tumorToks; double normalCount; double tumorCount; double ratio; while ((normalLine = normalReader.ReadLine()) != null) { tumorLine = tumorReader.ReadLine(); normalToks = normalLine.Split('\t'); tumorToks = tumorLine.Split('\t'); normalCount = double.Parse(normalToks[3]); tumorCount = double.Parse(tumorToks[3]); // The weighted average count of a bin could be less than 1. // Using these small counts for coverage normalization creates large ratios. // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling. if (normalCount < 1) { continue; } // skip the bin string chrom = normalToks[0]; int start = int.Parse(normalToks[1]); int end = int.Parse(normalToks[2]); // get the normal ploidy from intervalsWithPloidyByChrom double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0; ratio = tumorCount / normalCount * factor * librarySizeFactor; normalToks[3] = String.Format("{0}", ratio); writer.WriteLine(String.Join("\t", normalToks)); } } }
public BestLR2ReferenceGenerator(IFileLocation sampleBinnedFile, IEnumerable <IFileLocation> controlBinnedFiles, NexteraManifest manifest) { if (!sampleBinnedFile.Exists) { throw new FileNotFoundException(sampleBinnedFile.FullName + " does not exist."); } foreach (var binnedFile in controlBinnedFiles) { if (!binnedFile.Exists) { throw new FileNotFoundException(binnedFile.FullName + " does not exist."); } } _sampleBinnedFile = sampleBinnedFile; _controlBinnedFiles = controlBinnedFiles; _manifest = manifest; }
/// <summary> /// Perform GC normalization depending on the mode /// </summary> /// <param name="bins">Bins whose counts are to be normalized</param> /// <param name="manifest"></param> /// <param name="mode">GC normalization mode</param> static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest, CanvasGCNormalizationMode mode) { switch (mode) { case CanvasGCNormalizationMode.MedianByGC: NormalizeByGC(bins, manifest: manifest); break; case CanvasGCNormalizationMode.LOESS: var normalizer = new LoessGCNormalizer(bins, manifest, robustnessIter: 0, countTransformer: x => (double)Math.Log(x), invCountTransformer: x => (float)Math.Exp(x)); normalizer.Normalize(); break; default: throw new Illumina.Common.IlluminaException("Unsupported Canvas GC normalization mode: " + mode.ToString()); } }
public CanvasCallset( SingleSampleCallset singleSampleCallset, AnalysisDetails analysisDetails, IEnumerable <IFileLocation> normalBamPaths, NexteraManifest manifest, IFileLocation somaticVcfPath) { SingleSampleCallset = singleSampleCallset; Manifest = manifest; if (somaticVcfPath != null) { SomaticVcfPath = somaticVcfPath; } AnalysisDetails = analysisDetails; if (normalBamPaths != null) { NormalBamPaths = normalBamPaths.Select(file => new Bam(file)); } }
private CanvasCallset GetCallset(ILogger logger) { IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz"); var manifest = new NexteraManifest(_manifest.FullName, null, logger.Error); CanvasCallset callSet = new CanvasCallset( _tumorNormalOptions.TumorBam, CommonOptions.SampleName, CommonOptions.WholeGenomeFasta, CommonOptions.OutputDirectory, CommonOptions.KmerFasta, CommonOptions.FilterBed, CommonOptions.PloidyBed, CommonOptions.BAlleleSites, CommonOptions.IsDbSnpVcf, new[] { _normalBam }, manifest, _tumorNormalOptions.SomaticVcf, outputVcfPath); return callSet; }
private CanvasCallset GetCallset(ILogger logger) { IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz"); var manifest = new NexteraManifest(_somaticEnrichmentOptions.Manifest.FullName, null, logger.Error); CanvasCallset callSet = new CanvasCallset( _somaticEnrichmentOptions.Bam, CommonOptions.SampleName, CommonOptions.WholeGenomeFasta, CommonOptions.OutputDirectory, CommonOptions.KmerFasta, CommonOptions.FilterBed, CommonOptions.PloidyBed, CommonOptions.BAlleleSites, CommonOptions.IsDbSnpVcf, _somaticEnrichmentOptions.ControlBams, manifest, null, outputVcfPath); return callSet; }
/// <summary> /// Assumes the bins are sorted by genomic coordinates /// </summary> /// <param name="bins">Bins whose counts are to be normalized</param> /// <param name="countsByGC">An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.</param> /// <param name="counts">Will hold all of the autosomal counts present in 'bins'</param> public static void GetCountsByGC(List<GenomicBin> bins, NexteraManifest manifest, out List<float>[] countsByGC, out List<float> counts) { countsByGC = new List<float>[numberOfGCbins]; counts = new List<float>(bins.Count); // Initialize the lists for (int i = 0; i < countsByGC.Length; i++) countsByGC[i] = new List<float>(); foreach (GenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.Chromosome)) { continue; } // Put the observed count in the GC-appropriate list. countsByGC[bin.GC].Add(bin.Count); // Add to the global list of counts. counts.Add(bin.Count); } }
/// <summary> /// Get the on-target bins by intersecting the manifest. /// </summary> /// <param name="bins"></param> /// <param name="manifest"></param> /// <returns></returns> public static IEnumerable<GenomicBin> GetOnTargetBins(IEnumerable<GenomicBin> bins, NexteraManifest manifest) { var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool offTarget = true; foreach (GenomicBin bin in bins) // 0-based bins { if (currChrom != bin.Chromosome) { currChrom = bin.Chromosome; offTarget = true; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap { offTarget = false; } else { offTarget = true; } if (offTarget) { continue; } // ignore off-target bins yield return bin; } }
private CanvasCallset GetCallset(ILogger logger) { IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz"); var manifest = new NexteraManifest(_somaticEnrichmentOptions.Manifest.FullName, null, logger.Error); CanvasCallset callSet = new CanvasCallset( _somaticEnrichmentOptions.Bam, CommonOptions.SampleName, CommonOptions.WholeGenomeFasta, CommonOptions.OutputDirectory, CommonOptions.KmerFasta, CommonOptions.FilterBed, CommonOptions.PloidyBed, CommonOptions.BAlleleSites, CommonOptions.IsDbSnpVcf, _somaticEnrichmentOptions.ControlBams, manifest, null, outputVcfPath); return(callSet); }
public static void WriteRegionBed(NexteraManifest manifest, BgzipOrStreamWriter writer, GenomeMetadata genome) { List<NexteraManifest.ManifestRegion> tempRegions = manifest.Regions; if (genome != null) { tempRegions = new List<NexteraManifest.ManifestRegion>(manifest.Regions); Dictionary<string, int> chromsomeIndexLookup = new Dictionary<string, int>(); //generate chromsome index lookup and sort for (int chromosomeIndex = 0; chromosomeIndex < genome.Sequences.Count; chromosomeIndex++) { GenomeMetadata.SequenceMetadata sequence = genome.Sequences[chromosomeIndex]; chromsomeIndexLookup[sequence.Name] = chromosomeIndex; } tempRegions.Sort((a, b) => a.CompareTo(b, chromsomeIndexLookup)); } foreach (NexteraManifest.ManifestRegion region in tempRegions) { writer.WriteLine(string.Format("{0}\t{1}\t{2}", region.Chromosome, region.Start - 1, region.End)); } }
/// <summary> /// Perform a simple GC normalization. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> /// <param name="manifest"></param> static void NormalizeByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List <float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List <float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); double globalMedian = Utilities.Median(counts); double?[] medians = new double?[countsByGC.Length]; // Compute the median count for each GC bin for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++) { if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC) { medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]); } else { List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex); medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts); } } // Divide each count by the median count of bins with the same GC content for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++) { double?median = medians[bins[gcBinIndex].GenomicBin.GC]; if (median != null && median > 0) { bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median); } } // DebugPrintCountsByGC(bins, "CountsByGC-After.txt"); }
private CanvasCallset GetCallset(ILogger logger) { AnalysisDetails analysisDetails = new AnalysisDetails(CommonOptions.OutputDirectory, CommonOptions.WholeGenomeFasta, CommonOptions.KmerFasta, CommonOptions.FilterBed, SingleSampleCommonOptions.PloidyVcf, null); IFileLocation outputVcfPath = CommonOptions.OutputDirectory.GetFileLocation("CNV.vcf.gz"); var manifest = new NexteraManifest(_somaticEnrichmentOptions.Manifest.FullName, null, logger.Error); // TODO: refactor and remove the following two lines manifest.CanvasControlBinnedPath = _somaticEnrichmentOptions.ControlBinned?.FullName; manifest.CanvasBinSize = _somaticEnrichmentOptions.ControlBinSize; CanvasCallset callSet = new CanvasCallset( _somaticEnrichmentOptions.Bam, SingleSampleCommonOptions.SampleName, SingleSampleCommonOptions.BAlleleSites, SingleSampleCommonOptions.IsDbSnpVcf, _somaticEnrichmentOptions.ControlBams, manifest, null, outputVcfPath, analysisDetails); return(callSet); }
public static int Run(CanvasNormalizeParameters parameters) { NexteraManifest manifest = string.IsNullOrEmpty(parameters.manifestPath) ? null : new NexteraManifest(parameters.manifestPath, null, Console.WriteLine); switch (parameters.normalizationMode) { case CanvasNormalizeMode.BestLR2: GetBestLR2BinCount(parameters.tumorBedPath, parameters.normalBedPaths, parameters.weightedAverageNormalBedPath, manifest: manifest); break; case CanvasNormalizeMode.WeightedAverage: GetWeightedAverageBinCount(parameters.normalBedPaths, parameters.weightedAverageNormalBedPath, manifest: manifest); break; default: throw new Exception(string.Format("Invalid CanvasNormalize mode '{0}'", parameters.normalizationMode)); } GetBinRatio(parameters.tumorBedPath, parameters.weightedAverageNormalBedPath, parameters.outBedPath, parameters.ploidyBedPath, manifest: manifest); return(0); }
public BinCounts(string binnedPath, NexteraManifest manifest = null) { BinnedPath = binnedPath; Manifest = manifest; LoadBinCounts(); }
public static void WriteNexteraManifests(NexteraManifest manifest, TextWriter writer) { writer.WriteLine("#{0}: {1}", "Manifest Type", "Regions"); writer.WriteLine("#{0}: {1}", "Target Region Count", manifest.Regions.Count); writer.WriteLine("#{0}: {1}", "Date", DateTime.Now.ToShortDateString()); writer.WriteLine("[Header]"); //writer.WriteLine("Manifest Version\t1.0"); if (!string.IsNullOrEmpty(manifest.GenomeName)) { writer.WriteLine("ReferenceGenome\t" + manifest.GenomeName); } writer.WriteLine("[Regions]"); List<string> headers = new List<string>(); if (manifest.ColumnNames != null && manifest.ColumnNames.Length > 0) { foreach (int columnNumber in manifest.ColumnNumbers) { if (columnNumber >= 0 && columnNumber < manifest.ColumnNames.Length) { headers.Add(manifest.ColumnNames[columnNumber]); } } writer.WriteLine(string.Join("\t", headers.ToArray())); } if (manifest.Regions != null && manifest.Regions.Count > 0) { foreach (NexteraManifest.ManifestRegion region in manifest.Regions) { NexteraManifest.ManifestRegion tmpRegion = new NexteraManifest.ManifestRegion(region); List<string> line = new List<string>(); foreach (string header in headers) { switch (header.ToLowerInvariant()) { case "name": line.Add(tmpRegion.Name); break; case "chromosome": line.Add(tmpRegion.Chromosome); break; case "start": case "amplicon start": line.Add(tmpRegion.Start.ToString()); break; case "end": case "amplicon end": line.Add(tmpRegion.End.ToString()); break; case "startprobelength": case "upstream probe length": line.Add(tmpRegion.StartProbeLength.ToString()); break; case "endprobelength": case "downstream probe length": line.Add(tmpRegion.EndProbeLength.ToString()); break; case "groupname": case "group name": case "group": case "ip group": line.Add(tmpRegion.GroupName); break; } } writer.WriteLine(string.Join("\t", line.ToArray())); } } }
public static NexteraManifest GetUpdatedNexteraManifestsWithNewRegions(NexteraManifest manifest, List<RegionStatistics> regionStats) { NexteraManifest nexteraManifestsWithNewRegions = new NexteraManifest(manifest); // create a dictionary for new regions Dictionary<string, RegionStatistics> regionStatsLookup = new Dictionary<string, RegionStatistics>(); foreach (RegionStatistics regionStat in regionStats) { regionStatsLookup.Add(regionStat.RegionName, regionStat); } //update the regions if (nexteraManifestsWithNewRegions.Regions != null && nexteraManifestsWithNewRegions.Regions.Count > 0) { var newRegions = new List<NexteraManifest.ManifestRegion>(); foreach (NexteraManifest.ManifestRegion region in nexteraManifestsWithNewRegions.Regions) { NexteraManifest.ManifestRegion tmpRegion = new NexteraManifest.ManifestRegion(region); if (!regionStatsLookup.ContainsKey(region.Name)) continue; tmpRegion.Start = regionStatsLookup[region.Name].StartPosition; tmpRegion.End = regionStatsLookup[region.Name].EndPosition; newRegions.Add(tmpRegion); } nexteraManifestsWithNewRegions.Regions = newRegions; } return nexteraManifestsWithNewRegions; }
/// <summary> /// Calculates how many possible alignments corresponds to the desired number of observed alignments per bin. /// </summary> /// <param name="countsPerBin">Desired number of observed alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments (unique mers).</param> /// <param name="observedAlignments">BitArrays storing the observed alignments.</param> /// <returns>Median alignment rate observed on the autosomes.</returns> static int CalculateNumberOfPossibleAlignmentsPerBin(int countsPerBin, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, NexteraManifest manifest = null) { List<double> rates = new List<double>(); Dictionary<string, List<NexteraManifest.ManifestRegion>> manifestRegionsByChrom = null; if (manifest != null) { manifestRegionsByChrom = manifest.GetManifestRegionsByChromosome(); } List<ThreadStart> tasks = new List<ThreadStart>(); foreach (string chr in possibleAlignments.Keys) { // We don't want to include the sex chromosomes because they may not be copy number 2 if (!GenomeMetadata.SequenceMetadata.IsAutosome(chr)) continue; HitArray observed = observedAlignments[chr]; BitArray possible = possibleAlignments[chr]; List<NexteraManifest.ManifestRegion> regions = null; if (manifestRegionsByChrom != null) { if (!manifestRegionsByChrom.ContainsKey(chr)) { continue; } regions = manifestRegionsByChrom[chr]; } tasks.Add(new ThreadStart(() => { int numberObserved = observed.CountSetBits(regions); int numberPossible = CountSetBits(possible, regions); double rate = numberObserved / (double)numberPossible; lock (rates) { rates.Add(rate); } })); } Console.WriteLine("Launch CalculateNumberOfPossibleAlignmentsPerBin jobs..."); Console.Out.WriteLine(); //Parallel.ForEach(tasks, t => { t.Invoke(); }); //todo allow controling degree of parallelism Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(tasks); Console.WriteLine("CalculateNumberOfPossibleAlignmentsPerBin jobs complete."); Console.Out.WriteLine(); double medianRate = CanvasCommon.Utilities.Median(rates); return (int)(countsPerBin / medianRate); }
/// <summary> /// Perform variance stabilization by GC bins. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> static bool NormalizeVarianceByGC(List<GenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List<float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List<float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); // Estimate quartiles of all bins genomewide var globalQuartiles = Utilities.Quartiles(counts); // Will hold interquartile range (IQR) separately for each GC bin List<float> localIQR = new List<float>(countsByGC.Length); // Will hold quartiles separately for each GC bin List<Tuple<float, float, float>> localQuartiles = new List<Tuple<float, float, float>>(countsByGC.Length); // calculate interquartile range (IQR) for GC bins and populate localQuartiles list for (int i = 0; i < countsByGC.Length; i++) { if (countsByGC[i].Count == 0) { localIQR.Add(-1f); localQuartiles.Add(new Tuple<float, float, float>(-1f, -1f, -1f)); } else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC) { localQuartiles.Add(Utilities.Quartiles(countsByGC[i])); localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1); } else { List<Tuple<float, float>> weightedCounts = GetWeightedCounts(countsByGC, i); double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List<float>() { 0.25f, 0.5f, 0.75f }); localQuartiles.Add(new Tuple<float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2])); localIQR.Add((float)(quartiles[2] - quartiles[0])); } } // Identify if particular GC bins have IQR twice as large as IQR genomewide float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1; // Holder for GC bins with large IQR (compared to genomewide IQR) int significantIQRcounter = 0; for (int i = 10; i < 90; i++) { if (globalIQR < localIQR[i] * 2f) significantIQRcounter++; } if (significantIQRcounter <= 0) return false; // Divide each count by the median count of bins with the same GC content foreach (GenomicBin bin in bins) { var scaledLocalIqr = localIQR[bin.GC] * 0.8f; if (globalIQR >= scaledLocalIqr) continue; // ratio of GC bins and global IQRs float iqrRatio = scaledLocalIqr / globalIQR; var medianGCCount = localQuartiles[bin.GC].Item2; bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio; } // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt"); return true; }
public static void WriteNexteraManifests(NexteraManifest manifest, string path) { using (StreamWriter writer = new StreamWriter(path)) { WriteNexteraManifests(manifest, writer); } }
/// <summary> /// Get the on-target bins by intersecting the manifest. /// </summary> /// <param name="bins"></param> /// <param name="manifest"></param> /// <returns></returns> public static IEnumerable <SampleGenomicBin> GetOnTargetBins(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest) { var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool offTarget = true; foreach (SampleGenomicBin bin in bins) // 0-based bins { if (currChrom != bin.GenomicBin.Chromosome) { currChrom = bin.GenomicBin.Chromosome; offTarget = true; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap { offTarget = false; } else { offTarget = true; } if (offTarget) { continue; } // ignore off-target bins yield return(bin); } }
/// <summary> /// Perform a simple GC normalization. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> /// <param name="manifest"></param> static void NormalizeByGC(List<GenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGC-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List<float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List<float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); double globalMedian = Utilities.Median(counts); double?[] medians = new double?[countsByGC.Length]; // Compute the median count for each GC bin for (int gcBinIndex = 0; gcBinIndex < countsByGC.Length; gcBinIndex++) { if (countsByGC[gcBinIndex].Count >= defaultMinNumberOfBinsPerGC) { medians[gcBinIndex] = Utilities.Median(countsByGC[gcBinIndex]); } else { List<Tuple<float, float>> weightedCounts = GetWeightedCounts(countsByGC, gcBinIndex); medians[gcBinIndex] = Utilities.WeightedMedian(weightedCounts); } } // Divide each count by the median count of bins with the same GC content for (int gcBinIndex = 0; gcBinIndex < bins.Count; gcBinIndex++) { double? median = medians[bins[gcBinIndex].GC]; if (median != null && median > 0) bins[gcBinIndex].Count = (float)(globalMedian * (double)bins[gcBinIndex].Count / median); } // DebugPrintCountsByGC(bins, "CountsByGC-After.txt"); }
static int Main(string[] args) { Utilities.LogCommandLine(args); string inFile = null; string outFile = null; bool doGCnorm = false; bool doSizeFilter = false; bool doOutlierRemoval = false; string ffpeOutliersFile = null; string manifestFile = null; CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC; string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}", String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()), gcNormalizationMode); bool needHelp = false; OptionSet p = new OptionSet() { { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v }, { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v }, { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null }, { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null }, { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null }, { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v }, { "t|manifest=", "Nextera manifest file", v => manifestFile = v }, { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) }, { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) }, { "h|help", "show this message and exit", v => needHelp = v != null }, }; List <string> extraArgs = p.Parse(args); if (needHelp) { ShowHelp(p); return(0); } if (inFile == null || outFile == null) { ShowHelp(p); return(0); } // Does the input file exist? if (!File.Exists(inFile)) { Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile); return(1); } List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile); if (doOutlierRemoval) { bins = RemoveOutliers(bins); } if (doSizeFilter) { bins = RemoveBigBins(bins); } // do not run FFPE outlier removal on targeted/low coverage data if (ffpeOutliersFile != null && bins.Count < 50000) { ffpeOutliersFile = null; } // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file double LocalSD = -1.0; if (ffpeOutliersFile != null) { LocalSD = getLocalStandardDeviation(bins); CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD); } if (doGCnorm) { NexteraManifest manifest = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine); List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest) : bins; if (strippedBins.Count == 0) { Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction"); } else { bins = strippedBins; NormalizeByGC(bins, manifest, gcNormalizationMode); // Use variance normalization only on large exome panels and whole genome sequencing // The treshold is set to 10% of an average number of bins on CanvasClean data if (ffpeOutliersFile != null && bins.Count > 500000) { bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest); // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC if (isNormalizeVarianceByGC) { NormalizeByGC(bins, manifest, gcNormalizationMode); } } } } if (ffpeOutliersFile != null) { // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples) List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile); bins = LocalMadstrippedBins; } CanvasIO.WriteToTextFile(outFile, bins); return(0); }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, Dictionary<string, List<GenomicBin>> predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>(); List<string> chromosomes = new List<string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) meanFragmentSize = MeanFragmentSize(fragmentLengths); using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List<ThreadStart> normalizationTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; int gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) currentFragment = meanFragmentSize; else currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } if (gcCounter < 0) gcCounter = 0; gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>(); List<ThreadStart> binningTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) continue; if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue; BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) args.ReadGCContent = readGCContent[chr]; else args.ReadGCContent = null; args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List<GenomicBin> finalBins = new List<GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) continue; finalBins.AddRange(perChromosomeBins[chr]); } return finalBins; }
private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List<double> binCounts, out List<int> onTargetIndices) { binCounts = new List<double>(); onTargetIndices = new List<int>(); var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool onTarget = false; using (GzipReader reader = new GzipReader(binnedPath)) { string line; string[] toks; int binIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]); // 0-based, inclusive int stop = int.Parse(toks[2]); // 0-based, exclusive if (currChrom != chrom) { currChrom = chrom; onTarget = false; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap { onTarget = true; } else { onTarget = false; } if (onTarget) { onTargetIndices.Add(binIdx); } binCounts.Add(double.Parse(toks[3])); binIdx++; } } }
public LSNormRatioCalculator(NexteraManifest manifest) { _manifest = manifest; }
/// <summary> /// Pick the best normal control that has the smallest mean squared log-ratios (LR2s). /// </summary> /// <param name="tumorBinnedPath"></param> /// <param name="normalBinnedPaths"></param> /// <param name="bestBinnedPath"></param> /// <param name="manifest"></param> private static void GetBestLR2BinCount(string tumorBinnedPath, IEnumerable <string> normalBinnedPaths, string bestBinnedPath, NexteraManifest manifest = null) { int bestNormalSampleIndex = 0; int normalSampleCount = normalBinnedPaths.Count(); if (normalSampleCount > 1) // find the best normal { List <double[]> binCountsByNormalSample = new List <double[]>(); for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { string normalBinnedPath = normalBinnedPaths.ElementAt(normalSampleIndex); var binCounts = new BinCounts(normalBinnedPath, manifest: manifest); List <double> counts = binCounts.OnTargetCounts; double median = binCounts.OnTargetMedianBinCount; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double weight = median > 0 ? 1.0 / median : 0; binCountsByNormalSample.Add(counts.Select(cnt => cnt * weight).ToArray()); } double[] tumorBinCounts; { var binCounts = new BinCounts(tumorBinnedPath, manifest: manifest); List <double> counts = binCounts.OnTargetCounts; double tumorMedian = binCounts.OnTargetMedianBinCount; double tumorWeight = tumorMedian > 0 ? 1.0 / tumorMedian : 0; tumorBinCounts = counts.Select(cnt => cnt * tumorWeight).ToArray(); } // Find the best normal sample bestNormalSampleIndex = -1; double minMeanSquaredLogRatios = double.PositiveInfinity; for (int normalSampleIndex = 0; normalSampleIndex < normalSampleCount; normalSampleIndex++) { // Get the sum of squared log ratios var result = GetMeanSquaredLogRatios(tumorBinCounts, binCountsByNormalSample[normalSampleIndex]); double meanSquaredLogRatios = result.Item1; int ignoredBinCount = result.Item2; // TODO: Skip a (bad) normal sample if too many bins were ignored. // Donavan's script skips a normal sample if more than 100 log ratios is NA. // The cut-off is likely panel-dependent. if (meanSquaredLogRatios < minMeanSquaredLogRatios) { minMeanSquaredLogRatios = meanSquaredLogRatios; bestNormalSampleIndex = normalSampleIndex; } } } // copy file string srcBinnedPath = normalBinnedPaths.ElementAt(bestNormalSampleIndex); if (File.Exists(srcBinnedPath)) { if (File.Exists(bestBinnedPath)) { File.Delete(bestBinnedPath); } File.Copy(srcBinnedPath, bestBinnedPath); } }
/// <summary> /// Perform GC normalization depending on the mode /// </summary> /// <param name="bins">Bins whose counts are to be normalized</param> /// <param name="manifest"></param> /// <param name="mode">GC normalization mode</param> static void NormalizeByGC(List<GenomicBin> bins, NexteraManifest manifest, CanvasGCNormalizationMode mode) { switch (mode) { case CanvasGCNormalizationMode.MedianByGC: NormalizeByGC(bins, manifest: manifest); break; case CanvasGCNormalizationMode.LOESS: var normalizer = new LoessGCNormalizer(bins, manifest, robustnessIter: 0, countTransformer: x => (double)Math.Log(x), invCountTransformer: x => (float)Math.Exp(x)); normalizer.Normalize(); break; default: throw new ApplicationException("Unsupported Canvas GC normalization mode: " + mode.ToString()); } }
/// <summary> /// Perform variance stabilization by GC bins. /// </summary> /// <param name="bins">Bins whose counts are to be normalized.</param> static bool NormalizeVarianceByGC(List <SampleGenomicBin> bins, NexteraManifest manifest = null) { // DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt"); // An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content. List <float>[] countsByGC; // Will hold all of the autosomal counts present in 'bins' List <float> counts; EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts); // Estimate quartiles of all bins genomewide var globalQuartiles = Utilities.Quartiles(counts); // Will hold interquartile range (IQR) separately for each GC bin List <float> localIQR = new List <float>(countsByGC.Length); // Will hold quartiles separately for each GC bin List <Tuple <float, float, float> > localQuartiles = new List <Tuple <float, float, float> >(countsByGC.Length); // calculate interquartile range (IQR) for GC bins and populate localQuartiles list for (int i = 0; i < countsByGC.Length; i++) { if (countsByGC[i].Count == 0) { localIQR.Add(-1f); localQuartiles.Add(new Tuple <float, float, float>(-1f, -1f, -1f)); } else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC) { localQuartiles.Add(Utilities.Quartiles(countsByGC[i])); localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1); } else { List <Tuple <float, float> > weightedCounts = GetWeightedCounts(countsByGC, i); double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List <float>() { 0.25f, 0.5f, 0.75f }); localQuartiles.Add(new Tuple <float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2])); localIQR.Add((float)(quartiles[2] - quartiles[0])); } } // Identify if particular GC bins have IQR twice as large as IQR genomewide float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1; // Holder for GC bins with large IQR (compared to genomewide IQR) int significantIQRcounter = 0; for (int i = 10; i < 90; i++) { if (globalIQR < localIQR[i] * 2f) { significantIQRcounter++; } } if (significantIQRcounter <= 0) { return(false); } // Divide each count by the median count of bins with the same GC content foreach (SampleGenomicBin bin in bins) { var scaledLocalIqr = localIQR[bin.GenomicBin.GC] * 0.8f; if (globalIQR >= scaledLocalIqr) { continue; } // ratio of GC bins and global IQRs float iqrRatio = scaledLocalIqr / globalIQR; var medianGCCount = localQuartiles[bin.GenomicBin.GC].Item2; bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio; } // DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt"); return(true); }
/// <summary> /// Remove bins with extreme GC content. /// </summary> /// <param name="bins">Genomic bins in from which we filter out GC content outliers.</param> /// <param name="threshold">Minimum number of bins with the same GC content required to keep a bin.</param> /// /// The rationale of this function is that a GC normalization is performed by computing the median count /// for each possible GC value. If that count is small, then the corresponding normalization constant /// is unstable and we shouldn't use these data. static List<GenomicBin> RemoveBinsWithExtremeGC(List<GenomicBin> bins, int threshold, NexteraManifest manifest = null) { // Will hold outlier-removed bins. List<GenomicBin> stripped = new List<GenomicBin>(); // used to count the number of bins with each possible GC content (0-100) int[] counts = new int[EnrichmentUtilities.numberOfGCbins]; double totalCount = 0; foreach (GenomicBin bin in manifest == null ? bins : EnrichmentUtilities.GetOnTargetBins(bins, manifest)) { // We only count autosomal bins because these are the ones we computed normalization factor upon. if (!GenomeMetadata.SequenceMetadata.IsAutosome(bin.Chromosome)) continue; counts[bin.GC]++; totalCount++; } int averageCountPerGC = Math.Max(minNumberOfBinsPerGCForWeightedMedian, (int)(totalCount / counts.Length)); threshold = Math.Min(threshold, averageCountPerGC); foreach (GenomicBin bin in bins) { // Remove outlier (not a lot of bins with the same GC content) if (counts[bin.GC] < threshold) continue; stripped.Add(bin); } return stripped; }
/// <summary> /// Computes fragment-based GC normalization correction factor /// </summary> /// <returns>An array of observed vs expected GC counts.</returns> static float[] ComputeObservedVsExpectedGC(Dictionary<string, HitArray> observedAlignments, Dictionary<string, byte[]> readGCContent, NexteraManifest manifest, bool debugGC, string outFile) { Dictionary<string, List<NexteraManifest.ManifestRegion>> regionsByChrom = null; if (manifest != null) { regionsByChrom = manifest.GetManifestRegionsByChromosome(); } long[] expectedReadCountsByGC = new long[numberOfGCbins]; long[] observedReadCountsByGC = new long[numberOfGCbins]; foreach (KeyValuePair<string, byte[]> chromosomeReadGCContent in readGCContent) { string chr = chromosomeReadGCContent.Key; if (!observedAlignments.ContainsKey(chr)) { continue; } if (manifest == null) // look at the entire genome { for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++) { expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++; observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i]; } } else // look at only the targeted regions { if (!regionsByChrom.ContainsKey(chr)) { continue; } int i = -1; foreach (var region in regionsByChrom[chr]) { if (i < region.Start) // avoid overlapping targeted regions { i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based. } for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++) { expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++; observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i]; } } } } // calculate ratio of observed to expected read counts for each read GC bin float[] observedVsExpectedGC = new float[numberOfGCbins]; for (int i = 0; i < numberOfGCbins; i++) observedVsExpectedGC[i] = 1; long sumObserved = 0; long sumExpected = 0; foreach (long gcContent in observedReadCountsByGC) sumObserved += gcContent; foreach (long gcContent in expectedReadCountsByGC) sumExpected += gcContent; for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++) { if (expectedReadCountsByGC[binIndex] == 0) expectedReadCountsByGC[binIndex] = 1; if (observedReadCountsByGC[binIndex] == 0) observedReadCountsByGC[binIndex] = 1; observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved); } if (debugGC) { using (GzipWriter writer = new GzipWriter(outFile + ".gcstat")) { for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++) { writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex])); } } } return observedVsExpectedGC; }