private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List <double>[] binCountsBySample = new List <double> [sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List <double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
public static void WriteToTextFile(string outfile, List <GenomicBin> bins) { using (GzipWriter writer = new GzipWriter(outfile)) { foreach (GenomicBin bin in bins) { writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3:F2}\t{4}", bin.Chromosome, bin.Start, bin.Stop, bin.Count, bin.GC)); } } }
/// <summary> /// Step 3: Summarize results to a simple tab-delimited file. /// </summary> protected void WriteResults(string outputPath) { using (GzipWriter writer = new GzipWriter(outputPath)) { writer.WriteLine("#Chromosome\tPosition\tRef\tAlt\tCountRef\tCountAlt"); for (int index = 0; index < this.Variants.Count; index++) { VcfVariant variant = this.Variants[index]; // skip HOM REF positions if (this.VariantCounts[index] > 5) { writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", variant.ReferenceName, variant.ReferencePosition, variant.ReferenceAllele, variant.VariantAlleles[0], this.ReferenceCounts[index], this.VariantCounts[index])); } } } Console.WriteLine("{0} Results written to {1}", DateTime.Now, outputPath); }
public void WriteCanvasPartitionResults(string outPath, GenomeSegmentationResults segmentationResults) { Dictionary <string, bool> starts = new Dictionary <string, bool>(); Dictionary <string, bool> stops = new Dictionary <string, bool>(); foreach (string chr in segmentationResults.SegmentByChr.Keys) { for (int segmentIndex = 0; segmentIndex < segmentationResults.SegmentByChr[chr].Length; segmentIndex++) { Segmentation.Segment segment = segmentationResults.SegmentByChr[chr][segmentIndex]; starts[chr + ":" + segment.start] = true; stops[chr + ":" + segment.end] = true; } } Dictionary <string, List <SampleGenomicBin> > excludedIntervals = new Dictionary <string, List <SampleGenomicBin> >(); if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath)) { excludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath); } using (GzipWriter writer = new GzipWriter(outPath)) { int segmentNum = -1; foreach (string chr in StartByChr.Keys) { List <SampleGenomicBin> excludeIntervals = null; if (excludedIntervals.ContainsKey(chr)) { excludeIntervals = excludedIntervals[chr]; } int excludeIndex = 0; // Points to the first interval which *doesn't* end before our current position uint previousBinEnd = 0; for (int pos = 0; pos < StartByChr[chr].Length; pos++) { uint start = StartByChr[chr][pos]; uint end = EndByChr[chr][pos]; string key = chr + ":" + start; bool newSegment = IsNewSegment(starts, key, excludeIntervals, previousBinEnd, end, start, ref excludeIndex); if (newSegment) { segmentNum++; } writer.WriteLine(string.Format($"{chr}\t{start}\t{end}\t{ScoreByChr[chr][pos]}\t{segmentNum}")); previousBinEnd = end; } } } }
public void WriteCanvasPartitionResults(string outPath, Dictionary <string, List <SegmentWithBins> > segmentsByChromosome) { using (var writer = new GzipWriter(outPath)) { foreach (var chr in segmentsByChromosome.Keys) { var segments = segmentsByChromosome[chr]; foreach (var segment in segments) { foreach (var bin in segment.Bins) { writer.WriteLine(string.Format($"{chr}\t{bin.Start}\t{bin.End}\t{bin.Coverage}\t{segment.Identifier}")); } } } } }
private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath, string ploidyBedPath, NexteraManifest manifest = null) { PloidyInfo referencePloidy = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); double tumorMedian = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double normalMedian = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1; using (GzipReader tumorReader = new GzipReader(tumorBinnedPath)) using (GzipReader normalReader = new GzipReader(normalBinnedPath)) using (GzipWriter writer = new GzipWriter(ratioBinnedPath)) { string normalLine; string tumorLine; string[] normalToks; string[] tumorToks; double normalCount; double tumorCount; double ratio; while ((normalLine = normalReader.ReadLine()) != null) { tumorLine = tumorReader.ReadLine(); normalToks = normalLine.Split('\t'); tumorToks = tumorLine.Split('\t'); normalCount = double.Parse(normalToks[3]); tumorCount = double.Parse(tumorToks[3]); // The weighted average count of a bin could be less than 1. // Using these small counts for coverage normalization creates large ratios. // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling. if (normalCount < 1) { continue; } // skip the bin string chrom = normalToks[0]; int start = int.Parse(normalToks[1]); int end = int.Parse(normalToks[2]); // get the normal ploidy from intervalsWithPloidyByChrom double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0; ratio = tumorCount / normalCount * factor * librarySizeFactor; normalToks[3] = String.Format("{0}", ratio); writer.WriteLine(String.Join("\t", normalToks)); } } }
private void WriteCanvasPartitionResults(string outPath) { Dictionary <string, bool> starts = new Dictionary <string, bool>(); Dictionary <string, bool> stops = new Dictionary <string, bool>(); foreach (string chr in SegmentationResults.SegmentByChr.Keys) { for (int segmentIndex = 0; segmentIndex < SegmentationResults.SegmentByChr[chr].Length; segmentIndex++) { Segment segment = SegmentationResults.SegmentByChr[chr][segmentIndex]; starts[chr + ":" + segment.start] = true; stops[chr + ":" + segment.end] = true; } } Dictionary <string, List <GenomicBin> > ExcludedIntervals = new Dictionary <string, List <GenomicBin> >(); if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath)) { ExcludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath); } using (GzipWriter writer = new GzipWriter(outPath)) { int segmentNum = -1; foreach (string chr in StartByChr.Keys) { List <GenomicBin> excludeIntervals = null; if (ExcludedIntervals.ContainsKey(chr)) { excludeIntervals = ExcludedIntervals[chr]; } int excludeIndex = 0; // Points to the first interval which *doesn't* end before our current position uint previousBinEnd = 0; for (int pos = 0; pos < StartByChr[chr].Length; pos++) { uint start = StartByChr[chr][pos]; uint end = EndByChr[chr][pos]; bool newSegment = false; string key = chr + ":" + start; if (starts.ContainsKey(key)) { newSegment = true; } if (excludeIntervals != null) { while (excludeIndex < excludeIntervals.Count && excludeIntervals[excludeIndex].Stop < previousBinEnd) { excludeIndex++; } if (excludeIndex < excludeIntervals.Count) { // Note: forbiddenZoneMid should never fall inside a bin, becuase these intervals were already excluded // from consideration during the call to CanvasBin. int forbiddenZoneMid = (excludeIntervals[excludeIndex].Start + excludeIntervals[excludeIndex].Stop) / 2; if (previousBinEnd < forbiddenZoneMid && end >= forbiddenZoneMid) { newSegment = true; } } } if (previousBinEnd > 0 && MaxInterBinDistInSegment >= 0 && previousBinEnd + MaxInterBinDistInSegment < start && !newSegment) { newSegment = true; } if (newSegment) { segmentNum++; } writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}\t{4}", chr, start, end, ScoreByChr[chr][pos], segmentNum)); previousBinEnd = end; } } } }
/// <summary> /// Invoke CanvasSNV. Return null if this fails and we need to abort CNV calling for this sample. /// </summary> protected void InvokeCanvasSnv(CanvasCallset callset) { List <UnitOfWork> jobList = new List <UnitOfWork>(); List <string> outputPaths = new List <string>(); GenomeMetadata genomeMetadata = callset.GenomeMetadata; string tumorBamPath = callset.Bam.BamFile.FullName; string normalVcfPath = callset.NormalVcfPath.FullName; foreach (GenomeMetadata.SequenceMetadata chromosome in genomeMetadata.Sequences) { // Only invoke for autosomes + allosomes; // don't invoke it for mitochondrial chromosome or extra contigs or decoys if (chromosome.Type != GenomeMetadata.SequenceType.Allosome && !chromosome.IsAutosome()) { continue; } UnitOfWork job = new UnitOfWork(); job.ExecutablePath = Path.Combine(_canvasFolder, "CanvasSNV.exe"); if (CrossPlatform.IsThisMono()) { job.CommandLine = job.ExecutablePath; job.ExecutablePath = Utilities.GetMonoPath(); } string outputPath = Path.Combine(callset.TempFolder, string.Format("{0}-{1}.SNV.txt.gz", chromosome.Name, callset.Id)); outputPaths.Add(outputPath); job.CommandLine += $" {chromosome.Name} {normalVcfPath} {tumorBamPath} {outputPath}"; if (_customParameters.ContainsKey("CanvasSNV")) { job.CommandLine = Utilities.MergeCommandLineOptions(job.CommandLine, _customParameters["CanvasSNV"], true); } job.LoggingFolder = _workManager.LoggingFolder.FullName; job.LoggingStub = string.Format("CanvasSNV-{0}-{1}", callset.Id, chromosome.Name); jobList.Add(job); } Console.WriteLine("Invoking {0} processor jobs...", jobList.Count); // Invoke CanvasSNV jobs: Console.WriteLine(">>>CanvasSNV start..."); _workManager.DoWorkParallelThreads(jobList); Console.WriteLine(">>>CanvasSNV complete!"); // Concatenate CanvasSNV results: using (GzipWriter writer = new GzipWriter(callset.VfSummaryPath)) { bool headerWritten = false; foreach (string outputPath in outputPaths) { if (!File.Exists(outputPath)) { Console.WriteLine("Error: Expected output file not found at {0}", outputPath); continue; } using (GzipReader reader = new GzipReader(outputPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.Length > 0 && fileLine[0] == '#') { if (headerWritten) { continue; } headerWritten = true; } writer.WriteLine(fileLine); } } } } }
/// <summary> /// Intersect bins with the targeted regions defined in callset.Manifest. /// Assumes that the targeted regions don't intersect, the bins are sorted by genomic location and the bins don't intersect. /// </summary> /// <param name="callset"></param> /// <param name="partitionedPath">Output of CanvasPartition. Bins are assumed to be sorted</param> /// <returns></returns> private IFileLocation IntersectBinsWithTargetedRegions(CanvasCallset callset, IFileLocation partitionedPath) { if (!partitionedPath.Exists) { return(partitionedPath); } var rawPartitionedPath = partitionedPath.AppendName(".raw"); if (rawPartitionedPath.Exists) { rawPartitionedPath.Delete(); } partitionedPath.MoveTo(rawPartitionedPath); //callset.Manifest Dictionary <string, List <NexteraManifest.ManifestRegion> > manifestRegionsByChrom = callset.Manifest.GetManifestRegionsByChromosome(); // CanvasPartition output file is in the BED format // start: 0-based, inclusive // end: 0-based, exclusive // Manifest // start: 1-based, inclusive // end: 1-based, inclusive using (GzipReader reader = new GzipReader(rawPartitionedPath.FullName)) using (GzipWriter writer = new GzipWriter(partitionedPath.FullName)) { string currentChrom = null; int manifestRegionIdx = 0; string line; string[] toks; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]) + 1; // 1-based, inclusive int end = int.Parse(toks[2]); // 1-based, inclusive if (chrom != currentChrom) { currentChrom = chrom; manifestRegionIdx = 0; } if (!manifestRegionsByChrom.ContainsKey(currentChrom)) { continue; } while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count && manifestRegionsByChrom[currentChrom][manifestRegionIdx].End < start) // |- manifest region -| |- bin -| { manifestRegionIdx++; } if (manifestRegionIdx >= manifestRegionsByChrom[currentChrom].Count || // |- last manifest region -| |- bin -| end < manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) // |- bin -| |- manifest region -| { continue; // skip bin } // |- bin -| // |- manifest region -| while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count && end >= manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) { // calculate intersection int intersectionStart = Math.Max(start, manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start); // 1-based, inclusive int intersectionEnd = Math.Min(end, manifestRegionsByChrom[currentChrom][manifestRegionIdx].End); // 1-based, inclusive // start/end in BED format toks[1] = String.Format("{0}", intersectionStart - 1); // 0-based, inclusive toks[2] = String.Format("{0}", intersectionEnd); // 0-based, exclusive // write intersected bin writer.WriteLine(String.Join("\t", toks)); manifestRegionIdx++; } } } return(partitionedPath); }
/// <summary> /// Computes fragment-based GC normalization correction factor /// </summary> /// <returns>An array of observed vs expected GC counts.</returns> static float[] ComputeObservedVsExpectedGC(Dictionary <string, HitArray> observedAlignments, Dictionary <string, byte[]> readGCContent, NexteraManifest manifest, bool debugGC, string outFile) { Dictionary <string, List <NexteraManifest.ManifestRegion> > regionsByChrom = null; if (manifest != null) { regionsByChrom = manifest.GetManifestRegionsByChromosome(); } long[] expectedReadCountsByGC = new long[numberOfGCbins]; long[] observedReadCountsByGC = new long[numberOfGCbins]; foreach (KeyValuePair <string, byte[]> chromosomeReadGCContent in readGCContent) { string chr = chromosomeReadGCContent.Key; if (!observedAlignments.ContainsKey(chr)) { continue; } if (manifest == null) // look at the entire genome { for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++) { expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++; observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i]; } } else // look at only the targeted regions { if (!regionsByChrom.ContainsKey(chr)) { continue; } int i = -1; foreach (var region in regionsByChrom[chr]) { if (i < region.Start) // avoid overlapping targeted regions { i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based. } for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++) { expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++; observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i]; } } } } // calculate ratio of observed to expected read counts for each read GC bin float[] observedVsExpectedGC = new float[numberOfGCbins]; for (int i = 0; i < numberOfGCbins; i++) { observedVsExpectedGC[i] = 1; } long sumObserved = 0; long sumExpected = 0; foreach (long gcContent in observedReadCountsByGC) { sumObserved += gcContent; } foreach (long gcContent in expectedReadCountsByGC) { sumExpected += gcContent; } for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++) { if (expectedReadCountsByGC[binIndex] == 0) { expectedReadCountsByGC[binIndex] = 1; } if (observedReadCountsByGC[binIndex] == 0) { observedReadCountsByGC[binIndex] = 1; } observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved); } if (debugGC) { using (GzipWriter writer = new GzipWriter(outFile + ".gcstat")) { for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++) { writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex])); } } } return(observedVsExpectedGC); }