Exemple #1
0
        private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath,
                                                       NexteraManifest manifest = null)
        {
            int sampleCount = binnedPaths.Count();

            if (sampleCount == 1) // copy file
            {
                if (File.Exists(binnedPaths.First()))
                {
                    if (File.Exists(mergedBinnedPath))
                    {
                        File.Delete(mergedBinnedPath);
                    }
                    File.Copy(binnedPaths.First(), mergedBinnedPath);
                }
            }
            else // merge normal samples
            {
                double[]        weights           = new double[sampleCount];
                List <double>[] binCountsBySample = new List <double> [sampleCount];
                for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
                {
                    string        binnedPath = binnedPaths.ElementAt(sampleIndex);
                    var           binCounts  = new BinCounts(binnedPath, manifest: manifest);
                    List <double> counts     = binCounts.AllCounts;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double median = binCounts.OnTargetMedianBinCount;
                    weights[sampleIndex]           = median > 0 ? 1.0 / median : 0;
                    binCountsBySample[sampleIndex] = counts;
                }
                double weightSum = weights.Sum();
                for (int i = 0; i < sampleCount; i++)
                {
                    weights[i] /= weightSum;
                }                                                                  // so weights sum to 1

                // Computed weighted average of bin counts across samples
                using (GzipReader reader = new GzipReader(binnedPaths.First()))
                    using (GzipWriter writer = new GzipWriter(mergedBinnedPath))
                    {
                        string   line;
                        string[] toks;
                        int      lineIdx = 0;
                        while ((line = reader.ReadLine()) != null)
                        {
                            toks = line.Split('\t');
                            double weightedBinCount = 0;
                            for (int i = 0; i < sampleCount; i++)
                            {
                                weightedBinCount += weights[i] * binCountsBySample[i][lineIdx];
                            }
                            toks[3] = String.Format("{0}", weightedBinCount);
                            writer.WriteLine(String.Join("\t", toks));
                            lineIdx++;
                        }
                    }
            }
        }
Exemple #2
0
 public static void WriteToTextFile(string outfile, List <GenomicBin> bins)
 {
     using (GzipWriter writer = new GzipWriter(outfile))
     {
         foreach (GenomicBin bin in bins)
         {
             writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3:F2}\t{4}", bin.Chromosome, bin.Start, bin.Stop, bin.Count, bin.GC));
         }
     }
 }
Exemple #3
0
 /// <summary>
 /// Step 3: Summarize results to a simple tab-delimited file.
 /// </summary>
 protected void WriteResults(string outputPath)
 {
     using (GzipWriter writer = new GzipWriter(outputPath))
     {
         writer.WriteLine("#Chromosome\tPosition\tRef\tAlt\tCountRef\tCountAlt");
         for (int index = 0; index < this.Variants.Count; index++)
         {
             VcfVariant variant = this.Variants[index];
             // skip HOM REF positions
             if (this.VariantCounts[index] > 5)
             {
                 writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", variant.ReferenceName, variant.ReferencePosition,
                                                variant.ReferenceAllele, variant.VariantAlleles[0], this.ReferenceCounts[index],
                                                this.VariantCounts[index]));
             }
         }
     }
     Console.WriteLine("{0} Results written to {1}", DateTime.Now, outputPath);
 }
Exemple #4
0
        public void WriteCanvasPartitionResults(string outPath, GenomeSegmentationResults segmentationResults)
        {
            Dictionary <string, bool> starts = new Dictionary <string, bool>();
            Dictionary <string, bool> stops  = new Dictionary <string, bool>();

            foreach (string chr in segmentationResults.SegmentByChr.Keys)
            {
                for (int segmentIndex = 0; segmentIndex < segmentationResults.SegmentByChr[chr].Length; segmentIndex++)
                {
                    Segmentation.Segment segment = segmentationResults.SegmentByChr[chr][segmentIndex];
                    starts[chr + ":" + segment.start] = true;
                    stops[chr + ":" + segment.end]    = true;
                }
            }

            Dictionary <string, List <SampleGenomicBin> > excludedIntervals = new Dictionary <string, List <SampleGenomicBin> >();

            if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath))
            {
                excludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath);
            }

            using (GzipWriter writer = new GzipWriter(outPath))
            {
                int segmentNum = -1;

                foreach (string chr in StartByChr.Keys)
                {
                    List <SampleGenomicBin> excludeIntervals = null;
                    if (excludedIntervals.ContainsKey(chr))
                    {
                        excludeIntervals = excludedIntervals[chr];
                    }
                    int  excludeIndex   = 0; // Points to the first interval which *doesn't* end before our current position
                    uint previousBinEnd = 0;
                    for (int pos = 0; pos < StartByChr[chr].Length; pos++)
                    {
                        uint   start      = StartByChr[chr][pos];
                        uint   end        = EndByChr[chr][pos];
                        string key        = chr + ":" + start;
                        bool   newSegment = IsNewSegment(starts, key, excludeIntervals, previousBinEnd, end, start, ref excludeIndex);
                        if (newSegment)
                        {
                            segmentNum++;
                        }
                        writer.WriteLine(string.Format($"{chr}\t{start}\t{end}\t{ScoreByChr[chr][pos]}\t{segmentNum}"));
                        previousBinEnd = end;
                    }
                }
            }
        }
Exemple #5
0
        public void WriteCanvasPartitionResults(string outPath, Dictionary <string, List <SegmentWithBins> > segmentsByChromosome)
        {
            using (var writer = new GzipWriter(outPath))
            {
                foreach (var chr in segmentsByChromosome.Keys)
                {
                    var segments = segmentsByChromosome[chr];

                    foreach (var segment in segments)
                    {
                        foreach (var bin in segment.Bins)
                        {
                            writer.WriteLine(string.Format($"{chr}\t{bin.Start}\t{bin.End}\t{bin.Coverage}\t{segment.Identifier}"));
                        }
                    }
                }
            }
        }
Exemple #6
0
        private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath,
                                        string ploidyBedPath, NexteraManifest manifest = null)
        {
            PloidyInfo referencePloidy   = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);
            double     tumorMedian       = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double     normalMedian      = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double     librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1;

            using (GzipReader tumorReader = new GzipReader(tumorBinnedPath))
                using (GzipReader normalReader = new GzipReader(normalBinnedPath))
                    using (GzipWriter writer = new GzipWriter(ratioBinnedPath))
                    {
                        string   normalLine;
                        string   tumorLine;
                        string[] normalToks;
                        string[] tumorToks;
                        double   normalCount;
                        double   tumorCount;
                        double   ratio;
                        while ((normalLine = normalReader.ReadLine()) != null)
                        {
                            tumorLine   = tumorReader.ReadLine();
                            normalToks  = normalLine.Split('\t');
                            tumorToks   = tumorLine.Split('\t');
                            normalCount = double.Parse(normalToks[3]);
                            tumorCount  = double.Parse(tumorToks[3]);
                            // The weighted average count of a bin could be less than 1.
                            // Using these small counts for coverage normalization creates large ratios.
                            // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                            if (normalCount < 1)
                            {
                                continue;
                            }                          // skip the bin
                            string chrom = normalToks[0];
                            int    start = int.Parse(normalToks[1]);
                            int    end   = int.Parse(normalToks[2]);
                            // get the normal ploidy from intervalsWithPloidyByChrom
                            double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0;
                            ratio         = tumorCount / normalCount * factor * librarySizeFactor;
                            normalToks[3] = String.Format("{0}", ratio);
                            writer.WriteLine(String.Join("\t", normalToks));
                        }
                    }
        }
Exemple #7
0
        private void WriteCanvasPartitionResults(string outPath)
        {
            Dictionary <string, bool> starts = new Dictionary <string, bool>();
            Dictionary <string, bool> stops  = new Dictionary <string, bool>();

            foreach (string chr in SegmentationResults.SegmentByChr.Keys)
            {
                for (int segmentIndex = 0; segmentIndex < SegmentationResults.SegmentByChr[chr].Length; segmentIndex++)
                {
                    Segment segment = SegmentationResults.SegmentByChr[chr][segmentIndex];
                    starts[chr + ":" + segment.start] = true;
                    stops[chr + ":" + segment.end]    = true;
                }
            }

            Dictionary <string, List <GenomicBin> > ExcludedIntervals = new Dictionary <string, List <GenomicBin> >();

            if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath))
            {
                ExcludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath);
            }

            using (GzipWriter writer = new GzipWriter(outPath))
            {
                int segmentNum = -1;

                foreach (string chr in StartByChr.Keys)
                {
                    List <GenomicBin> excludeIntervals = null;
                    if (ExcludedIntervals.ContainsKey(chr))
                    {
                        excludeIntervals = ExcludedIntervals[chr];
                    }
                    int  excludeIndex   = 0; // Points to the first interval which *doesn't* end before our current position
                    uint previousBinEnd = 0;
                    for (int pos = 0; pos < StartByChr[chr].Length; pos++)
                    {
                        uint   start      = StartByChr[chr][pos];
                        uint   end        = EndByChr[chr][pos];
                        bool   newSegment = false;
                        string key        = chr + ":" + start;
                        if (starts.ContainsKey(key))
                        {
                            newSegment = true;
                        }

                        if (excludeIntervals != null)
                        {
                            while (excludeIndex < excludeIntervals.Count && excludeIntervals[excludeIndex].Stop < previousBinEnd)
                            {
                                excludeIndex++;
                            }
                            if (excludeIndex < excludeIntervals.Count)
                            {
                                // Note: forbiddenZoneMid should never fall inside a bin, becuase these intervals were already excluded
                                // from consideration during the call to CanvasBin.
                                int forbiddenZoneMid = (excludeIntervals[excludeIndex].Start + excludeIntervals[excludeIndex].Stop) / 2;
                                if (previousBinEnd < forbiddenZoneMid && end >= forbiddenZoneMid)
                                {
                                    newSegment = true;
                                }
                            }
                        }
                        if (previousBinEnd > 0 && MaxInterBinDistInSegment >= 0 && previousBinEnd + MaxInterBinDistInSegment < start &&
                            !newSegment)
                        {
                            newSegment = true;
                        }

                        if (newSegment)
                        {
                            segmentNum++;
                        }
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}\t{4}", chr, start, end, ScoreByChr[chr][pos], segmentNum));
                        previousBinEnd = end;
                    }
                }
            }
        }
Exemple #8
0
        /// <summary>
        /// Invoke CanvasSNV.  Return null if this fails and we need to abort CNV calling for this sample.
        /// </summary>
        protected void InvokeCanvasSnv(CanvasCallset callset)
        {
            List <UnitOfWork> jobList        = new List <UnitOfWork>();
            List <string>     outputPaths    = new List <string>();
            GenomeMetadata    genomeMetadata = callset.GenomeMetadata;

            string tumorBamPath  = callset.Bam.BamFile.FullName;
            string normalVcfPath = callset.NormalVcfPath.FullName;

            foreach (GenomeMetadata.SequenceMetadata chromosome in genomeMetadata.Sequences)
            {
                // Only invoke for autosomes + allosomes;
                // don't invoke it for mitochondrial chromosome or extra contigs or decoys
                if (chromosome.Type != GenomeMetadata.SequenceType.Allosome && !chromosome.IsAutosome())
                {
                    continue;
                }

                UnitOfWork job = new UnitOfWork();
                job.ExecutablePath = Path.Combine(_canvasFolder, "CanvasSNV.exe");
                if (CrossPlatform.IsThisMono())
                {
                    job.CommandLine    = job.ExecutablePath;
                    job.ExecutablePath = Utilities.GetMonoPath();
                }

                string outputPath = Path.Combine(callset.TempFolder, string.Format("{0}-{1}.SNV.txt.gz", chromosome.Name, callset.Id));
                outputPaths.Add(outputPath);
                job.CommandLine += $" {chromosome.Name} {normalVcfPath} {tumorBamPath} {outputPath}";
                if (_customParameters.ContainsKey("CanvasSNV"))
                {
                    job.CommandLine = Utilities.MergeCommandLineOptions(job.CommandLine, _customParameters["CanvasSNV"], true);
                }
                job.LoggingFolder = _workManager.LoggingFolder.FullName;
                job.LoggingStub   = string.Format("CanvasSNV-{0}-{1}", callset.Id, chromosome.Name);
                jobList.Add(job);
            }
            Console.WriteLine("Invoking {0} processor jobs...", jobList.Count);

            // Invoke CanvasSNV jobs:
            Console.WriteLine(">>>CanvasSNV start...");
            _workManager.DoWorkParallelThreads(jobList);
            Console.WriteLine(">>>CanvasSNV complete!");

            // Concatenate CanvasSNV results:
            using (GzipWriter writer = new GzipWriter(callset.VfSummaryPath))
            {
                bool headerWritten = false;
                foreach (string outputPath in outputPaths)
                {
                    if (!File.Exists(outputPath))
                    {
                        Console.WriteLine("Error: Expected output file not found at {0}", outputPath);
                        continue;
                    }
                    using (GzipReader reader = new GzipReader(outputPath))
                    {
                        while (true)
                        {
                            string fileLine = reader.ReadLine();
                            if (fileLine == null)
                            {
                                break;
                            }
                            if (fileLine.Length > 0 && fileLine[0] == '#')
                            {
                                if (headerWritten)
                                {
                                    continue;
                                }
                                headerWritten = true;
                            }
                            writer.WriteLine(fileLine);
                        }
                    }
                }
            }
        }
Exemple #9
0
        /// <summary>
        /// Intersect bins with the targeted regions defined in callset.Manifest.
        /// Assumes that the targeted regions don't intersect, the bins are sorted by genomic location and the bins don't intersect.
        /// </summary>
        /// <param name="callset"></param>
        /// <param name="partitionedPath">Output of CanvasPartition. Bins are assumed to be sorted</param>
        /// <returns></returns>
        private IFileLocation IntersectBinsWithTargetedRegions(CanvasCallset callset, IFileLocation partitionedPath)
        {
            if (!partitionedPath.Exists)
            {
                return(partitionedPath);
            }
            var rawPartitionedPath = partitionedPath.AppendName(".raw");

            if (rawPartitionedPath.Exists)
            {
                rawPartitionedPath.Delete();
            }
            partitionedPath.MoveTo(rawPartitionedPath);

            //callset.Manifest
            Dictionary <string, List <NexteraManifest.ManifestRegion> > manifestRegionsByChrom = callset.Manifest.GetManifestRegionsByChromosome();

            // CanvasPartition output file is in the BED format
            //   start: 0-based, inclusive
            //   end: 0-based, exclusive
            // Manifest
            //   start: 1-based, inclusive
            //   end: 1-based, inclusive
            using (GzipReader reader = new GzipReader(rawPartitionedPath.FullName))
                using (GzipWriter writer = new GzipWriter(partitionedPath.FullName))
                {
                    string   currentChrom      = null;
                    int      manifestRegionIdx = 0;
                    string   line;
                    string[] toks;
                    while ((line = reader.ReadLine()) != null)
                    {
                        toks = line.Split('\t');
                        string chrom = toks[0];
                        int    start = int.Parse(toks[1]) + 1; // 1-based, inclusive
                        int    end   = int.Parse(toks[2]);     // 1-based, inclusive
                        if (chrom != currentChrom)
                        {
                            currentChrom      = chrom;
                            manifestRegionIdx = 0;
                        }
                        if (!manifestRegionsByChrom.ContainsKey(currentChrom))
                        {
                            continue;
                        }
                        while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count &&
                               manifestRegionsByChrom[currentChrom][manifestRegionIdx].End < start) // |- manifest region -| |- bin -|
                        {
                            manifestRegionIdx++;
                        }
                        if (manifestRegionIdx >= manifestRegionsByChrom[currentChrom].Count ||   // |- last manifest region -| |- bin -|
                            end < manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) // |- bin -| |- manifest region -|
                        {
                            continue;                                                            // skip bin
                        }

                        // |- bin -|
                        //       |- manifest region -|
                        while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count &&
                               end >= manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start)
                        {
                            // calculate intersection
                            int intersectionStart = Math.Max(start, manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start); // 1-based, inclusive
                            int intersectionEnd   = Math.Min(end, manifestRegionsByChrom[currentChrom][manifestRegionIdx].End);     // 1-based, inclusive
                                                                                                                                    // start/end in BED format
                            toks[1] = String.Format("{0}", intersectionStart - 1);                                                  // 0-based, inclusive
                            toks[2] = String.Format("{0}", intersectionEnd);                                                        // 0-based, exclusive

                            // write intersected bin
                            writer.WriteLine(String.Join("\t", toks));

                            manifestRegionIdx++;
                        }
                    }
                }

            return(partitionedPath);
        }
Exemple #10
0
        /// <summary>
        /// Computes fragment-based GC normalization correction factor
        /// </summary>
        /// <returns>An array of observed vs expected GC counts.</returns>
        static float[] ComputeObservedVsExpectedGC(Dictionary <string, HitArray> observedAlignments,
                                                   Dictionary <string, byte[]> readGCContent, NexteraManifest manifest,
                                                   bool debugGC, string outFile)
        {
            Dictionary <string, List <NexteraManifest.ManifestRegion> > regionsByChrom = null;

            if (manifest != null)
            {
                regionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            long[] expectedReadCountsByGC = new long[numberOfGCbins];
            long[] observedReadCountsByGC = new long[numberOfGCbins];
            foreach (KeyValuePair <string, byte[]> chromosomeReadGCContent in readGCContent)
            {
                string chr = chromosomeReadGCContent.Key;
                if (!observedAlignments.ContainsKey(chr))
                {
                    continue;
                }

                if (manifest == null) // look at the entire genome
                {
                    for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
                    {
                        expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                        observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                    }
                }
                else // look at only the targeted regions
                {
                    if (!regionsByChrom.ContainsKey(chr))
                    {
                        continue;
                    }
                    int i = -1;
                    foreach (var region in regionsByChrom[chr])
                    {
                        if (i < region.Start)     // avoid overlapping targeted regions
                        {
                            i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
                        }
                        for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
                        {
                            expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                            observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                        }
                    }
                }
            }

            // calculate ratio of observed to expected read counts for each read GC bin
            float[] observedVsExpectedGC = new float[numberOfGCbins];
            for (int i = 0; i < numberOfGCbins; i++)
            {
                observedVsExpectedGC[i] = 1;
            }
            long sumObserved = 0;
            long sumExpected = 0;

            foreach (long gcContent in observedReadCountsByGC)
            {
                sumObserved += gcContent;
            }
            foreach (long gcContent in expectedReadCountsByGC)
            {
                sumExpected += gcContent;
            }
            for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
            {
                if (expectedReadCountsByGC[binIndex] == 0)
                {
                    expectedReadCountsByGC[binIndex] = 1;
                }
                if (observedReadCountsByGC[binIndex] == 0)
                {
                    observedReadCountsByGC[binIndex] = 1;
                }
                observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
            }

            if (debugGC)
            {
                using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
                {
                    for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
                    {
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
                    }
                }
            }
            return(observedVsExpectedGC);
        }