Ejemplo n.º 1
0
Archivo: IO.cs Proyecto: abladon/canvas
 public static void WriteToTextFile(string outfile, List<GenomicBin> bins)
 {
     using (GzipWriter writer = new GzipWriter(outfile))
     {
         foreach (GenomicBin bin in bins)
         {
             writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3:F2}\t{4}", bin.Chromosome, bin.Start, bin.Stop, bin.Count, bin.GC));
         }
     }
 }
Ejemplo n.º 2
0
        private static void GetWeightedAverageBinCount(IEnumerable<string> binnedPaths, string mergedBinnedPath,
            NexteraManifest manifest = null)
        {
            int sampleCount = binnedPaths.Count();
            if (sampleCount == 1) // copy file
            {
                if (File.Exists(binnedPaths.First()))
                {
                    if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); }
                    File.Copy(binnedPaths.First(), mergedBinnedPath);
                }
            }
            else // merge normal samples
            {
                double[] weights = new double[sampleCount];
                List<double>[] binCountsBySample = new List<double>[sampleCount];
                for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
                {
                    string binnedPath = binnedPaths.ElementAt(sampleIndex);
                    var binCounts = new BinCounts(binnedPath, manifest: manifest);
                    List<double> counts = binCounts.AllCounts;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double median = binCounts.OnTargetMedianBinCount;
                    weights[sampleIndex] = median > 0 ? 1.0 / median : 0;
                    binCountsBySample[sampleIndex] = counts;
                }
                double weightSum = weights.Sum();
                for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1

                // Computed weighted average of bin counts across samples
                using (GzipReader reader = new GzipReader(binnedPaths.First()))
                using (GzipWriter writer = new GzipWriter(mergedBinnedPath))
                {
                    string line;
                    string[] toks;
                    int lineIdx = 0;
                    while ((line = reader.ReadLine()) != null)
                    {
                        toks = line.Split('\t');
                        double weightedBinCount = 0;
                        for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; }
                        toks[3] = String.Format("{0}", weightedBinCount);
                        writer.WriteLine(String.Join("\t", toks));
                        lineIdx++;
                    }
                }
            }
        }
Ejemplo n.º 3
0
        private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath,
            string ploidyBedPath, NexteraManifest manifest = null)
        {
            PloidyInfo referencePloidy = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);
            double tumorMedian = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double normalMedian = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1;

            using (GzipReader tumorReader = new GzipReader(tumorBinnedPath))
            using (GzipReader normalReader = new GzipReader(normalBinnedPath))
            using (GzipWriter writer = new GzipWriter(ratioBinnedPath))
            {
                string normalLine;
                string tumorLine;
                string[] normalToks;
                string[] tumorToks;
                double normalCount;
                double tumorCount;
                double ratio;
                while ((normalLine = normalReader.ReadLine()) != null)
                {
                    tumorLine = tumorReader.ReadLine();
                    normalToks = normalLine.Split('\t');
                    tumorToks = tumorLine.Split('\t');
                    normalCount = double.Parse(normalToks[3]);
                    tumorCount = double.Parse(tumorToks[3]);
                    // The weighted average count of a bin could be less than 1.
                    // Using these small counts for coverage normalization creates large ratios.
                    // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                    if (normalCount < 1) { continue; } // skip the bin
                    string chrom = normalToks[0];
                    int start = int.Parse(normalToks[1]);
                    int end = int.Parse(normalToks[2]);
                    // get the normal ploidy from intervalsWithPloidyByChrom
                    double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0;
                    ratio = tumorCount / normalCount * factor * librarySizeFactor;
                    normalToks[3] = String.Format("{0}", ratio);
                    writer.WriteLine(String.Join("\t", normalToks));
                }
            }
        }
Ejemplo n.º 4
0
 /// <summary>
 /// Step 3: Summarize results to a simple tab-delimited file.
 /// </summary>
 protected void WriteResults(string outputPath)
 {
     using (GzipWriter writer = new GzipWriter(outputPath))
     {
         writer.WriteLine("#Chromosome\tPosition\tRef\tAlt\tCountRef\tCountAlt");
         for (int index = 0; index < this.Variants.Count; index++)
         {
             VcfVariant variant = this.Variants[index];
             // skip HOM REF positions
             if (this.VariantCounts[index] > 5) {
                 writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", variant.ReferenceName, variant.ReferencePosition,
                 variant.ReferenceAllele, variant.VariantAlleles[0], this.ReferenceCounts[index],
                 this.VariantCounts[index]));
             }
         }
     }
     Console.WriteLine("{0} Results written to {1}", DateTime.Now, outputPath);
 }
Ejemplo n.º 5
0
        private void WriteCanvasPartitionResults(string outPath)
        {
            Dictionary<string, bool> starts = new Dictionary<string, bool>();
            Dictionary<string, bool> stops = new Dictionary<string, bool>();

            foreach (string chr in SegmentationResults.SegmentByChr.Keys)
            {
                for (int segmentIndex = 0; segmentIndex < SegmentationResults.SegmentByChr[chr].Length; segmentIndex++)
                {
                    Segment segment = SegmentationResults.SegmentByChr[chr][segmentIndex];
                    starts[chr + ":" + segment.start] = true;
                    stops[chr + ":" + segment.end] = true;
                }
            }

            Dictionary<string, List<GenomicBin>> ExcludedIntervals = new Dictionary<string, List<GenomicBin>>();
            if (!string.IsNullOrEmpty(ForbiddenIntervalBedPath))
            {
                ExcludedIntervals = CanvasCommon.Utilities.LoadBedFile(ForbiddenIntervalBedPath);
            }

            using (GzipWriter writer = new GzipWriter(outPath))
            {
                int segmentNum = -1;

                foreach (string chr in StartByChr.Keys)
                {
                    List<GenomicBin> excludeIntervals = null;
                    if (ExcludedIntervals.ContainsKey(chr)) excludeIntervals = ExcludedIntervals[chr];
                    int excludeIndex = 0; // Points to the first interval which *doesn't* end before our current position
                    uint previousBinEnd = 0;
                    for (int pos = 0; pos < StartByChr[chr].Length; pos++)
                    {
                        uint start = StartByChr[chr][pos];
                        uint end = EndByChr[chr][pos];
                        bool newSegment = false;
                        string key = chr + ":" + start;
                        if (starts.ContainsKey(key))
                        {
                            newSegment = true;
                        }

                        if (excludeIntervals != null)
                        {
                            while (excludeIndex < excludeIntervals.Count && excludeIntervals[excludeIndex].Stop < previousBinEnd) excludeIndex++;
                            if (excludeIndex < excludeIntervals.Count)
                            {
                                // Note: forbiddenZoneMid should never fall inside a bin, becuase these intervals were already excluded
                                // from consideration during the call to CanvasBin.
                                int forbiddenZoneMid = (excludeIntervals[excludeIndex].Start + excludeIntervals[excludeIndex].Stop) / 2;
                                if (previousBinEnd < forbiddenZoneMid && end >= forbiddenZoneMid) newSegment = true;
                            }
                        }
                        if (previousBinEnd > 0 && MaxInterBinDistInSegment >= 0 && previousBinEnd + MaxInterBinDistInSegment < start
                            && !newSegment)
                        {
                            newSegment = true;
                        }

                        if (newSegment) segmentNum++;
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}\t{4}", chr, start, end, ScoreByChr[chr][pos], segmentNum));
                        previousBinEnd = end;
                    }
                }
            }
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Invoke CanvasSNV.  Return null if this fails and we need to abort CNV calling for this sample.
        /// </summary>
        protected void InvokeCanvasSnv(CanvasCallset callset)
        {
            List<UnitOfWork> jobList = new List<UnitOfWork>();
            List<string> outputPaths = new List<string>();
            GenomeMetadata genomeMetadata = callset.GenomeMetadata;

            string tumorBamPath = callset.Bam.BamFile.FullName;
            string normalVcfPath = callset.NormalVcfPath.FullName;
            foreach (GenomeMetadata.SequenceMetadata chromosome in genomeMetadata.Sequences)
            {
                // Only invoke for autosomes + allosomes;
                // don't invoke it for mitochondrial chromosome or extra contigs or decoys
                if (chromosome.Type != GenomeMetadata.SequenceType.Allosome && !chromosome.IsAutosome())
                    continue;

                UnitOfWork job = new UnitOfWork();
                job.ExecutablePath = Path.Combine(_canvasFolder, "CanvasSNV.exe");
                if (CrossPlatform.IsThisMono())
                {
                    job.CommandLine = job.ExecutablePath;
                    job.ExecutablePath = Utilities.GetMonoPath();
                }

                string outputPath = Path.Combine(callset.TempFolder, string.Format("{0}-{1}.SNV.txt.gz", chromosome.Name, callset.Id));
                outputPaths.Add(outputPath);
                job.CommandLine += $" {chromosome.Name} {normalVcfPath} {tumorBamPath} {outputPath}";
                if (_customParameters.ContainsKey("CanvasSNV"))
                {
                    job.CommandLine = Utilities.MergeCommandLineOptions(job.CommandLine, _customParameters["CanvasSNV"], true);
                }
                job.LoggingFolder = _workManager.LoggingFolder.FullName;
                job.LoggingStub = string.Format("CanvasSNV-{0}-{1}", callset.Id, chromosome.Name);
                jobList.Add(job);
            }
            Console.WriteLine("Invoking {0} processor jobs...", jobList.Count);

            // Invoke CanvasSNV jobs:
            Console.WriteLine(">>>CanvasSNV start...");
            _workManager.DoWorkParallelThreads(jobList);
            Console.WriteLine(">>>CanvasSNV complete!");

            // Concatenate CanvasSNV results:
            using (GzipWriter writer = new GzipWriter(callset.VfSummaryPath))
            {
                bool headerWritten = false;
                foreach (string outputPath in outputPaths)
                {
                    if (!File.Exists(outputPath))
                    {
                        Console.WriteLine("Error: Expected output file not found at {0}", outputPath);
                        continue;
                    }
                    using (GzipReader reader = new GzipReader(outputPath))
                    {
                        while (true)
                        {
                            string fileLine = reader.ReadLine();
                            if (fileLine == null) break;
                            if (fileLine.Length > 0 && fileLine[0] == '#')
                            {
                                if (headerWritten) continue;
                                headerWritten = true;
                            }
                            writer.WriteLine(fileLine);
                        }
                    }
                }
            }
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Intersect bins with the targeted regions defined in callset.Manifest.
        /// Assumes that the targeted regions don't intersect, the bins are sorted by genomic location and the bins don't intersect.
        /// </summary>
        /// <param name="callset"></param>
        /// <param name="partitionedPath">Output of CanvasPartition. Bins are assumed to be sorted</param>
        /// <returns></returns>
        private IFileLocation IntersectBinsWithTargetedRegions(CanvasCallset callset, IFileLocation partitionedPath)
        {
            if (!partitionedPath.Exists) { return partitionedPath; }
            var rawPartitionedPath = partitionedPath.AppendName(".raw");
            if (rawPartitionedPath.Exists) { rawPartitionedPath.Delete(); }
            partitionedPath.MoveTo(rawPartitionedPath);

            //callset.Manifest
            Dictionary<string, List<NexteraManifest.ManifestRegion>> manifestRegionsByChrom = callset.Manifest.GetManifestRegionsByChromosome();

            // CanvasPartition output file is in the BED format
            //   start: 0-based, inclusive
            //   end: 0-based, exclusive
            // Manifest
            //   start: 1-based, inclusive
            //   end: 1-based, inclusive
            using (GzipReader reader = new GzipReader(rawPartitionedPath.FullName))
            using (GzipWriter writer = new GzipWriter(partitionedPath.FullName))
            {
                string currentChrom = null;
                int manifestRegionIdx = 0;
                string line;
                string[] toks;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    string chrom = toks[0];
                    int start = int.Parse(toks[1]) + 1; // 1-based, inclusive
                    int end = int.Parse(toks[2]); // 1-based, inclusive
                    if (chrom != currentChrom)
                    {
                        currentChrom = chrom;
                        manifestRegionIdx = 0;
                    }
                    if (!manifestRegionsByChrom.ContainsKey(currentChrom)) { continue; }
                    while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count
                        && manifestRegionsByChrom[currentChrom][manifestRegionIdx].End < start) // |- manifest region -| |- bin -|
                    {
                        manifestRegionIdx++;
                    }
                    if (manifestRegionIdx >= manifestRegionsByChrom[currentChrom].Count || // |- last manifest region -| |- bin -|
                        end < manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) // |- bin -| |- manifest region -|
                    {
                        continue; // skip bin
                    }

                    // |- bin -|
                    //       |- manifest region -|
                    while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count &&
                        end >= manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start)
                    {
                        // calculate intersection
                        int intersectionStart = Math.Max(start, manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start); // 1-based, inclusive
                        int intersectionEnd = Math.Min(end, manifestRegionsByChrom[currentChrom][manifestRegionIdx].End); // 1-based, inclusive
                                                                                                                          // start/end in BED format
                        toks[1] = String.Format("{0}", intersectionStart - 1); // 0-based, inclusive
                        toks[2] = String.Format("{0}", intersectionEnd); // 0-based, exclusive

                        // write intersected bin
                        writer.WriteLine(String.Join("\t", toks));

                        manifestRegionIdx++;
                    }
                }
            }

            return partitionedPath;
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Computes fragment-based GC normalization correction factor 
        /// </summary>
        /// <returns>An array of observed vs expected GC counts.</returns>
        static float[] ComputeObservedVsExpectedGC(Dictionary<string, HitArray> observedAlignments,
            Dictionary<string, byte[]> readGCContent, NexteraManifest manifest,
            bool debugGC, string outFile)
        {

            Dictionary<string, List<NexteraManifest.ManifestRegion>> regionsByChrom = null;
            if (manifest != null)
            {
                regionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            long[] expectedReadCountsByGC = new long[numberOfGCbins];
            long[] observedReadCountsByGC = new long[numberOfGCbins];
            foreach (KeyValuePair<string, byte[]> chromosomeReadGCContent in readGCContent)
            {
                string chr = chromosomeReadGCContent.Key;
                if (!observedAlignments.ContainsKey(chr)) { continue; }

                if (manifest == null) // look at the entire genome
                {
                    for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
                    {
                        expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                        observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                    }
                }
                else // look at only the targeted regions
                {
                    if (!regionsByChrom.ContainsKey(chr)) { continue; }
                    int i = -1;
                    foreach (var region in regionsByChrom[chr])
                    {
                        if (i < region.Start) // avoid overlapping targeted regions
                        {
                            i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
                        }
                        for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
                        {
                            expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                            observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                        }
                    }
                }
            }

            // calculate ratio of observed to expected read counts for each read GC bin
            float[] observedVsExpectedGC = new float[numberOfGCbins];
            for (int i = 0; i < numberOfGCbins; i++)
                observedVsExpectedGC[i] = 1;
            long sumObserved = 0;
            long sumExpected = 0;
            foreach (long gcContent in observedReadCountsByGC)
                sumObserved += gcContent;
            foreach (long gcContent in expectedReadCountsByGC)
                sumExpected += gcContent;
            for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
            {
                if (expectedReadCountsByGC[binIndex] == 0)
                    expectedReadCountsByGC[binIndex] = 1;
                if (observedReadCountsByGC[binIndex] == 0)
                    observedReadCountsByGC[binIndex] = 1;
                observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
            }

            if (debugGC)
            {
                using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
                {
                    for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
                    {
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
                    }
                }
            }
            return observedVsExpectedGC;
        }