public static List<GenomicBin> ReadFromTextFile(string infile) { List<GenomicBin> bins = new List<GenomicBin>(); using (GzipReader reader = new GzipReader(infile)) { string row; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); string chr = fields[0]; int start = Convert.ToInt32(fields[1]); int stop = Convert.ToInt32(fields[2]); //int count = Convert.ToInt32(fields[3]); float count = float.Parse(fields[3]); int gc = Convert.ToInt32(fields[4]); GenomicBin bin = new GenomicBin(chr, start, stop, gc, count); bins.Add(bin); } } return bins; }
public static PloidyInfo LoadPloidyFromBedFile(string filePath) { PloidyInfo ploidy = new PloidyInfo(); int count = 0; using (GzipReader reader = new GzipReader(filePath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype")) { ploidy.HeaderLine = fileLine.Trim(); continue; } if (fileLine.Length == 0 || fileLine[0] == '#') continue; string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List<PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(); interval.Start = int.Parse(bits[1]); interval.End = int.Parse(bits[2]); interval.Ploidy = int.Parse(bits[4]); ploidy.PloidyByChromosome[chromosome].Add(interval); count++; } } Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count); return ploidy; }
/// <summary> /// uncompress foo.gz to foo /// this will probably fail if there are no newlines in the file and the entire file cannot fit into memory /// TODO: implement GzipReader.ReadBytes so that we don't need to use ReadLine/WriteLine /// </summary> public static void UncompressFile(string sourcePath, string targetPath) { using (StreamWriter writer = new StreamWriter(targetPath)) using (GzipReader reader = new GzipReader(sourcePath)) { writer.NewLine = "\n"; while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; writer.WriteLine(fileLine); } } }
private static void GetWeightedAverageBinCount(IEnumerable<string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List<double>[] binCountsBySample = new List<double>[sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List<double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
public static void Uncompress(string tempPath, string outputPath) { if (!File.Exists(tempPath)) { return; } using (GzipReader reader = new GzipReader(tempPath)) using (StreamWriter writer = new StreamWriter(outputPath)) { writer.NewLine = "\n"; while (true) { string FileLine = reader.ReadLine(); if (FileLine == null) break; writer.WriteLine(FileLine); } } }
/// <summary> /// Assume that the rows are sorted by the start position and ascending order /// </summary> private void ReadBEDInput() { try { Dictionary<string, List<uint>> startByChr = new Dictionary<string, List<uint>>(), endByChr = new Dictionary<string, List<uint>>(); Dictionary<string, List<double>> scoreByChr = new Dictionary<string, List<double>>(); // Create an instance of StreamReader to read from a file. // The using statement also closes the StreamReader. using (GzipReader reader = new GzipReader(this.InputBinPath)) { string line; string[] tokens; while ((line = reader.ReadLine()) != null) { tokens = line.Split('\t'); string chr = tokens[Segmentation.idxChr].Trim(); if (!startByChr.ContainsKey(chr)) { startByChr.Add(chr, new List<uint>()); endByChr.Add(chr, new List<uint>()); scoreByChr.Add(chr, new List<double>()); } startByChr[chr].Add(Convert.ToUInt32(tokens[Segmentation.idxStart].Trim())); endByChr[chr].Add(Convert.ToUInt32(tokens[Segmentation.idxEnd].Trim())); scoreByChr[chr].Add(Convert.ToDouble(tokens[this.idxScore].Trim())); } foreach (string chr in startByChr.Keys) { this.StartByChr[chr] = startByChr[chr].ToArray(); this.EndByChr[chr] = endByChr[chr].ToArray(); this.ScoreByChr[chr] = scoreByChr[chr].ToArray(); } } } catch (Exception e) { Console.Error.WriteLine("File {0} could not be read:", this.InputBinPath); Console.Error.WriteLine(e.Message); Environment.Exit(1); } }
protected void LoadKnownCNVCF(string oracleVCFPath) { bool stripChr = false; // Load our "oracle" of known copy numbers: this.KnownCN = new Dictionary<string, List<CNInterval>>(); int count = 0; using (GzipReader reader = new GzipReader(oracleVCFPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; if (fileLine.Length == 0 || fileLine[0] == '#') continue; string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (stripChr) chromosome = chromosome.Replace("chr", ""); if (!KnownCN.ContainsKey(chromosome)) KnownCN[chromosome] = new List<CNInterval>(); CNInterval interval = new CNInterval(); interval.Start = int.Parse(bits[1]); interval.CN = -1; string[] infoBits = bits[7].Split(';'); foreach (string subBit in infoBits) { if (subBit.StartsWith("CN=")) { float tempCN = float.Parse(subBit.Substring(3)); if (subBit.EndsWith(".5")) { interval.CN = (int)Math.Round(tempCN + 0.1); // round X.5 up to X+1 } else { interval.CN = (int)Math.Round(tempCN); // Round off } } if (subBit.StartsWith("END=")) { interval.End = int.Parse(subBit.Substring(4)); } } // Parse CN from Canvas output: if (bits.Length > 8) { string[] subBits = bits[8].Split(':'); string[] subBits2 = bits[9].Split(':'); for (int subBitIndex = 0; subBitIndex < subBits.Length; subBitIndex++) { if (subBits[subBitIndex] == "CN") { interval.CN = int.Parse(subBits2[subBitIndex]); } } } if (interval.End == 0 || interval.CN < 0) { Console.WriteLine("Error - bogus record!"); Console.WriteLine(fileLine); } else { KnownCN[chromosome].Add(interval); count++; } } } Console.WriteLine(">>>Loaded {0} known-CN intervals", count); }
public static IGenomesReferencePath GetReferenceFromVcfHeader(string vcfPath) { string referencePath = null; using (GzipReader Reader = new GzipReader(vcfPath)) { while (true) { string FileLine = Reader.ReadLine(); if (FileLine == null || !FileLine.StartsWith("#")) break; if (FileLine.StartsWith("##reference=")) referencePath = FileLine.Substring(12); } } return SafeGetReference(referencePath); }
/// <summary> /// Invoke CanvasSNV. Return null if this fails and we need to abort CNV calling for this sample. /// </summary> protected void InvokeCanvasSnv(CanvasCallset callset) { List<UnitOfWork> jobList = new List<UnitOfWork>(); List<string> outputPaths = new List<string>(); GenomeMetadata genomeMetadata = callset.GenomeMetadata; string tumorBamPath = callset.Bam.BamFile.FullName; string normalVcfPath = callset.NormalVcfPath.FullName; foreach (GenomeMetadata.SequenceMetadata chromosome in genomeMetadata.Sequences) { // Only invoke for autosomes + allosomes; // don't invoke it for mitochondrial chromosome or extra contigs or decoys if (chromosome.Type != GenomeMetadata.SequenceType.Allosome && !chromosome.IsAutosome()) continue; UnitOfWork job = new UnitOfWork(); job.ExecutablePath = Path.Combine(_canvasFolder, "CanvasSNV.exe"); if (CrossPlatform.IsThisMono()) { job.CommandLine = job.ExecutablePath; job.ExecutablePath = Utilities.GetMonoPath(); } string outputPath = Path.Combine(callset.TempFolder, string.Format("{0}-{1}.SNV.txt.gz", chromosome.Name, callset.Id)); outputPaths.Add(outputPath); job.CommandLine += $" {chromosome.Name} {normalVcfPath} {tumorBamPath} {outputPath}"; if (_customParameters.ContainsKey("CanvasSNV")) { job.CommandLine = Utilities.MergeCommandLineOptions(job.CommandLine, _customParameters["CanvasSNV"], true); } job.LoggingFolder = _workManager.LoggingFolder.FullName; job.LoggingStub = string.Format("CanvasSNV-{0}-{1}", callset.Id, chromosome.Name); jobList.Add(job); } Console.WriteLine("Invoking {0} processor jobs...", jobList.Count); // Invoke CanvasSNV jobs: Console.WriteLine(">>>CanvasSNV start..."); _workManager.DoWorkParallelThreads(jobList); Console.WriteLine(">>>CanvasSNV complete!"); // Concatenate CanvasSNV results: using (GzipWriter writer = new GzipWriter(callset.VfSummaryPath)) { bool headerWritten = false; foreach (string outputPath in outputPaths) { if (!File.Exists(outputPath)) { Console.WriteLine("Error: Expected output file not found at {0}", outputPath); continue; } using (GzipReader reader = new GzipReader(outputPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; if (fileLine.Length > 0 && fileLine[0] == '#') { if (headerWritten) continue; headerWritten = true; } writer.WriteLine(fileLine); } } } } }
/// <summary> /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment. /// </summary> public static float LoadVariantFrequencies(string variantFrequencyFile, List<CanvasSegment> segments) { Console.WriteLine("{0} Load variant frequencies from {1}", DateTime.Now, variantFrequencyFile); int count = 0; Dictionary<string, List<CanvasSegment>> segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(segments); Dictionary<string, string> alternativeNames = GetChromosomeAlternativeNames(segmentsByChromosome.Keys); long totalCoverage = 0; int totalRecords = 0; using (GzipReader reader = new GzipReader(variantFrequencyFile)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; if (fileLine.Length == 0 || fileLine[0] == '#') continue; // Skip headers string[] bits = fileLine.Split('\t'); if (bits.Length < 6) { Console.Error.WriteLine("* Bad line in {0}: '{1}'", variantFrequencyFile, fileLine); continue; } string chromosome = bits[0]; if (!segmentsByChromosome.ContainsKey(chromosome)) { if (alternativeNames.ContainsKey(chromosome)) { chromosome = alternativeNames[chromosome]; } else continue; } int position = int.Parse(bits[1]); // 1-based (from the input VCF to Canvas SNV) int countRef = int.Parse(bits[4]); int countAlt = int.Parse(bits[5]); if (countRef + countAlt < 10) continue; float VF = countAlt / (float)(countRef + countAlt); // Binary search for the segment this variant hits: List<CanvasSegment> chrSegments = segmentsByChromosome[chromosome]; int start = 0; int end = chrSegments.Count - 1; int mid = (start + end) / 2; while (start <= end) { if (chrSegments[mid].End < position) // CanvasSegment.End is already 1-based { start = mid + 1; mid = (start + end) / 2; continue; } if (chrSegments[mid].Begin + 1 > position) // Convert CanvasSegment.Begin to 1-based by adding 1 { end = mid - 1; mid = (start + end) / 2; continue; } chrSegments[mid].VariantFrequencies.Add(VF); chrSegments[mid].VariantTotalCoverage.Add(countRef + countAlt); count++; totalCoverage += (countRef + countAlt); // use only coverage information in segments totalRecords++; break; } } } float meanCoverage = 0; if (totalRecords > 0) meanCoverage = totalCoverage / Math.Max(1f, totalRecords); Console.WriteLine("{0} Loaded a total of {1} usable variant frequencies", DateTime.Now, count); return meanCoverage; }
/// <summary> /// Loads in data produced by CanvasPartition.exe. /// </summary> /// <param name="infile">Input file.</param> /// <returns>A list of segments.</returns> public static List<CanvasSegment> ReadSegments(string infile) { Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile); List<CanvasSegment> segments = new List<CanvasSegment>(); string chr = null; int begin = -1; int end = -1; int bin = -1; List<float> counts = new List<float>(); using (GzipReader reader = new GzipReader(infile)) { string row = null; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); int currentBin = Convert.ToInt32(fields[4]); // We've moved to a new segment if (currentBin != bin) { // Make a segment if (bin != -1) { CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); counts.Clear(); } chr = fields[0]; begin = Convert.ToInt32(fields[1]); bin = currentBin; } end = Convert.ToInt32(fields[2]); counts.Add(float.Parse(fields[3])); } if (bin != -1) { // Add the last segment CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); } } Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count); return segments; }
private static void LoadBinCounts(string binnedPath, out List<double> binCounts) { binCounts = new List<double>(); using (GzipReader reader = new GzipReader(binnedPath)) { string line; string[] toks; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); binCounts.Add(double.Parse(toks[3])); } } }
private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List<double> binCounts, out List<int> onTargetIndices) { binCounts = new List<double>(); onTargetIndices = new List<int>(); var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool onTarget = false; using (GzipReader reader = new GzipReader(binnedPath)) { string line; string[] toks; int binIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]); // 0-based, inclusive int stop = int.Parse(toks[2]); // 0-based, exclusive if (currChrom != chrom) { currChrom = chrom; onTarget = false; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap { onTarget = true; } else { onTarget = false; } if (onTarget) { onTargetIndices.Add(binIdx); } binCounts.Add(double.Parse(toks[3])); binIdx++; } } }
/// <summary> /// Opens the file /// </summary> public override void Open(string filename) { IsOpen = true; reader = new GzipReader(filename); }
/// <summary> /// Intersect bins with the targeted regions defined in callset.Manifest. /// Assumes that the targeted regions don't intersect, the bins are sorted by genomic location and the bins don't intersect. /// </summary> /// <param name="callset"></param> /// <param name="partitionedPath">Output of CanvasPartition. Bins are assumed to be sorted</param> /// <returns></returns> private IFileLocation IntersectBinsWithTargetedRegions(CanvasCallset callset, IFileLocation partitionedPath) { if (!partitionedPath.Exists) { return partitionedPath; } var rawPartitionedPath = partitionedPath.AppendName(".raw"); if (rawPartitionedPath.Exists) { rawPartitionedPath.Delete(); } partitionedPath.MoveTo(rawPartitionedPath); //callset.Manifest Dictionary<string, List<NexteraManifest.ManifestRegion>> manifestRegionsByChrom = callset.Manifest.GetManifestRegionsByChromosome(); // CanvasPartition output file is in the BED format // start: 0-based, inclusive // end: 0-based, exclusive // Manifest // start: 1-based, inclusive // end: 1-based, inclusive using (GzipReader reader = new GzipReader(rawPartitionedPath.FullName)) using (GzipWriter writer = new GzipWriter(partitionedPath.FullName)) { string currentChrom = null; int manifestRegionIdx = 0; string line; string[] toks; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]) + 1; // 1-based, inclusive int end = int.Parse(toks[2]); // 1-based, inclusive if (chrom != currentChrom) { currentChrom = chrom; manifestRegionIdx = 0; } if (!manifestRegionsByChrom.ContainsKey(currentChrom)) { continue; } while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count && manifestRegionsByChrom[currentChrom][manifestRegionIdx].End < start) // |- manifest region -| |- bin -| { manifestRegionIdx++; } if (manifestRegionIdx >= manifestRegionsByChrom[currentChrom].Count || // |- last manifest region -| |- bin -| end < manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) // |- bin -| |- manifest region -| { continue; // skip bin } // |- bin -| // |- manifest region -| while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count && end >= manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) { // calculate intersection int intersectionStart = Math.Max(start, manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start); // 1-based, inclusive int intersectionEnd = Math.Min(end, manifestRegionsByChrom[currentChrom][manifestRegionIdx].End); // 1-based, inclusive // start/end in BED format toks[1] = String.Format("{0}", intersectionStart - 1); // 0-based, inclusive toks[2] = String.Format("{0}", intersectionEnd); // 0-based, exclusive // write intersected bin writer.WriteLine(String.Join("\t", toks)); manifestRegionIdx++; } } } return partitionedPath; }
private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath, string ploidyBedPath, NexteraManifest manifest = null) { PloidyInfo referencePloidy = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); double tumorMedian = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double normalMedian = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1; using (GzipReader tumorReader = new GzipReader(tumorBinnedPath)) using (GzipReader normalReader = new GzipReader(normalBinnedPath)) using (GzipWriter writer = new GzipWriter(ratioBinnedPath)) { string normalLine; string tumorLine; string[] normalToks; string[] tumorToks; double normalCount; double tumorCount; double ratio; while ((normalLine = normalReader.ReadLine()) != null) { tumorLine = tumorReader.ReadLine(); normalToks = normalLine.Split('\t'); tumorToks = tumorLine.Split('\t'); normalCount = double.Parse(normalToks[3]); tumorCount = double.Parse(tumorToks[3]); // The weighted average count of a bin could be less than 1. // Using these small counts for coverage normalization creates large ratios. // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling. if (normalCount < 1) { continue; } // skip the bin string chrom = normalToks[0]; int start = int.Parse(normalToks[1]); int end = int.Parse(normalToks[2]); // get the normal ploidy from intervalsWithPloidyByChrom double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0; ratio = tumorCount / normalCount * factor * librarySizeFactor; normalToks[3] = String.Format("{0}", ratio); writer.WriteLine(String.Join("\t", normalToks)); } } }
/// <summary> /// opens the vcf file and reads the header /// </summary> private void Open(string vcfPath, bool skipHeader) { // sanity check: make sure the vcf file exists if (!File.Exists(vcfPath)) { throw new FileNotFoundException(string.Format("The specified vcf file ({0}) does not exist.", vcfPath)); } Reader = new GzipReader(vcfPath); IsOpen = true; if (skipHeader) { this.Samples.Add("Sample"); } else { ParseHeader(); } }