public void Gzip_Flush() { // Verify behavior of flushing a compression stream using (MemoryStream compressed = new MemoryStream()) { using (GzipWriter stream = new GzipWriter(compressed, true)) { stream.Write(s_sampledata, 0, s_sampledata.Length); // Get the unflushed length of the compressed stream and flush it long unflushed = compressed.Length; stream.Flush(); // The expectation is that the output stream will be longer after the flush long flushedonce = compressed.Length; Assert.IsTrue(compressed.Length > unflushed); // Flushing the same data a second time should not have any impact at all stream.Flush(); Assert.AreEqual(compressed.Length, flushedonce); // The stream should still be writable after a flush operation stream.Write(s_sampledata, 0, s_sampledata.Length / 10); } } // Verify behavior of flushing a decompression stream using (GzipReader stream = new GzipReader(new MemoryStream(s_sampledata))) { // Flush has no effect on decompression streams, just ensure it doesn't throw stream.Flush(); } }
public void Gzip_CompressDecompress() { // Start with a MemoryStream created from the sample data using (MemoryStream source = new MemoryStream(s_sampledata)) { using (MemoryStream dest = new MemoryStream()) { // Compress the data into the destination memory stream instance using (GzipWriter compressor = new GzipWriter(dest, CompressionLevel.Optimal, true)) source.CopyTo(compressor); // The compressed data should be smaller than the source data Assert.IsTrue(dest.Length < source.Length); source.SetLength(0); // Clear the source stream dest.Position = 0; // Reset the destination stream // Decompress the data back into the source memory stream using (GzipReader decompressor = new GzipReader(dest, true)) decompressor.CopyTo(source); // Ensure that the original data has been restored Assert.AreEqual(source.Length, s_sampledata.Length); Assert.IsTrue(s_sampledata.SequenceEqual(source.ToArray())); } } }
public static List <GenomicBin> ReadFromTextFile(string infile) { List <GenomicBin> bins = new List <GenomicBin>(); using (GzipReader reader = new GzipReader(infile)) { string row; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); string chr = fields[0]; int start = Convert.ToInt32(fields[1]); int stop = Convert.ToInt32(fields[2]); //int count = Convert.ToInt32(fields[3]); float count = float.Parse(fields[3]); int gc = Convert.ToInt32(fields[4]); GenomicBin bin = new GenomicBin(chr, start, stop, gc, count); bins.Add(bin); } } return(bins); }
protected static Dictionary <string, List <CNInterval> > LoadKnownCNVCF(string oracleVcfPath) { var knownCn = new Dictionary <string, List <CNInterval> >(); // Load our "oracle" of known copy numbers: int count = 0; using (GzipReader reader = new GzipReader(oracleVcfPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; } var interval = ParseCnInterval(oracleVcfPath, fileLine); if (!knownCn.ContainsKey(interval.Chromosome)) { knownCn[interval.Chromosome] = new List <CNInterval>(); } knownCn[interval.Chromosome].Add(interval); count++; } } Console.WriteLine(">>>Loaded {0} known-CN intervals", count); return(knownCn); }
private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts, out List <int> onTargetIndices) { binCounts = new List <double>(); onTargetIndices = new List <int>(); var regionsByChrom = manifest.GetManifestRegionsByChromosome(); string currChrom = null; List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions int regionIndex = -1; bool onTarget = false; using (GzipReader reader = new GzipReader(binnedPath)) { string line; string[] toks; int binIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]); // 0-based, inclusive int stop = int.Parse(toks[2]); // 0-based, exclusive if (currChrom != chrom) { currChrom = chrom; onTarget = false; if (!regionsByChrom.ContainsKey(currChrom)) { regions = null; } else { regions = regionsByChrom[currChrom]; regionIndex = 0; } } while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1) { regionIndex++; } if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap { onTarget = true; } else { onTarget = false; } if (onTarget) { onTargetIndices.Add(binIdx); } binCounts.Add(double.Parse(toks[3])); binIdx++; } } }
public void Gzip_Seek() { // Verify behavior of a compression stream using (GzipWriter stream = new GzipWriter(new MemoryStream())) { try { stream.Seek(50, SeekOrigin.Begin); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } try { stream.Seek(0, SeekOrigin.Current); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } try { stream.Seek(-50, SeekOrigin.End); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } // Verify behavior of a decompression stream using (GzipReader stream = new GzipReader(new MemoryStream(s_sampledata))) { try { stream.Seek(50, SeekOrigin.Begin); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } try { stream.Seek(0, SeekOrigin.Current); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } try { stream.Seek(-50, SeekOrigin.End); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } }
private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath, NexteraManifest manifest = null) { int sampleCount = binnedPaths.Count(); if (sampleCount == 1) // copy file { if (File.Exists(binnedPaths.First())) { if (File.Exists(mergedBinnedPath)) { File.Delete(mergedBinnedPath); } File.Copy(binnedPaths.First(), mergedBinnedPath); } } else // merge normal samples { double[] weights = new double[sampleCount]; List <double>[] binCountsBySample = new List <double> [sampleCount]; for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { string binnedPath = binnedPaths.ElementAt(sampleIndex); var binCounts = new BinCounts(binnedPath, manifest: manifest); List <double> counts = binCounts.AllCounts; // If a manifest is available, get the median of bins overlapping the targeted regions only. // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero. double median = binCounts.OnTargetMedianBinCount; weights[sampleIndex] = median > 0 ? 1.0 / median : 0; binCountsBySample[sampleIndex] = counts; } double weightSum = weights.Sum(); for (int i = 0; i < sampleCount; i++) { weights[i] /= weightSum; } // so weights sum to 1 // Computed weighted average of bin counts across samples using (GzipReader reader = new GzipReader(binnedPaths.First())) using (GzipWriter writer = new GzipWriter(mergedBinnedPath)) { string line; string[] toks; int lineIdx = 0; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); double weightedBinCount = 0; for (int i = 0; i < sampleCount; i++) { weightedBinCount += weights[i] * binCountsBySample[i][lineIdx]; } toks[3] = String.Format("{0}", weightedBinCount); writer.WriteLine(String.Join("\t", toks)); lineIdx++; } } } }
/// <summary> /// Loads in data produced by CanvasPartition.exe. /// </summary> /// <param name="infile">Input file.</param> /// <returns>A list of segments.</returns> public static List <CanvasSegment> ReadSegments(string infile) { Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile); List <CanvasSegment> segments = new List <CanvasSegment>(); string chr = null; int begin = -1; int end = -1; int bin = -1; List <float> counts = new List <float>(); using (GzipReader reader = new GzipReader(infile)) { string row = null; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); int currentBin = Convert.ToInt32(fields[4]); // We've moved to a new segment if (currentBin != bin) { // Make a segment if (bin != -1) { CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); counts.Clear(); } chr = fields[0]; begin = Convert.ToInt32(fields[1]); bin = currentBin; } end = Convert.ToInt32(fields[2]); counts.Add(float.Parse(fields[3])); } if (bin != -1) { // Add the last segment CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); } } Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count); return(segments); }
public void Gzip_CanWrite() { // Verify behavior of a compression stream using (GzipWriter stream = new GzipWriter(new MemoryStream())) { Assert.IsTrue(stream.CanWrite); } // Verify behavior of a decompression stream using (GzipReader stream = new GzipReader(new MemoryStream(s_sampledata))) { Assert.IsFalse(stream.CanWrite); } }
public void Gzip_ReaderDispose() { byte[] buffer = new byte[8192]; // 8KiB data buffer // Create a dummy stream and immediately dispose of it GzipReader stream = new GzipReader(new MemoryStream(s_sampledata)); stream.Dispose(); // Test double dispose stream.Dispose(); // All properties and methods should throw an ObjectDisposedException try { var bs = stream.BaseStream; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { var b = stream.CanRead; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { var b = stream.CanSeek; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { var b = stream.CanWrite; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { stream.Flush(); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { var l = stream.Length; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { var l = stream.Position; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { stream.Position = 12345L; Assert.Fail("Property access should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { stream.Read(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { stream.Seek(0, SeekOrigin.Current); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { stream.SetLength(12345L); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } try { stream.Write(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ObjectDisposedException)); } }
public void Gzip_DecompressExternal() { // Decompress a stream created externally to this library using (GzipReader reader = new GzipReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("zuki.io.compression.test.thethreemusketeers.gz"))) { using (MemoryStream dest = new MemoryStream()) { reader.CopyTo(dest); dest.Flush(); // Verify that the output matches the sample data byte-for-byte Assert.IsTrue(Enumerable.SequenceEqual(s_sampledata, dest.ToArray())); } } }
private static void LoadBinCounts(string binnedPath, out List <double> binCounts) { binCounts = new List <double>(); using (GzipReader reader = new GzipReader(binnedPath)) { string line; string[] toks; while ((line = reader.ReadLine()) != null) { toks = line.Split('\t'); binCounts.Add(double.Parse(toks[3])); } } }
/// <summary> /// Assume that the rows are sorted by the start position and ascending order /// </summary> private void ReadBEDInput() { GenomicBinFilter binFilter = new GenomicBinFilter(ForbiddenIntervalBedPath); try { Dictionary <string, List <uint> > startByChr = new Dictionary <string, List <uint> >(), endByChr = new Dictionary <string, List <uint> >(); Dictionary <string, List <double> > scoreByChr = new Dictionary <string, List <double> >(); using (GzipReader reader = new GzipReader(this.InputBinPath)) { string line; string[] tokens; while ((line = reader.ReadLine()) != null) { tokens = line.Split('\t'); string chrom = tokens[Segmentation.idxChr].Trim(); uint start = Convert.ToUInt32(tokens[Segmentation.idxStart].Trim()); uint end = Convert.ToUInt32(tokens[Segmentation.idxEnd].Trim()); if (binFilter.SkipBin(chrom, start, end)) { continue; } if (!startByChr.ContainsKey(chrom)) { startByChr.Add(chrom, new List <uint>()); endByChr.Add(chrom, new List <uint>()); scoreByChr.Add(chrom, new List <double>()); } startByChr[chrom].Add(start); endByChr[chrom].Add(end); scoreByChr[chrom].Add(Convert.ToDouble(tokens[this.idxScore].Trim())); } foreach (string chr in startByChr.Keys) { this.StartByChr[chr] = startByChr[chr].ToArray(); this.EndByChr[chr] = endByChr[chr].ToArray(); this.ScoreByChr[chr] = scoreByChr[chr].ToArray(); } } } catch (Exception e) { Console.Error.WriteLine("File {0} could not be read:", this.InputBinPath); Console.Error.WriteLine(e.Message); Environment.Exit(1); } }
public void Gzip_SetLength() { // Verify behavior of a compression stream using (GzipWriter stream = new GzipWriter(new MemoryStream())) { try { stream.SetLength(12345L); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } // Verify behavior of a decompression stream using (GzipReader stream = new GzipReader(new MemoryStream(s_sampledata))) { try { stream.SetLength(12345L); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } }
public static PloidyInfo LoadPloidyFromBedFile(string filePath) { PloidyInfo ploidy = new PloidyInfo(); if (string.IsNullOrEmpty(filePath)) { return(ploidy); } int count = 0; using (GzipReader reader = new GzipReader(filePath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } // save anything that looks like a vcf header line (we will add it to the output vcf) // TODO: support adding multiple header lines to the output vcf if (fileLine.StartsWith("##")) { ploidy.HeaderLine = fileLine.Trim(); continue; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; } string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(chromosome); interval.Start = int.Parse(bits[1]); interval.End = int.Parse(bits[2]); interval.Ploidy = int.Parse(bits[4]); ploidy.PloidyByChromosome[chromosome].Add(interval); count++; } } Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count); return(ploidy); }
public void Gzip_BaseStream() { using (MemoryStream source = new MemoryStream()) { using (GzipWriter stream = new GzipWriter(source, CompressionLevel.Optimal)) { Assert.IsNotNull(stream.BaseStream); Assert.AreSame(source, stream.BaseStream); } using (GzipReader stream = new GzipReader(source, true)) { Assert.IsNotNull(stream.BaseStream); Assert.AreSame(source, stream.BaseStream); } } }
public void Gzip_GzipException() { using (MemoryStream compressed = new MemoryStream()) { // Start with a compressed MemoryStream created from the sample data using (GzipWriter compressor = new GzipWriter(compressed, CompressionLevel.Optimal, true)) { compressor.Write(s_sampledata, 0, s_sampledata.Length); compressor.Flush(); } byte[] buffer = new byte[8192]; GzipException thrown = null; GzipException deserialized = null; // Create a decompressor to test exception cases using (GzipReader decompressor = new GzipReader(compressed, true)) { // Attempting to read from the middle of the compressed stream should throw a GzipException compressed.Position = compressed.Length / 2; try { decompressor.Read(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (GzipException ex) { thrown = ex; } Assert.IsNotNull(thrown); Assert.IsInstanceOfType(thrown, typeof(GzipException)); // Check the error code property Assert.AreEqual(-3, thrown.ErrorCode); // Z_DATA_ERROR (-3) // Serialize and de-serialize the exception with a BinaryFormatter BinaryFormatter formatter = new BinaryFormatter(); using (MemoryStream memstream = new MemoryStream()) { formatter.Serialize(memstream, thrown); memstream.Seek(0, 0); deserialized = (GzipException)formatter.Deserialize(memstream); } // Check that the exceptions are equivalent Assert.AreEqual(thrown.ErrorCode, deserialized.ErrorCode); Assert.AreEqual(thrown.StackTrace, deserialized.StackTrace); Assert.AreEqual(thrown.ToString(), deserialized.ToString()); } } }
private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath, string ploidyBedPath, NexteraManifest manifest = null) { PloidyInfo referencePloidy = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath); double tumorMedian = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double normalMedian = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount; double librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1; using (GzipReader tumorReader = new GzipReader(tumorBinnedPath)) using (GzipReader normalReader = new GzipReader(normalBinnedPath)) using (GzipWriter writer = new GzipWriter(ratioBinnedPath)) { string normalLine; string tumorLine; string[] normalToks; string[] tumorToks; double normalCount; double tumorCount; double ratio; while ((normalLine = normalReader.ReadLine()) != null) { tumorLine = tumorReader.ReadLine(); normalToks = normalLine.Split('\t'); tumorToks = tumorLine.Split('\t'); normalCount = double.Parse(normalToks[3]); tumorCount = double.Parse(tumorToks[3]); // The weighted average count of a bin could be less than 1. // Using these small counts for coverage normalization creates large ratios. // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling. if (normalCount < 1) { continue; } // skip the bin string chrom = normalToks[0]; int start = int.Parse(normalToks[1]); int end = int.Parse(normalToks[2]); // get the normal ploidy from intervalsWithPloidyByChrom double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0; ratio = tumorCount / normalCount * factor * librarySizeFactor; normalToks[3] = String.Format("{0}", ratio); writer.WriteLine(String.Join("\t", normalToks)); } } }
public void Gzip_Write() { byte[] buffer = new byte[8192]; // 8KiB data buffer // Compress the sample data using a call to Write directly using (MemoryStream compressed = new MemoryStream()) { // Check the constructor for ArgumentNullException while we're here try { using (GzipWriter compressor = new GzipWriter(null)) { }; Assert.Fail("Constructor should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentNullException)); } using (GzipWriter compressor = new GzipWriter(compressed, CompressionLevel.Optimal, true)) { // Send in some bum arguments to Write() to check they are caught try { compressor.Write(null); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentNullException)); } try { compressor.Write(null, 0, 0); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentNullException)); } try { compressor.Write(s_sampledata, -1, 0); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentOutOfRangeException)); } try { compressor.Write(s_sampledata, 0, -1); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentOutOfRangeException)); } try { compressor.Write(s_sampledata, 0, s_sampledata.Length + 1024); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentException)); } // Not writing anything shouldn't throw an exception compressor.Write(s_sampledata, 0, 0); // Compress the data; there really isn't much that can go wrong with Write() itself compressor.Write(s_sampledata, 0, s_sampledata.Length); compressor.Flush(); } using (GzipReader reader = new GzipReader(compressed, true)) { try { reader.Write(buffer, 0, buffer.Length); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } } }
/// <summary> /// opens the vcf file and reads the header /// </summary> private void Open(string vcfPath, bool skipHeader) { // sanity check: make sure the vcf file exists if (!File.Exists(vcfPath)) { throw new FileNotFoundException(string.Format("The specified vcf file ({0}) does not exist.", vcfPath)); } Reader = new GzipReader(vcfPath); IsOpen = true; if (skipHeader) { this.Samples.Add("Sample"); } else { ParseHeader(); } }
/// <summary> /// Assume that the rows are sorted by the start position and ascending order /// </summary> private void ReadBEDInput() { try { Dictionary <string, List <uint> > startByChr = new Dictionary <string, List <uint> >(), endByChr = new Dictionary <string, List <uint> >(); Dictionary <string, List <double> > scoreByChr = new Dictionary <string, List <double> >(); // Create an instance of StreamReader to read from a file. // The using statement also closes the StreamReader. using (GzipReader reader = new GzipReader(this.InputBinPath)) { string line; string[] tokens; while ((line = reader.ReadLine()) != null) { tokens = line.Split('\t'); string chr = tokens[Segmentation.idxChr].Trim(); if (!startByChr.ContainsKey(chr)) { startByChr.Add(chr, new List <uint>()); endByChr.Add(chr, new List <uint>()); scoreByChr.Add(chr, new List <double>()); } startByChr[chr].Add(Convert.ToUInt32(tokens[Segmentation.idxStart].Trim())); endByChr[chr].Add(Convert.ToUInt32(tokens[Segmentation.idxEnd].Trim())); scoreByChr[chr].Add(Convert.ToDouble(tokens[this.idxScore].Trim())); } foreach (string chr in startByChr.Keys) { this.StartByChr[chr] = startByChr[chr].ToArray(); this.EndByChr[chr] = endByChr[chr].ToArray(); this.ScoreByChr[chr] = scoreByChr[chr].ToArray(); } } } catch (Exception e) { Console.Error.WriteLine("File {0} could not be read:", this.InputBinPath); Console.Error.WriteLine(e.Message); Environment.Exit(1); } }
public void Gzip_Position() { // Start with a MemoryStream created from the sample data using (MemoryStream source = new MemoryStream(s_sampledata)) { using (MemoryStream dest = new MemoryStream()) { // Test a compression stream using (GzipWriter compressor = new GzipWriter(dest, CompressionLevel.Optimal, true)) { // The stream should report position zero prior to compression Assert.AreEqual(0L, compressor.Position); source.CopyTo(compressor); // The stream should report non-zero after compression Assert.AreNotEqual(0L, compressor.Position); // Attempting to set the position on the stream should throw try { compressor.Position = 12345L; Assert.Fail("Property should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } source.SetLength(0); // Clear the source stream dest.Position = 0; // Reset the destination stream // Test a decompression stream using (GzipReader decompressor = new GzipReader(dest, true)) { // The stream should report position zero prior to compression Assert.AreEqual(0L, decompressor.Position); decompressor.CopyTo(source); // The stream should report non-zero after compression Assert.AreNotEqual(0L, decompressor.Position); // Attempting to set the position on the stream should throw try { decompressor.Position = 12345L; Assert.Fail("Property should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } } } } }
private static void LoadModel(IFileLocation modelFile, out List <SampleGenomicBin> mu, out List <double[]> axes) { mu = new List <SampleGenomicBin>(); axes = new List <double[]>(); List <List <double> > tempAxes = new List <List <double> >(); using (GzipReader reader = new GzipReader(modelFile.FullName)) { string line = reader.ReadLine(); for (int i = 0; i < line.Split('\t').Length - 4; i++) // initialize axes { tempAxes.Add(new List <double>()); } while (line != null) { string[] toks = line.Split('\t'); string chrom = toks[0]; int start = int.Parse(toks[1]); int stop = int.Parse(toks[2]); float mean = float.Parse(toks[3]); mu.Add(new SampleGenomicBin(chrom, start, stop, -1, mean)); for (int i = 0; i < tempAxes.Count; i++) { tempAxes[i].Add(double.Parse(toks[i + 4])); } line = reader.ReadLine(); } } foreach (var axis in tempAxes) { axes.Add(CanvasCommon.Utilities.NormalizeBy2Norm(axis.ToArray())); } if (!AreOrthogonal(axes)) { throw new Illumina.Common.IlluminaException(String.Format("Axes are not orthogonal to each other in {0}.", modelFile.FullName)); } }
public static IEnumerable <SampleGenomicBin> IterateThroughTextFile(string infile) { using (GzipReader reader = new GzipReader(infile)) { string row; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); string chr = fields[0]; int start = Convert.ToInt32(fields[1]); int stop = Convert.ToInt32(fields[2]); float count = float.Parse(fields[3]); int gc = Convert.ToInt32(fields[4]); SampleGenomicBin bin = new SampleGenomicBin(chr, start, stop, gc, count); yield return(bin); } } }
public static PloidyInfo LoadPloidyFromBedFile(string filePath) { PloidyInfo ploidy = new PloidyInfo(); int count = 0; using (GzipReader reader = new GzipReader(filePath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype")) { ploidy.HeaderLine = fileLine.Trim(); continue; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; } string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(); interval.Start = int.Parse(bits[1]); interval.End = int.Parse(bits[2]); interval.Ploidy = int.Parse(bits[4]); ploidy.PloidyByChromosome[chromosome].Add(interval); count++; } } Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count); return(ploidy); }
/// <summary> /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment. /// </summary> public static float LoadVariantFrequencies(string variantFrequencyFile, List <CanvasSegment> segments) { Console.WriteLine("{0} Load variant frequencies from {1}", DateTime.Now, variantFrequencyFile); int count = 0; Dictionary <string, List <CanvasSegment> > segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(segments); Dictionary <string, string> alternativeNames = GetChromosomeAlternativeNames(segmentsByChromosome.Keys); long totalCoverage = 0; int totalRecords = 0; using (GzipReader reader = new GzipReader(variantFrequencyFile)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; // Skip headers } string[] bits = fileLine.Split('\t'); if (bits.Length < 6) { Console.Error.WriteLine("* Bad line in {0}: '{1}'", variantFrequencyFile, fileLine); continue; } string chromosome = bits[0]; if (!segmentsByChromosome.ContainsKey(chromosome)) { if (alternativeNames.ContainsKey(chromosome)) { chromosome = alternativeNames[chromosome]; } else { continue; } } int position = int.Parse(bits[1]); // 1-based (from the input VCF to Canvas SNV) int countRef = int.Parse(bits[4]); int countAlt = int.Parse(bits[5]); if (countRef + countAlt < 10) { continue; } float VF = countAlt / (float)(countRef + countAlt); // Binary search for the segment this variant hits: List <CanvasSegment> chrSegments = segmentsByChromosome[chromosome]; int start = 0; int end = chrSegments.Count - 1; int mid = (start + end) / 2; while (start <= end) { if (chrSegments[mid].End < position) // CanvasSegment.End is already 1-based { start = mid + 1; mid = (start + end) / 2; continue; } if (chrSegments[mid].Begin + 1 > position) // Convert CanvasSegment.Begin to 1-based by adding 1 { end = mid - 1; mid = (start + end) / 2; continue; } chrSegments[mid].VariantFrequencies.Add(VF); chrSegments[mid].VariantTotalCoverage.Add(countRef + countAlt); count++; totalCoverage += (countRef + countAlt); // use only coverage information in segments totalRecords++; break; } } } float meanCoverage = 0; if (totalRecords > 0) { meanCoverage = totalCoverage / Math.Max(1f, totalRecords); } Console.WriteLine("{0} Loaded a total of {1} usable variant frequencies", DateTime.Now, count); return(meanCoverage); }
protected void LoadKnownCNVCF(string oracleVCFPath) { bool stripChr = false; // Load our "oracle" of known copy numbers: this.KnownCN = new Dictionary <string, List <CNInterval> >(); int count = 0; using (GzipReader reader = new GzipReader(oracleVCFPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; } string[] bits = fileLine.Split('\t'); if (bits.Length == 1 && bits[0].Trim().Length == 0) { continue; // skip empty lines! } string chromosome = bits[0]; if (stripChr) { chromosome = chromosome.Replace("chr", ""); } if (!KnownCN.ContainsKey(chromosome)) { KnownCN[chromosome] = new List <CNInterval>(); } CNInterval interval = new CNInterval(); interval.Start = int.Parse(bits[1]); interval.CN = -1; string[] infoBits = bits[7].Split(';'); foreach (string subBit in infoBits) { if (subBit.StartsWith("CN=")) { float tempCN = float.Parse(subBit.Substring(3)); if (subBit.EndsWith(".5")) { interval.CN = (int)Math.Round(tempCN + 0.1); // round X.5 up to X+1 } else { interval.CN = (int)Math.Round(tempCN); // Round off } } if (subBit.StartsWith("END=")) { interval.End = int.Parse(subBit.Substring(4)); } } // Parse CN from Canvas output: if (bits.Length > 8) { string[] subBits = bits[8].Split(':'); string[] subBits2 = bits[9].Split(':'); for (int subBitIndex = 0; subBitIndex < subBits.Length; subBitIndex++) { if (subBits[subBitIndex] == "CN") { interval.CN = int.Parse(subBits2[subBitIndex]); } } } if (interval.End == 0 || interval.CN < 0) { Console.WriteLine("Error - bogus record!"); Console.WriteLine(fileLine); } else { KnownCN[chromosome].Add(interval); count++; } } } Console.WriteLine(">>>Loaded {0} known-CN intervals", count); }
/// <summary> /// Loads .cleaned bed files, merges bins from multiple samples and returns GenomicBin objects with MultiSampleCount /// </summary> public static Dictionary <string, List <MultiSampleGenomicBin> > MergeMultiSampleCleanedBedFile(List <IFileLocation> canvasCleanBedPaths) { // initialize variables to hold multi-sample bed files Dictionary <string, List <MultiSampleGenomicBin> > multiSampleGenomicBins = new Dictionary <string, List <MultiSampleGenomicBin> >(); Dictionary <string, Dictionary <int, int> > start = new Dictionary <string, Dictionary <int, int> >(); Dictionary <string, Dictionary <int, int> > stop = new Dictionary <string, Dictionary <int, int> >(); Dictionary <string, Dictionary <int, List <float> > > binCounts = new Dictionary <string, Dictionary <int, List <float> > >(); List <int> counts = new List <int>(); HashSet <string> chromosomes = new HashSet <string>(); Console.WriteLine("Merge and normalize CanvasClean bed files"); foreach (IFileLocation bedPath in canvasCleanBedPaths) { int count = 0; using (GzipReader reader = new GzipReader(bedPath.FullName)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } string[] lineBedFile = fileLine.Split('\t'); string chr = lineBedFile[0]; if (!chromosomes.Contains(chr)) { chromosomes.Add(chr); } count++; } } counts.Add(count); Console.WriteLine($"count {count}"); } foreach (string chr in chromosomes) { start[chr] = new Dictionary <int, int>(); stop[chr] = new Dictionary <int, int>(); binCounts[chr] = new Dictionary <int, List <float> >(); } // read counts and segmentIDs foreach (IFileLocation bedPath in canvasCleanBedPaths) { Console.WriteLine(bedPath); using (GzipReader reader = new GzipReader(bedPath.FullName)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } string[] lineBedFile = fileLine.Split('\t'); string chr = lineBedFile[0]; int pos = int.Parse(lineBedFile[1]); start[chr][pos] = pos; stop[chr][pos] = int.Parse(lineBedFile[2]); if (binCounts[chr].ContainsKey(pos)) { binCounts[chr][pos].Add(float.Parse(lineBedFile[3])); } else { binCounts[chr][pos] = new List <float> { float.Parse(lineBedFile[3]) } }; } } } Console.WriteLine("create GenomeBin intervals"); // create GenomeBin intervals foreach (string chr in chromosomes) { if (!multiSampleGenomicBins.ContainsKey(chr)) { multiSampleGenomicBins[chr] = new List <MultiSampleGenomicBin>(); } var binStartPositions = start[chr].Keys.ToList(); foreach (var binStartPosition in binStartPositions) { // if outlier is removed in one sample, remove it in all samples if (binCounts[chr][binStartPosition].Count < canvasCleanBedPaths.Count) { continue; } if (binStartPosition < 0) { throw new Illumina.Common.IlluminaException($"Start must be non-negative"); } if (binStartPosition >= stop[chr][binStartPosition]) // Do not allow empty intervals { throw new Illumina.Common.IlluminaException($"Start must be less than Stop"); } GenomicBin interval = new GenomicBin(chr, new BedInterval(binStartPosition, stop[chr][binStartPosition])); multiSampleGenomicBins[chr].Add(new MultiSampleGenomicBin(interval, binCounts[chr][binStartPosition])); } } return(multiSampleGenomicBins); }
/// <summary> /// Loads in data produced by CanvasPartition.exe. /// </summary> /// <param name="infile">Input file.</param> /// <returns>A list of segments.</returns> public static List <CanvasSegment> ReadSegments(string infile) { Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile); List <CanvasSegment> segments = new List <CanvasSegment>(); string chr = null; int begin = -1; int previousSegmentIndex = -1; int previousBinStart = 0; int previousBinEnd = 0; List <float> counts = new List <float>(); Tuple <int, int> segmentStartCI = null; using (GzipReader reader = new GzipReader(infile)) { string row = null; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); int currentSegmentIndex = Convert.ToInt32(fields[4]); int newBinStart = Convert.ToInt32(fields[1]); int newBinEnd = Convert.ToInt32(fields[2]); // We've moved to a new segment if (currentSegmentIndex != previousSegmentIndex) { // Make a segment if (previousSegmentIndex != -1) { CanvasSegment segment = new CanvasSegment(chr, begin, previousBinEnd, counts); // Prepare the confidence interval for the end of the segment that just ended, based on the size of its last bin // (and, if the segments abut, based on the size of the next segment's first bin): int CIEnd1 = -(previousBinEnd - previousBinStart) / 2; int CIEnd2 = -CIEnd1; if (previousBinEnd == newBinStart) { CIEnd2 = (newBinEnd - newBinStart) / 2; } segment.EndConfidenceInterval = new Tuple <int, int>(CIEnd1, CIEnd2); segment.StartConfidenceInterval = segmentStartCI; segments.Add(segment); counts.Clear(); // Prepare the confidence interval for the start of the segment that just started, based on the size of its first // bin (and, if the segments abut, based on the size of the previous segment's last bin): int CIStart2 = (newBinEnd - newBinStart) / 2; int CIStart1 = -CIStart2; if (previousBinEnd == newBinStart) { CIStart1 = -(previousBinEnd - previousBinStart) / 2; } segmentStartCI = new Tuple <int, int>(CIStart1, CIStart2); } else { int interval = (newBinEnd - newBinStart) / 2; segmentStartCI = new Tuple <int, int>(-interval, interval); } chr = fields[0]; begin = Convert.ToInt32(fields[1]); previousSegmentIndex = currentSegmentIndex; } previousBinStart = newBinStart; previousBinEnd = newBinEnd; counts.Add(float.Parse(fields[3])); } if (previousSegmentIndex != -1) { // Add the last segment CanvasSegment segment = new CanvasSegment(chr, begin, previousBinEnd, counts); segments.Add(segment); segment.StartConfidenceInterval = segmentStartCI; } } Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count); return(segments); }
public void Gzip_Read() { byte[] buffer = new byte[8192]; // 8KiB data buffer using (MemoryStream compressed = new MemoryStream()) { // Start with a compressed MemoryStream created from the sample data using (GzipWriter compressor = new GzipWriter(compressed, CompressionLevel.Optimal, true)) { try { compressor.Read(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(NotSupportedException)); } compressor.Write(s_sampledata, 0, s_sampledata.Length); compressor.Flush(); } // Check the constructor for ArgumentNullException while we're here try { using (GzipReader decompressor = new GzipReader(null, false)) { }; Assert.Fail("Constructor should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentNullException)); } // Create a decompressor to test some of the error cases using (GzipReader decompressor = new GzipReader(compressed, true)) { // Send in some bum arguments to Read() to check they are caught try { decompressor.Read(null, 0, 0); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentNullException)); } try { decompressor.Read(buffer, -1, 0); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentOutOfRangeException)); } try { decompressor.Read(buffer, 0, -1); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentOutOfRangeException)); } try { decompressor.Read(buffer, 0, buffer.Length + 1024); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(ArgumentException)); } // Attempting to read from the end of the compressed stream should throw an InvalidDataException try { decompressor.Read(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(InvalidDataException)); } // Attempting to read from the middle of the compressed stream should throw a GzipException compressed.Position = compressed.Position / 2; try { decompressor.Read(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(GzipException)); } // The decompression stream is trashed at this point since the input buffer was filled // with data from the middle. Thought about a special case handler for that, but it's // a fringe case. Verify that the stream is indeed trashed ... compressed.Position = 0; try { decompressor.Read(buffer, 0, 8192); Assert.Fail("Method call should have thrown an exception"); } catch (Exception ex) { Assert.IsInstanceOfType(ex, typeof(GzipException)); } } // Create a new decompressor against the same stream and make sure it doesn't throw using (GzipReader decompressor = new GzipReader(compressed, true)) { // Reading zero bytes should not throw an exception decompressor.Read(buffer, 0, 0); while (decompressor.Read(buffer, 0, 8192) != 0) { } } } }