public static void RatiosToCounts(IEnumerable <SampleGenomicBin> ratios, IFileLocation referencePloidyBedFile, IFileLocation outputPath) { PloidyInfo referencePloidy = null; if (referencePloidyBedFile != null && referencePloidyBedFile.Exists) { referencePloidy = PloidyInfo.LoadPloidyFromBedFile(referencePloidyBedFile.FullName); } CanvasIO.WriteToTextFile(outputPath.FullName, RatiosToCounts(ratios, referencePloidy)); }
public void Run(IFileLocation outputFile) { List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName); VerifyBinOrder(sampleBins); // set bin count to 1 if less than 1 foreach (var bin in sampleBins) { bin.Count = Math.Max(1, bin.Count); } // center the sample var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray(); // project onto the axes var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes); // undo centering and set bin count to 1 if less than 1 var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count)); // write temporary reference count file var tempReferenceFile = new FileLocation(Path.GetTempFileName()); var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count)); CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins); // calcualte median ratio var ratios = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest); double medianRatio = ratios.OnTargetMedianBinCount; // delete temporary reference count file tempReferenceFile.Delete(); // multiply reference counts by the median ratio var referenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio))); // write reference count file CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins); }
public int Run(IFileLocation inputFile, IFileLocation outputFile) { // read input bins var binsByChrom = CanvasIO.GetGenomicBinsByChrom(inputFile.FullName); // smooth bins on each chromosome RepeatedMedianSmoother smoother = new RepeatedMedianSmoother(MaxHalfWindowSize); var chromosomes = binsByChrom.Keys; ConcurrentDictionary <string, List <SampleGenomicBin> > smoothedBinsByChrom = new ConcurrentDictionary <string, List <SampleGenomicBin> >(); Console.WriteLine("Launch smoothing jobs..."); Parallel.ForEach(chromosomes, chrom => { smoothedBinsByChrom[chrom] = smoother.Smooth(binsByChrom[chrom]); }); Console.WriteLine("Completed smoothing jobs."); // write smoothed bins CanvasIO.WriteToTextFile(outputFile.FullName, chromosomes.SelectMany(chrom => smoothedBinsByChrom[chrom])); return(0); }
static int Main(string[] args) { Utilities.LogCommandLine(args); string inFile = null; string outFile = null; bool doGCnorm = false; bool doSizeFilter = false; bool doOutlierRemoval = false; string ffpeOutliersFile = null; string manifestFile = null; CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC; string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}", String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()), gcNormalizationMode); bool needHelp = false; OptionSet p = new OptionSet() { { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v }, { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v }, { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null }, { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null }, { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null }, { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v }, { "t|manifest=", "Nextera manifest file", v => manifestFile = v }, { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) }, { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) }, { "h|help", "show this message and exit", v => needHelp = v != null }, }; List <string> extraArgs = p.Parse(args); if (needHelp) { ShowHelp(p); return(0); } if (inFile == null || outFile == null) { ShowHelp(p); return(0); } // Does the input file exist? if (!File.Exists(inFile)) { Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile); return(1); } List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile); if (doOutlierRemoval) { bins = RemoveOutliers(bins); } if (doSizeFilter) { bins = RemoveBigBins(bins); } // do not run FFPE outlier removal on targeted/low coverage data if (ffpeOutliersFile != null && bins.Count < 50000) { ffpeOutliersFile = null; } // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file double LocalSD = -1.0; if (ffpeOutliersFile != null) { LocalSD = getLocalStandardDeviation(bins); CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD); } if (doGCnorm) { NexteraManifest manifest = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine); List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest) : bins; if (strippedBins.Count == 0) { Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction"); } else { bins = strippedBins; NormalizeByGC(bins, manifest, gcNormalizationMode); // Use variance normalization only on large exome panels and whole genome sequencing // The treshold is set to 10% of an average number of bins on CanvasClean data if (ffpeOutliersFile != null && bins.Count > 500000) { bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest); // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC if (isNormalizeVarianceByGC) { NormalizeByGC(bins, manifest, gcNormalizationMode); } } } } if (ffpeOutliersFile != null) { // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples) List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile); bins = LocalMadstrippedBins; } CanvasIO.WriteToTextFile(outFile, bins); return(0); }
/// <summary> /// Performs fragment binning. /// </summary> /// <returns></returns> public int Bin() { if (parameters.predefinedBinsFile == null) { throw new Illumina.Common.IlluminaException("Predefined bins in BED is required for fragment binning."); } if (!parameters.isPairedEnd) // Janus-SRS-189 { throw new Illumina.Common.IlluminaException("Paired-end reads are required for fragment binning."); } Dictionary <string, List <SampleGenomicBin> > predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile, gcIndex: 3); List <string> chromosomes = GetChromosomesInBam(); // used to order chromosomes if (!Utilities.IsSubset(predefinedBins.Keys, chromosomes)) { throw new Illumina.Common.IlluminaException( String.Format("Not all chromosomes in {0} are found in {1}.", parameters.predefinedBinsFile, parameters.bamFile)); } // Count fragments by chromosome List <ThreadStart> binningThreads = new List <ThreadStart>(); List <BinTask> tasks = new List <BinTask>(); foreach (string chrom in chromosomes) { if (!predefinedBins.ContainsKey(chrom)) { continue; } BinTask task = new BinTask(parameters.referenceFile, chrom, parameters.bamFile, predefinedBins[chrom]); tasks.Add(task); binningThreads.Add(new ThreadStart(() => { task.DoIt(); })); } Console.WriteLine("Launch fragment binning jobs..."); Console.Out.WriteLine(); Parallel.ForEach(binningThreads, t => { t.Invoke(); }); Console.WriteLine("Completed fragment binning jobs."); Console.Out.WriteLine(); long usableFragmentCount = tasks.Select(t => t.UsableFragmentCount).Sum(); if (usableFragmentCount == 0) { throw new Illumina.Common.IlluminaException(String.Format("No passing-filter fragments overlapping bins are found in {0}", parameters.bamFile)); } // Aggregate bins List <SampleGenomicBin> finalBins = new List <SampleGenomicBin>(); foreach (string chrom in chromosomes) { if (!predefinedBins.ContainsKey(chrom)) { continue; } finalBins.AddRange(predefinedBins[chrom]); } // Output! CanvasIO.WriteToTextFile(parameters.outFile, finalBins); return(0); }
/// <summary> /// Implements the Canvas binning algorithm /// </summary> public static int Run(CanvasBinParameters parameters) { // Will hold a bunch of BitArrays, one for each chromosome. // Each one's length corresponds to the length of the chromosome it represents. // A position will be marked 'true' if the mer starting at that position is unique in the genome. Dictionary <string, BitArray> possibleAlignments = new Dictionary <string, BitArray>(); // Will hold a bunch of HitArrays, one for each chromosome. // Each one's length corresponds to the length of the chromosome it represents. // A position will be marked with the number of times the mer starting at that position // is observed in the SAM file. Dictionary <string, HitArray> observedAlignments = new Dictionary <string, HitArray>(); // Will hold a bunch of byte arrays, one for each chromosome. // Each one's length corresponds to the length of the chromosome it represents. // A value at a given index will represents fragment length of the read starting at that index Dictionary <string, Int16[]> fragmentLengths = new Dictionary <string, Int16[]>(); if (parameters.intermediatePaths.Count == 0) { BinOneGenomicInterval(parameters, possibleAlignments, observedAlignments, fragmentLengths); return(0); } //load our intermediate data files List <string> inputFiles = new List <string>(parameters.intermediatePaths); Object semaphore = new object(); // control access to possibleAlignments, observedAlignments, fragmentLengths // retrieve the number of processors //int processorCoreCount = Environment.ProcessorCount; int processorCoreCount = 1; // Limit # of deserialization threads to avoid (rare) protobuf issue. List <Thread> threads = new List <Thread>(); Console.WriteLine("Start deserialization:"); Console.Out.Flush(); while (threads.Count > 0 || inputFiles.Count > 0) { // Remove defunct threads: threads.RemoveAll(t => !t.IsAlive); if (threads.Count == processorCoreCount) { Thread.Sleep(1000); continue; } while (inputFiles.Count > 0 && threads.Count < processorCoreCount) { string inputFile = inputFiles.First(); ThreadStart threadDelegate = new ThreadStart(() => DeserializeCanvasData(inputFile, possibleAlignments, observedAlignments, fragmentLengths, semaphore, parameters.coverageMode)); Thread newThread = new Thread(threadDelegate); threads.Add(newThread); newThread.Name = "CanvasBin " + inputFiles[0]; Console.WriteLine(newThread.Name); newThread.Start(); inputFiles.RemoveAt(0); } } Console.WriteLine("{0} Deserialization complete", DateTime.Now); Console.Out.Flush(); NexteraManifest manifest = parameters.manifestFile == null ? null : new NexteraManifest(parameters.manifestFile, null, Console.WriteLine); if (parameters.binSize == -1) { // Turn the desired # of alignments per bin into the number of possible alignments expected per bin. parameters.binSize = CalculateNumberOfPossibleAlignmentsPerBin(parameters.countsPerBin, possibleAlignments, observedAlignments, manifest: manifest); } if (parameters.binSizeOnly) { // Write bin size to file System.IO.File.WriteAllText(parameters.outFile + ".binsize", "" + parameters.binSize); return(0); } Dictionary <string, List <GenomicBin> > predefinedBins = null; if (parameters.predefinedBinsFile != null) { // Read predefined bins predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile); } // Bin alignments. List <GenomicBin> bins = BinCounts(parameters.referenceFile, parameters.binSize, parameters.coverageMode, manifest, possibleAlignments, observedAlignments, fragmentLengths, predefinedBins, parameters.outFile); // Output! Console.WriteLine("{0} Output binned counts:", DateTime.Now); CanvasIO.WriteToTextFile(parameters.outFile, bins); Console.WriteLine("{0} Output complete", DateTime.Now); Console.Out.Flush(); return(0); }