public void Run(IFileLocation outputFile) { List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName); VerifyBinOrder(sampleBins); // set bin count to 1 if less than 1 foreach (var bin in sampleBins) { bin.Count = Math.Max(1, bin.Count); } // center the sample var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray(); // project onto the axes var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes); // undo centering and set bin count to 1 if less than 1 var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count)); // write temporary reference count file var tempReferenceFile = new FileLocation(Path.GetTempFileName()); var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count)); CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins); // calcualte median ratio var ratios = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest); double medianRatio = ratios.OnTargetMedianBinCount; // delete temporary reference count file tempReferenceFile.Delete(); // multiply reference counts by the median ratio var referenceBins = Enumerable.Zip(sampleBins, referenceVector, (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio))); // write reference count file CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins); }
static int Main(string[] args) { Utilities.LogCommandLine(args); string inFile = null; string outFile = null; bool doGCnorm = false; bool doSizeFilter = false; bool doOutlierRemoval = false; string ffpeOutliersFile = null; string manifestFile = null; CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC; string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}", String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()), gcNormalizationMode); bool needHelp = false; OptionSet p = new OptionSet() { { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v }, { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v }, { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null }, { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null }, { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null }, { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v }, { "t|manifest=", "Nextera manifest file", v => manifestFile = v }, { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) }, { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) }, { "h|help", "show this message and exit", v => needHelp = v != null }, }; List <string> extraArgs = p.Parse(args); if (needHelp) { ShowHelp(p); return(0); } if (inFile == null || outFile == null) { ShowHelp(p); return(0); } // Does the input file exist? if (!File.Exists(inFile)) { Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile); return(1); } List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile); if (doOutlierRemoval) { bins = RemoveOutliers(bins); } if (doSizeFilter) { bins = RemoveBigBins(bins); } // do not run FFPE outlier removal on targeted/low coverage data if (ffpeOutliersFile != null && bins.Count < 50000) { ffpeOutliersFile = null; } // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file double LocalSD = -1.0; if (ffpeOutliersFile != null) { LocalSD = getLocalStandardDeviation(bins); CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD); } if (doGCnorm) { NexteraManifest manifest = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine); List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest) : bins; if (strippedBins.Count == 0) { Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction"); } else { bins = strippedBins; NormalizeByGC(bins, manifest, gcNormalizationMode); // Use variance normalization only on large exome panels and whole genome sequencing // The treshold is set to 10% of an average number of bins on CanvasClean data if (ffpeOutliersFile != null && bins.Count > 500000) { bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest); // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC if (isNormalizeVarianceByGC) { NormalizeByGC(bins, manifest, gcNormalizationMode); } } } } if (ffpeOutliersFile != null) { // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples) List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile); bins = LocalMadstrippedBins; } CanvasIO.WriteToTextFile(outFile, bins); return(0); }