public void Run(IFileLocation outputFile)
        {
            List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName);

            VerifyBinOrder(sampleBins);

            // set bin count to 1 if less than 1
            foreach (var bin in sampleBins)
            {
                bin.Count = Math.Max(1, bin.Count);
            }

            // center the sample
            var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray();

            // project onto the axes
            var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes);

            // undo centering and set bin count to 1 if less than 1
            var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count));

            // write temporary reference count file
            var tempReferenceFile = new FileLocation(Path.GetTempFileName());
            var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector,
                                                   (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count));

            CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins);

            // calcualte median ratio
            var    ratios      = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest);
            double medianRatio = ratios.OnTargetMedianBinCount;

            // delete temporary reference count file
            tempReferenceFile.Delete();

            // multiply reference counts by the median ratio
            var referenceBins = Enumerable.Zip(sampleBins, referenceVector,
                                               (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio)));

            // write reference count file
            CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins);
        }
Beispiel #2
0
        static int Main(string[] args)
        {
            Utilities.LogCommandLine(args);
            string inFile           = null;
            string outFile          = null;
            bool   doGCnorm         = false;
            bool   doSizeFilter     = false;
            bool   doOutlierRemoval = false;
            string ffpeOutliersFile = null;
            string manifestFile     = null;
            CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC;
            string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}",
                                                   String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()),
                                                   gcNormalizationMode);
            bool needHelp = false;

            OptionSet p = new OptionSet()
            {
                { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v },
                { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v },
                { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null },
                { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null },
                { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null },
                { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v },
                { "t|manifest=", "Nextera manifest file", v => manifestFile = v },
                { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) },
                { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) },
                { "h|help", "show this message and exit", v => needHelp = v != null },
            };

            List <string> extraArgs = p.Parse(args);

            if (needHelp)
            {
                ShowHelp(p);
                return(0);
            }

            if (inFile == null || outFile == null)
            {
                ShowHelp(p);
                return(0);
            }

            // Does the input file exist?
            if (!File.Exists(inFile))
            {
                Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile);
                return(1);
            }

            List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile);

            if (doOutlierRemoval)
            {
                bins = RemoveOutliers(bins);
            }

            if (doSizeFilter)
            {
                bins = RemoveBigBins(bins);
            }

            // do not run FFPE outlier removal on targeted/low coverage data
            if (ffpeOutliersFile != null && bins.Count < 50000)
            {
                ffpeOutliersFile = null;
            }

            // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file
            double LocalSD = -1.0;

            if (ffpeOutliersFile != null)
            {
                LocalSD = getLocalStandardDeviation(bins);
                CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD);
            }

            if (doGCnorm)
            {
                NexteraManifest         manifest     = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine);
                List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC
                    ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest)
                    : bins;
                if (strippedBins.Count == 0)
                {
                    Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction");
                }
                else
                {
                    bins = strippedBins;
                    NormalizeByGC(bins, manifest, gcNormalizationMode);
                    // Use variance normalization only on large exome panels and whole genome sequencing
                    // The treshold is set to 10% of an average number of bins on CanvasClean data
                    if (ffpeOutliersFile != null && bins.Count > 500000)
                    {
                        bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest);
                        // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC
                        if (isNormalizeVarianceByGC)
                        {
                            NormalizeByGC(bins, manifest, gcNormalizationMode);
                        }
                    }
                }
            }

            if (ffpeOutliersFile != null)
            {
                // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples)
                List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile);
                bins = LocalMadstrippedBins;
            }

            CanvasIO.WriteToTextFile(outFile, bins);
            return(0);
        }