public static void RatiosToCounts(IEnumerable <SampleGenomicBin> ratios, IFileLocation referencePloidyBedFile,
                                          IFileLocation outputPath)
        {
            PloidyInfo referencePloidy = null;

            if (referencePloidyBedFile != null && referencePloidyBedFile.Exists)
            {
                referencePloidy = PloidyInfo.LoadPloidyFromBedFile(referencePloidyBedFile.FullName);
            }

            CanvasIO.WriteToTextFile(outputPath.FullName, RatiosToCounts(ratios, referencePloidy));
        }
        public void Run(IFileLocation outputFile)
        {
            List <SampleGenomicBin> sampleBins = CanvasIO.ReadFromTextFile(_sampleBinnedFile.FullName);

            VerifyBinOrder(sampleBins);

            // set bin count to 1 if less than 1
            foreach (var bin in sampleBins)
            {
                bin.Count = Math.Max(1, bin.Count);
            }

            // center the sample
            var centeredSampleVector = Enumerable.Zip(sampleBins, _model.Mu, (bin, mu) => (double)bin.Count - mu.Count).ToArray();

            // project onto the axes
            var projectedSampleVector = CanvasCommon.Utilities.Project(centeredSampleVector, _model.Axes);

            // undo centering and set bin count to 1 if less than 1
            var referenceVector = Enumerable.Zip(_model.Mu, projectedSampleVector, (bin, count) => Math.Max(1, bin.Count + count));

            // write temporary reference count file
            var tempReferenceFile = new FileLocation(Path.GetTempFileName());
            var tempReferenceBins = Enumerable.Zip(sampleBins, referenceVector,
                                                   (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)count));

            CanvasIO.WriteToTextFile(tempReferenceFile.FullName, tempReferenceBins);

            // calcualte median ratio
            var    ratios      = new BinCounts(_ratioCalculator.Run(_sampleBinnedFile, tempReferenceFile), _manifest);
            double medianRatio = ratios.OnTargetMedianBinCount;

            // delete temporary reference count file
            tempReferenceFile.Delete();

            // multiply reference counts by the median ratio
            var referenceBins = Enumerable.Zip(sampleBins, referenceVector,
                                               (bin, count) => new SampleGenomicBin(bin.GenomicBin.Chromosome, bin.Start, bin.Stop, bin.GenomicBin.GC, (float)(count * medianRatio)));

            // write reference count file
            CanvasIO.WriteToTextFile(outputFile.FullName, referenceBins);
        }
Example #3
0
        public int Run(IFileLocation inputFile, IFileLocation outputFile)
        {
            // read input bins
            var binsByChrom = CanvasIO.GetGenomicBinsByChrom(inputFile.FullName);

            // smooth bins on each chromosome
            RepeatedMedianSmoother smoother = new RepeatedMedianSmoother(MaxHalfWindowSize);
            var chromosomes = binsByChrom.Keys;
            ConcurrentDictionary <string, List <SampleGenomicBin> > smoothedBinsByChrom = new ConcurrentDictionary <string, List <SampleGenomicBin> >();

            Console.WriteLine("Launch smoothing jobs...");
            Parallel.ForEach(chromosomes, chrom =>
            {
                smoothedBinsByChrom[chrom] = smoother.Smooth(binsByChrom[chrom]);
            });
            Console.WriteLine("Completed smoothing jobs.");

            // write smoothed bins
            CanvasIO.WriteToTextFile(outputFile.FullName, chromosomes.SelectMany(chrom => smoothedBinsByChrom[chrom]));

            return(0);
        }
Example #4
0
        static int Main(string[] args)
        {
            Utilities.LogCommandLine(args);
            string inFile           = null;
            string outFile          = null;
            bool   doGCnorm         = false;
            bool   doSizeFilter     = false;
            bool   doOutlierRemoval = false;
            string ffpeOutliersFile = null;
            string manifestFile     = null;
            CanvasGCNormalizationMode gcNormalizationMode = CanvasGCNormalizationMode.MedianByGC;
            string modeDescription = String.Format("gc normalization mode. Available modes: {0}. Default: {1}",
                                                   String.Join(", ", Enum.GetValues(typeof(CanvasGCNormalizationMode)).Cast <CanvasGCNormalizationMode>()),
                                                   gcNormalizationMode);
            bool needHelp = false;

            OptionSet p = new OptionSet()
            {
                { "i|infile=", "input file - usually generated by CanvasBin", v => inFile = v },
                { "o|outfile=", "text file to output containing cleaned bins", v => outFile = v },
                { "g|gcnorm", "perform GC normalization", v => doGCnorm = v != null },
                { "s|filtsize", "filter out genomically large bins", v => doSizeFilter = v != null },
                { "r|outliers", "filter outlier points", v => doOutlierRemoval = v != null },
                { "f|ffpeoutliers=", "filter regions of FFPE biases", v => ffpeOutliersFile = v },
                { "t|manifest=", "Nextera manifest file", v => manifestFile = v },
                { "w|weightedmedian=", "Minimum number of bins per GC required to calculate weighted median", v => minNumberOfBinsPerGCForWeightedMedian = int.Parse(v) },
                { "m|mode=", modeDescription, v => gcNormalizationMode = Utilities.ParseCanvasGCNormalizationMode(v) },
                { "h|help", "show this message and exit", v => needHelp = v != null },
            };

            List <string> extraArgs = p.Parse(args);

            if (needHelp)
            {
                ShowHelp(p);
                return(0);
            }

            if (inFile == null || outFile == null)
            {
                ShowHelp(p);
                return(0);
            }

            // Does the input file exist?
            if (!File.Exists(inFile))
            {
                Console.WriteLine("CanvasClean.exe: File {0} does not exist! Exiting.", inFile);
                return(1);
            }

            List <SampleGenomicBin> bins = CanvasIO.ReadFromTextFile(inFile);

            if (doOutlierRemoval)
            {
                bins = RemoveOutliers(bins);
            }

            if (doSizeFilter)
            {
                bins = RemoveBigBins(bins);
            }

            // do not run FFPE outlier removal on targeted/low coverage data
            if (ffpeOutliersFile != null && bins.Count < 50000)
            {
                ffpeOutliersFile = null;
            }

            // estimate localSD metric to use in doFFPEOutlierRemoval later and write to a text file
            double LocalSD = -1.0;

            if (ffpeOutliersFile != null)
            {
                LocalSD = getLocalStandardDeviation(bins);
                CanvasIO.WriteLocalSDToTextFile(ffpeOutliersFile, LocalSD);
            }

            if (doGCnorm)
            {
                NexteraManifest         manifest     = manifestFile == null ? null : new NexteraManifest(manifestFile, null, Console.WriteLine);
                List <SampleGenomicBin> strippedBins = gcNormalizationMode == CanvasGCNormalizationMode.MedianByGC
                    ? RemoveBinsWithExtremeGC(bins, defaultMinNumberOfBinsPerGC, manifest: manifest)
                    : bins;
                if (strippedBins.Count == 0)
                {
                    Console.Error.WriteLine("Warning in CanvasClean: Coverage too low to perform GC correction; proceeding without GC correction");
                }
                else
                {
                    bins = strippedBins;
                    NormalizeByGC(bins, manifest, gcNormalizationMode);
                    // Use variance normalization only on large exome panels and whole genome sequencing
                    // The treshold is set to 10% of an average number of bins on CanvasClean data
                    if (ffpeOutliersFile != null && bins.Count > 500000)
                    {
                        bool isNormalizeVarianceByGC = NormalizeVarianceByGC(bins, manifest: manifest);
                        // If normalization by variance was run (isNormalizeVarianceByGC), perform mean centering by using NormalizeByGC
                        if (isNormalizeVarianceByGC)
                        {
                            NormalizeByGC(bins, manifest, gcNormalizationMode);
                        }
                    }
                }
            }

            if (ffpeOutliersFile != null)
            {
                // threshold 20 is derived to separate FF and noisy FFPE samples (derived from a training set of approx. 40 samples)
                List <SampleGenomicBin> LocalMadstrippedBins = RemoveBinsWithExtremeLocalSD(bins, LocalSD, 20, outFile);
                bins = LocalMadstrippedBins;
            }

            CanvasIO.WriteToTextFile(outFile, bins);
            return(0);
        }
Example #5
0
        /// <summary>
        /// Performs fragment binning.
        /// </summary>
        /// <returns></returns>
        public int Bin()
        {
            if (parameters.predefinedBinsFile == null)
            {
                throw new Illumina.Common.IlluminaException("Predefined bins in BED is required for fragment binning.");
            }
            if (!parameters.isPairedEnd) // Janus-SRS-189
            {
                throw new Illumina.Common.IlluminaException("Paired-end reads are required for fragment binning.");
            }

            Dictionary <string, List <SampleGenomicBin> > predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile, gcIndex: 3);
            List <string> chromosomes = GetChromosomesInBam(); // used to order chromosomes

            if (!Utilities.IsSubset(predefinedBins.Keys, chromosomes))
            {
                throw new Illumina.Common.IlluminaException(
                          String.Format("Not all chromosomes in {0} are found in {1}.", parameters.predefinedBinsFile, parameters.bamFile));
            }

            // Count fragments by chromosome
            List <ThreadStart> binningThreads = new List <ThreadStart>();
            List <BinTask>     tasks          = new List <BinTask>();

            foreach (string chrom in chromosomes)
            {
                if (!predefinedBins.ContainsKey(chrom))
                {
                    continue;
                }
                BinTask task = new BinTask(parameters.referenceFile, chrom, parameters.bamFile, predefinedBins[chrom]);
                tasks.Add(task);
                binningThreads.Add(new ThreadStart(() => { task.DoIt(); }));
            }

            Console.WriteLine("Launch fragment binning jobs...");
            Console.Out.WriteLine();
            Parallel.ForEach(binningThreads, t => { t.Invoke(); });
            Console.WriteLine("Completed fragment binning jobs.");
            Console.Out.WriteLine();

            long usableFragmentCount = tasks.Select(t => t.UsableFragmentCount).Sum();

            if (usableFragmentCount == 0)
            {
                throw new Illumina.Common.IlluminaException(String.Format("No passing-filter fragments overlapping bins are found in {0}", parameters.bamFile));
            }

            // Aggregate bins
            List <SampleGenomicBin> finalBins = new List <SampleGenomicBin>();

            foreach (string chrom in chromosomes)
            {
                if (!predefinedBins.ContainsKey(chrom))
                {
                    continue;
                }
                finalBins.AddRange(predefinedBins[chrom]);
            }

            // Output!
            CanvasIO.WriteToTextFile(parameters.outFile, finalBins);

            return(0);
        }
Example #6
0
        /// <summary>
        /// Implements the Canvas binning algorithm
        /// </summary>
        public static int Run(CanvasBinParameters parameters)
        {
            // Will hold a bunch of BitArrays, one for each chromosome.
            // Each one's length corresponds to the length of the chromosome it represents.
            // A position will be marked 'true' if the mer starting at that position is unique in the genome.
            Dictionary <string, BitArray> possibleAlignments = new Dictionary <string, BitArray>();

            // Will hold a bunch of HitArrays, one for each chromosome.
            // Each one's length corresponds to the length of the chromosome it represents.
            // A position will be marked with the number of times the mer starting at that position
            // is observed in the SAM file.
            Dictionary <string, HitArray> observedAlignments = new Dictionary <string, HitArray>();

            // Will hold a bunch of byte arrays, one for each chromosome.
            // Each one's length corresponds to the length of the chromosome it represents.
            // A value at a given index will represents fragment length of the read starting at that index
            Dictionary <string, Int16[]> fragmentLengths = new Dictionary <string, Int16[]>();

            if (parameters.intermediatePaths.Count == 0)
            {
                BinOneGenomicInterval(parameters, possibleAlignments, observedAlignments, fragmentLengths);
                return(0);
            }

            //load our intermediate data files
            List <string> inputFiles = new List <string>(parameters.intermediatePaths);
            Object        semaphore  = new object(); // control access to possibleAlignments, observedAlignments, fragmentLengths
            // retrieve the number of processors
            //int processorCoreCount = Environment.ProcessorCount;
            int           processorCoreCount = 1; // Limit # of deserialization threads to avoid (rare) protobuf issue.
            List <Thread> threads            = new List <Thread>();

            Console.WriteLine("Start deserialization:");
            Console.Out.Flush();
            while (threads.Count > 0 || inputFiles.Count > 0)
            {
                // Remove defunct threads:
                threads.RemoveAll(t => !t.IsAlive);
                if (threads.Count == processorCoreCount)
                {
                    Thread.Sleep(1000);
                    continue;
                }
                while (inputFiles.Count > 0 && threads.Count < processorCoreCount)
                {
                    string      inputFile      = inputFiles.First();
                    ThreadStart threadDelegate = new ThreadStart(() => DeserializeCanvasData(inputFile, possibleAlignments, observedAlignments, fragmentLengths, semaphore, parameters.coverageMode));
                    Thread      newThread      = new Thread(threadDelegate);
                    threads.Add(newThread);
                    newThread.Name = "CanvasBin " + inputFiles[0];
                    Console.WriteLine(newThread.Name);
                    newThread.Start();
                    inputFiles.RemoveAt(0);
                }
            }
            Console.WriteLine("{0} Deserialization complete", DateTime.Now);
            Console.Out.Flush();

            NexteraManifest manifest = parameters.manifestFile == null ? null : new NexteraManifest(parameters.manifestFile, null, Console.WriteLine);

            if (parameters.binSize == -1)
            {
                // Turn the desired # of alignments per bin into the number of possible alignments expected per bin.
                parameters.binSize = CalculateNumberOfPossibleAlignmentsPerBin(parameters.countsPerBin, possibleAlignments, observedAlignments,
                                                                               manifest: manifest);
            }

            if (parameters.binSizeOnly)
            {
                // Write bin size to file
                System.IO.File.WriteAllText(parameters.outFile + ".binsize", "" + parameters.binSize);
                return(0);
            }

            Dictionary <string, List <GenomicBin> > predefinedBins = null;

            if (parameters.predefinedBinsFile != null)
            {
                // Read predefined bins
                predefinedBins = Utilities.LoadBedFile(parameters.predefinedBinsFile);
            }

            // Bin alignments.
            List <GenomicBin> bins = BinCounts(parameters.referenceFile, parameters.binSize, parameters.coverageMode, manifest,
                                               possibleAlignments, observedAlignments, fragmentLengths, predefinedBins, parameters.outFile);

            // Output!
            Console.WriteLine("{0} Output binned counts:", DateTime.Now);
            CanvasIO.WriteToTextFile(parameters.outFile, bins);
            Console.WriteLine("{0} Output complete", DateTime.Now);
            Console.Out.Flush();
            return(0);
        }