Esempio n. 1
0
        private static void ComputeCallability(ILogger logger, Dictionary <string, List <CnvCall> > callsByContig,
                                               EvaluateCnvOptions options, IDirectoryLocation output)
        {
            var kmerFasta           = new FileLocation(options.KmerFa);
            var canvasAnnotationDir = kmerFasta.Directory;
            var filterBed           = canvasAnnotationDir.GetFileLocation("filter13.bed");

            if (!filterBed.Exists)
            {
                throw new ArgumentException($"Missing file at {filterBed}");
            }
            var annotationDir   = canvasAnnotationDir.Parent;
            var buildDir        = annotationDir.Parent;
            var genome          = new ReferenceGenome(buildDir).GenomeMetadata;
            var computer        = CallabilityMetricsComputer.Create(logger, genome, filterBed, options.PloidyInfo.SexPloidyInfo.PloidyY == 0);
            var callability     = computer.CalculateMetric(callsByContig.SelectValues(calls => calls.Where(call => call.PassFilter).ToList()));
            var callabilityFile = output.GetFileLocation($"{options.BaseFileName}_callability.txt");

            File.WriteAllLines(callabilityFile.FullName, callability.GetMetrics().Select(metric => metric.ToCsv().Replace(",", "\t")));
            logger.Info($"Callability: {callability.Callability}. Called bases: {callability.CalledBases}. Total bases: {callability.TotalBases}.");
        }
Esempio n. 2
0
        public void ComputeAccuracy(Dictionary <string, List <CNInterval> > knownCN, string cnvCallsPath, string outputPath, bool includePassingOnly, EvaluateCnvOptions options, Dictionary <string, List <CnvCall> > calls)
        {
            // Make a note of how many bases in the truth set are not *actually* considered to be known bases, using
            // the "cnaqc" exclusion set:
            bool regionsOfInterest = !_cnvChecker.RegionsOfInterest.Empty();
            var  baseCounters      = new List <BaseCounter> {
                new BaseCounter(MaxCn, 0, Int32.MaxValue, regionsOfInterest)
            };

            if (options.SplitBySize)
            {
                baseCounters.Add(new BaseCounter(MaxCn, 0, 4999, regionsOfInterest));
                baseCounters.Add(new BaseCounter(MaxCn, 5000, 9999, regionsOfInterest));
                baseCounters.Add(new BaseCounter(MaxCn, 10000, 99999, regionsOfInterest));
                baseCounters.Add(new BaseCounter(MaxCn, 100000, 499999, regionsOfInterest));
                baseCounters.Add(new BaseCounter(MaxCn, 500000, int.MaxValue, regionsOfInterest));
            }

            // not parallel here as parallelism will be attained at the level of regression workflow
            _cnvChecker.CountExcludedBasesInTruthSetIntervals(knownCN);
            Dictionary <string, BitArray> referenceBases = null;

            if (options.KmerFa != null)
            {
                referenceBases = new Dictionary <string, BitArray>();
                foreach (var chr in knownCN.Keys)
                {
                    string chromReferenceBases = FastaLoader.LoadFastaSequence(options.KmerFa, chr);
                    var    bitArrayBases       = new BitArray(chromReferenceBases.Length);
                    // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters.
                    for (var i = 0; i < chromReferenceBases.Length; i++)
                    {
                        if (char.IsUpper(chromReferenceBases[i]))
                        {
                            bitArrayBases[i] = true;
                        }
                    }
                    referenceBases[chr] = bitArrayBases;
                }
            }

            foreach (var baseCounter in baseCounters)
            {
                _cnvChecker.InitializeIntervalMetrics(knownCN);
                var metrics = CalculateMetrics(knownCN, calls, baseCounter, options.SkipDiploid, includePassingOnly, referenceBases);

                string fileName = $"{options.BaseFileName}";
                if (options.DQscoreThreshold.HasValue)
                {
                    fileName += "_denovo";
                }
                if (baseCounter.MinSize != 0 || baseCounter.MaxSize != int.MaxValue)
                {
                    fileName += $"_{Math.Round(baseCounter.MinSize / 1000.0)}kb";
                    fileName += baseCounter.MaxSize == int.MaxValue ? "+" : $"_{ Math.Round(baseCounter.MaxSize / 1000.0)}kb";
                }
                fileName += ".txt";
                var outputDir = new DirectoryLocation(outputPath);
                outputDir.Create();
                var outputFile = outputDir.GetFileLocation(fileName);
                using (FileStream stream = new FileStream(outputFile.FullName, includePassingOnly ?
                                                          FileMode.Create : FileMode.Append, FileAccess.Write))
                    using (StreamWriter outputWriter = new StreamWriter(stream))
                    {
                        outputWriter.NewLine = "\n";
                        WriteResults(cnvCallsPath, outputWriter, baseCounter, includePassingOnly, metrics);
                    }
            }
        }
Esempio n. 3
0
        public static void Evaluate(string truthSetPath, string cnvCallsPath, string excludedBed, string outputPath, EvaluateCnvOptions options)
        {
            double heterogeneityFraction = options.HeterogeneityFraction;
            var    knownCn = LoadKnownCn(truthSetPath, heterogeneityFraction);

            knownCn = knownCn.SelectValues(
                truthEntries => truthEntries.Where(truthEntry => truthEntry.Length >= options.MinEntrySize).ToList());
            var calls = GetCnvCallsFromVcf(cnvCallsPath, options.DQscoreThreshold);

            calls = calls.SelectValues(
                chromosomeCalls => chromosomeCalls.Where(call => call.Length >= options.MinEntrySize).ToList());

            // LoadRegionsOfInterest(options.RoiBed?.FullName);
            var excludeIntervals = new Dictionary <string, List <CNInterval> >();

            if (!string.IsNullOrEmpty(excludedBed))
            {
                var           excludeIntervalsTmp = LoadIntervalsFromBed(excludedBed, false, 1.0);
                List <string> keys = excludeIntervalsTmp.Keys.ToList();
                foreach (string key in keys)
                {
                    string chr = key;
                    if (!calls.ContainsKey(chr))
                    {
                        chr = key.Replace("chr", "");
                    }
                    if (!calls.ContainsKey(chr))
                    {
                        chr = "chr" + key;
                    }
                    if (!calls.ContainsKey(chr))
                    {
                        Console.WriteLine($"Error: Skipping exclude intervals for chromosome {key} with no truth data." +
                                          $"Check that chromosome names are spelled correctly for exclude intervals");
                        continue;
                    }
                    excludeIntervals[chr] = excludeIntervalsTmp[key];
                }
            }
            Console.WriteLine("TruthSet\t{0}", truthSetPath);
            Console.WriteLine("CNVCalls\t{0}", cnvCallsPath);

            bool includePassingOnly = Path.GetFileName(cnvCallsPath).ToLower().Contains("vcf");
            var  logger             = new Logger(new[] { Console.Out }, new[] { Console.Error });
            var  settings           = IsasConfigurationSettings.GetConfigSettings();
            var  output             = new DirectoryLocation(outputPath);
            var  workerDirectory    = new DirectoryLocation(Isas.Framework.Utilities.Utilities.GetAssemblyFolder(typeof(CNVChecker)));
            var  commandManager     = new CommandManager(new ExecutableProcessor(settings, logger, workerDirectory));

            WorkDoerFactory.RunWithWorkDoer(logger, settings, output, workDoer =>
            {
                var tabixWrapper    = TabixWrapperFactory.GetTabixWrapper(logger, workDoer, commandManager);
                var ploidyCorrector = new PloidyCorrector(logger, workDoer,
                                                          new PloidyEstimator(logger, workDoer, null, false, commandManager), tabixWrapper, false);
                var checker = new CNVChecker(options.DQscoreThreshold, excludeIntervals, ploidyCorrector);
                if (options.PloidyInfo.SexPloidyInfo != null)
                {
                    Console.WriteLine($">>>Getting reference ploidy from provided ploidy information and PAR bed file '{options.PloidyInfo.ParBed}'");

                    var ploidy          = checker.GetPloidy(options.PloidyInfo, output);
                    var referencePloidy = LoadReferencePloidy(options.PloidyInfo.SexPloidyInfo, options.PloidyInfo.ParBed);
                    knownCn             = GetKnownCopyNumberWithReferencePloidy(referencePloidy, knownCn);
                    calls = GetCallsWithRefPloidy(calls, ploidy);
                }
                var cnvEvaluator = new CnvEvaluator(checker);

                if (checker.DQscoreThreshold.HasValue && !Path.GetFileName(cnvCallsPath).ToLower().Contains("vcf"))
                {
                    throw new ArgumentException("CNV.vcf must be in a vcf format when --dqscore option is used");
                }
                cnvEvaluator.ComputeAccuracy(knownCn, cnvCallsPath, outputPath, includePassingOnly, options, calls);
                if (includePassingOnly)
                {
                    cnvEvaluator.ComputeAccuracy(knownCn, cnvCallsPath, outputPath, false, options, calls);
                }
                ComputeCallability(logger, calls, options, output);
                Console.WriteLine(">>>Done - results written to {0}", outputPath);
            });
        }