private static void ComputeCallability(ILogger logger, Dictionary <string, List <CnvCall> > callsByContig, EvaluateCnvOptions options, IDirectoryLocation output) { var kmerFasta = new FileLocation(options.KmerFa); var canvasAnnotationDir = kmerFasta.Directory; var filterBed = canvasAnnotationDir.GetFileLocation("filter13.bed"); if (!filterBed.Exists) { throw new ArgumentException($"Missing file at {filterBed}"); } var annotationDir = canvasAnnotationDir.Parent; var buildDir = annotationDir.Parent; var genome = new ReferenceGenome(buildDir).GenomeMetadata; var computer = CallabilityMetricsComputer.Create(logger, genome, filterBed, options.PloidyInfo.SexPloidyInfo.PloidyY == 0); var callability = computer.CalculateMetric(callsByContig.SelectValues(calls => calls.Where(call => call.PassFilter).ToList())); var callabilityFile = output.GetFileLocation($"{options.BaseFileName}_callability.txt"); File.WriteAllLines(callabilityFile.FullName, callability.GetMetrics().Select(metric => metric.ToCsv().Replace(",", "\t"))); logger.Info($"Callability: {callability.Callability}. Called bases: {callability.CalledBases}. Total bases: {callability.TotalBases}."); }
public void ComputeAccuracy(Dictionary <string, List <CNInterval> > knownCN, string cnvCallsPath, string outputPath, bool includePassingOnly, EvaluateCnvOptions options, Dictionary <string, List <CnvCall> > calls) { // Make a note of how many bases in the truth set are not *actually* considered to be known bases, using // the "cnaqc" exclusion set: bool regionsOfInterest = !_cnvChecker.RegionsOfInterest.Empty(); var baseCounters = new List <BaseCounter> { new BaseCounter(MaxCn, 0, Int32.MaxValue, regionsOfInterest) }; if (options.SplitBySize) { baseCounters.Add(new BaseCounter(MaxCn, 0, 4999, regionsOfInterest)); baseCounters.Add(new BaseCounter(MaxCn, 5000, 9999, regionsOfInterest)); baseCounters.Add(new BaseCounter(MaxCn, 10000, 99999, regionsOfInterest)); baseCounters.Add(new BaseCounter(MaxCn, 100000, 499999, regionsOfInterest)); baseCounters.Add(new BaseCounter(MaxCn, 500000, int.MaxValue, regionsOfInterest)); } // not parallel here as parallelism will be attained at the level of regression workflow _cnvChecker.CountExcludedBasesInTruthSetIntervals(knownCN); Dictionary <string, BitArray> referenceBases = null; if (options.KmerFa != null) { referenceBases = new Dictionary <string, BitArray>(); foreach (var chr in knownCN.Keys) { string chromReferenceBases = FastaLoader.LoadFastaSequence(options.KmerFa, chr); var bitArrayBases = new BitArray(chromReferenceBases.Length); // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters. for (var i = 0; i < chromReferenceBases.Length; i++) { if (char.IsUpper(chromReferenceBases[i])) { bitArrayBases[i] = true; } } referenceBases[chr] = bitArrayBases; } } foreach (var baseCounter in baseCounters) { _cnvChecker.InitializeIntervalMetrics(knownCN); var metrics = CalculateMetrics(knownCN, calls, baseCounter, options.SkipDiploid, includePassingOnly, referenceBases); string fileName = $"{options.BaseFileName}"; if (options.DQscoreThreshold.HasValue) { fileName += "_denovo"; } if (baseCounter.MinSize != 0 || baseCounter.MaxSize != int.MaxValue) { fileName += $"_{Math.Round(baseCounter.MinSize / 1000.0)}kb"; fileName += baseCounter.MaxSize == int.MaxValue ? "+" : $"_{ Math.Round(baseCounter.MaxSize / 1000.0)}kb"; } fileName += ".txt"; var outputDir = new DirectoryLocation(outputPath); outputDir.Create(); var outputFile = outputDir.GetFileLocation(fileName); using (FileStream stream = new FileStream(outputFile.FullName, includePassingOnly ? FileMode.Create : FileMode.Append, FileAccess.Write)) using (StreamWriter outputWriter = new StreamWriter(stream)) { outputWriter.NewLine = "\n"; WriteResults(cnvCallsPath, outputWriter, baseCounter, includePassingOnly, metrics); } } }
public static void Evaluate(string truthSetPath, string cnvCallsPath, string excludedBed, string outputPath, EvaluateCnvOptions options) { double heterogeneityFraction = options.HeterogeneityFraction; var knownCn = LoadKnownCn(truthSetPath, heterogeneityFraction); knownCn = knownCn.SelectValues( truthEntries => truthEntries.Where(truthEntry => truthEntry.Length >= options.MinEntrySize).ToList()); var calls = GetCnvCallsFromVcf(cnvCallsPath, options.DQscoreThreshold); calls = calls.SelectValues( chromosomeCalls => chromosomeCalls.Where(call => call.Length >= options.MinEntrySize).ToList()); // LoadRegionsOfInterest(options.RoiBed?.FullName); var excludeIntervals = new Dictionary <string, List <CNInterval> >(); if (!string.IsNullOrEmpty(excludedBed)) { var excludeIntervalsTmp = LoadIntervalsFromBed(excludedBed, false, 1.0); List <string> keys = excludeIntervalsTmp.Keys.ToList(); foreach (string key in keys) { string chr = key; if (!calls.ContainsKey(chr)) { chr = key.Replace("chr", ""); } if (!calls.ContainsKey(chr)) { chr = "chr" + key; } if (!calls.ContainsKey(chr)) { Console.WriteLine($"Error: Skipping exclude intervals for chromosome {key} with no truth data." + $"Check that chromosome names are spelled correctly for exclude intervals"); continue; } excludeIntervals[chr] = excludeIntervalsTmp[key]; } } Console.WriteLine("TruthSet\t{0}", truthSetPath); Console.WriteLine("CNVCalls\t{0}", cnvCallsPath); bool includePassingOnly = Path.GetFileName(cnvCallsPath).ToLower().Contains("vcf"); var logger = new Logger(new[] { Console.Out }, new[] { Console.Error }); var settings = IsasConfigurationSettings.GetConfigSettings(); var output = new DirectoryLocation(outputPath); var workerDirectory = new DirectoryLocation(Isas.Framework.Utilities.Utilities.GetAssemblyFolder(typeof(CNVChecker))); var commandManager = new CommandManager(new ExecutableProcessor(settings, logger, workerDirectory)); WorkDoerFactory.RunWithWorkDoer(logger, settings, output, workDoer => { var tabixWrapper = TabixWrapperFactory.GetTabixWrapper(logger, workDoer, commandManager); var ploidyCorrector = new PloidyCorrector(logger, workDoer, new PloidyEstimator(logger, workDoer, null, false, commandManager), tabixWrapper, false); var checker = new CNVChecker(options.DQscoreThreshold, excludeIntervals, ploidyCorrector); if (options.PloidyInfo.SexPloidyInfo != null) { Console.WriteLine($">>>Getting reference ploidy from provided ploidy information and PAR bed file '{options.PloidyInfo.ParBed}'"); var ploidy = checker.GetPloidy(options.PloidyInfo, output); var referencePloidy = LoadReferencePloidy(options.PloidyInfo.SexPloidyInfo, options.PloidyInfo.ParBed); knownCn = GetKnownCopyNumberWithReferencePloidy(referencePloidy, knownCn); calls = GetCallsWithRefPloidy(calls, ploidy); } var cnvEvaluator = new CnvEvaluator(checker); if (checker.DQscoreThreshold.HasValue && !Path.GetFileName(cnvCallsPath).ToLower().Contains("vcf")) { throw new ArgumentException("CNV.vcf must be in a vcf format when --dqscore option is used"); } cnvEvaluator.ComputeAccuracy(knownCn, cnvCallsPath, outputPath, includePassingOnly, options, calls); if (includePassingOnly) { cnvEvaluator.ComputeAccuracy(knownCn, cnvCallsPath, outputPath, false, options, calls); } ComputeCallability(logger, calls, options, output); Console.WriteLine(">>>Done - results written to {0}", outputPath); }); }