Esempio n. 1
0
 private void WriteResults(string cnvCallsPath, StreamWriter outputWriter, BaseCounter baseCounter, bool includePassingOnly, MetricsCalculator metrics)
 {
     // load and append VCF header information
     _cnvChecker.HandleVcfHeaderInfo(outputWriter, new FileLocation(cnvCallsPath));
     // Report stats:
     outputWriter.WriteLine(includePassingOnly ? "Results for PASSing variants" : "Results for all variants");
     outputWriter.WriteLine("Accuracy\t{0:F4}", metrics.Accuracy);
     // SK: I felt the direction based performance metrices make more sense
     outputWriter.WriteLine("DirectionAccuracy\t{0:F4}", metrics.DirectionAccuracy);
     outputWriter.WriteLine("F-score\t{0:F4}", metrics.F1Score);
     outputWriter.WriteLine("Recall\t{0:F4}", metrics.Recall);
     outputWriter.WriteLine("DirectionRecall\t{0:F4}", metrics.DirectionRecall);
     outputWriter.WriteLine("Precision\t{0:F4}", metrics.Precision);
     outputWriter.WriteLine("DirectionPrecision\t{0:F4}", metrics.DirectionPrecision);
     outputWriter.WriteLine("GainRecall\t{0:F4}", metrics.GainRecall);
     outputWriter.WriteLine("GainDirectionRecall\t{0:F4}", metrics.GainDirectionRecall);
     outputWriter.WriteLine("GainPrecision\t{0:F4}", metrics.GainPrecision);
     outputWriter.WriteLine("GainDirectionPrecision\t{0:F4}", metrics.GainDirectionPrecision);
     outputWriter.WriteLine("LossRecall\t{0:F4}", metrics.LossRecall);
     outputWriter.WriteLine("LossDirectionRecall\t{0:F4}", metrics.LossRecall);
     outputWriter.WriteLine("LossPrecision\t{0:F4}", metrics.LossPrecision);
     outputWriter.WriteLine("LossDirectionPrecision\t{0:F4}", metrics.LossDirectionPrecision);
     outputWriter.WriteLine("MeanEventAccuracy\t{0:F4}", 100 * baseCounter.MeanAccuracy);
     outputWriter.WriteLine("MedianEventAccuracy\t{0:F4}", 100 * baseCounter.MedianAccuracy);
     outputWriter.WriteLine("VariantEventsCalled\t{0}", baseCounter.TotalVariants);
     outputWriter.WriteLine("VariantBasesCalled\t{0}", baseCounter.TotalVariantBases);
     if (baseCounter.RoiBaseCount != null && metrics.RoiBases > 0)
     {
         outputWriter.WriteLine("ROIAccuracy\t{0:F4}", metrics.ROIAccuracy);
         outputWriter.WriteLine("ROIDirectionAccuracy\t{0:F4}", metrics.ROIDirectionAccuracy);
     }
     // to separate passing and all variant results
     outputWriter.WriteLine();
 }
Esempio n. 2
0
        public MetricsCalculator CalculateMetrics(Dictionary <string, List <CNInterval> > knownCN, Dictionary <string, List <CnvCall> > calls,
                                                  BaseCounter baseCounter, bool optionsSkipDiploid, bool includePassingOnly, Dictionary <string, BitArray> kmerfa = null)
        {
            // string referenceBases = string.Empty;
            calls.Values.SelectMany(x => x).ForEach(call =>
            {
                if (!(call.IsAltVariant && call.Length >= baseCounter.MinSize && call.Length <= baseCounter.MaxSize))
                {
                    return;
                }
                if (includePassingOnly && !call.PassFilter)
                {
                    return;
                }
                baseCounter.TotalVariantBases += call.Length;
                baseCounter.TotalVariants++;
            });

            // skip truth interval that have >= 80% of unmapable bases
            // code is not parallel as this will be done by Regression workflow
            const double fractionUnmappableBases = 0.8;
            var          filteredknownCn         = new Dictionary <string, List <CNInterval> >();

            if (kmerfa != null)
            {
                foreach (var chromosome in knownCN.Keys)
                {
                    filteredknownCn[chromosome] = new List <CNInterval>();
                    foreach (var interval in knownCN[chromosome])
                    {
                        // always include REF intervals even if they are in unmappable regions
                        if (interval.Cn == interval.ReferenceCopyNumber)
                        {
                            filteredknownCn[chromosome].Add(interval);
                            continue;
                        }
                        var flaggedBasesCounter = 0;
                        for (var bp = interval.Start; bp < interval.End; bp++)
                        {
                            if (!kmerfa[chromosome][bp])
                            {
                                flaggedBasesCounter++;
                            }
                        }

                        if (flaggedBasesCounter / (double)interval.Length < fractionUnmappableBases)
                        {
                            filteredknownCn[chromosome].Add(interval);
                        }
                        else
                        {
                            Console.Error.WriteLine($"skipping truth interval {interval} with >= {fractionUnmappableBases} fraction of unmappable positions");
                        }
                    }
                }
            }
            else
            {
                filteredknownCn = knownCN;
            }

            foreach (CNInterval interval in filteredknownCn.Values.SelectMany(x => x))
            {
                if (!(interval.Length >= baseCounter.MinSize && interval.Length <= baseCounter.MaxSize))
                {
                    continue;
                }
                int nonOverlapBases    = interval.Length;
                int nonOverlapRoiBases = 0;
                if (!_cnvChecker.RegionsOfInterest.Empty() &&
                    _cnvChecker.RegionsOfInterest.ContainsKey(interval.Chromosome))
                {
                    foreach (CNInterval roiInterval in _cnvChecker.RegionsOfInterest[interval.Chromosome])
                    {
                        int roiOverlapStart = Math.Max(roiInterval.Start, interval.Start);
                        int roiOverlapEnd   = Math.Min(roiInterval.End, interval.End);
                        if (roiOverlapStart >= roiOverlapEnd)
                        {
                            continue;
                        }
                        int roiOverlapBases = roiOverlapEnd - roiOverlapStart;
                        nonOverlapRoiBases -= roiOverlapBases;
                    }
                }
                int    totalOverlapBases      = 0;
                int    totalRoiOverlapBases   = 0;
                int    excludeIntervalBases   = 0;
                var    totalIntervalRefPloidy = new List <(int ploidy, int length)>();
                string chromosome             = interval.Chromosome;


                if (!calls.ContainsKey(chromosome))
                {
                    chromosome = chromosome.Replace("chr", "");
                }
                if (!calls.ContainsKey(chromosome))
                {
                    chromosome = "chr" + chromosome;
                }


                IEnumerable <CnvCall> callsThisChromosome;
                if (calls.ContainsKey(chromosome))
                {
                    callsThisChromosome = calls[chromosome];
                }
                else
                {
                    Console.Error.WriteLine($"Error: no Canvas calls for chromosome {interval.Chromosome} in truth file");
                    callsThisChromosome = Enumerable.Empty <CnvCall>();
                }
                int knownCn = interval.Cn;
                if (knownCn > MaxCn)
                {
                    knownCn = MaxCn;
                }

                int thisIntervalBasesTruePositive  = 0;
                int thisIntervalBasesTrueNegative  = 0;
                int thisIntervalBasesFalsePositive = 0;
                int thisIntervalBasesFalseNegative = 0;
                int thisIntervalBasesNoCall        = interval.Length;
                int thisIntervalBasesExcluded      = 0;

                foreach (CnvCall call in callsThisChromosome)
                {
                    if (!call.RefPloidy.HasValue)
                    {
                        throw new IlluminaException($"Could not determine reference ploidy for call '{call}'. Please provide ploidy information via command line option.");
                    }
                    int refPloidy = interval.ReferenceCopyNumber ?? call.RefPloidy.Value;
                    int CN        = call.CN;
                    if (call.AltAllele == "." && optionsSkipDiploid)
                    {
                        continue;
                    }

                    if (CN > MaxCn)
                    {
                        CN = MaxCn;
                    }
                    string chr          = call.Chr;
                    int    overlapStart = Math.Max(call.Start, interval.Start);
                    int    overlapEnd   = Math.Min(call.End, interval.End);
                    if (overlapStart >= overlapEnd)
                    {
                        continue;
                    }
                    int overlapBases          = overlapEnd - overlapStart;
                    int thisCallBasesExcluded = 0;
                    // We've got an overlap interval.  Kill off some bases from this interval, if it happens
                    // to overlap with an excluded interval:
                    if (_cnvChecker.ExcludeIntervals.ContainsKey(chr))
                    {
                        foreach (CNInterval excludeInterval in _cnvChecker.ExcludeIntervals[chr])
                        {
                            int excludeOverlapStart = Math.Max(excludeInterval.Start, overlapStart);
                            int excludeOverlapEnd   = Math.Min(excludeInterval.End, overlapEnd);
                            if (excludeOverlapStart >= excludeOverlapEnd)
                            {
                                continue;
                            }
                            excludeIntervalBases  += excludeOverlapEnd - excludeOverlapStart;
                            thisCallBasesExcluded += excludeOverlapEnd - excludeOverlapStart;
                            overlapBases          -= excludeOverlapEnd - excludeOverlapStart;
                            // if majority of the region is in exclude intervals, don't consider any overlap
                            // N.B.: the denominator here looks dubious -- why compare overlap bases to the excluded interval overlap size,
                            // rather than, say, the size of the original truth interval?  Or the length of the overlap of the current CNV?
                            if (overlapBases / Math.Max(excludeOverlapEnd - excludeOverlapStart, 1) < 0.1)
                            {
                                thisCallBasesExcluded += overlapBases;
                                excludeIntervalBases  += overlapBases;
                                overlapBases           = 0;
                                break;
                            }
                        }
                    }

                    totalIntervalRefPloidy.Add((refPloidy, overlapBases));

                    if (call.PassFilter || !includePassingOnly)
                    {
                        totalOverlapBases += overlapBases;
                        baseCounter.BaseCount[knownCn, CN, refPloidy] += overlapBases;

                        if (knownCn == CN)
                        {
                            if (CN == refPloidy)
                            {
                                thisIntervalBasesTrueNegative += overlapBases;
                            }
                            else
                            {
                                thisIntervalBasesTruePositive += overlapBases;
                            }
                        }
                        else
                        {
                            if (knownCn == refPloidy)
                            {
                                thisIntervalBasesFalsePositive += overlapBases;
                            }
                            else
                            {
                                thisIntervalBasesFalseNegative += overlapBases;
                            }
                        }
                        thisIntervalBasesNoCall   -= overlapBases;
                        thisIntervalBasesNoCall   -= thisCallBasesExcluded;
                        thisIntervalBasesExcluded += thisCallBasesExcluded;
                    }

                    interval.BasesCovered += overlapBases;

                    if (knownCn == CN)
                    {
                        interval.BasesCalledCorrectly += overlapBases;
                    }
                    else
                    {
                        interval.BasesCalledIncorrectly += overlapBases;
                    }

                    if (_cnvChecker.RegionsOfInterest.Empty() ||
                        !_cnvChecker.RegionsOfInterest.ContainsKey(chr))
                    {
                        continue;
                    }

                    foreach (CNInterval roiInterval in _cnvChecker.RegionsOfInterest[chr])
                    {
                        int roiOverlapStart = Math.Max(roiInterval.Start, overlapStart);
                        int roiOverlapEnd   = Math.Min(roiInterval.End, overlapEnd);
                        if (roiOverlapStart >= roiOverlapEnd)
                        {
                            continue;
                        }
                        int roiOverlapBases = roiOverlapEnd - roiOverlapStart;
                        if (call.PassFilter || !includePassingOnly)
                        {
                            totalRoiOverlapBases += roiOverlapBases;
                            baseCounter.RoiBaseCount[knownCn, CN, refPloidy] += roiOverlapBases;
                        }
                    }
                }

                if (baseCounter.MinSize == 0 && baseCounter.MaxSize > 100000)
                {
                    Console.WriteLine($"Truth {chromosome}:{interval.Start}-{interval.End} CN={knownCn} base counts TP/TN/FP/FN/NC/EXCL {thisIntervalBasesTruePositive} {thisIntervalBasesTrueNegative} {thisIntervalBasesFalsePositive} {thisIntervalBasesFalseNegative} {thisIntervalBasesNoCall} {thisIntervalBasesExcluded}");
                }

                nonOverlapBases -= (totalOverlapBases + excludeIntervalBases);

                if (!interval.ReferenceCopyNumber.HasValue)
                {
                    if (totalIntervalRefPloidy.Empty())
                    {
                        throw new ArgumentException(
                                  $"Error: Truth variant {interval.Chromosome}:{interval.Start}-{interval.End} with no overlapping " +
                                  $"Canvas calls. Reference ploidy cannot be determined! Please provide reference ploidy via command line options");
                    }

                    interval.ReferenceCopyNumber = Convert.ToInt32(Math.Round(Utilities.WeightedMean(
                                                                                  totalIntervalRefPloidy.Select(x => (double)x.ploidy).ToList(),
                                                                                  totalIntervalRefPloidy.Select(x => (double)Math.Max(x.length, 1)).ToList())));
                }
                if (nonOverlapBases < 0)
                {
                    throw new InvalidDataException($"Truth variant {interval.Chromosome}:{interval.Start}-{interval.End} has negative non-overlap bases");
                }
                baseCounter.NoCalls[knownCn, interval.ReferenceCopyNumber.Value] += nonOverlapBases;
            }

            CalculateMedianAndMeanAccuracies(baseCounter, knownCN);
            var allIntervals = knownCN.SelectMany(kvp => kvp.Value).ToList();

            // find truth interval with highest number of false negatives (hurts recall)
            var variantIntervals = allIntervals.Where(interval => interval.Cn != interval.ReferenceCopyNumber).ToList();

            if (variantIntervals.Any())
            {
                var intervalMaxFalseNegatives = variantIntervals.MaxBy(interval => interval.BasesNotCalled + interval.BasesCalledIncorrectly);
                Console.WriteLine($"Truth interval with most false negatives (hurts recall): {intervalMaxFalseNegatives}");
            }
            // find truth interval with highest number of false positive (hurts precision)
            var refIntervals = allIntervals.Where(interval => interval.Cn == interval.ReferenceCopyNumber).ToList();

            if (refIntervals.Any())
            {
                var intervalMaxFalsePositives = refIntervals.MaxBy(interval => interval.BasesCalledIncorrectly);
                Console.WriteLine($"Truth interval with most false positives (hurts precision): {intervalMaxFalsePositives}");
            }
            return(MetricsCalculator.CalculateMetrics(baseCounter, MaxCn, 2));
        }