Beispiel #1
0
        // idea is to keep track of the disparity between two pools as a measure of FFPE degradation,
        // or overall oxidation affecting tissue sample.


        //possible SNP changes:
        //
        //
        // *    A   C   G   T
        //  A   *   1   2   3
        //  C   4   *   5   6
        //  G   7   8   *   9
        //  T   10  11  12  *
        //

        public static SignatureSorterResultFiles StrainVcf(VQROptions options)
        {
            var variantList = new List <CalledAllele>()
            {
            };
            var basicCountsData       = new CountData();
            var edgeVariantsCountData = new EdgeIssueCountData(options.ExtentofEdgeRegion);

            string basicCountsPath  = CleanUpOldFiles(options.VcfPath, options.OutputDirectory, ".counts");
            string edgeCountsPath   = CleanUpOldFiles(options.VcfPath, options.OutputDirectory, ".edgecounts");
            string edgeVariantsPath = CleanUpOldFiles(options.VcfPath, options.OutputDirectory, ".edgevariants");

            using (AlleleReader readerA = new AlleleReader(options.VcfPath))
            {
                while (readerA.GetNextVariants(out variantList))
                {
                    foreach (var variant in variantList)
                    {
                        try
                        {
                            basicCountsData.Add(variant);
                            edgeVariantsCountData.Add(variant, edgeVariantsPath);
                        }


                        catch (Exception ex)
                        {
                            Logger.WriteToLog(string.Format("Fatal error processing vcf; Check {0}, position {1}.  Exception: {2}",
                                                            variant.Chromosome, variant.ReferencePosition, ex));
                            throw;
                        }
                    }
                }

                //The edge issue filter trails N variants behind.
                //The following code cleans out the buffer, processing anything left behind in the buffer.
                for (int i = 0; i < options.ExtentofEdgeRegion; i++)
                {
                    edgeVariantsCountData.Add(null, edgeVariantsPath);
                }

                if (options.LociCount > 0)
                {
                    basicCountsData.ForceTotalPossibleMutations(options.LociCount);
                    edgeVariantsCountData.ForceTotalPossibleMutations(options.LociCount);
                }

                if (options.DoBasicChecks)
                {
                    CountsFileWriter.WriteCountsFile(basicCountsPath, basicCountsData);
                }

                if (options.DoAmpliconPositionChecks)
                {
                    CountsFileWriter.WriteCountsFile(edgeCountsPath, edgeVariantsCountData);
                }
            }

            return(new SignatureSorterResultFiles(basicCountsPath, edgeCountsPath, edgeVariantsPath));
        }
        public static void WriteCountsFile(string outFile, CountData counts)
        {
            using (var writer = new StreamWriter(new FileStream(outFile, FileMode.Create)))
            {
                writer.WriteLine();
                writer.WriteLine("CountsByCategory");
                foreach (MutationCategory mutation in counts.CountsByCategory.Keys)
                {
                    writer.WriteLine(mutation + "\t" + counts.CountsByCategory[mutation]);
                }

                writer.WriteLine();
                writer.WriteLine("AllPossibleVariants\t" + counts.NumPossibleVariants);
                writer.WriteLine("VariantsCountedTowardEstimate\t" + counts.TotalMutations);
                writer.WriteLine("MismatchEstimate(%)\t{0:N4}", (counts.ObservedMutationRate * 100));
            }
        }
        /// <summary>
        /// This is a pretty naive reader. The mutation categories must be listed first, and the rates must be listed last, or the parsing fails.
        /// There are some hard coded strings. Yes, maybe it should be in json.
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        public static CountData ReadCountsFile(string file)
        {
            CountData variantCounts = new CountData();
            bool      inRateSection = false;

            using (StreamReader sr = new StreamReader(new FileStream(file, FileMode.Open)))
            {
                string line;

                while (true)
                {
                    line = sr.ReadLine();

                    if (line == "")
                    {
                        continue;
                    }

                    if (line == null)
                    {
                        break;
                    }

                    if (inRateSection)
                    {
                        string[] splat = line.Split();

                        if (splat.Length < 2)
                        {
                            continue;
                        }


                        double result = -1;
                        if (!(double.TryParse(splat[1], out result)))
                        {
                            throw new IOException("Unable to parse counts from noise file " + file);
                        }

                        string firstWord = splat[0];
                        switch (firstWord)
                        {
                        case "AllPossibleVariants":
                            variantCounts.NumPossibleVariants += result;
                            break;

                        case "FalsePosVariantsFound":
                        case "ErrorRate(%)":
                        case "VariantsCountedTowardEstimate":
                        case "ErrorRateEstimate(%)":
                        case "MismatchEstimate(%)":
                            continue;

                        default:

                            //if its a mutation category - do something. Else do nothing
                            if (MutationCategoryUtil.IsValidCategory(firstWord))
                            {
                                MutationCategory category = MutationCategoryUtil.GetMutationCategory(firstWord);

                                //this category should always exist. this is just defensive
                                if (!variantCounts.CountsByCategory.ContainsKey(category))
                                {
                                    variantCounts.CountsByCategory.Add(category, 0);
                                    Logger.WriteWarningToLog("This counts file found a mutation category listed that this version of VQR is not aware of, and cannot process. Please check " + firstWord);
                                }

                                variantCounts.CountsByCategory[category] += result;
                            }
                            break;
                        }
                    }
                    if (line.Contains("CountsByCategory"))
                    {
                        inRateSection = true;
                    }
                }
            }

            return(variantCounts);
        }
        private static QualityRecalibrationData GetRecalibrationTables(SignatureSorterResultFiles resultsFilePaths,
                                                                       VQROptions options)
        {
            CountData BasicCounts = null;
            CountData EdgeCounts  = null;

            QualityRecalibrationData recalibrationData = new QualityRecalibrationData();

            if (options.DoBasicChecks)
            {
                if (!File.Exists(resultsFilePaths.BasicCountsFilePath))
                {
                    Logger.WriteToLog("Cannot do basic recalibration. Cannot find {0} ", resultsFilePaths.BasicCountsFilePath);
                }
                else
                {
                    Logger.WriteToLog("Found counts file: {0} ", resultsFilePaths.BasicCountsFilePath);

                    BasicCounts = CountsFileReader.ReadCountsFile(resultsFilePaths.BasicCountsFilePath);
                    recalibrationData.BasicLookupTable = GetPhredScaledCalibratedRates(options.BamFilterParams.MinimumBaseCallQuality, options.ZFactor, BasicCounts);


                    //if no work to do here...
                    if ((recalibrationData.BasicLookupTable == null) || (recalibrationData.BasicLookupTable.Count == 0))
                    {
                        Logger.WriteToLog("No general recalibration needed.");
                    }
                    else
                    {
                        Logger.WriteToLog("General mutation bias detected. This sample may have sample-specific prep issues such as FFPE or oxidation damage.");
                    }
                }
            }

            if (options.DoAmpliconPositionChecks)
            {
                if (!File.Exists(resultsFilePaths.AmpliconEdgeCountsFilePath))
                {
                    Logger.WriteToLog("Cannot do amplicon-position based recalibration. Cannot find {0} ", resultsFilePaths.AmpliconEdgeCountsFilePath);
                }
                else
                {
                    Logger.WriteToLog("Found counts file: {0} ", resultsFilePaths.AmpliconEdgeCountsFilePath);


                    EdgeCounts = CountsFileReader.ReadCountsFile(resultsFilePaths.AmpliconEdgeCountsFilePath);
                    recalibrationData.AmpliconEdgeVariantsLookupTable = GetPhredScaledCalibratedRates(options.BamFilterParams.MinimumBaseCallQuality, options.ZFactor, EdgeCounts);
                    recalibrationData.AmpliconEdgeVariantsList        = VariantListReader.ReadVariantListFile(resultsFilePaths.AmpliconEdgeSuspectListFilePath);


                    if ((recalibrationData.AmpliconEdgeVariantsLookupTable == null) || (recalibrationData.AmpliconEdgeVariantsLookupTable.Count == 0))
                    {
                        Logger.WriteToLog("No position-in-amplicon recalibration needed.");
                    }
                }
            }

            //compare edge-issues with FFPE-like issues.
            //Did the bulk of variants appear to come from the edge of amplicons..?
            //Look at the diff in percents.
            //If a variant is X more likely to be called when its by an edge - thats an estimate of the error.
            if (options.DoBasicChecks && options.DoAmpliconPositionChecks)
            {
                recalibrationData.EdgeRiskLookupTable = GetPhredScaledCalibratedRatesForEdges(options.BamFilterParams.MinimumBaseCallQuality, options.AlignmentWarningThreshold, BasicCounts, EdgeCounts);
            }


            return(recalibrationData);
        }
        Dictionary <MutationCategory, int> GetPhredScaledCalibratedRates(int baselineQNoise, double zFactor, CountData counts)
        {
            double baseNoiseRate = MathOperations.QtoP(baselineQNoise);

            var countsByCategory           = counts.CountsByCategory;
            var PhredScaledRatesByCategory = new Dictionary <MutationCategory, int>();

            countsByCategory.Remove(MutationCategory.Deletion);
            countsByCategory.Remove(MutationCategory.Insertion);
            countsByCategory.Remove(MutationCategory.Other);
            countsByCategory.Remove(MutationCategory.Reference);

            double[] sortedFinalCounts = countsByCategory.Values.OrderBy(d => d).ToArray();

            if (countsByCategory.Keys.Count != 12)
            {
                return(null);
            }

            //take the average value, throwing out the top two and bottom two outlyiers.
            double numDataPoints = 8; //12 - 4;
            double avg           = 0;

            for (int i = 2; i < 10; i++)
            {
                avg += (sortedFinalCounts[i] / numDataPoints);
            }

            //get the variance
            double variance = 0;

            for (int i = 2; i < 10; i++)
            {
                variance += ((avg - sortedFinalCounts[i]) * (avg - sortedFinalCounts[i]) / numDataPoints);
            }

            //threshold = avg + z * sigma
            double threshold = avg + zFactor * Math.Sqrt(variance);

            foreach (var cat in countsByCategory.Keys)
            {
                double mutationCount = countsByCategory[cat];

                if (mutationCount > threshold)
                {
                    //baseline noise level is 'b' .
                    //the observed transition-rate is how frequently we observe a vf >b.

                    //so our expected freq due to noise =
                    // (prob of observation f_i <= b) + (prob of observation f_i > b)

                    double observedNoiseRate = 0;
                    if (counts.NumPossibleVariants > 0)
                    {
                        observedNoiseRate = mutationCount / counts.NumPossibleVariants;
                    }

                    //deliberately taking floor instead of rounding.
                    PhredScaledRatesByCategory.Add(cat, (int)(MathOperations.PtoQ(observedNoiseRate + baseNoiseRate)));
                }
            }

            return(PhredScaledRatesByCategory);
        }
        Dictionary <MutationCategory, int> GetPhredScaledCalibratedRatesForEdges(int baselineQNoise, double warningThreshold, CountData basicCountsData, CountData edgeIssueCountsData)
        {
            Dictionary <MutationCategory, int> AdjustedErrorRates = new Dictionary <MutationCategory, int>();
            double GeneralEdgeRiskIncrease = edgeIssueCountsData.ObservedMutationRate / basicCountsData.ObservedMutationRate;

            if (GeneralEdgeRiskIncrease > warningThreshold)
            {
                Logger.WriteToLog("Warning, high levels of mismatches detected at loci near edges, relative to all other loci, by a factor of " + GeneralEdgeRiskIncrease);
            }

            double MutationRateInEdge = edgeIssueCountsData.ObservedMutationRate;

            double MuationsCalledNotInEdge = basicCountsData.TotalMutations - edgeIssueCountsData.TotalMutations;
            double TotalLociNotInEdge      = basicCountsData.NumPossibleVariants - edgeIssueCountsData.NumPossibleVariants;

            double MutationRateNotInEdge = MuationsCalledNotInEdge / TotalLociNotInEdge;

            //if the error rate at the edges was the same as the error rate in the middle,
            // we would expect this many variants at the edges:
            double NullHypothesisExpectedMismatches = MutationRateNotInEdge * edgeIssueCountsData.NumPossibleVariants;

            double HowManyVariantsWeActuallySaw = edgeIssueCountsData.TotalMutations;

            double HowManyAreProbablyWrong = edgeIssueCountsData.TotalMutations - NullHypothesisExpectedMismatches;

            //error rate in edge region, Given You Called a Variant.
            double EstimatedErrorRateInEdgeRegions = HowManyAreProbablyWrong / edgeIssueCountsData.TotalMutations;


            foreach (var cat in edgeIssueCountsData.CountsByCategory.Keys)
            {
                double countsAtEdge     = edgeIssueCountsData.CountsByCategory[cat];
                double overallMutations = edgeIssueCountsData.TotalMutations;

                double proportion = countsAtEdge / edgeIssueCountsData.TotalMutations;

                //how much this particular variant category contributed to the error rate increase
                double estimatedErrorRateByCategory = proportion * EstimatedErrorRateInEdgeRegions;
                int    riskAsQRate = (int)MathOperations.PtoQ(estimatedErrorRateByCategory);
                AdjustedErrorRates.Add(cat, riskAsQRate);
            }

            return(AdjustedErrorRates);
        }