// idea is to keep track of the disparity between two pools as a measure of FFPE degradation, // or overall oxidation affecting tissue sample. //possible SNP changes: // // // * A C G T // A * 1 2 3 // C 4 * 5 6 // G 7 8 * 9 // T 10 11 12 * // public static SignatureSorterResultFiles StrainVcf(VQROptions options) { var variantList = new List <CalledAllele>() { }; var basicCountsData = new CountData(); var edgeVariantsCountData = new EdgeIssueCountData(options.ExtentofEdgeRegion); string basicCountsPath = CleanUpOldFiles(options.VcfPath, options.OutputDirectory, ".counts"); string edgeCountsPath = CleanUpOldFiles(options.VcfPath, options.OutputDirectory, ".edgecounts"); string edgeVariantsPath = CleanUpOldFiles(options.VcfPath, options.OutputDirectory, ".edgevariants"); using (AlleleReader readerA = new AlleleReader(options.VcfPath)) { while (readerA.GetNextVariants(out variantList)) { foreach (var variant in variantList) { try { basicCountsData.Add(variant); edgeVariantsCountData.Add(variant, edgeVariantsPath); } catch (Exception ex) { Logger.WriteToLog(string.Format("Fatal error processing vcf; Check {0}, position {1}. Exception: {2}", variant.Chromosome, variant.ReferencePosition, ex)); throw; } } } //The edge issue filter trails N variants behind. //The following code cleans out the buffer, processing anything left behind in the buffer. for (int i = 0; i < options.ExtentofEdgeRegion; i++) { edgeVariantsCountData.Add(null, edgeVariantsPath); } if (options.LociCount > 0) { basicCountsData.ForceTotalPossibleMutations(options.LociCount); edgeVariantsCountData.ForceTotalPossibleMutations(options.LociCount); } if (options.DoBasicChecks) { CountsFileWriter.WriteCountsFile(basicCountsPath, basicCountsData); } if (options.DoAmpliconPositionChecks) { CountsFileWriter.WriteCountsFile(edgeCountsPath, edgeVariantsCountData); } } return(new SignatureSorterResultFiles(basicCountsPath, edgeCountsPath, edgeVariantsPath)); }
public static void WriteCountsFile(string outFile, CountData counts) { using (var writer = new StreamWriter(new FileStream(outFile, FileMode.Create))) { writer.WriteLine(); writer.WriteLine("CountsByCategory"); foreach (MutationCategory mutation in counts.CountsByCategory.Keys) { writer.WriteLine(mutation + "\t" + counts.CountsByCategory[mutation]); } writer.WriteLine(); writer.WriteLine("AllPossibleVariants\t" + counts.NumPossibleVariants); writer.WriteLine("VariantsCountedTowardEstimate\t" + counts.TotalMutations); writer.WriteLine("MismatchEstimate(%)\t{0:N4}", (counts.ObservedMutationRate * 100)); } }
/// <summary> /// This is a pretty naive reader. The mutation categories must be listed first, and the rates must be listed last, or the parsing fails. /// There are some hard coded strings. Yes, maybe it should be in json. /// </summary> /// <param name="file"></param> /// <returns></returns> public static CountData ReadCountsFile(string file) { CountData variantCounts = new CountData(); bool inRateSection = false; using (StreamReader sr = new StreamReader(new FileStream(file, FileMode.Open))) { string line; while (true) { line = sr.ReadLine(); if (line == "") { continue; } if (line == null) { break; } if (inRateSection) { string[] splat = line.Split(); if (splat.Length < 2) { continue; } double result = -1; if (!(double.TryParse(splat[1], out result))) { throw new IOException("Unable to parse counts from noise file " + file); } string firstWord = splat[0]; switch (firstWord) { case "AllPossibleVariants": variantCounts.NumPossibleVariants += result; break; case "FalsePosVariantsFound": case "ErrorRate(%)": case "VariantsCountedTowardEstimate": case "ErrorRateEstimate(%)": case "MismatchEstimate(%)": continue; default: //if its a mutation category - do something. Else do nothing if (MutationCategoryUtil.IsValidCategory(firstWord)) { MutationCategory category = MutationCategoryUtil.GetMutationCategory(firstWord); //this category should always exist. this is just defensive if (!variantCounts.CountsByCategory.ContainsKey(category)) { variantCounts.CountsByCategory.Add(category, 0); Logger.WriteWarningToLog("This counts file found a mutation category listed that this version of VQR is not aware of, and cannot process. Please check " + firstWord); } variantCounts.CountsByCategory[category] += result; } break; } } if (line.Contains("CountsByCategory")) { inRateSection = true; } } } return(variantCounts); }
private static QualityRecalibrationData GetRecalibrationTables(SignatureSorterResultFiles resultsFilePaths, VQROptions options) { CountData BasicCounts = null; CountData EdgeCounts = null; QualityRecalibrationData recalibrationData = new QualityRecalibrationData(); if (options.DoBasicChecks) { if (!File.Exists(resultsFilePaths.BasicCountsFilePath)) { Logger.WriteToLog("Cannot do basic recalibration. Cannot find {0} ", resultsFilePaths.BasicCountsFilePath); } else { Logger.WriteToLog("Found counts file: {0} ", resultsFilePaths.BasicCountsFilePath); BasicCounts = CountsFileReader.ReadCountsFile(resultsFilePaths.BasicCountsFilePath); recalibrationData.BasicLookupTable = GetPhredScaledCalibratedRates(options.BamFilterParams.MinimumBaseCallQuality, options.ZFactor, BasicCounts); //if no work to do here... if ((recalibrationData.BasicLookupTable == null) || (recalibrationData.BasicLookupTable.Count == 0)) { Logger.WriteToLog("No general recalibration needed."); } else { Logger.WriteToLog("General mutation bias detected. This sample may have sample-specific prep issues such as FFPE or oxidation damage."); } } } if (options.DoAmpliconPositionChecks) { if (!File.Exists(resultsFilePaths.AmpliconEdgeCountsFilePath)) { Logger.WriteToLog("Cannot do amplicon-position based recalibration. Cannot find {0} ", resultsFilePaths.AmpliconEdgeCountsFilePath); } else { Logger.WriteToLog("Found counts file: {0} ", resultsFilePaths.AmpliconEdgeCountsFilePath); EdgeCounts = CountsFileReader.ReadCountsFile(resultsFilePaths.AmpliconEdgeCountsFilePath); recalibrationData.AmpliconEdgeVariantsLookupTable = GetPhredScaledCalibratedRates(options.BamFilterParams.MinimumBaseCallQuality, options.ZFactor, EdgeCounts); recalibrationData.AmpliconEdgeVariantsList = VariantListReader.ReadVariantListFile(resultsFilePaths.AmpliconEdgeSuspectListFilePath); if ((recalibrationData.AmpliconEdgeVariantsLookupTable == null) || (recalibrationData.AmpliconEdgeVariantsLookupTable.Count == 0)) { Logger.WriteToLog("No position-in-amplicon recalibration needed."); } } } //compare edge-issues with FFPE-like issues. //Did the bulk of variants appear to come from the edge of amplicons..? //Look at the diff in percents. //If a variant is X more likely to be called when its by an edge - thats an estimate of the error. if (options.DoBasicChecks && options.DoAmpliconPositionChecks) { recalibrationData.EdgeRiskLookupTable = GetPhredScaledCalibratedRatesForEdges(options.BamFilterParams.MinimumBaseCallQuality, options.AlignmentWarningThreshold, BasicCounts, EdgeCounts); } return(recalibrationData); }
Dictionary <MutationCategory, int> GetPhredScaledCalibratedRates(int baselineQNoise, double zFactor, CountData counts) { double baseNoiseRate = MathOperations.QtoP(baselineQNoise); var countsByCategory = counts.CountsByCategory; var PhredScaledRatesByCategory = new Dictionary <MutationCategory, int>(); countsByCategory.Remove(MutationCategory.Deletion); countsByCategory.Remove(MutationCategory.Insertion); countsByCategory.Remove(MutationCategory.Other); countsByCategory.Remove(MutationCategory.Reference); double[] sortedFinalCounts = countsByCategory.Values.OrderBy(d => d).ToArray(); if (countsByCategory.Keys.Count != 12) { return(null); } //take the average value, throwing out the top two and bottom two outlyiers. double numDataPoints = 8; //12 - 4; double avg = 0; for (int i = 2; i < 10; i++) { avg += (sortedFinalCounts[i] / numDataPoints); } //get the variance double variance = 0; for (int i = 2; i < 10; i++) { variance += ((avg - sortedFinalCounts[i]) * (avg - sortedFinalCounts[i]) / numDataPoints); } //threshold = avg + z * sigma double threshold = avg + zFactor * Math.Sqrt(variance); foreach (var cat in countsByCategory.Keys) { double mutationCount = countsByCategory[cat]; if (mutationCount > threshold) { //baseline noise level is 'b' . //the observed transition-rate is how frequently we observe a vf >b. //so our expected freq due to noise = // (prob of observation f_i <= b) + (prob of observation f_i > b) double observedNoiseRate = 0; if (counts.NumPossibleVariants > 0) { observedNoiseRate = mutationCount / counts.NumPossibleVariants; } //deliberately taking floor instead of rounding. PhredScaledRatesByCategory.Add(cat, (int)(MathOperations.PtoQ(observedNoiseRate + baseNoiseRate))); } } return(PhredScaledRatesByCategory); }
Dictionary <MutationCategory, int> GetPhredScaledCalibratedRatesForEdges(int baselineQNoise, double warningThreshold, CountData basicCountsData, CountData edgeIssueCountsData) { Dictionary <MutationCategory, int> AdjustedErrorRates = new Dictionary <MutationCategory, int>(); double GeneralEdgeRiskIncrease = edgeIssueCountsData.ObservedMutationRate / basicCountsData.ObservedMutationRate; if (GeneralEdgeRiskIncrease > warningThreshold) { Logger.WriteToLog("Warning, high levels of mismatches detected at loci near edges, relative to all other loci, by a factor of " + GeneralEdgeRiskIncrease); } double MutationRateInEdge = edgeIssueCountsData.ObservedMutationRate; double MuationsCalledNotInEdge = basicCountsData.TotalMutations - edgeIssueCountsData.TotalMutations; double TotalLociNotInEdge = basicCountsData.NumPossibleVariants - edgeIssueCountsData.NumPossibleVariants; double MutationRateNotInEdge = MuationsCalledNotInEdge / TotalLociNotInEdge; //if the error rate at the edges was the same as the error rate in the middle, // we would expect this many variants at the edges: double NullHypothesisExpectedMismatches = MutationRateNotInEdge * edgeIssueCountsData.NumPossibleVariants; double HowManyVariantsWeActuallySaw = edgeIssueCountsData.TotalMutations; double HowManyAreProbablyWrong = edgeIssueCountsData.TotalMutations - NullHypothesisExpectedMismatches; //error rate in edge region, Given You Called a Variant. double EstimatedErrorRateInEdgeRegions = HowManyAreProbablyWrong / edgeIssueCountsData.TotalMutations; foreach (var cat in edgeIssueCountsData.CountsByCategory.Keys) { double countsAtEdge = edgeIssueCountsData.CountsByCategory[cat]; double overallMutations = edgeIssueCountsData.TotalMutations; double proportion = countsAtEdge / edgeIssueCountsData.TotalMutations; //how much this particular variant category contributed to the error rate increase double estimatedErrorRateByCategory = proportion * EstimatedErrorRateInEdgeRegions; int riskAsQRate = (int)MathOperations.PtoQ(estimatedErrorRateByCategory); AdjustedErrorRates.Add(cat, riskAsQRate); } return(AdjustedErrorRates); }