private CalledAllele MakeDummyAllele(string reference, string alt) { var v = new CalledAllele() { ReferenceAllele = reference, AlternateAllele = alt }; v.SetType(); return(v); }
public void CandidateAllele_CheckType() { var allele = new CalledAllele() { ReferenceAllele = "A", AlternateAllele = "." }; allele.SetType(); Assert.Equal(AlleleCategory.Reference, allele.Type); allele = new CalledAllele() { ReferenceAllele = "A", AlternateAllele = "A" }; allele.SetType(); Assert.Equal(AlleleCategory.Reference, allele.Type); allele = new CalledAllele() { ReferenceAllele = "A", AlternateAllele = "C" }; allele.SetType(); Assert.Equal(AlleleCategory.Snv, allele.Type); allele = new CalledAllele() { ReferenceAllele = "AC", AlternateAllele = "CG" }; allele.SetType(); Assert.Equal(AlleleCategory.Mnv, allele.Type); allele = new CalledAllele() { ReferenceAllele = "AAA", AlternateAllele = "A" }; allele.SetType(); Assert.Equal(AlleleCategory.Deletion, allele.Type); allele = new CalledAllele() { ReferenceAllele = "A", AlternateAllele = "ACG" }; allele.SetType(); Assert.Equal(AlleleCategory.Insertion, allele.Type); allele = new CalledAllele() { ReferenceAllele = "AFGGGG", AlternateAllele = "ACG" }; allele.SetType(); Assert.Equal(AlleleCategory.Unsupported, allele.Type); }
/// <summary> /// Pisces "AlleleCategory" handles simple small variants, but not complex varaints such as ACGT -> ACGTGG, /// which is neither a SNP, indel or MNV, but a combination of the above. /// However (as in the case of the example ACGT -> ACGTGG) /// we can trim off from the front or the back of the allel, and the input variant will become a simpler small variant. /// Ie (ACGT -> ACGTGG) -> (T -> TGG) . There, we can process that! /// This will not always work (ie, (CCGT -> ACGTGG), no luck) but we can try. /// </summary> /// <param name="allele"></param> public static void TrimUnsupportedAlleleType(CalledAllele allele) { var alleleReference = allele.ReferenceAllele; var alleleAlternate = allele.AlternateAllele; //GB: can we make this a constant? //TJD: I really wish it was zero and we did not prepend reference bases for indels. Would like to pack it *out* , not bake it in. var numBasesOfAgreementToKeep = 1; var numTrailingBasesToTrim = GetNumTrailingAgreement(alleleReference, alleleAlternate); numTrailingBasesToTrim = Math.Min(Math.Min(numTrailingBasesToTrim, alleleReference.Length - numBasesOfAgreementToKeep), alleleAlternate.Length - numBasesOfAgreementToKeep); numTrailingBasesToTrim = Math.Max(numTrailingBasesToTrim, 0); alleleReference = alleleReference.Substring(0, alleleReference.Length - numTrailingBasesToTrim); alleleAlternate = alleleAlternate.Substring(0, alleleAlternate.Length - numTrailingBasesToTrim); //dont forget to keep a prepended base var numPrecedingBasesToTrim = GetNumPrecedingAgreement(alleleReference, alleleAlternate) - numBasesOfAgreementToKeep; numPrecedingBasesToTrim = Math.Min(Math.Min(numPrecedingBasesToTrim, alleleReference.Length - numBasesOfAgreementToKeep), alleleAlternate.Length - numBasesOfAgreementToKeep); numPrecedingBasesToTrim = Math.Max(numPrecedingBasesToTrim, 0); alleleReference = alleleReference.Substring(numPrecedingBasesToTrim, alleleReference.Length - numPrecedingBasesToTrim); alleleAlternate = alleleAlternate.Substring(numPrecedingBasesToTrim, alleleAlternate.Length - numPrecedingBasesToTrim); allele.ReferenceAllele = alleleReference; allele.AlternateAllele = alleleAlternate; allele.ReferencePosition = allele.ReferencePosition + numPrecedingBasesToTrim; allele.SetType(); }
/// <summary> /// populates a called allele object given an array of vcf columns /// </summary> protected static void ConvertColumnsToVariant(bool shouldTrimComplexAlleles, string[] cols, CalledAllele allele, int alleleIndex) { bool shouldOutputRcCounts = true; bool shouldOutputTsCounts = true; if ((cols == null) || (cols.Length == 0)) { allele = null; return; } //set defaults. var genotypeQscore = 0; var referenceSupport = 0; var altSupport = 0; var genotypeString = ""; var totalCoverage = 0; var variantQuality = 0.0; var numAlts = 1; var noiseLevel = 0; var fractionNocalls = 0f; var strandBiasInGATKScaleCoords = -100f; var tsCounts = new List <string>(); // //read in simple data allele.Chromosome = cols[VcfCommon.ChromIndex]; allele.ReferencePosition = int.Parse(cols[VcfCommon.PosIndex]); allele.ReferenceAllele = cols[VcfCommon.RefIndex]; allele.Filters = VcfVariantUtilities.MapFilterString(cols[VcfCommon.FilterIndex]); bool gotQual = double.TryParse(cols[VcfCommon.QualIndex], out variantQuality); // CFTR uses a ".", which is not actually legal... (actually, vcf 4.1 does allow the missing value "." here. Strelka uses it) if (gotQual) { allele.VariantQscore = (int)variantQuality; } // parse the variant alleles var variantAlleles = cols[VcfCommon.AltIndex].Split(','); allele.AlternateAllele = variantAlleles[alleleIndex]; var isRef = (allele.AlternateAllele == "."); if (isRef) { numAlts = 0; } else { numAlts = variantAlleles.Count(); } // parse the info field data (presume, single sample) Dictionary <string, string> InfoFields = ParseInfoFields(cols); // parse the genotype data (presume, single sample) List <Dictionary <string, string> > Genotypes = ParseGenotypeData(cols); //get more complex allele data... if (InfoFields.ContainsKey("DP")) { totalCoverage = Int32.Parse(InfoFields["DP"]); } if ((Genotypes.Count > 0) && (Genotypes[0] != null)) { if (Genotypes[0].ContainsKey("GQ")) { genotypeQscore = Int32.Parse(Genotypes[0]["GQ"]); } else if (Genotypes[0].ContainsKey("GQX")) { genotypeQscore = Int32.Parse(Genotypes[0]["GQX"]); } if (Genotypes[0].ContainsKey("GT")) { genotypeString = Genotypes[0]["GT"]; } if (Genotypes[0].ContainsKey("NL")) { noiseLevel = Int32.Parse(Genotypes[0]["NL"]); } if (Genotypes[0].ContainsKey("NC")) { fractionNocalls = float.Parse(Genotypes[0]["NC"]); } if (Genotypes[0].ContainsKey("SB")) { strandBiasInGATKScaleCoords = float.Parse(Genotypes[0]["SB"]); } var ADstrings = new string[] { "0", "0" }; if (Genotypes[0].ContainsKey("AD")) { ADstrings = Genotypes[0]["AD"].Split(','); } referenceSupport = int.Parse(ADstrings[0]); //by default alt support is 0. if ((!isRef) && (ADstrings.Length > 1)) { altSupport = int.Parse(ADstrings[1]); } if (shouldOutputRcCounts) { if (Genotypes[0].ContainsKey("US")) { tsCounts = Genotypes[0]["US"].Split(',').ToList(); } } allele.Genotype = VcfVariantUtilities.MapGTString(genotypeString, numAlts); //note this awkward vcf line (pisces) //"chr4\t10\t.\tAA\tGA,G\t0\tPASS\tDP=5394\tGT:GQ:AD:DP:VF:NL:SB:NC\t1/2:0:2387,2000:5394:0.8133:23:0.0000:0.0000"; //and this one //chr2 19946216.ATGTGTG ATG,ATGTG,A 0 PASS metal = platinum; cgi =.; bwa_freebayes = HD:0,LOOHD: 0; bwa_platypus =.; bwa_gatk3 = HD:2,LOOHD: 2; cortex =.; isaac2 = HD:1,LOOHD: 1; dist2closest = 192 GT 1 | 2 if ((numAlts >= 2) && (Genotypes[0].ContainsKey("AD"))) { if (ADstrings.Count() <= numAlts) //in this case we never expressedly gave the ref support, so we have to derive it. { int totalAltCount = 0; for (int altIndex = 0; altIndex < numAlts; altIndex++) { var altSupportAtIndex = int.Parse(ADstrings[altIndex]); totalAltCount += altSupportAtIndex; if (altIndex == alleleIndex) { altSupport = altSupportAtIndex; } } referenceSupport = Math.Max(0, totalCoverage - totalAltCount); } } } var strandBiasResults = new BiasResults(); strandBiasResults.GATKBiasScore = strandBiasInGATKScaleCoords; //set the remaining data allele.TotalCoverage = totalCoverage; allele.AlleleSupport = isRef ? referenceSupport : altSupport; allele.ReferenceSupport = referenceSupport; allele.GenotypeQscore = genotypeQscore; allele.NoiseLevelApplied = noiseLevel; allele.StrandBiasResults = strandBiasResults; allele.IsForcedToReport = allele.Filters.Contains(FilterType.ForcedReport); //set the derived values allele.SetType(); allele.ForceFractionNoCalls(fractionNocalls); //rescue attempt for complex types, ie ACGT -> ACGTGG. //Get the simplest form of the allele if ((allele.Type == AlleleCategory.Unsupported) && shouldTrimComplexAlleles) { VcfVariantUtilities.TrimUnsupportedAlleleType(allele); } if (tsCounts.Count != 0) { VcfVariantUtilities.FillInCollapsedReadsCount(shouldOutputRcCounts, shouldOutputTsCounts, allele, tsCounts); } }
public static CalledAllele ConvertUnpackedVariant(VcfVariant v, bool shouldOutputRcCounts = false, bool shouldOutputTsCounts = false, bool shouldTrimComplexAlleles = true) { if (v == null) { return(null); } if (v.VariantAlleles.Count() > 1) { throw new ArgumentException("This method does not handle crushed vcf format. Use Convert(IEnumerable<VcfVariant> vcfVariants)"); } var genotypeQscore = 0; var referenceSupport = 0; var altSupport = 0; var genotypeString = ""; var totalCoverage = 0; var isRef = ((v.VariantAlleles.Count() == 1) && v.VariantAlleles[0] == "."); var variantQuality = v.Quality; var numAlts = 1; var noiseLevel = 1; var fractionNocalls = 0f; var strandBiasInGATKScaleCoords = -100f; var tsCounts = new List <string>(); if (v.InfoFields.ContainsKey("DP")) { totalCoverage = Int32.Parse(v.InfoFields["DP"]); } if (v.Genotypes.Count > 0) { if (v.Genotypes[0].ContainsKey("GQ")) { genotypeQscore = Int32.Parse(v.Genotypes[0]["GQ"]); } else if (v.Genotypes[0].ContainsKey("GQX")) { genotypeQscore = Int32.Parse(v.Genotypes[0]["GQX"]); } genotypeString = v.Genotypes[0]["GT"]; if (v.Genotypes[0].ContainsKey("NL")) { noiseLevel = Int32.Parse(v.Genotypes[0]["NL"]); } if (v.Genotypes[0].ContainsKey("NC")) { fractionNocalls = float.Parse(v.Genotypes[0]["NC"]); } if (v.Genotypes[0].ContainsKey("SB")) { strandBiasInGATKScaleCoords = float.Parse(v.Genotypes[0]["SB"]); } var ADstring = new string[] { "0", "0" }; if (v.Genotypes[0].ContainsKey("AD")) { ADstring = v.Genotypes[0]["AD"].Split(','); } var VFstring = new string[] { "0", "0" }; if (v.Genotypes[0].ContainsKey("VF")) { VFstring = v.Genotypes[0]["VF"].Split(','); } referenceSupport = int.Parse(ADstring[0]); altSupport = isRef ? 0 : int.Parse(ADstring[1]); if (shouldOutputRcCounts) { if (v.Genotypes[0].ContainsKey("US")) { tsCounts = v.Genotypes[0]["US"].Split(',').ToList(); } } if (isRef) { numAlts = 0; } else { numAlts = 1; //note this, method should never get a value here >1. these should be UNPACKED variants } } var strandBiasResults = new BiasResults(); strandBiasResults.GATKBiasScore = strandBiasInGATKScaleCoords; var filters = MapFilterString(v.Filters); var allele = new CalledAllele() { Chromosome = v.ReferenceName, ReferencePosition = v.ReferencePosition, ReferenceAllele = v.ReferenceAllele, AlternateAllele = v.VariantAlleles[0], TotalCoverage = totalCoverage, AlleleSupport = isRef ? referenceSupport : altSupport, ReferenceSupport = referenceSupport, VariantQscore = (int)variantQuality, GenotypeQscore = genotypeQscore, Genotype = MapGTString(genotypeString, numAlts), Filters = filters, NoiseLevelApplied = noiseLevel, StrandBiasResults = strandBiasResults, IsForcedToReport = filters.Contains(FilterType.ForcedReport) }; allele.SetType(); allele.ForceFractionNoCalls(fractionNocalls); //rescue attempt for complex types, ie ACGT -> ACGTGG if ((allele.Type == AlleleCategory.Unsupported) && shouldTrimComplexAlleles) { TrimUnsupportedAlleleType(allele); } FillInCollapsedReadsCount(shouldOutputRcCounts, shouldOutputTsCounts, allele, tsCounts); return(allele); }