public static Dictionary <string, List <CandidateAllele> > GetVariantsByChromosome(this VcfReader reader, bool variantsOnly = false, bool flagIsKnown = false, List <AlleleCategory> typeFilter = null, Func <CandidateAllele, bool> doSkipCandidate = null) { var lookup = new Dictionary <string, List <CandidateAllele> >(); var calledVariants = VcfVariantUtilities.Convert(reader.GetVariants()); foreach (var calledVariant in calledVariants) { var candidate = BackToCandiate(calledVariant); if (candidate.Type != AlleleCategory.Unsupported) { if (variantsOnly && candidate.Type == AlleleCategory.Reference) { continue; } if (typeFilter != null && !typeFilter.Contains(candidate.Type)) { continue; } if (doSkipCandidate != null && doSkipCandidate(candidate)) { continue; } if (flagIsKnown) { candidate.IsKnown = true; } if (!lookup.ContainsKey(candidate.Chromosome)) { lookup[candidate.Chromosome] = new List <CandidateAllele>(); } lookup[candidate.Chromosome].Add(candidate); } } return(lookup); }
/// <summary> /// populates a called allele object given an array of vcf columns /// </summary> protected static void ConvertColumnsToVariant(bool shouldTrimComplexAlleles, string[] cols, CalledAllele allele, int alleleIndex) { bool shouldOutputRcCounts = true; bool shouldOutputTsCounts = true; if ((cols == null) || (cols.Length == 0)) { allele = null; return; } //set defaults. var genotypeQscore = 0; var referenceSupport = 0; var altSupport = 0; var genotypeString = ""; var totalCoverage = 0; var variantQuality = 0.0; var numAlts = 1; var noiseLevel = 0; var fractionNocalls = 0f; var strandBiasInGATKScaleCoords = -100f; var tsCounts = new List <string>(); // //read in simple data allele.Chromosome = cols[VcfCommon.ChromIndex]; allele.ReferencePosition = int.Parse(cols[VcfCommon.PosIndex]); allele.ReferenceAllele = cols[VcfCommon.RefIndex]; allele.Filters = VcfVariantUtilities.MapFilterString(cols[VcfCommon.FilterIndex]); bool gotQual = double.TryParse(cols[VcfCommon.QualIndex], out variantQuality); // CFTR uses a ".", which is not actually legal... (actually, vcf 4.1 does allow the missing value "." here. Strelka uses it) if (gotQual) { allele.VariantQscore = (int)variantQuality; } // parse the variant alleles var variantAlleles = cols[VcfCommon.AltIndex].Split(','); allele.AlternateAllele = variantAlleles[alleleIndex]; var isRef = (allele.AlternateAllele == "."); if (isRef) { numAlts = 0; } else { numAlts = variantAlleles.Count(); } // parse the info field data (presume, single sample) Dictionary <string, string> InfoFields = ParseInfoFields(cols); // parse the genotype data (presume, single sample) List <Dictionary <string, string> > Genotypes = ParseGenotypeData(cols); //get more complex allele data... if (InfoFields.ContainsKey("DP")) { totalCoverage = Int32.Parse(InfoFields["DP"]); } if ((Genotypes.Count > 0) && (Genotypes[0] != null)) { if (Genotypes[0].ContainsKey("GQ")) { genotypeQscore = Int32.Parse(Genotypes[0]["GQ"]); } else if (Genotypes[0].ContainsKey("GQX")) { genotypeQscore = Int32.Parse(Genotypes[0]["GQX"]); } if (Genotypes[0].ContainsKey("GT")) { genotypeString = Genotypes[0]["GT"]; } if (Genotypes[0].ContainsKey("NL")) { noiseLevel = Int32.Parse(Genotypes[0]["NL"]); } if (Genotypes[0].ContainsKey("NC")) { fractionNocalls = float.Parse(Genotypes[0]["NC"]); } if (Genotypes[0].ContainsKey("SB")) { strandBiasInGATKScaleCoords = float.Parse(Genotypes[0]["SB"]); } var ADstrings = new string[] { "0", "0" }; if (Genotypes[0].ContainsKey("AD")) { ADstrings = Genotypes[0]["AD"].Split(','); } referenceSupport = int.Parse(ADstrings[0]); //by default alt support is 0. if ((!isRef) && (ADstrings.Length > 1)) { altSupport = int.Parse(ADstrings[1]); } if (shouldOutputRcCounts) { if (Genotypes[0].ContainsKey("US")) { tsCounts = Genotypes[0]["US"].Split(',').ToList(); } } allele.Genotype = VcfVariantUtilities.MapGTString(genotypeString, numAlts); //note this awkward vcf line (pisces) //"chr4\t10\t.\tAA\tGA,G\t0\tPASS\tDP=5394\tGT:GQ:AD:DP:VF:NL:SB:NC\t1/2:0:2387,2000:5394:0.8133:23:0.0000:0.0000"; //and this one //chr2 19946216.ATGTGTG ATG,ATGTG,A 0 PASS metal = platinum; cgi =.; bwa_freebayes = HD:0,LOOHD: 0; bwa_platypus =.; bwa_gatk3 = HD:2,LOOHD: 2; cortex =.; isaac2 = HD:1,LOOHD: 1; dist2closest = 192 GT 1 | 2 if ((numAlts >= 2) && (Genotypes[0].ContainsKey("AD"))) { if (ADstrings.Count() <= numAlts) //in this case we never expressedly gave the ref support, so we have to derive it. { int totalAltCount = 0; for (int altIndex = 0; altIndex < numAlts; altIndex++) { var altSupportAtIndex = int.Parse(ADstrings[altIndex]); totalAltCount += altSupportAtIndex; if (altIndex == alleleIndex) { altSupport = altSupportAtIndex; } } referenceSupport = Math.Max(0, totalCoverage - totalAltCount); } } } var strandBiasResults = new BiasResults(); strandBiasResults.GATKBiasScore = strandBiasInGATKScaleCoords; //set the remaining data allele.TotalCoverage = totalCoverage; allele.AlleleSupport = isRef ? referenceSupport : altSupport; allele.ReferenceSupport = referenceSupport; allele.GenotypeQscore = genotypeQscore; allele.NoiseLevelApplied = noiseLevel; allele.StrandBiasResults = strandBiasResults; allele.IsForcedToReport = allele.Filters.Contains(FilterType.ForcedReport); //set the derived values allele.SetType(); allele.ForceFractionNoCalls(fractionNocalls); //rescue attempt for complex types, ie ACGT -> ACGTGG. //Get the simplest form of the allele if ((allele.Type == AlleleCategory.Unsupported) && shouldTrimComplexAlleles) { VcfVariantUtilities.TrimUnsupportedAlleleType(allele); } if (tsCounts.Count != 0) { VcfVariantUtilities.FillInCollapsedReadsCount(shouldOutputRcCounts, shouldOutputTsCounts, allele, tsCounts); } }