/// <summary> /// Loop over variants like this: foreach (VcfVariant variant in reader.GetVariants()) /// </summary> public IEnumerable <VcfVariant> GetVariants() { // sanity check: make sure the file is open if (!IsOpen) { yield break; } while (true) { // grab the next vcf line string line = Reader.ReadLine(); if (line == null) { break; } VcfVariant variant = new VcfVariant(); // split the columns and assign them to VcfVariant string[] cols = line.Split('\t'); // convert the columns to a variant ConvertColumnsToVariant(cols, variant); if (RequireGenotypes && (variant.Genotypes == null || variant.Genotypes.Count == 0)) { throw new InvalidDataException("Missing genotype columns in VCF file"); } yield return(variant); } }
/// <summary> /// Retrieves the next available variant and returns false if no variants are available. /// </summary> public bool GetNextVariant(VcfVariant variant) { // sanity check: make sure the file is open if (!IsOpen) { return(false); } // grab the next vcf line string line = Reader.ReadLine(); if (line == null) { return(false); } // split the columns and assign them to VcfVariant string[] cols = line.Split('\t'); // convert the columns to a variant ConvertColumnsToVariant(cols, variant); if (RequireGenotypes && variant.Genotypes.Count == 0) { throw new InvalidDataException("Missing genotype columns in VCF file"); } return(true); }
/// <summary> /// Assign a variant type to a particular allele. The rules are as follows: /// - If ref==alt, type is reference. /// - Otherwise, trim off any common prefix and any common suffix. Let |ref| denote the length of the /// reference allele after trimming, and |alt| denote the length of the alt allele after trimming. /// - If |ref|=0, it's an insertion /// - If |alt|=0, it's a deletion /// - If |ref|=|alt|=1, it's a SNV /// - If |ref| = |alt| > 1, it's a MNP /// - If |ref|>0 and |alt|>0 and |ref| != |alt|, it's a complex event /// </summary> private static VariantType GetAlleleVariantType(VcfVariant variant, int haplotype) { if (haplotype == 0) { return(VariantType.Reference); } if (haplotype == -1) { return(VariantType.Missing); } string altAllele = variant.VariantAlleles[haplotype - 1]; return(GetAlleleVariantType(variant.ReferenceAllele, altAllele)); }
/// <summary> /// populates a vcf variant object given an array of vcf columns /// </summary> protected void ConvertColumnsToVariant(string[] cols, VcfVariant variant) { variant.ReferenceName = cols[VcfCommon.ChromIndex]; variant.ReferencePosition = int.Parse(cols[VcfCommon.PosIndex]); variant.Identifier = cols[VcfCommon.IDIndex]; variant.ReferenceAllele = cols[VcfCommon.RefIndex]; variant.Filters = cols[VcfCommon.FilterIndex]; if (cols[VcfCommon.QualIndex] == ".") { variant.HasQuality = false; } double.TryParse(cols[VcfCommon.QualIndex], out variant.Quality); // CFTR uses a ".", which is not actually legal... (actually, vcf 4.1 does allow the missing value "." here. Strelka uses it) // parse the variant alleles variant.VariantAlleles = cols[VcfCommon.AltIndex].Split(','); // parse the info fields //variant.InfoFields.Clear(); variant.InfoFields = new Dictionary <string, string>(StringComparer.OrdinalIgnoreCase); string InfoData = cols[VcfCommon.InfoIndex]; if (InfoData == ".") { InfoData = ""; // Special case: a "." in the INFO field should be treated like an empty string. } string[] infoCols = InfoData.Split(InfoSplitChars, StringSplitOptions.RemoveEmptyEntries); int numInfoCols = infoCols.Length; if ((variant.InfoTagOrder == null) || (numInfoCols != variant.InfoTagOrder.Length)) { variant.InfoTagOrder = new string[numInfoCols]; } for (int infoColIndex = 0; infoColIndex < numInfoCols; infoColIndex++) { string infoField = infoCols[infoColIndex]; string[] infoFieldKvp = infoField.Split('='); variant.InfoTagOrder[infoColIndex] = infoFieldKvp[0]; variant.InfoFields[infoFieldKvp[0]] = (infoFieldKvp.Length == 1 ? null : infoFieldKvp[1]); } if (cols.Length > VcfCommon.GenotypeIndex) // Genotype columns present { // parse the genotype format field if (cols[VcfCommon.FormatIndex] != GenotypeTagString) { GenotypeTagString = cols[VcfCommon.FormatIndex]; GenotypeTagOrder = GenotypeTagString.Split(':'); } variant.GenotypeTagOrder = GenotypeTagOrder; // parse the genotype data for each sample variant.Genotypes = new List <Dictionary <string, string> >(); for (int sampleIndex = 0; sampleIndex < this.Samples.Count; sampleIndex++) { string genotypeColumn = cols[VcfCommon.GenotypeIndex + sampleIndex]; if (genotypeColumn == ".") { variant.Genotypes.Add(null); } else { string[] genotypeCols = genotypeColumn.Split(':'); variant.Genotypes.Add(ParseGenotype(variant.GenotypeTagOrder, genotypeCols)); } } // specify the variant type: AssignVariantType(variant); } }
private static void AssignVariantType(VcfVariant variant) { string genotype = null; if (variant.Genotypes[0] != null && variant.Genotypes[0].ContainsKey("GT")) { genotype = variant.Genotypes[0]["GT"]; } // sanity check: support missing genotypes if (genotype == null || genotype == "./." || genotype == ".") { variant.VarType1 = VariantType.Missing; variant.VarType2 = VariantType.Missing; variant.VarType = VariantType.Missing; return; } // Handle usual cases like 0/0, 0/1, 1/0, 1/1 as well as // special cases like ., ./., ./1, 1/.: int haplotypeA = int.TryParse(genotype.Substring(0, 1), out haplotypeA) ? haplotypeA : -1; int haplotypeB = genotype.Length >= 3 && int.TryParse(genotype.Substring(2, 1), out haplotypeB) ? haplotypeB : -1; // Treat things like ./1 or 0/. as homozygous: if (haplotypeA == -1) { haplotypeA = haplotypeB; } if (haplotypeB == -1) { haplotypeB = haplotypeA; } variant.VarType1 = GetAlleleVariantType(variant, haplotypeA); variant.VarType2 = GetAlleleVariantType(variant, haplotypeB); switch (variant.VarType1) { case VariantType.Reference: variant.VarType = variant.VarType2; break; case VariantType.SNV: switch (variant.VarType2) { case VariantType.Reference: variant.VarType = VariantType.SNV; break; case VariantType.SNV: variant.VarType = VariantType.SNV; break; case VariantType.Insertion: variant.VarType = VariantType.SNVInsertion; break; case VariantType.Deletion: variant.VarType = VariantType.SNVDeletion; break; default: variant.VarType = VariantType.Complex; break; } break; case VariantType.MNP: switch (variant.VarType2) { case VariantType.Reference: variant.VarType = VariantType.MNP; break; case VariantType.MNP: variant.VarType = VariantType.MNP; break; default: variant.VarType = VariantType.Complex; break; } break; case VariantType.Insertion: switch (variant.VarType2) { case VariantType.Reference: variant.VarType = VariantType.Insertion; break; case VariantType.SNV: variant.VarType = VariantType.SNVInsertion; break; case VariantType.Insertion: variant.VarType = VariantType.Insertion; break; case VariantType.Deletion: variant.VarType = VariantType.InsertionDeletion; break; default: variant.VarType = VariantType.Complex; break; } break; case VariantType.Deletion: switch (variant.VarType2) { case VariantType.Reference: variant.VarType = VariantType.Deletion; break; case VariantType.SNV: variant.VarType = VariantType.SNVDeletion; break; case VariantType.Insertion: variant.VarType = VariantType.InsertionDeletion; break; case VariantType.Deletion: variant.VarType = VariantType.Deletion; break; default: variant.VarType = VariantType.Complex; break; } break; default: variant.VarType = VariantType.Complex; break; } }