/// <summary> /// Internal code to determine the type of the genotype from the alleles vector </summary> /// <returns> the type </returns> protected internal virtual GenotypeType determineType() // we should never call if already calculated { // TODO -- this code is slow and could be optimized for the diploid case IList <Allele> alleles = Alleles; if (alleles.Count == 0) { return(GenotypeType.UNAVAILABLE); } bool sawNoCall = false, sawMultipleAlleles = false; Allele observedAllele = null; foreach (Allele allele in alleles) { if (allele.NoCall) { sawNoCall = true; } else if (observedAllele == null) { observedAllele = allele; } else if (!allele.Equals(observedAllele)) { sawMultipleAlleles = true; } } if (sawNoCall) { if (observedAllele == null) { return(GenotypeType.NO_CALL); } return(GenotypeType.MIXED); } if (observedAllele == null) { throw new Exception("BUG: there are no alleles present in this genotype but the alleles list is not null"); } return(sawMultipleAlleles ? GenotypeType.HET : observedAllele.Reference?GenotypeType.HOM_REF : GenotypeType.HOM_VAR); }
private static VariantType typeOfBiallelicVariant(Allele reference, Allele allele) { if (reference.Symbolic) { throw new Exception("Unexpected error: encountered a record with a symbolic reference allele"); } if (allele.Symbolic) { return(VariantType.SYMBOLIC); } if (reference.Length == allele.Length) { if (allele.Length == 1) { return(VariantType.SNP); } else { return(VariantType.MNP); } } // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an // appropriate check as can be seen from the following example: // REF = CTTA and ALT = C,CT,CA // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. return(VariantType.INDEL); // old incorrect logic: // if (oneIsPrefixOfOther(ref, allele)) // return Type.INDEL; // else // return Type.MIXED; }
/// <summary> /// Compute the end position for this VariantContext from the alleles themselves /// /// In the trivial case this is a single BP event and end = start (open intervals) /// In general the end is start + ref length - 1, handling the case where ref length == 0 /// However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases /// </summary> /// <param name="alleles"> the list of alleles to consider. The reference allele must be the first one </param> /// <param name="start"> the known start position of this event </param> /// <param name="endForSymbolicAlleles"> the end position to use if any of the alleles is symbolic. Can be -1 /// if no is expected but will throw an error if one is found </param> /// <returns> this builder </returns> public static int ComputeEndFromAlleles(IList <Allele> alleles, int start, int endForSymbolicAlleles) { Allele reference = alleles [0]; if (reference.NonReference) { throw new Exception("computeEndFromAlleles requires first allele to be reference"); } if (VariantContext.HasSymbolicAlleles(alleles)) { if (endForSymbolicAlleles == -1) { throw new Exception("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); } return(endForSymbolicAlleles); } else { return(start + Math.Max(reference.Length - 1, 0)); } }
/// <summary> /// check to make sure the allele is an acceptable allele </summary> /// <param name="allele"> the allele to check </param> /// <param name="isRef"> are we the reference allele? </param> /// <param name="lineNo"> the line number for this record </param> private static void checkAllele(string allele, bool isRef, int lineNo) { if (allele == null || allele.Length == 0) { generateException("Empty alleles are not permitted in VCF records", lineNo); } if (MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.Length > MAX_ALLELE_SIZE_BEFORE_WARNING) { throw new VCFParsingError(string.Format("Allele detected with length {0:D} exceeding max size {1:D} at approximately line {2:D}, likely resulting in degraded VCF processing performance", allele.Length, MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); } if (isSymbolicAllele(allele)) { if (isRef) { generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); } } else { // check for VCF3 insertions or deletions if ((allele [0] == VCFConstants.DELETION_ALLELE_v3) || (allele [0] == VCFConstants.INSERTION_ALLELE_v3)) { generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); } if (!Allele.AcceptableAlleleBases(allele)) { generateException("Unparsable vcf record with allele " + allele, lineNo); } if (isRef && allele.Equals(VCFConstants.EMPTY_ALLELE)) { generateException("The reference allele cannot be missing", lineNo); } } }
/// <summary> /// parse out the alleles </summary> /// <param name="reference"> the reference base </param> /// <param name="alts"> a string of alternates to break into alleles </param> /// <param name="lineNo"> the line number for this record </param> /// <returns> a list of alleles, and a pair of the shortest and longest sequence </returns> protected internal static IList <Allele> parseAlleles(string reference, string alts, int lineNo) { IList <Allele> alleles = new List <Allele> (2); // we are almost always biallelic // ref checkAllele(reference, true, lineNo); Allele refAllele = Allele.Create(reference, true); alleles.Add(refAllele); if (alts.IndexOf(",") == -1) // only 1 alternatives, don't call string split { parseSingleAltAllele(alleles, alts, lineNo); } else { foreach (string alt in alts.Split(VCFConstants.COMMA_AS_CHAR_ARRAY, StringSplitOptions.RemoveEmptyEntries)) { parseSingleAltAllele(alleles, alt, lineNo); } } return(alleles); }
/// <summary> /// Add a record to the file /// </summary> /// <param name="vc">The Variant Context object </param> protected string getVariantLinetoWrite(VariantContext vc) { if (doNotWriteGenotypes) { vc = (new VariantContextBuilder(vc)).noGenotypes().make(); } try { //Convert alleles to 1,2,3,etc. numbering IDictionary <Allele, string> alleleMap = buildAlleleMap(vc); // CHROM StringBuilder lineToWrite = new StringBuilder(); //Add chr, pos, id, ref lineToWrite.Append(String.Join(VCFConstants.FIELD_SEPARATOR, vc.Chr, vc.Start.ToString(), vc.ID, vc.Reference.DisplayString)); // ALT if (vc.Variant) { Allele altAllele = vc.GetAlternateAllele(0); string alt = altAllele.DisplayString; lineToWrite.Append(alt); for (int i = 1; i < vc.AlternateAlleles.Count; i++) { altAllele = vc.GetAlternateAllele(i); alt = altAllele.DisplayString; lineToWrite.Append(","); lineToWrite.Append(alt); } } else { lineToWrite.Append(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // QUAL if (!vc.HasLog10PError) { lineToWrite.Append(VCFConstants.MISSING_VALUE_v4); } else { lineToWrite.Append(formatQualValue(vc.PhredScaledQual)); } lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // FILTER string filters = getFilterString(vc); lineToWrite.Append(filters); lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // INFO IDictionary <string, string> infoFields = new SortedDictionary <string, string>(); foreach (KeyValuePair <string, object> field in vc.Attributes) { string key = field.Key; if (!mHeader.hasInfoLine(key)) { fieldIsMissingFromHeaderError(vc, key, "INFO"); } string outputValue = formatVCFField(field.Value); if (outputValue != null) { infoFields[key] = outputValue; } } lineToWrite.Append(getInfoString(infoFields));; // FORMAT GenotypesContext gc = vc.Genotypes; if (gc.LazyWithData && ((LazyGenotypesContext)gc).UnparsedGenotypeData is string) { lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); lineToWrite.Append(((LazyGenotypesContext)gc).UnparsedGenotypeData.ToString()); } else { IList <string> genotypeAttributeKeys = calcVCFGenotypeKeys(vc); if (genotypeAttributeKeys.Count > 0) { foreach (String format in genotypeAttributeKeys) { if (!mHeader.hasFormatLine(format)) { fieldIsMissingFromHeaderError(vc, format, "FORMAT"); } } string genotypeFormatString = String.Join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); lineToWrite.Append(genotypeFormatString); lineToWrite.Append(getGenotypeDataText(vc, alleleMap, genotypeAttributeKeys)); } } lineToWrite.Append("\n"); return(lineToWrite.ToString()); } catch (IOException e) { throw new Exception("Unable to write the VCF object:\n " + vc.ToString() + "\n", e); } }
/// <summary> /// Returns how many times allele appears in this genotype object? /// </summary> /// <param name="allele"> </param> /// <returns> a value >= 0 indicating how many times the allele occurred in this sample's genotype </returns> public virtual int CountAlleles(Allele allele) { return(Alleles.Count(x => x.Equals(allele))); }