/// <summary> /// Update the attributes of the attributes map given the VariantContext to reflect the /// proper chromosome-based VCF tags /// </summary> /// <param name="vc"> the VariantContext </param> /// <param name="attributes"> the attributes map to populate; must not be null; may contain old values </param> /// <param name="removeStaleValues"> should we remove stale values from the mapping? </param> /// <param name="founderIds"> - Set of founders Ids to take into account. AF and FC will be calculated over the founders. /// If empty or null, counts are generated for all samples as unrelated individuals </param> /// <returns> the attributes map provided as input, returned for programming convenience </returns> public static IDictionary<string, object> CalculateChromosomeCounts (VariantContext vc, IDictionary<string, object> attributes, bool removeStaleValues, ISet<string> founderIds) { int AN = vc.CalledChrCount; // if everyone is a no-call, remove the old attributes if requested if (AN == 0 && removeStaleValues) { if (attributes.ContainsKey (VCFConstants.ALLELE_COUNT_KEY)) { attributes.Remove (VCFConstants.ALLELE_COUNT_KEY); } if (attributes.ContainsKey (VCFConstants.ALLELE_FREQUENCY_KEY)) { attributes.Remove (VCFConstants.ALLELE_FREQUENCY_KEY); } if (attributes.ContainsKey (VCFConstants.ALLELE_NUMBER_KEY)) { attributes.Remove (VCFConstants.ALLELE_NUMBER_KEY); } return attributes; } if (vc.HasGenotypes) { attributes [VCFConstants.ALLELE_NUMBER_KEY] = AN; // if there are alternate alleles, record the relevant tags if (vc.AlternateAlleles.Count > 0) { List<double> alleleFreqs = new List<double> (); List<int> alleleCounts = new List<int> (); List<int> foundersAlleleCounts = new List<int> (); double totalFoundersChromosomes = (double)vc.GetCalledChrCount (founderIds); int foundersAltChromosomes; foreach (Allele allele in vc.AlternateAlleles) { foundersAltChromosomes = vc.GetCalledChrCount (allele, founderIds); alleleCounts.Add (vc.GetCalledChrCount (allele)); foundersAlleleCounts.Add (foundersAltChromosomes); if (AN == 0) { alleleFreqs.Add (0.0); } else { double freq = (double)foundersAltChromosomes / totalFoundersChromosomes; alleleFreqs.Add (freq); } } if (alleleCounts.Count == 1) { attributes [VCFConstants.ALLELE_COUNT_KEY] = alleleCounts [0]; } else { attributes [VCFConstants.ALLELE_COUNT_KEY] = alleleCounts; } if (alleleFreqs.Count == 1) { attributes [VCFConstants.ALLELE_FREQUENCY_KEY] = alleleFreqs [0]; } else { attributes [VCFConstants.ALLELE_FREQUENCY_KEY] = alleleFreqs; } } else { // if there's no alt AC and AF shouldn't be present attributes.Remove (VCFConstants.ALLELE_COUNT_KEY); attributes.Remove (VCFConstants.ALLELE_FREQUENCY_KEY); } } return attributes; }
private static IDictionary<Allele, string> buildAlleleMap(VariantContext vc) { IDictionary<Allele, string> alleleMap = new Dictionary<Allele, string>(vc.Alleles.Count + 1); alleleMap[Allele.NO_CALL] = VCFConstants.EMPTY_ALLELE; // convenience for lookup IList<Allele> alleles = vc.Alleles; for (int i = 0; i < alleles.Count; i++) { alleleMap[alleles[i]] = Convert.ToString(i); } return alleleMap; }
private static IDictionary <Allele, string> buildAlleleMap(VariantContext vc) { IDictionary <Allele, string> alleleMap = new Dictionary <Allele, string>(vc.Alleles.Count + 1); alleleMap[Allele.NO_CALL] = VCFConstants.EMPTY_ALLELE; // convenience for lookup IList <Allele> alleles = vc.Alleles; for (int i = 0; i < alleles.Count; i++) { alleleMap[alleles[i]] = Convert.ToString(i); } return(alleleMap); }
/// <summary> /// Get the number of values expected for this header field, given the properties of VariantContext vc /// /// If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 /// If the count is of type A, return vc.getNAlleles - 1 /// If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the /// max ploidy among all samples. Note that if the max ploidy of the VC is 0 (there's no GT information /// at all, then implicitly assume diploid samples when computing G values. /// If the count is UNBOUNDED return -1 /// </summary> /// <param name="vc"> /// @return </param> public virtual int getCount(VariantContext vc) { switch (countType) { case Bio.VCF.VCFHeaderLineCount.INTEGER: return count; case Bio.VCF.VCFHeaderLineCount.UNBOUNDED: return -1; case Bio.VCF.VCFHeaderLineCount.A: return vc.NAlleles - 1; case Bio.VCF.VCFHeaderLineCount.G: int ploidy = vc.GetMaxPloidy(2); return GenotypeLikelihoods.numLikelihoods(vc.NAlleles, ploidy); default: throw new VCFParsingError("Unknown count type: " + countType); } }
/// <summary> /// Returns a new builder based on parent -- the new VC will have all fields initialized /// to their corresponding values in parent. This is the best way to create a derived VariantContext /// </summary> /// <param name="parent"> Cannot be null </param> public VariantContextBuilder(VariantContext parent) : this() { if (parent == null) { throw new System.ArgumentException("BUG: VariantContextBuilder parent argument cannot be null in VariantContextBuilder"); } this.alleles_Renamed = parent.alleles; this.attributes_Renamed = (IDictionary <string, object>)parent.Attributes; this.attributesCanBeModified = false; this.Contig = parent.contig; this.filters_Renamed = (ISet <string>)parent.FiltersMaybeNull; this.genotypes_Renamed = parent.genotypes; this.ID = parent.ID; this.Log10PError = parent.Log10PError; this.Source = parent.Source; this.start_Renamed = parent.Start; this.stop_Renamed = parent.End; this.FullyDecoded = parent.FullyDecoded; }
/// <summary> /// Compute the end position for this VariantContext from the alleles themselves /// /// In the trivial case this is a single BP event and end = start (open intervals) /// In general the end is start + ref length - 1, handling the case where ref length == 0 /// However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases /// </summary> /// <param name="alleles"> the list of alleles to consider. The reference allele must be the first one </param> /// <param name="start"> the known start position of this event </param> /// <param name="endForSymbolicAlleles"> the end position to use if any of the alleles is symbolic. Can be -1 /// if no is expected but will throw an error if one is found </param> /// <returns> this builder </returns> public static int ComputeEndFromAlleles(IList <Allele> alleles, int start, int endForSymbolicAlleles) { Allele reference = alleles [0]; if (reference.NonReference) { throw new Exception("computeEndFromAlleles requires first allele to be reference"); } if (VariantContext.HasSymbolicAlleles(alleles)) { if (endForSymbolicAlleles == -1) { throw new Exception("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); } return(endForSymbolicAlleles); } else { return(start + Math.Max(reference.Length - 1, 0)); } }
/// <summary> /// Get the number of values expected for this header field, given the properties of VariantContext vc /// /// If the count is a fixed count, return that. For example, a field with size of 1 in the header returns 1 /// If the count is of type A, return vc.getNAlleles - 1 /// If the count is of type G, return the expected number of genotypes given the number of alleles in VC and the /// max ploidy among all samples. Note that if the max ploidy of the VC is 0 (there's no GT information /// at all, then implicitly assume diploid samples when computing G values. /// If the count is UNBOUNDED return -1 /// </summary> /// <param name="vc"> /// @return </param> public virtual int getCount(VariantContext vc) { switch (countType) { case Bio.VCF.VCFHeaderLineCount.INTEGER: return(count); case Bio.VCF.VCFHeaderLineCount.UNBOUNDED: return(-1); case Bio.VCF.VCFHeaderLineCount.A: return(vc.NAlleles - 1); case Bio.VCF.VCFHeaderLineCount.G: int ploidy = vc.GetMaxPloidy(2); return(GenotypeLikelihoods.numLikelihoods(vc.NAlleles, ploidy)); default: throw new VCFParsingError("Unknown count type: " + countType); } }
// -------------------------------------------------------------------------------- // // implementation functions // // -------------------------------------------------------------------------------- private string getFilterString(VariantContext vc) { if (vc.Filtered) { foreach (String filter in vc.Filters) { if (!mHeader.hasFilterLine(filter)) { fieldIsMissingFromHeaderError(vc, filter, "FILTER"); } } return(String.Join(";", ParsingUtils.SortList(vc.Filters.ToList()).ToArray())); } else if (vc.FiltersWereApplied) { return(VCFConstants.PASSES_FILTERS_v4); } else { return(VCFConstants.UNFILTERED); } }
/// <summary> /// Returns a new builder based on parent -- the new VC will have all fields initialized /// to their corresponding values in parent. This is the best way to create a derived VariantContext /// </summary> /// <param name="parent"> Cannot be null </param> public VariantContextBuilder (VariantContext parent) : this () { if (parent == null) { throw new System.ArgumentException ("BUG: VariantContextBuilder parent argument cannot be null in VariantContextBuilder"); } this.alleles_Renamed = parent.alleles; this.attributes_Renamed = (IDictionary<string,object>)parent.Attributes; this.attributesCanBeModified = false; this.Contig = parent.contig; this.filters_Renamed = (ISet<string>)parent.FiltersMaybeNull; this.genotypes_Renamed = parent.genotypes; this.ID = parent.ID; this.Log10PError = parent.Log10PError; this.Source = parent.Source; this.start_Renamed = parent.Start; this.stop_Renamed = parent.End; this.FullyDecoded = parent.FullyDecoded; }
/// <param name="other"> VariantContext whose alleles to compare against </param> /// <returns> true if this VariantContext has the same alleles (both ref and alts) as other, /// regardless of ordering. Otherwise returns false. </returns> public bool HasSameAllelesAs (VariantContext other) { return HasSameAlternateAllelesAs (other) && other.Reference.Equals (Reference, false); }
/// <summary> /// Determine which genotype fields are in use in the genotypes in VC </summary> /// <param name="vc"> </param> /// <returns> an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first </returns> private static IList<string> calcVCFGenotypeKeys(VariantContext vc) { //TODO: not sure who wrote this, these boolean flags should be removed though HashSet<string> keys = new HashSet<string>(); bool sawGoodGT = false; bool sawGoodQual = false; bool sawGenotypeFilter = false; bool sawDP = false; bool sawAD = false; bool sawPL = false; foreach (Genotype g in vc.Genotypes) { //todo, make this a string later foreach (string s in g.ExtendedAttributes.Keys.Select(x => x.ToString())) { keys.Add(s); } if (g.Available) { sawGoodGT = true; } if (g.HasGQ) { sawGoodQual = true; } if (g.HasDP) { sawDP = true; } if (g.HasAD) { sawAD = true; } if (g.HasPL) { sawPL = true; } if (g.Filtered) { sawGenotypeFilter = true; } } if (sawGoodQual) { } if (sawDP) { keys.Add(VCFConstants.DEPTH_KEY); } if (sawAD) { keys.Add(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } if (sawPL) { keys.Add(VCFConstants.GENOTYPE_PL_KEY); } if (sawGenotypeFilter) { keys.Add(VCFConstants.GENOTYPE_FILTER_KEY); } IList<string> sortedList = ParsingUtils.SortList(new List<string>(keys)); // make sure the GT is first if (sawGoodGT) { IList<string> newList = new List<string>(sortedList.Count + 1); newList.Add(VCFConstants.GENOTYPE_KEY); foreach (string s in sortedList) { newList.Add(s); } sortedList = newList; } if (sortedList.Count == 0) { // this needs to be done in case all samples are no-calls return new List<string>() { VCFConstants.GENOTYPE_KEY }; } else { return sortedList; } }
/// <summary> /// Gets size of the variant (end - start +1). /// </summary> /// <returns>The size.</returns> /// <param name="vc">Vc.</param> public static int GetSize (VariantContext vc) { return vc.End - vc.Start + 1; }
/// <summary> /// Update the attributes of the attributes map given the VariantContext to reflect the /// proper chromosome-based VCF tags /// </summary> /// <param name="vc"> the VariantContext </param> /// <param name="attributes"> the attributes map to populate; must not be null; may contain old values </param> /// <param name="removeStaleValues"> should we remove stale values from the mapping? </param> /// <param name="founderIds"> - Set of founders Ids to take into account. AF and FC will be calculated over the founders. /// If empty or null, counts are generated for all samples as unrelated individuals </param> /// <returns> the attributes map provided as input, returned for programming convenience </returns> public static IDictionary <string, object> CalculateChromosomeCounts(VariantContext vc, IDictionary <string, object> attributes, bool removeStaleValues, ISet <string> founderIds) { int AN = vc.CalledChrCount; // if everyone is a no-call, remove the old attributes if requested if (AN == 0 && removeStaleValues) { if (attributes.ContainsKey(VCFConstants.ALLELE_COUNT_KEY)) { attributes.Remove(VCFConstants.ALLELE_COUNT_KEY); } if (attributes.ContainsKey(VCFConstants.ALLELE_FREQUENCY_KEY)) { attributes.Remove(VCFConstants.ALLELE_FREQUENCY_KEY); } if (attributes.ContainsKey(VCFConstants.ALLELE_NUMBER_KEY)) { attributes.Remove(VCFConstants.ALLELE_NUMBER_KEY); } return(attributes); } if (vc.HasGenotypes) { attributes [VCFConstants.ALLELE_NUMBER_KEY] = AN; // if there are alternate alleles, record the relevant tags if (vc.AlternateAlleles.Count > 0) { List <double> alleleFreqs = new List <double> (); List <int> alleleCounts = new List <int> (); List <int> foundersAlleleCounts = new List <int> (); double totalFoundersChromosomes = (double)vc.GetCalledChrCount(founderIds); int foundersAltChromosomes; foreach (Allele allele in vc.AlternateAlleles) { foundersAltChromosomes = vc.GetCalledChrCount(allele, founderIds); alleleCounts.Add(vc.GetCalledChrCount(allele)); foundersAlleleCounts.Add(foundersAltChromosomes); if (AN == 0) { alleleFreqs.Add(0.0); } else { double freq = (double)foundersAltChromosomes / totalFoundersChromosomes; alleleFreqs.Add(freq); } } if (alleleCounts.Count == 1) { attributes [VCFConstants.ALLELE_COUNT_KEY] = alleleCounts [0]; } else { attributes [VCFConstants.ALLELE_COUNT_KEY] = alleleCounts; } if (alleleFreqs.Count == 1) { attributes [VCFConstants.ALLELE_FREQUENCY_KEY] = alleleFreqs [0]; } else { attributes [VCFConstants.ALLELE_FREQUENCY_KEY] = alleleFreqs; } } else { // if there's no alt AC and AF shouldn't be present attributes.Remove(VCFConstants.ALLELE_COUNT_KEY); attributes.Remove(VCFConstants.ALLELE_FREQUENCY_KEY); } } return(attributes); }
/// <summary> /// Returns a newly allocated VC that is the same as VC, but without genotypes </summary> /// <param name="vc"> variant context </param> /// <returns> new VC without genotypes </returns> public static VariantContext SitesOnlyVariantContext(VariantContext vc) { return((new VariantContextBuilder(vc)).noGenotypes().make()); }
public double getLog10GQ(Genotype genotype, VariantContext context) { return(getLog10GQ(genotype, context.Alleles)); }
private void SetVariantValues(patient_variants variant, VariantContext context) { if (context.Genotypes.Count >= 1) { var alleles = context.Genotypes[0].Alleles; variant.value1 = alleles[0].DisplayString; if (alleles.Count > 1) { variant.value2 = alleles[1].DisplayString; } else { variant.value2 = variant.value1; } } }
/// <summary> /// Parses a line from a VCF File /// </summary> /// <param name="parts">An array of length >8 where the 9th element contains unsplit genotype data (if present)</param> /// <param name="includeGenotypes"> Whether or not to also parse the genotype data </param> /// <returns></returns> private VariantContext parseVCFLine(string[] parts, bool includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); builder.Source = Name; // increment the line count lineNo++; // parse out the required fields string chr = GetCachedString(parts [0]); builder.Contig = chr; int pos = -1; try { pos = Convert.ToInt32(parts [1]); } catch (FormatException e) { generateException(parts [1] + " is not a valid start position in the VCF format"); } builder.Start = pos; if (parts [2].Length == 0) { generateException("The VCF specification requires a valid ID field"); } else if (parts [2].Equals(VCFConstants.EMPTY_ID_FIELD)) { builder.ID = VCFConstants.EMPTY_ID_FIELD; } else { builder.ID = parts [2]; } string refe = GetCachedString(parts [3].ToUpper()); string alts = GetCachedString(parts [4].ToUpper()); builder.Log10PError = parseQual(parts [5]); string filterStr = GetCachedString(parts [6]); var filters = filterHash [filterStr]; if (filters != null) //means filter data present { builder.SetFilters(filters.Hash); } IDictionary <string, object> attrs = parseInfo(parts [7]); builder.Attributes = attrs; if (attrs.ContainsKey(VCFConstants.END_KEY)) { // update stop with the end key if provided try { builder.Stop = Convert.ToInt32(attrs [VCFConstants.END_KEY].ToString()); } catch (Exception e) { generateException("the END value in the INFO field is not valid"); } } else { builder.Stop = (pos + refe.Length - 1); } // get our alleles, filters, and setup an attribute map IList <Allele> alleles = parseAlleles(refe, alts, lineNo); builder.SetAlleles(alleles); // do we have genotyping data if (parts.Length > NUM_STANDARD_FIELDS && includeGenotypes) { int nGenotypes = header.NGenotypeSamples; LazyGenotypesContext lazy = new LazyGenotypesContext(this, alleles, chr, pos, parts [8], nGenotypes); // did we resort the sample names? If so, we need to load the genotype data if (!header.SamplesWereAlreadySorted) { lazy.Decode(); } builder.SetGenotypes(lazy, false); } VariantContext vc = null; try { vc = builder.make(); } catch (Exception e) { generateException(e.Message); } return(vc); }
/// <summary> /// Add a record to the file /// </summary> /// <param name="vc">The Variant Context object </param> protected string getVariantLinetoWrite(VariantContext vc) { if (doNotWriteGenotypes) { vc = (new VariantContextBuilder(vc)).noGenotypes().make(); } try { //Convert alleles to 1,2,3,etc. numbering IDictionary<Allele, string> alleleMap = buildAlleleMap(vc); // CHROM StringBuilder lineToWrite = new StringBuilder(); //Add chr, pos, id, ref lineToWrite.Append(String.Join(VCFConstants.FIELD_SEPARATOR, vc.Chr, vc.Start.ToString(), vc.ID, vc.Reference.DisplayString)); // ALT if (vc.Variant) { Allele altAllele = vc.GetAlternateAllele(0); string alt = altAllele.DisplayString; lineToWrite.Append(alt); for (int i = 1; i < vc.AlternateAlleles.Count; i++) { altAllele = vc.GetAlternateAllele(i); alt = altAllele.DisplayString; lineToWrite.Append(","); lineToWrite.Append(alt); } } else { lineToWrite.Append(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // QUAL if (!vc.HasLog10PError) { lineToWrite.Append(VCFConstants.MISSING_VALUE_v4); } else { lineToWrite.Append(formatQualValue(vc.PhredScaledQual)); } lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // FILTER string filters = getFilterString(vc); lineToWrite.Append(filters); lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // INFO IDictionary<string, string> infoFields = new SortedDictionary<string, string>(); foreach (KeyValuePair<string, object> field in vc.Attributes) { string key = field.Key; if (!mHeader.hasInfoLine(key)) { fieldIsMissingFromHeaderError(vc, key, "INFO"); } string outputValue = formatVCFField(field.Value); if (outputValue != null) { infoFields[key] = outputValue; } } lineToWrite.Append(getInfoString(infoFields)); ; // FORMAT GenotypesContext gc = vc.Genotypes; if (gc.LazyWithData && ((LazyGenotypesContext)gc).UnparsedGenotypeData is string) { lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); lineToWrite.Append(((LazyGenotypesContext)gc).UnparsedGenotypeData.ToString()); } else { IList<string> genotypeAttributeKeys = calcVCFGenotypeKeys(vc); if (genotypeAttributeKeys.Count > 0) { foreach (String format in genotypeAttributeKeys) { if (!mHeader.hasFormatLine(format)) { fieldIsMissingFromHeaderError(vc, format, "FORMAT"); } } string genotypeFormatString = String.Join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); lineToWrite.Append(genotypeFormatString); lineToWrite.Append(getGenotypeDataText(vc, alleleMap, genotypeAttributeKeys)); } } lineToWrite.Append("\n"); return lineToWrite.ToString(); } catch (IOException e) { throw new Exception("Unable to write the VCF object:\n " + vc.ToString() + "\n", e); } }
public double getLog10GQ(Genotype genotype, VariantContext context) { return getLog10GQ(genotype,context.Alleles); }
private void SetVariantValues(VariantContext context, int patientId, int fileId, result_entities parent, List<result_entities> variantEntities) { if (context.Genotypes.Count > 0) { var alleles = context.Genotypes[0].Alleles; foreach (var allele in alleles) { variantEntities.Add(new result_entities() { patient_id = patientId, result_file_id = fileId, attribute_id = EntityRepository.GetAttribute(null, null, "SNP allele", null).id, parent = parent, value_short_text = allele.DisplayString }); } } }
/// <summary> /// add the genotype data /// </summary> /// <param name="vc"> the variant context </param> /// <param name="genotypeFormatKeys"> Genotype formatting string </param> /// <param name="alleleMap"> alleles for this context </param> /// <exception cref="IOException"> for writer </exception> private string getGenotypeDataText(VariantContext vc, IDictionary <Allele, string> alleleMap, IList <string> genotypeFormatKeys) { StringBuilder sbn = new StringBuilder(); int ploidy = vc.GetMaxPloidy(2); foreach (string sample in mHeader.GenotypeSampleNames) { sbn.Append(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.GetGenotype(sample); if (g == null) { g = GenotypeBuilder.CreateMissing(sample, ploidy); } IList <string> attrs = new List <string>(genotypeFormatKeys.Count); foreach (string field in genotypeFormatKeys) { if (field.Equals(VCFConstants.GENOTYPE_KEY)) { if (!g.Available) { throw new Exception("GTs cannot be missing for some samples if they are available for others in the record"); } sbn.Append(getAlleleText(g.getAllele(0), alleleMap)); for (int i = 1; i < g.Ploidy; i++) { sbn.Append(g.Phased ? VCFConstants.PHASED : VCFConstants.UNPHASED); sbn.Append(getAlleleText(g.getAllele(i), alleleMap)); } continue; } else { string outputValue; if (field.Equals(VCFConstants.GENOTYPE_FILTER_KEY)) { outputValue = g.Filtered ? g.Filters : VCFConstants.PASSES_FILTERS_v4; } else { IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.GetAccessor(field); if (accessor != null) { int[] intValues = accessor.getValues(g); if (intValues == null) { outputValue = VCFConstants.MISSING_VALUE_v4; } else if (intValues.Length == 1) // fast path { outputValue = Convert.ToString(intValues[0]); } else { StringBuilder sb = new StringBuilder(); sb.Append(intValues[0]); for (int i = 1; i < intValues.Length; i++) { sb.Append(","); sb.Append(intValues[i]); } outputValue = sb.ToString(); } } else { object val = g.HasExtendedAttribute(field) ? g.GetExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); if (metaData != null) { int numInFormatField = metaData.getCount(vc); if (numInFormatField > 1 && val.Equals(VCFConstants.MISSING_VALUE_v4)) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); for (int i = 1; i < numInFormatField; i++) { sb.Append(","); sb.Append(VCFConstants.MISSING_VALUE_v4); } val = sb.ToString(); } } // assume that if key is absent, then the given string encoding suffices outputValue = formatVCFField(val); } } if (outputValue != null) { attrs.Add(outputValue); } } } // strip off trailing missing values for (int i = attrs.Count - 1; i >= 0; i--) { if (isMissingValue(attrs[i])) { attrs.RemoveAt(i); } else { break; } } for (int i = 0; i < attrs.Count; i++) { if (i > 0 || genotypeFormatKeys.Contains(VCFConstants.GENOTYPE_KEY)) { sbn.Append(VCFConstants.GENOTYPE_FIELD_SEPARATOR); } sbn.Append(attrs[i]); } } return(sbn.ToString()); }
/// <summary> /// add a record to the file /// </summary> /// <param name="vc"> the Variant Context object </param> public override void add(VariantContext vc) { if (mHeader == null) { throw new IllegalStateException("The VCF Header must be written before records can be added: " + StreamName); } if (doNotWriteGenotypes) { vc = (new VariantContextBuilder(vc)).noGenotypes().make(); } try { base.add(vc); IDictionary<Allele, string> alleleMap = buildAlleleMap(vc); // CHROM write(vc.Chr); write(VCFConstants.FIELD_SEPARATOR); // POS write(Convert.ToString(vc.Start)); write(VCFConstants.FIELD_SEPARATOR); // ID string ID = vc.ID; write(ID); write(VCFConstants.FIELD_SEPARATOR); // REF string refString = vc.Reference.DisplayString; write(refString); write(VCFConstants.FIELD_SEPARATOR); // ALT if (vc.Variant) { Allele altAllele = vc.getAlternateAllele(0); string alt = altAllele.DisplayString; write(alt); for (int i = 1; i < vc.AlternateAlleles.Count; i++) { altAllele = vc.getAlternateAllele(i); alt = altAllele.DisplayString; write(","); write(alt); } } else { write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } write(VCFConstants.FIELD_SEPARATOR); // QUAL if (!vc.hasLog10PError()) { write(VCFConstants.MISSING_VALUE_v4); } else { write(formatQualValue(vc.PhredScaledQual)); } write(VCFConstants.FIELD_SEPARATOR); // FILTER string filters = getFilterString(vc); write(filters); write(VCFConstants.FIELD_SEPARATOR); // INFO IDictionary<string, string> infoFields = new SortedDictionary<string, string>(); foreach (KeyValuePair<string, object> field in vc.Attributes) { string key = field.Key; if (!mHeader.hasInfoLine(key)) { fieldIsMissingFromHeaderError(vc, key, "INFO"); } string outputValue = formatVCFField(field.Value); if (outputValue != null) { infoFields[key] = outputValue; } } writeInfoString(infoFields); // FORMAT GenotypesContext gc = vc.Genotypes; if (gc.LazyWithData && ((LazyGenotypesContext)gc).UnparsedGenotypeData is string) { write(VCFConstants.FIELD_SEPARATOR); write(((LazyGenotypesContext) gc).UnparsedGenotypeData.ToString()); } else { IList<string> genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader); if (genotypeAttributeKeys.Count > 0) { foreach (String format in genotypeAttributeKeys) { if (!mHeader.hasFormatLine(format)) { fieldIsMissingFromHeaderError(vc, format, "FORMAT"); } } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String genotypeFormatString = org.broad.tribble.util.ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); string genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); write(VCFConstants.FIELD_SEPARATOR); write(genotypeFormatString); addGenotypeData(vc, alleleMap, genotypeAttributeKeys); } } write("\n"); // note that we cannot call flush here if we want block gzipping to work properly // calling flush results in all gzipped blocks for each variant flushBuffer(); } catch (IOException e) { throw new Exception("Unable to write the VCF object to " + StreamName, e); } }
/// <summary> /// Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper /// chromosome-based VCF tags based on the current VC produced by builder.make() /// </summary> /// <param name="builder"> the VariantContextBuilder we are updating </param> /// <param name="founderIds"> - Set of founders to take into account. AF and FC will be calculated over the founders only. /// If empty or null, counts are generated for all samples as unrelated individuals </param> /// <param name="removeStaleValues"> should we remove stale values from the mapping? </param> public static void CalculateChromosomeCounts(VariantContextBuilder builder, bool removeStaleValues, ISet <string> founderIds) { VariantContext vc = builder.make(); builder.Attributes = CalculateChromosomeCounts(vc, new Dictionary <string, object> (vc.Attributes), removeStaleValues, founderIds); }
//JAVA TO C# CONVERTER WARNING: 'final' parameters are not allowed in .NET: //ORIGINAL LINE: private static Map<Allele, String> buildAlleleMap(final VariantContext vc) private static IDictionary<Allele, string> buildAlleleMap(VariantContext vc) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1); IDictionary<Allele, string> alleleMap = new Dictionary<Allele, string>(vc.Alleles.Count + 1); alleleMap[Allele.NO_CALL] = VCFConstants.EMPTY_ALLELE; // convenience for lookup //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final List<Allele> alleles = vc.getAlleles(); IList<Allele> alleles = vc.Alleles; for (int i = 0; i < alleles.Count; i++) { alleleMap[alleles[i]] = Convert.ToString(i); } return alleleMap; }
/// <summary> /// Gets size of the variant (end - start +1). /// </summary> /// <returns>The size.</returns> /// <param name="vc">Vc.</param> public static int GetSize(VariantContext vc) { return(vc.End - vc.Start + 1); }
// -------------------------------------------------------------------------------- // // implementation functions // // -------------------------------------------------------------------------------- //JAVA TO C# CONVERTER WARNING: 'final' parameters are not allowed in .NET: //ORIGINAL LINE: private final String getFilterString(final VariantContext vc) private string getFilterString(VariantContext vc) { if (vc.Filtered) { foreach (String filter in vc.Filters) { if (!mHeader.hasFilterLine(filter)) { fieldIsMissingFromHeaderError(vc, filter, "FILTER"); } } return ParsingUtils.join(";", ParsingUtils.sortList(vc.Filters)); } else if (vc.filtersWereApplied()) { return VCFConstants.PASSES_FILTERS_v4; } else { return VCFConstants.UNFILTERED; } }
/// <summary> /// Update the attributes of the attributes map given the VariantContext to reflect the /// proper chromosome-based VCF tags /// </summary> /// <param name="vc"> the VariantContext </param> /// <param name="attributes"> the attributes map to populate; must not be null; may contain old values </param> /// <param name="removeStaleValues"> should we remove stale values from the mapping? </param> /// <returns> the attributes map provided as input, returned for programming convenience </returns> public static IDictionary <string, object> CalculateChromosomeCounts(VariantContext vc, IDictionary <string, object> attributes, bool removeStaleValues) { return(CalculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet <string> ())); }
/// <summary> /// add the genotype data /// </summary> /// <param name="vc"> the variant context </param> /// <param name="genotypeFormatKeys"> Genotype formatting string </param> /// <param name="alleleMap"> alleles for this context </param> /// <exception cref="IOException"> for writer </exception> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys) throws IOException private void addGenotypeData(VariantContext vc, IDictionary<Allele, string> alleleMap, IList<string> genotypeFormatKeys) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int ploidy = vc.getMaxPloidy(2); int ploidy = vc.getMaxPloidy(2); foreach (string sample in mHeader.GenotypeSampleNames) { write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); if (g == null) { g = GenotypeBuilder.createMissing(sample, ploidy); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size()); IList<string> attrs = new List<string>(genotypeFormatKeys.Count); foreach (string field in genotypeFormatKeys) { if (field.Equals(VCFConstants.GENOTYPE_KEY)) { if (!g.Available) { throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record"); } writeAllele(g.getAllele(0), alleleMap); for (int i = 1; i < g.Ploidy; i++) { write(g.Phased ? VCFConstants.PHASED : VCFConstants.UNPHASED); writeAllele(g.getAllele(i), alleleMap); } continue; } else { string outputValue; if (field.Equals(VCFConstants.GENOTYPE_FILTER_KEY)) { outputValue = g.Filtered ? g.Filters : VCFConstants.PASSES_FILTERS_v4; } else { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); if (accessor != null) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int[] intValues = accessor.getValues(g); int[] intValues = accessor.getValues(g); if (intValues == null) { outputValue = VCFConstants.MISSING_VALUE_v4; } else if (intValues.Length == 1) // fast path { outputValue = Convert.ToString(intValues[0]); } else { StringBuilder sb = new StringBuilder(); sb.Append(intValues[0]); for (int i = 1; i < intValues.Length; i++) { sb.Append(","); sb.Append(intValues[i]); } outputValue = sb.ToString(); } } else { object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); if (metaData != null) { int numInFormatField = metaData.getCount(vc); if (numInFormatField > 1 && val.Equals(VCFConstants.MISSING_VALUE_v4)) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); for (int i = 1; i < numInFormatField; i++) { sb.Append(","); sb.Append(VCFConstants.MISSING_VALUE_v4); } val = sb.ToString(); } } // assume that if key is absent, then the given string encoding suffices outputValue = formatVCFField(val); } } if (outputValue != null) { attrs.Add(outputValue); } } } // strip off trailing missing values for (int i = attrs.Count - 1; i >= 0; i--) { if (isMissingValue(attrs[i])) { attrs.RemoveAt(i); } else { break; } } for (int i = 0; i < attrs.Count; i++) { if (i > 0 || genotypeFormatKeys.Contains(VCFConstants.GENOTYPE_KEY)) { write(VCFConstants.GENOTYPE_FIELD_SEPARATOR); } write(attrs[i]); } } }
/// <summary> /// Returns a newly allocated VC that is the same as VC, but without genotypes </summary> /// <param name="vc"> variant context </param> /// <returns> new VC without genotypes </returns> public static VariantContext SitesOnlyVariantContext (VariantContext vc) { return (new VariantContextBuilder (vc)).noGenotypes ().make (); }
/// <summary> /// Determine which genotype fields are in use in the genotypes in VC </summary> /// <param name="vc"> </param> /// <returns> an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first </returns> //JAVA TO C# CONVERTER WARNING: 'final' parameters are not allowed in .NET: //ORIGINAL LINE: public static List<String> calcVCFGenotypeKeys(final VariantContext vc, final VCFHeader header) public static IList<string> calcVCFGenotypeKeys(VariantContext vc, VCFHeader header) { Set<string> keys = new HashSet<string>(); bool sawGoodGT = false; bool sawGoodQual = false; bool sawGenotypeFilter = false; bool sawDP = false; bool sawAD = false; bool sawPL = false; foreach (Genotype g in vc.Genotypes) { keys.addAll(g.ExtendedAttributes.Keys); if (g.Available) { sawGoodGT = true; } if (g.hasGQ()) { sawGoodQual = true; } if (g.hasDP()) { sawDP = true; } if (g.hasAD()) { sawAD = true; } if (g.hasPL()) { sawPL = true; } if (g.Filtered) { sawGenotypeFilter = true; } } if (sawGoodQual) { keys.add(VCFConstants.GENOTYPE_QUALITY_KEY); } if (sawDP) { keys.add(VCFConstants.DEPTH_KEY); } if (sawAD) { keys.add(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } if (sawPL) { keys.add(VCFConstants.GENOTYPE_PL_KEY); } if (sawGenotypeFilter) { keys.add(VCFConstants.GENOTYPE_FILTER_KEY); } IList<string> sortedList = ParsingUtils.sortList(new List<string>(keys)); // make sure the GT is first if (sawGoodGT) { IList<string> newList = new List<string>(sortedList.Count + 1); newList.Add(VCFConstants.GENOTYPE_KEY); newList.AddRange(sortedList); sortedList = newList; } if (sortedList.Count == 0 && header.hasGenotypingData()) { // this needs to be done in case all samples are no-calls return Collections.singletonList(VCFConstants.GENOTYPE_KEY); } else { return sortedList; } }
/// <summary> /// Update the attributes of the attributes map given the VariantContext to reflect the /// proper chromosome-based VCF tags /// </summary> /// <param name="vc"> the VariantContext </param> /// <param name="attributes"> the attributes map to populate; must not be null; may contain old values </param> /// <param name="removeStaleValues"> should we remove stale values from the mapping? </param> /// <returns> the attributes map provided as input, returned for programming convenience </returns> public static IDictionary<string, object> CalculateChromosomeCounts (VariantContext vc, IDictionary<string, object> attributes, bool removeStaleValues) { return CalculateChromosomeCounts (vc, attributes, removeStaleValues, new HashSet<string> ()); }
//JAVA TO C# CONVERTER WARNING: 'final' parameters are not allowed in .NET: //ORIGINAL LINE: private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) private void fieldIsMissingFromHeaderError(VariantContext vc, string id, string field) { if (!allowMissingFieldsInHeader) { throw new IllegalStateException("Key " + id + " found in VariantContext field " + field + " at " + vc.Chr + ":" + vc.Start + " but this key isn't defined in the VCFHeader. We require all VCFs to have" + " complete VCF headers by default."); } }
/// <summary> /// Copy constructor /// constructors: see VariantContextBuilder /// </summary> /// <param name="other"> the VariantContext to copy </param> protected internal VariantContext (VariantContext other) : this (other.Source, other.ID, other.Chr, other.Start, other.End, other.Alleles, other.Genotypes, other.Log10PError, other.FiltersMaybeNull, other.Attributes, other.FullyDecoded, NO_VALIDATION) { }
/// <summary> /// Determine which genotype fields are in use in the genotypes in VC </summary> /// <param name="vc"> </param> /// <returns> an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first </returns> private static IList <string> calcVCFGenotypeKeys(VariantContext vc) { //TODO: not sure who wrote this, these boolean flags should be removed though HashSet <string> keys = new HashSet <string>(); bool sawGoodGT = false; bool sawGoodQual = false; bool sawGenotypeFilter = false; bool sawDP = false; bool sawAD = false; bool sawPL = false; foreach (Genotype g in vc.Genotypes) { //todo, make this a string later foreach (string s in g.ExtendedAttributes.Keys.Select(x => x.ToString())) { keys.Add(s); } if (g.Available) { sawGoodGT = true; } if (g.HasGQ) { sawGoodQual = true; } if (g.HasDP) { sawDP = true; } if (g.HasAD) { sawAD = true; } if (g.HasPL) { sawPL = true; } if (g.Filtered) { sawGenotypeFilter = true; } } if (sawGoodQual) { } if (sawDP) { keys.Add(VCFConstants.DEPTH_KEY); } if (sawAD) { keys.Add(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } if (sawPL) { keys.Add(VCFConstants.GENOTYPE_PL_KEY); } if (sawGenotypeFilter) { keys.Add(VCFConstants.GENOTYPE_FILTER_KEY); } IList <string> sortedList = ParsingUtils.SortList(new List <string>(keys)); // make sure the GT is first if (sawGoodGT) { IList <string> newList = new List <string>(sortedList.Count + 1); newList.Add(VCFConstants.GENOTYPE_KEY); foreach (string s in sortedList) { newList.Add(s); } sortedList = newList; } if (sortedList.Count == 0) { // this needs to be done in case all samples are no-calls return(new List <string>() { VCFConstants.GENOTYPE_KEY }); } else { return(sortedList); } }
/// <param name="other"> VariantContext whose alternate alleles to compare against </param> /// <returns> true if this VariantContext has the same alternate alleles as other, /// regardless of ordering. Otherwise returns false. </returns> public bool HasSameAlternateAllelesAs (VariantContext other) { IList<Allele> thisAlternateAlleles = AlternateAlleles; IList<Allele> otherAlternateAlleles = other.AlternateAlleles; if (thisAlternateAlleles.Count != otherAlternateAlleles.Count) { return false; } foreach (Allele allele in thisAlternateAlleles) { if (!otherAlternateAlleles.Contains (allele)) { return false; } } return true; }
/// <summary> /// Add a record to the file /// </summary> /// <param name="vc">The Variant Context object </param> protected string getVariantLinetoWrite(VariantContext vc) { if (doNotWriteGenotypes) { vc = (new VariantContextBuilder(vc)).noGenotypes().make(); } try { //Convert alleles to 1,2,3,etc. numbering IDictionary <Allele, string> alleleMap = buildAlleleMap(vc); // CHROM StringBuilder lineToWrite = new StringBuilder(); //Add chr, pos, id, ref lineToWrite.Append(String.Join(VCFConstants.FIELD_SEPARATOR, vc.Chr, vc.Start.ToString(), vc.ID, vc.Reference.DisplayString)); // ALT if (vc.Variant) { Allele altAllele = vc.GetAlternateAllele(0); string alt = altAllele.DisplayString; lineToWrite.Append(alt); for (int i = 1; i < vc.AlternateAlleles.Count; i++) { altAllele = vc.GetAlternateAllele(i); alt = altAllele.DisplayString; lineToWrite.Append(","); lineToWrite.Append(alt); } } else { lineToWrite.Append(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); } lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // QUAL if (!vc.HasLog10PError) { lineToWrite.Append(VCFConstants.MISSING_VALUE_v4); } else { lineToWrite.Append(formatQualValue(vc.PhredScaledQual)); } lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // FILTER string filters = getFilterString(vc); lineToWrite.Append(filters); lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); // INFO IDictionary <string, string> infoFields = new SortedDictionary <string, string>(); foreach (KeyValuePair <string, object> field in vc.Attributes) { string key = field.Key; if (!mHeader.hasInfoLine(key)) { fieldIsMissingFromHeaderError(vc, key, "INFO"); } string outputValue = formatVCFField(field.Value); if (outputValue != null) { infoFields[key] = outputValue; } } lineToWrite.Append(getInfoString(infoFields));; // FORMAT GenotypesContext gc = vc.Genotypes; if (gc.LazyWithData && ((LazyGenotypesContext)gc).UnparsedGenotypeData is string) { lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); lineToWrite.Append(((LazyGenotypesContext)gc).UnparsedGenotypeData.ToString()); } else { IList <string> genotypeAttributeKeys = calcVCFGenotypeKeys(vc); if (genotypeAttributeKeys.Count > 0) { foreach (String format in genotypeAttributeKeys) { if (!mHeader.hasFormatLine(format)) { fieldIsMissingFromHeaderError(vc, format, "FORMAT"); } } string genotypeFormatString = String.Join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); lineToWrite.Append(VCFConstants.FIELD_SEPARATOR); lineToWrite.Append(genotypeFormatString); lineToWrite.Append(getGenotypeDataText(vc, alleleMap, genotypeAttributeKeys)); } } lineToWrite.Append("\n"); return(lineToWrite.ToString()); } catch (IOException e) { throw new Exception("Unable to write the VCF object:\n " + vc.ToString() + "\n", e); } }