protected internal static Genotype Create(string sampleName, List <Allele> alleles, double[] gls) { var gb = new GenotypeBuilder(sampleName, alleles); gb.setPL(gls); return(gb.Make()); }
public static Genotype Create(string sampleName, List <Allele> alleles, Dictionary <string, object> attributes) { var gb = new GenotypeBuilder(sampleName, alleles); gb.AddAttributes(attributes); return(gb.Make()); }
private Genotype fullyDecodeGenotypes (Genotype g, VCFHeader header) { IDictionary<string, object> map = fullyDecodeAttributes (g.ExtendedAttributes, header, true); var g2 = new GenotypeBuilder (g); g2.AddAttributes (map); return g2.Make (); }
/// <summary> /// add the genotype data /// </summary> /// <param name="vc"> the variant context </param> /// <param name="genotypeFormatKeys"> Genotype formatting string </param> /// <param name="alleleMap"> alleles for this context </param> /// <exception cref="IOException"> for writer </exception> private string getGenotypeDataText(VariantContext vc, IDictionary <Allele, string> alleleMap, IList <string> genotypeFormatKeys) { StringBuilder sbn = new StringBuilder(); int ploidy = vc.GetMaxPloidy(2); foreach (string sample in mHeader.GenotypeSampleNames) { sbn.Append(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.GetGenotype(sample); if (g == null) { g = GenotypeBuilder.CreateMissing(sample, ploidy); } IList <string> attrs = new List <string>(genotypeFormatKeys.Count); foreach (string field in genotypeFormatKeys) { if (field.Equals(VCFConstants.GENOTYPE_KEY)) { if (!g.Available) { throw new Exception("GTs cannot be missing for some samples if they are available for others in the record"); } sbn.Append(getAlleleText(g.getAllele(0), alleleMap)); for (int i = 1; i < g.Ploidy; i++) { sbn.Append(g.Phased ? VCFConstants.PHASED : VCFConstants.UNPHASED); sbn.Append(getAlleleText(g.getAllele(i), alleleMap)); } continue; } else { string outputValue; if (field.Equals(VCFConstants.GENOTYPE_FILTER_KEY)) { outputValue = g.Filtered ? g.Filters : VCFConstants.PASSES_FILTERS_v4; } else { IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.GetAccessor(field); if (accessor != null) { int[] intValues = accessor.getValues(g); if (intValues == null) { outputValue = VCFConstants.MISSING_VALUE_v4; } else if (intValues.Length == 1) // fast path { outputValue = Convert.ToString(intValues[0]); } else { StringBuilder sb = new StringBuilder(); sb.Append(intValues[0]); for (int i = 1; i < intValues.Length; i++) { sb.Append(","); sb.Append(intValues[i]); } outputValue = sb.ToString(); } } else { object val = g.HasExtendedAttribute(field) ? g.GetExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); if (metaData != null) { int numInFormatField = metaData.getCount(vc); if (numInFormatField > 1 && val.Equals(VCFConstants.MISSING_VALUE_v4)) { // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. // For example, if Number=2, the string has to be ".,." StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); for (int i = 1; i < numInFormatField; i++) { sb.Append(","); sb.Append(VCFConstants.MISSING_VALUE_v4); } val = sb.ToString(); } } // assume that if key is absent, then the given string encoding suffices outputValue = formatVCFField(val); } } if (outputValue != null) { attrs.Add(outputValue); } } } // strip off trailing missing values for (int i = attrs.Count - 1; i >= 0; i--) { if (isMissingValue(attrs[i])) { attrs.RemoveAt(i); } else { break; } } for (int i = 0; i < attrs.Count; i++) { if (i > 0 || genotypeFormatKeys.Contains(VCFConstants.GENOTYPE_KEY)) { sbn.Append(VCFConstants.GENOTYPE_FIELD_SEPARATOR); } sbn.Append(attrs[i]); } } return(sbn.ToString()); }
protected internal static Genotype Create(string sampleName, List<Allele> alleles, double[] gls) { var gb = new GenotypeBuilder(sampleName, alleles); gb.setPL(gls); return gb.Make(); }
public static Genotype Create(string sampleName, List<Allele> alleles, Dictionary<string, object> attributes) { var gb = new GenotypeBuilder(sampleName, alleles); gb.AddAttributes(attributes); return gb.Make(); }
/// <summary> /// Create a genotype map /// </summary> /// <param name="str"> the string </param> /// <param name="alleles"> the list of alleles </param> /// <returns> a mapping of sample name to genotype object </returns> public LazyGenotypesContext.LazyData CreateGenotypeMap(string str, IList <Allele> alleles, string chr, int pos) { if (genotypeParts == null) { genotypeParts = new String[header.ColumnCount - NUM_STANDARD_FIELDS]; } try { FastStringUtils.Split(str, VCFConstants.FIELD_SEPARATOR_CHAR, genotypeParts); } catch (Exception e) { throw new VCFParsingError("Could not parse genotypes, was expecting " + (genotypeParts.Length - 1).ToString() + " but found " + str.Split(VCFConstants.FIELD_SEPARATOR_CHAR).Length.ToString(), e); } List <Genotype> genotypes = new List <Genotype> (genotypeParts.Length); // get the format keys //int nGTKeys = ParsingUtils.Split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); string[] genotypeKeyArray = genotypeParts [0].Split(VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); int genotypeAlleleLocation = Array.IndexOf(genotypeKeyArray, VCFConstants.GENOTYPE_KEY); if (version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1) { generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); } // clear out our allele mapping alleleMap.Clear(); GenotypeBuilder gb = new GenotypeBuilder(); // cycle through the genotype strings for (int genotypeOffset = 1; genotypeOffset < genotypeParts.Length; genotypeOffset++) { Genotype curGenotype; string sampleName = header.GenotypeSampleNames [genotypeOffset - 1]; var currentGeno = genotypeParts [genotypeOffset]; //shortcut for null alleles if (currentGeno == "./.") { curGenotype = GenotypeBuilder.CreateMissing(sampleName, 2); } else if (currentGeno == ".") { curGenotype = GenotypeBuilder.CreateMissing(sampleName, 1); } else { gb.Reset(false); gb.SampleName = sampleName; string[] GTValueArray = FastStringUtils.Split(currentGeno, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR, int.MaxValue, StringSplitOptions.None); // cycle through the sample names // check to see if the value list is longer than the key list, which is a problem if (genotypeKeyArray.Length < GTValueArray.Length) { generateException("There are too many keys for the sample " + sampleName + ", line is: keys = " + genotypeParts [0] + ", values = " + genotypeParts [genotypeOffset]); } if (genotypeAlleleLocation > 0) { generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); } //TODO: THIS IS A DAMNED MESS //Code loops over all fields in the key and decodes them, adding them as information to the genotype builder, which then makes it. if (genotypeKeyArray.Length > 0) { gb.MaxAttributes(genotypeKeyArray.Length - 1); for (int i = 0; i < genotypeKeyArray.Length; i++) { string gtKey = genotypeKeyArray [i]; if (i >= GTValueArray.Length) { break; } // todo -- all of these on the fly parsing of the missing value should be static constants if (gtKey == VCFConstants.GENOTYPE_FILTER_KEY) { IList <string> filters = parseFilters(GetCachedString(GTValueArray [i])); if (filters != null) { gb.SetFilters(filters.ToList()); } } else if (GTValueArray [i] == VCFConstants.MISSING_VALUE_v4) { // don't add missing values to the map } else { if (gtKey == VCFConstants.GENOTYPE_QUALITY_KEY) { if (GTValueArray [i] == VCFConstants.MISSING_GENOTYPE_QUALITY_v3) { gb.noGQ(); } else { gb.GQ = ((int)Math.Round(Convert.ToDouble(GTValueArray [i]))); } } else if (gtKey == VCFConstants.GENOTYPE_ALLELE_DEPTHS) { gb.AD = (decodeInts(GTValueArray [i])); } else if (gtKey == VCFConstants.GENOTYPE_PL_KEY) { gb.PL = (decodeInts(GTValueArray [i])); } else if (gtKey == VCFConstants.GENOTYPE_LIKELIHOODS_KEY) { gb.PL = (GenotypeLikelihoods.fromGLField(GTValueArray [i]).AsPLs); } else if (gtKey.Equals(VCFConstants.DEPTH_KEY)) { gb.DP = (Convert.ToInt32(GTValueArray [i])); } else { gb.AddAttribute(gtKey, GTValueArray [i]); } } } } List <Allele> GTalleles; if (genotypeAlleleLocation == -1) { GTalleles = new List <Allele> (0); } else { GTalleles = parseGenotypeAlleles(GTValueArray [genotypeAlleleLocation], alleles, alleleMap); } gb.Alleles = GTalleles; gb.Phased = genotypeAlleleLocation != -1 && GTValueArray [genotypeAlleleLocation].IndexOf(VCFConstants.PHASED_AS_CHAR) != -1; // add it to the list try { curGenotype = gb.Make(); } catch (Exception e) { throw new VCFParsingError(e.Message + ", at position " + chr + ":" + pos); } } genotypes.Add(curGenotype); } return(new LazyGenotypesContext.LazyData(genotypes, header.SampleNamesInOrder, header.SampleNameToOffset)); }
/// <summary> /// Create a genotype map /// </summary> /// <param name="str"> the string </param> /// <param name="alleles"> the list of alleles </param> /// <returns> a mapping of sample name to genotype object </returns> public LazyGenotypesContext.LazyData CreateGenotypeMap (string str, IList<Allele> alleles, string chr, int pos) { if (genotypeParts == null) genotypeParts = new String[header.ColumnCount - NUM_STANDARD_FIELDS]; try { FastStringUtils.Split (str, VCFConstants.FIELD_SEPARATOR_CHAR, genotypeParts); } catch (Exception e) { throw new VCFParsingError ("Could not parse genotypes, was expecting " + (genotypeParts.Length - 1).ToString () + " but found " + str.Split (VCFConstants.FIELD_SEPARATOR_CHAR).Length.ToString (), e); } List<Genotype> genotypes = new List<Genotype> (genotypeParts.Length); // get the format keys //int nGTKeys = ParsingUtils.Split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); string[] genotypeKeyArray = genotypeParts [0].Split (VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); int genotypeAlleleLocation = Array.IndexOf (genotypeKeyArray, VCFConstants.GENOTYPE_KEY); if (version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1) { generateException ("Unable to find the GT field for the record; the GT field is required in VCF4.0"); } // clear out our allele mapping alleleMap.Clear (); GenotypeBuilder gb = new GenotypeBuilder (); // cycle through the genotype strings for (int genotypeOffset = 1; genotypeOffset < genotypeParts.Length; genotypeOffset++) { Genotype curGenotype; string sampleName = header.GenotypeSampleNames [genotypeOffset - 1]; var currentGeno = genotypeParts [genotypeOffset]; //shortcut for null alleles if (currentGeno == "./.") { curGenotype = GenotypeBuilder.CreateMissing (sampleName, 2); } else if (currentGeno == ".") { curGenotype = GenotypeBuilder.CreateMissing (sampleName, 1); } else { gb.Reset (false); gb.SampleName = sampleName; string[] GTValueArray = FastStringUtils.Split (currentGeno, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR, int.MaxValue, StringSplitOptions.None); // cycle through the sample names // check to see if the value list is longer than the key list, which is a problem if (genotypeKeyArray.Length < GTValueArray.Length) { generateException ("There are too many keys for the sample " + sampleName + ", line is: keys = " + genotypeParts [0] + ", values = " + genotypeParts [genotypeOffset]); } if (genotypeAlleleLocation > 0) { generateException ("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); } //TODO: THIS IS A DAMNED MESS //Code loops over all fields in the key and decodes them, adding them as information to the genotype builder, which then makes it. if (genotypeKeyArray.Length > 0) { gb.MaxAttributes (genotypeKeyArray.Length - 1); for (int i = 0; i < genotypeKeyArray.Length; i++) { string gtKey = genotypeKeyArray [i]; if (i >= GTValueArray.Length) { break; } // todo -- all of these on the fly parsing of the missing value should be static constants if (gtKey == VCFConstants.GENOTYPE_FILTER_KEY) { IList<string> filters = parseFilters (GetCachedString (GTValueArray [i])); if (filters != null) { gb.SetFilters (filters.ToList ()); } } else if (GTValueArray [i] == VCFConstants.MISSING_VALUE_v4) { // don't add missing values to the map } else { if (gtKey == VCFConstants.GENOTYPE_QUALITY_KEY) { if (GTValueArray [i] == VCFConstants.MISSING_GENOTYPE_QUALITY_v3) { gb.noGQ (); } else { gb.GQ = ((int)Math.Round (Convert.ToDouble (GTValueArray [i]))); } } else if (gtKey == VCFConstants.GENOTYPE_ALLELE_DEPTHS) { gb.AD = (decodeInts (GTValueArray [i])); } else if (gtKey == VCFConstants.GENOTYPE_PL_KEY) { gb.PL = (decodeInts (GTValueArray [i])); } else if (gtKey == VCFConstants.GENOTYPE_LIKELIHOODS_KEY) { gb.PL = (GenotypeLikelihoods.fromGLField (GTValueArray [i]).AsPLs); } else if (gtKey.Equals (VCFConstants.DEPTH_KEY)) { gb.DP = (Convert.ToInt32 (GTValueArray [i])); } else { gb.AddAttribute (gtKey, GTValueArray [i]); } } } } List<Allele> GTalleles; if (genotypeAlleleLocation == -1) { GTalleles = new List<Allele> (0); } else { GTalleles = parseGenotypeAlleles (GTValueArray [genotypeAlleleLocation], alleles, alleleMap); } gb.Alleles = GTalleles; gb.Phased = genotypeAlleleLocation != -1 && GTValueArray [genotypeAlleleLocation].IndexOf (VCFConstants.PHASED_AS_CHAR) != -1; // add it to the list try { curGenotype = gb.Make (); } catch (Exception e) { throw new VCFParsingError (e.Message + ", at position " + chr + ":" + pos); } } genotypes.Add (curGenotype); } return new LazyGenotypesContext.LazyData (genotypes, header.SampleNamesInOrder, header.SampleNameToOffset); }