예제 #1
0
        protected internal static Genotype Create(string sampleName, List <Allele> alleles, double[] gls)
        {
            var gb = new GenotypeBuilder(sampleName, alleles);

            gb.setPL(gls);
            return(gb.Make());
        }
예제 #2
0
        public static Genotype Create(string sampleName, List <Allele> alleles, Dictionary <string, object> attributes)
        {
            var gb = new GenotypeBuilder(sampleName, alleles);

            gb.AddAttributes(attributes);
            return(gb.Make());
        }
예제 #3
0
		private Genotype fullyDecodeGenotypes (Genotype g, VCFHeader header)
		{
			IDictionary<string, object> map = fullyDecodeAttributes (g.ExtendedAttributes, header, true);
			var g2 = new GenotypeBuilder (g);
			g2.AddAttributes (map);
			return g2.Make ();
		}
예제 #4
0
        /// <summary>
        /// add the genotype data
        /// </summary>
        /// <param name="vc">                     the variant context </param>
        /// <param name="genotypeFormatKeys">  Genotype formatting string </param>
        /// <param name="alleleMap">              alleles for this context </param>
        /// <exception cref="IOException"> for writer </exception>
        private string getGenotypeDataText(VariantContext vc, IDictionary <Allele, string> alleleMap, IList <string> genotypeFormatKeys)
        {
            StringBuilder sbn    = new StringBuilder();
            int           ploidy = vc.GetMaxPloidy(2);

            foreach (string sample in mHeader.GenotypeSampleNames)
            {
                sbn.Append(VCFConstants.FIELD_SEPARATOR);

                Genotype g = vc.GetGenotype(sample);
                if (g == null)
                {
                    g = GenotypeBuilder.CreateMissing(sample, ploidy);
                }
                IList <string> attrs = new List <string>(genotypeFormatKeys.Count);
                foreach (string field in genotypeFormatKeys)
                {
                    if (field.Equals(VCFConstants.GENOTYPE_KEY))
                    {
                        if (!g.Available)
                        {
                            throw new Exception("GTs cannot be missing for some samples if they are available for others in the record");
                        }

                        sbn.Append(getAlleleText(g.getAllele(0), alleleMap));
                        for (int i = 1; i < g.Ploidy; i++)
                        {
                            sbn.Append(g.Phased ? VCFConstants.PHASED : VCFConstants.UNPHASED);
                            sbn.Append(getAlleleText(g.getAllele(i), alleleMap));
                        }
                        continue;
                    }
                    else
                    {
                        string outputValue;
                        if (field.Equals(VCFConstants.GENOTYPE_FILTER_KEY))
                        {
                            outputValue = g.Filtered ? g.Filters : VCFConstants.PASSES_FILTERS_v4;
                        }
                        else
                        {
                            IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.GetAccessor(field);
                            if (accessor != null)
                            {
                                int[] intValues = accessor.getValues(g);
                                if (intValues == null)
                                {
                                    outputValue = VCFConstants.MISSING_VALUE_v4;
                                }
                                else if (intValues.Length == 1)                                 // fast path
                                {
                                    outputValue = Convert.ToString(intValues[0]);
                                }
                                else
                                {
                                    StringBuilder sb = new StringBuilder();
                                    sb.Append(intValues[0]);
                                    for (int i = 1; i < intValues.Length; i++)
                                    {
                                        sb.Append(",");
                                        sb.Append(intValues[i]);
                                    }
                                    outputValue = sb.ToString();
                                }
                            }
                            else
                            {
                                object val = g.HasExtendedAttribute(field) ? g.GetExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;

                                VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
                                if (metaData != null)
                                {
                                    int numInFormatField = metaData.getCount(vc);
                                    if (numInFormatField > 1 && val.Equals(VCFConstants.MISSING_VALUE_v4))
                                    {
                                        // If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
                                        // For example, if Number=2, the string has to be ".,."
                                        StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
                                        for (int i = 1; i < numInFormatField; i++)
                                        {
                                            sb.Append(",");
                                            sb.Append(VCFConstants.MISSING_VALUE_v4);
                                        }
                                        val = sb.ToString();
                                    }
                                }

                                // assume that if key is absent, then the given string encoding suffices
                                outputValue = formatVCFField(val);
                            }
                        }

                        if (outputValue != null)
                        {
                            attrs.Add(outputValue);
                        }
                    }
                }

                // strip off trailing missing values
                for (int i = attrs.Count - 1; i >= 0; i--)
                {
                    if (isMissingValue(attrs[i]))
                    {
                        attrs.RemoveAt(i);
                    }
                    else
                    {
                        break;
                    }
                }

                for (int i = 0; i < attrs.Count; i++)
                {
                    if (i > 0 || genotypeFormatKeys.Contains(VCFConstants.GENOTYPE_KEY))
                    {
                        sbn.Append(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
                    }
                    sbn.Append(attrs[i]);
                }
            }
            return(sbn.ToString());
        }
예제 #5
0
 protected internal static Genotype Create(string sampleName, List<Allele> alleles, double[] gls)
 {
     var gb = new GenotypeBuilder(sampleName, alleles);
     gb.setPL(gls);
     return gb.Make();
 }
예제 #6
0
 public static Genotype Create(string sampleName, List<Allele> alleles, Dictionary<string, object> attributes)
 {
     var gb = new GenotypeBuilder(sampleName, alleles);
     gb.AddAttributes(attributes);
     return gb.Make();
 }
예제 #7
0
        /// <summary>
        /// Create a genotype map
        /// </summary>
        /// <param name="str"> the string </param>
        /// <param name="alleles"> the list of alleles </param>
        /// <returns> a mapping of sample name to genotype object </returns>
        public LazyGenotypesContext.LazyData CreateGenotypeMap(string str, IList <Allele> alleles, string chr, int pos)
        {
            if (genotypeParts == null)
            {
                genotypeParts = new String[header.ColumnCount - NUM_STANDARD_FIELDS];
            }
            try {
                FastStringUtils.Split(str, VCFConstants.FIELD_SEPARATOR_CHAR, genotypeParts);
            } catch (Exception e) {
                throw new VCFParsingError("Could not parse genotypes, was expecting " + (genotypeParts.Length - 1).ToString() + " but found " + str.Split(VCFConstants.FIELD_SEPARATOR_CHAR).Length.ToString(), e);
            }
            List <Genotype> genotypes = new List <Genotype> (genotypeParts.Length);

            // get the format keys
            //int nGTKeys = ParsingUtils.Split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
            string[] genotypeKeyArray       = genotypeParts [0].Split(VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
            int      genotypeAlleleLocation = Array.IndexOf(genotypeKeyArray, VCFConstants.GENOTYPE_KEY);

            if (version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1)
            {
                generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
            }
            // clear out our allele mapping
            alleleMap.Clear();
            GenotypeBuilder gb = new GenotypeBuilder();

            // cycle through the genotype strings
            for (int genotypeOffset = 1; genotypeOffset < genotypeParts.Length; genotypeOffset++)
            {
                Genotype curGenotype;
                string   sampleName  = header.GenotypeSampleNames [genotypeOffset - 1];
                var      currentGeno = genotypeParts [genotypeOffset];
                //shortcut for null alleles
                if (currentGeno == "./.")
                {
                    curGenotype = GenotypeBuilder.CreateMissing(sampleName, 2);
                }
                else if (currentGeno == ".")
                {
                    curGenotype = GenotypeBuilder.CreateMissing(sampleName, 1);
                }
                else
                {
                    gb.Reset(false);
                    gb.SampleName = sampleName;
                    string[] GTValueArray = FastStringUtils.Split(currentGeno, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR, int.MaxValue, StringSplitOptions.None);
                    // cycle through the sample names
                    // check to see if the value list is longer than the key list, which is a problem
                    if (genotypeKeyArray.Length < GTValueArray.Length)
                    {
                        generateException("There are too many keys for the sample " + sampleName + ", line is: keys = " + genotypeParts [0] + ", values = " + genotypeParts [genotypeOffset]);
                    }
                    if (genotypeAlleleLocation > 0)
                    {
                        generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
                    }

                    //TODO: THIS IS A DAMNED MESS
                    //Code loops over all fields in the key and decodes them, adding them as information to the genotype builder, which then makes it.
                    if (genotypeKeyArray.Length > 0)
                    {
                        gb.MaxAttributes(genotypeKeyArray.Length - 1);
                        for (int i = 0; i < genotypeKeyArray.Length; i++)
                        {
                            string gtKey = genotypeKeyArray [i];
                            if (i >= GTValueArray.Length)
                            {
                                break;
                            }
                            // todo -- all of these on the fly parsing of the missing value should be static constants
                            if (gtKey == VCFConstants.GENOTYPE_FILTER_KEY)
                            {
                                IList <string> filters = parseFilters(GetCachedString(GTValueArray [i]));
                                if (filters != null)
                                {
                                    gb.SetFilters(filters.ToList());
                                }
                            }
                            else if (GTValueArray [i] == VCFConstants.MISSING_VALUE_v4)
                            {
                                // don't add missing values to the map
                            }
                            else
                            {
                                if (gtKey == VCFConstants.GENOTYPE_QUALITY_KEY)
                                {
                                    if (GTValueArray [i] == VCFConstants.MISSING_GENOTYPE_QUALITY_v3)
                                    {
                                        gb.noGQ();
                                    }
                                    else
                                    {
                                        gb.GQ = ((int)Math.Round(Convert.ToDouble(GTValueArray [i])));
                                    }
                                }
                                else if (gtKey == VCFConstants.GENOTYPE_ALLELE_DEPTHS)
                                {
                                    gb.AD = (decodeInts(GTValueArray [i]));
                                }
                                else if (gtKey == VCFConstants.GENOTYPE_PL_KEY)
                                {
                                    gb.PL = (decodeInts(GTValueArray [i]));
                                }
                                else if (gtKey == VCFConstants.GENOTYPE_LIKELIHOODS_KEY)
                                {
                                    gb.PL = (GenotypeLikelihoods.fromGLField(GTValueArray [i]).AsPLs);
                                }
                                else if (gtKey.Equals(VCFConstants.DEPTH_KEY))
                                {
                                    gb.DP = (Convert.ToInt32(GTValueArray [i]));
                                }
                                else
                                {
                                    gb.AddAttribute(gtKey, GTValueArray [i]);
                                }
                            }
                        }
                    }

                    List <Allele> GTalleles;
                    if (genotypeAlleleLocation == -1)
                    {
                        GTalleles = new List <Allele> (0);
                    }
                    else
                    {
                        GTalleles = parseGenotypeAlleles(GTValueArray [genotypeAlleleLocation], alleles, alleleMap);
                    }
                    gb.Alleles = GTalleles;
                    gb.Phased  = genotypeAlleleLocation != -1 && GTValueArray [genotypeAlleleLocation].IndexOf(VCFConstants.PHASED_AS_CHAR) != -1;

                    // add it to the list
                    try {
                        curGenotype = gb.Make();
                    } catch (Exception e) {
                        throw new VCFParsingError(e.Message + ", at position " + chr + ":" + pos);
                    }
                }
                genotypes.Add(curGenotype);
            }
            return(new LazyGenotypesContext.LazyData(genotypes, header.SampleNamesInOrder, header.SampleNameToOffset));
        }
예제 #8
0
		/// <summary>
		/// Create a genotype map
		/// </summary>
		/// <param name="str"> the string </param>
		/// <param name="alleles"> the list of alleles </param>
		/// <returns> a mapping of sample name to genotype object </returns>
		public LazyGenotypesContext.LazyData CreateGenotypeMap (string str, IList<Allele> alleles, string chr, int pos)
		{            
			if (genotypeParts == null)
				genotypeParts = new String[header.ColumnCount - NUM_STANDARD_FIELDS];
			try {
				FastStringUtils.Split (str, VCFConstants.FIELD_SEPARATOR_CHAR, genotypeParts);
			} catch (Exception e) {
				throw new VCFParsingError ("Could not parse genotypes, was expecting " + (genotypeParts.Length - 1).ToString () + " but found " + str.Split (VCFConstants.FIELD_SEPARATOR_CHAR).Length.ToString (), e);
			}
			List<Genotype> genotypes = new List<Genotype> (genotypeParts.Length);
			// get the format keys
			//int nGTKeys = ParsingUtils.Split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
			string[] genotypeKeyArray = genotypeParts [0].Split (VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
			int genotypeAlleleLocation = Array.IndexOf (genotypeKeyArray, VCFConstants.GENOTYPE_KEY);
			if (version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1) {
				generateException ("Unable to find the GT field for the record; the GT field is required in VCF4.0");
			}
			// clear out our allele mapping
			alleleMap.Clear ();
			GenotypeBuilder gb = new GenotypeBuilder ();
                    
			// cycle through the genotype strings
			for (int genotypeOffset = 1; genotypeOffset < genotypeParts.Length; genotypeOffset++) {
				Genotype curGenotype;
				string sampleName = header.GenotypeSampleNames [genotypeOffset - 1];
				var currentGeno = genotypeParts [genotypeOffset];
				//shortcut for null alleles
				if (currentGeno == "./.") {
					curGenotype = GenotypeBuilder.CreateMissing (sampleName, 2);
				} else if (currentGeno == ".") {
					curGenotype = GenotypeBuilder.CreateMissing (sampleName, 1);
				} else {
					gb.Reset (false);
					gb.SampleName = sampleName;
					string[] GTValueArray = FastStringUtils.Split (currentGeno, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR, int.MaxValue, StringSplitOptions.None);
					// cycle through the sample names
					// check to see if the value list is longer than the key list, which is a problem
					if (genotypeKeyArray.Length < GTValueArray.Length) {
						generateException ("There are too many keys for the sample " + sampleName + ", line is: keys = " + genotypeParts [0] + ", values = " + genotypeParts [genotypeOffset]);
					}
					if (genotypeAlleleLocation > 0) {
						generateException ("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
					}

					//TODO: THIS IS A DAMNED MESS
					//Code loops over all fields in the key and decodes them, adding them as information to the genotype builder, which then makes it.
					if (genotypeKeyArray.Length > 0) {
						gb.MaxAttributes (genotypeKeyArray.Length - 1);
						for (int i = 0; i < genotypeKeyArray.Length; i++) {
							string gtKey = genotypeKeyArray [i];
							if (i >= GTValueArray.Length) {
								break;
							}
							// todo -- all of these on the fly parsing of the missing value should be static constants
							if (gtKey == VCFConstants.GENOTYPE_FILTER_KEY) {
								IList<string> filters = parseFilters (GetCachedString (GTValueArray [i]));
								if (filters != null) {
									gb.SetFilters (filters.ToList ());
								}
							} else if (GTValueArray [i] == VCFConstants.MISSING_VALUE_v4) {
								// don't add missing values to the map
							} else {
								if (gtKey == VCFConstants.GENOTYPE_QUALITY_KEY) {
									if (GTValueArray [i] == VCFConstants.MISSING_GENOTYPE_QUALITY_v3) {
										gb.noGQ ();
									} else {
										gb.GQ = ((int)Math.Round (Convert.ToDouble (GTValueArray [i])));
									}
								} else if (gtKey == VCFConstants.GENOTYPE_ALLELE_DEPTHS) {
									gb.AD = (decodeInts (GTValueArray [i]));
								} else if (gtKey == VCFConstants.GENOTYPE_PL_KEY) {
									gb.PL = (decodeInts (GTValueArray [i]));
								} else if (gtKey == VCFConstants.GENOTYPE_LIKELIHOODS_KEY) {
									gb.PL = (GenotypeLikelihoods.fromGLField (GTValueArray [i]).AsPLs);
								} else if (gtKey.Equals (VCFConstants.DEPTH_KEY)) {
									gb.DP = (Convert.ToInt32 (GTValueArray [i]));
								} else {
									gb.AddAttribute (gtKey, GTValueArray [i]);
								}
							}
						}
					}

					List<Allele> GTalleles;
					if (genotypeAlleleLocation == -1) {
						GTalleles = new List<Allele> (0);
					} else {
						GTalleles = parseGenotypeAlleles (GTValueArray [genotypeAlleleLocation], alleles, alleleMap);
					}
					gb.Alleles = GTalleles;
					gb.Phased = genotypeAlleleLocation != -1 && GTValueArray [genotypeAlleleLocation].IndexOf (VCFConstants.PHASED_AS_CHAR) != -1;

					// add it to the list
					try {
						curGenotype = gb.Make ();
					} catch (Exception e) {
						throw new VCFParsingError (e.Message + ", at position " + chr + ":" + pos);
					}
				}
				genotypes.Add (curGenotype);
			}
			return new LazyGenotypesContext.LazyData (genotypes, header.SampleNamesInOrder, header.SampleNameToOffset);
		}