private VariantContext decodeLine(string line, bool includeGenotypes) { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.StartsWith(VCFHeader.HEADER_INDICATOR)) { //TODO: Possibly raise exception? At least in one scenario, the VCF header has already been parsed before this is called //seems like this should always be true based on statement below throw new VCFParsingError("While decoding genotype lines came across a commented header line. Problem is with line:\n " + line); } // our header cannot be null, we need the genotype sample names and counts if (header == null) { throw new VCFParsingError("VCF Header cannot be null when decoding a record"); } //I think this is not necessary int parseSize = Math.Min(header.ColumnCount, NUM_STANDARD_FIELDS + 1); //TODO: Original bit of code here could do lazy genotype initalization and so could split off the //first 8 columns, leaving any genotype data still lumped together in the 9th column (if present). // string[] parts=line.Split(VCFConstants.FIELD_SEPARATOR_CHAR_AS_ARRAY,parseSize,StringSplitOptions.None); string[] parts = FastStringUtils.Split(line, VCFConstants.FIELD_SEPARATOR_CHAR, parseSize, StringSplitOptions.None); //ND - Modified this heavily, as it is imposssible for header to be null at this stage. // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. // Otherwise check that we have nine (normal colummns + genotyping data) if ((!header.hasGenotypingData() && parts.Length != NUM_STANDARD_FIELDS) || (header.hasGenotypingData() && parts.Length != (NUM_STANDARD_FIELDS + 1))) { throw new VCFParsingError("Line " + lineNo + ": there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + parts.Length + " )"); } return(parseVCFLine(parts, includeGenotypes)); }
/// <summary> /// Create a genotype map /// </summary> /// <param name="str"> the string </param> /// <param name="alleles"> the list of alleles </param> /// <returns> a mapping of sample name to genotype object </returns> public LazyGenotypesContext.LazyData CreateGenotypeMap(string str, IList <Allele> alleles, string chr, int pos) { if (genotypeParts == null) { genotypeParts = new String[header.ColumnCount - NUM_STANDARD_FIELDS]; } try { FastStringUtils.Split(str, VCFConstants.FIELD_SEPARATOR_CHAR, genotypeParts); } catch (Exception e) { throw new VCFParsingError("Could not parse genotypes, was expecting " + (genotypeParts.Length - 1).ToString() + " but found " + str.Split(VCFConstants.FIELD_SEPARATOR_CHAR).Length.ToString(), e); } List <Genotype> genotypes = new List <Genotype> (genotypeParts.Length); // get the format keys //int nGTKeys = ParsingUtils.Split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); string[] genotypeKeyArray = genotypeParts [0].Split(VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); int genotypeAlleleLocation = Array.IndexOf(genotypeKeyArray, VCFConstants.GENOTYPE_KEY); if (version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1) { generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0"); } // clear out our allele mapping alleleMap.Clear(); GenotypeBuilder gb = new GenotypeBuilder(); // cycle through the genotype strings for (int genotypeOffset = 1; genotypeOffset < genotypeParts.Length; genotypeOffset++) { Genotype curGenotype; string sampleName = header.GenotypeSampleNames [genotypeOffset - 1]; var currentGeno = genotypeParts [genotypeOffset]; //shortcut for null alleles if (currentGeno == "./.") { curGenotype = GenotypeBuilder.CreateMissing(sampleName, 2); } else if (currentGeno == ".") { curGenotype = GenotypeBuilder.CreateMissing(sampleName, 1); } else { gb.Reset(false); gb.SampleName = sampleName; string[] GTValueArray = FastStringUtils.Split(currentGeno, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR, int.MaxValue, StringSplitOptions.None); // cycle through the sample names // check to see if the value list is longer than the key list, which is a problem if (genotypeKeyArray.Length < GTValueArray.Length) { generateException("There are too many keys for the sample " + sampleName + ", line is: keys = " + genotypeParts [0] + ", values = " + genotypeParts [genotypeOffset]); } if (genotypeAlleleLocation > 0) { generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); } //TODO: THIS IS A DAMNED MESS //Code loops over all fields in the key and decodes them, adding them as information to the genotype builder, which then makes it. if (genotypeKeyArray.Length > 0) { gb.MaxAttributes(genotypeKeyArray.Length - 1); for (int i = 0; i < genotypeKeyArray.Length; i++) { string gtKey = genotypeKeyArray [i]; if (i >= GTValueArray.Length) { break; } // todo -- all of these on the fly parsing of the missing value should be static constants if (gtKey == VCFConstants.GENOTYPE_FILTER_KEY) { IList <string> filters = parseFilters(GetCachedString(GTValueArray [i])); if (filters != null) { gb.SetFilters(filters.ToList()); } } else if (GTValueArray [i] == VCFConstants.MISSING_VALUE_v4) { // don't add missing values to the map } else { if (gtKey == VCFConstants.GENOTYPE_QUALITY_KEY) { if (GTValueArray [i] == VCFConstants.MISSING_GENOTYPE_QUALITY_v3) { gb.noGQ(); } else { gb.GQ = ((int)Math.Round(Convert.ToDouble(GTValueArray [i]))); } } else if (gtKey == VCFConstants.GENOTYPE_ALLELE_DEPTHS) { gb.AD = (decodeInts(GTValueArray [i])); } else if (gtKey == VCFConstants.GENOTYPE_PL_KEY) { gb.PL = (decodeInts(GTValueArray [i])); } else if (gtKey == VCFConstants.GENOTYPE_LIKELIHOODS_KEY) { gb.PL = (GenotypeLikelihoods.fromGLField(GTValueArray [i]).AsPLs); } else if (gtKey.Equals(VCFConstants.DEPTH_KEY)) { gb.DP = (Convert.ToInt32(GTValueArray [i])); } else { gb.AddAttribute(gtKey, GTValueArray [i]); } } } } List <Allele> GTalleles; if (genotypeAlleleLocation == -1) { GTalleles = new List <Allele> (0); } else { GTalleles = parseGenotypeAlleles(GTValueArray [genotypeAlleleLocation], alleles, alleleMap); } gb.Alleles = GTalleles; gb.Phased = genotypeAlleleLocation != -1 && GTValueArray [genotypeAlleleLocation].IndexOf(VCFConstants.PHASED_AS_CHAR) != -1; // add it to the list try { curGenotype = gb.Make(); } catch (Exception e) { throw new VCFParsingError(e.Message + ", at position " + chr + ":" + pos); } } genotypes.Add(curGenotype); } return(new LazyGenotypesContext.LazyData(genotypes, header.SampleNamesInOrder, header.SampleNameToOffset)); }