Exemplo n.º 1
0
        private VariantContext decodeLine(string line, bool includeGenotypes)
        {
            // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
            if (line.StartsWith(VCFHeader.HEADER_INDICATOR))
            {
                //TODO: Possibly raise exception?  At least in one scenario, the VCF header has already been parsed before this is called
                //seems like this should always be true based on statement below
                throw new VCFParsingError("While decoding genotype lines came across a commented header line.  Problem is with line:\n " + line);
            }
            // our header cannot be null, we need the genotype sample names and counts
            if (header == null)
            {
                throw new VCFParsingError("VCF Header cannot be null when decoding a record");
            }

            //I think this is not necessary
            int parseSize = Math.Min(header.ColumnCount, NUM_STANDARD_FIELDS + 1);

            //TODO: Original bit of code here could do lazy genotype initalization and so could split off the
            //first 8 columns, leaving any genotype data still lumped together in the 9th column (if present).
            // string[] parts=line.Split(VCFConstants.FIELD_SEPARATOR_CHAR_AS_ARRAY,parseSize,StringSplitOptions.None);
            string[] parts = FastStringUtils.Split(line, VCFConstants.FIELD_SEPARATOR_CHAR, parseSize, StringSplitOptions.None);


            //ND - Modified this heavily, as it is imposssible for header to be null at this stage.
            // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.
            // Otherwise check that we have nine (normal colummns + genotyping data)
            if ((!header.hasGenotypingData() && parts.Length != NUM_STANDARD_FIELDS) ||
                (header.hasGenotypingData() && parts.Length != (NUM_STANDARD_FIELDS + 1)))
            {
                throw new VCFParsingError("Line " + lineNo + ": there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + parts.Length + " )");
            }
            return(parseVCFLine(parts, includeGenotypes));
        }
Exemplo n.º 2
0
        /// <summary>
        /// Create a genotype map
        /// </summary>
        /// <param name="str"> the string </param>
        /// <param name="alleles"> the list of alleles </param>
        /// <returns> a mapping of sample name to genotype object </returns>
        public LazyGenotypesContext.LazyData CreateGenotypeMap(string str, IList <Allele> alleles, string chr, int pos)
        {
            if (genotypeParts == null)
            {
                genotypeParts = new String[header.ColumnCount - NUM_STANDARD_FIELDS];
            }
            try {
                FastStringUtils.Split(str, VCFConstants.FIELD_SEPARATOR_CHAR, genotypeParts);
            } catch (Exception e) {
                throw new VCFParsingError("Could not parse genotypes, was expecting " + (genotypeParts.Length - 1).ToString() + " but found " + str.Split(VCFConstants.FIELD_SEPARATOR_CHAR).Length.ToString(), e);
            }
            List <Genotype> genotypes = new List <Genotype> (genotypeParts.Length);

            // get the format keys
            //int nGTKeys = ParsingUtils.Split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
            string[] genotypeKeyArray       = genotypeParts [0].Split(VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
            int      genotypeAlleleLocation = Array.IndexOf(genotypeKeyArray, VCFConstants.GENOTYPE_KEY);

            if (version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1)
            {
                generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
            }
            // clear out our allele mapping
            alleleMap.Clear();
            GenotypeBuilder gb = new GenotypeBuilder();

            // cycle through the genotype strings
            for (int genotypeOffset = 1; genotypeOffset < genotypeParts.Length; genotypeOffset++)
            {
                Genotype curGenotype;
                string   sampleName  = header.GenotypeSampleNames [genotypeOffset - 1];
                var      currentGeno = genotypeParts [genotypeOffset];
                //shortcut for null alleles
                if (currentGeno == "./.")
                {
                    curGenotype = GenotypeBuilder.CreateMissing(sampleName, 2);
                }
                else if (currentGeno == ".")
                {
                    curGenotype = GenotypeBuilder.CreateMissing(sampleName, 1);
                }
                else
                {
                    gb.Reset(false);
                    gb.SampleName = sampleName;
                    string[] GTValueArray = FastStringUtils.Split(currentGeno, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR, int.MaxValue, StringSplitOptions.None);
                    // cycle through the sample names
                    // check to see if the value list is longer than the key list, which is a problem
                    if (genotypeKeyArray.Length < GTValueArray.Length)
                    {
                        generateException("There are too many keys for the sample " + sampleName + ", line is: keys = " + genotypeParts [0] + ", values = " + genotypeParts [genotypeOffset]);
                    }
                    if (genotypeAlleleLocation > 0)
                    {
                        generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
                    }

                    //TODO: THIS IS A DAMNED MESS
                    //Code loops over all fields in the key and decodes them, adding them as information to the genotype builder, which then makes it.
                    if (genotypeKeyArray.Length > 0)
                    {
                        gb.MaxAttributes(genotypeKeyArray.Length - 1);
                        for (int i = 0; i < genotypeKeyArray.Length; i++)
                        {
                            string gtKey = genotypeKeyArray [i];
                            if (i >= GTValueArray.Length)
                            {
                                break;
                            }
                            // todo -- all of these on the fly parsing of the missing value should be static constants
                            if (gtKey == VCFConstants.GENOTYPE_FILTER_KEY)
                            {
                                IList <string> filters = parseFilters(GetCachedString(GTValueArray [i]));
                                if (filters != null)
                                {
                                    gb.SetFilters(filters.ToList());
                                }
                            }
                            else if (GTValueArray [i] == VCFConstants.MISSING_VALUE_v4)
                            {
                                // don't add missing values to the map
                            }
                            else
                            {
                                if (gtKey == VCFConstants.GENOTYPE_QUALITY_KEY)
                                {
                                    if (GTValueArray [i] == VCFConstants.MISSING_GENOTYPE_QUALITY_v3)
                                    {
                                        gb.noGQ();
                                    }
                                    else
                                    {
                                        gb.GQ = ((int)Math.Round(Convert.ToDouble(GTValueArray [i])));
                                    }
                                }
                                else if (gtKey == VCFConstants.GENOTYPE_ALLELE_DEPTHS)
                                {
                                    gb.AD = (decodeInts(GTValueArray [i]));
                                }
                                else if (gtKey == VCFConstants.GENOTYPE_PL_KEY)
                                {
                                    gb.PL = (decodeInts(GTValueArray [i]));
                                }
                                else if (gtKey == VCFConstants.GENOTYPE_LIKELIHOODS_KEY)
                                {
                                    gb.PL = (GenotypeLikelihoods.fromGLField(GTValueArray [i]).AsPLs);
                                }
                                else if (gtKey.Equals(VCFConstants.DEPTH_KEY))
                                {
                                    gb.DP = (Convert.ToInt32(GTValueArray [i]));
                                }
                                else
                                {
                                    gb.AddAttribute(gtKey, GTValueArray [i]);
                                }
                            }
                        }
                    }

                    List <Allele> GTalleles;
                    if (genotypeAlleleLocation == -1)
                    {
                        GTalleles = new List <Allele> (0);
                    }
                    else
                    {
                        GTalleles = parseGenotypeAlleles(GTValueArray [genotypeAlleleLocation], alleles, alleleMap);
                    }
                    gb.Alleles = GTalleles;
                    gb.Phased  = genotypeAlleleLocation != -1 && GTValueArray [genotypeAlleleLocation].IndexOf(VCFConstants.PHASED_AS_CHAR) != -1;

                    // add it to the list
                    try {
                        curGenotype = gb.Make();
                    } catch (Exception e) {
                        throw new VCFParsingError(e.Message + ", at position " + chr + ":" + pos);
                    }
                }
                genotypes.Add(curGenotype);
            }
            return(new LazyGenotypesContext.LazyData(genotypes, header.SampleNamesInOrder, header.SampleNameToOffset));
        }