Esempio n. 1
0
        /// <summary>
        ///     populates a called allele object given an array of vcf columns
        /// </summary>
        protected static void ConvertColumnsToVariant(bool shouldTrimComplexAlleles, string[] cols, CalledAllele allele, int alleleIndex)
        {
            bool shouldOutputRcCounts = true;
            bool shouldOutputTsCounts = true;

            if ((cols == null) || (cols.Length == 0))
            {
                allele = null;
                return;
            }

            //set defaults.
            var genotypeQscore   = 0;
            var referenceSupport = 0;
            var altSupport       = 0;
            var genotypeString   = "";
            var totalCoverage    = 0;

            var variantQuality              = 0.0;
            var numAlts                     = 1;
            var noiseLevel                  = 0;
            var fractionNocalls             = 0f;
            var strandBiasInGATKScaleCoords = -100f;
            var tsCounts                    = new List <string>();

            //

            //read in simple data
            allele.Chromosome        = cols[VcfCommon.ChromIndex];
            allele.ReferencePosition = int.Parse(cols[VcfCommon.PosIndex]);
            allele.ReferenceAllele   = cols[VcfCommon.RefIndex];
            allele.Filters           = VcfVariantUtilities.MapFilterString(cols[VcfCommon.FilterIndex]);



            bool gotQual = double.TryParse(cols[VcfCommon.QualIndex], out variantQuality); // CFTR uses a ".", which is not actually legal... (actually, vcf 4.1 does allow the missing value "." here. Strelka uses it)

            if (gotQual)
            {
                allele.VariantQscore = (int)variantQuality;
            }

            // parse the variant alleles
            var variantAlleles = cols[VcfCommon.AltIndex].Split(',');

            allele.AlternateAllele = variantAlleles[alleleIndex];
            var isRef = (allele.AlternateAllele == ".");

            if (isRef)
            {
                numAlts = 0;
            }
            else
            {
                numAlts = variantAlleles.Count();
            }


            // parse the info field data (presume, single  sample)
            Dictionary <string, string> InfoFields = ParseInfoFields(cols);

            // parse the genotype data (presume, single  sample)
            List <Dictionary <string, string> > Genotypes = ParseGenotypeData(cols);

            //get more complex allele data...

            if (InfoFields.ContainsKey("DP"))
            {
                totalCoverage = Int32.Parse(InfoFields["DP"]);
            }

            if ((Genotypes.Count > 0) && (Genotypes[0] != null))
            {
                if (Genotypes[0].ContainsKey("GQ"))
                {
                    genotypeQscore = Int32.Parse(Genotypes[0]["GQ"]);
                }
                else if (Genotypes[0].ContainsKey("GQX"))
                {
                    genotypeQscore = Int32.Parse(Genotypes[0]["GQX"]);
                }

                if (Genotypes[0].ContainsKey("GT"))
                {
                    genotypeString = Genotypes[0]["GT"];
                }

                if (Genotypes[0].ContainsKey("NL"))
                {
                    noiseLevel = Int32.Parse(Genotypes[0]["NL"]);
                }

                if (Genotypes[0].ContainsKey("NC"))
                {
                    fractionNocalls = float.Parse(Genotypes[0]["NC"]);
                }

                if (Genotypes[0].ContainsKey("SB"))
                {
                    strandBiasInGATKScaleCoords = float.Parse(Genotypes[0]["SB"]);
                }

                var ADstrings = new string[] { "0", "0" };

                if (Genotypes[0].ContainsKey("AD"))
                {
                    ADstrings = Genotypes[0]["AD"].Split(',');
                }

                referenceSupport = int.Parse(ADstrings[0]);

                //by default alt support is 0.
                if ((!isRef) && (ADstrings.Length > 1))
                {
                    altSupport = int.Parse(ADstrings[1]);
                }

                if (shouldOutputRcCounts)
                {
                    if (Genotypes[0].ContainsKey("US"))
                    {
                        tsCounts = Genotypes[0]["US"].Split(',').ToList();
                    }
                }

                allele.Genotype = VcfVariantUtilities.MapGTString(genotypeString, numAlts);

                //note this awkward vcf line (pisces)
                //"chr4\t10\t.\tAA\tGA,G\t0\tPASS\tDP=5394\tGT:GQ:AD:DP:VF:NL:SB:NC\t1/2:0:2387,2000:5394:0.8133:23:0.0000:0.0000";
                //and this one
                //chr2    19946216.ATGTGTG ATG,ATGTG,A 0   PASS metal = platinum; cgi =.; bwa_freebayes = HD:0,LOOHD: 0; bwa_platypus =.; bwa_gatk3 = HD:2,LOOHD: 2; cortex =.; isaac2 = HD:1,LOOHD: 1; dist2closest = 192 GT  1 | 2

                if ((numAlts >= 2) && (Genotypes[0].ContainsKey("AD")))
                {
                    if (ADstrings.Count() <= numAlts) //in this case we never expressedly gave the ref support, so we have to derive it.
                    {
                        int totalAltCount = 0;

                        for (int altIndex = 0; altIndex < numAlts; altIndex++)
                        {
                            var altSupportAtIndex = int.Parse(ADstrings[altIndex]);
                            totalAltCount += altSupportAtIndex;

                            if (altIndex == alleleIndex)
                            {
                                altSupport = altSupportAtIndex;
                            }
                        }
                        referenceSupport = Math.Max(0, totalCoverage - totalAltCount);
                    }
                }
            }

            var strandBiasResults = new BiasResults();

            strandBiasResults.GATKBiasScore = strandBiasInGATKScaleCoords;



            //set the remaining data
            allele.TotalCoverage     = totalCoverage;
            allele.AlleleSupport     = isRef ? referenceSupport : altSupport;
            allele.ReferenceSupport  = referenceSupport;
            allele.GenotypeQscore    = genotypeQscore;
            allele.NoiseLevelApplied = noiseLevel;
            allele.StrandBiasResults = strandBiasResults;
            allele.IsForcedToReport  = allele.Filters.Contains(FilterType.ForcedReport);

            //set the derived values
            allele.SetType();
            allele.ForceFractionNoCalls(fractionNocalls);

            //rescue attempt for complex types, ie ACGT -> ACGTGG.
            //Get the simplest form of the allele
            if ((allele.Type == AlleleCategory.Unsupported) && shouldTrimComplexAlleles)
            {
                VcfVariantUtilities.TrimUnsupportedAlleleType(allele);
            }

            if (tsCounts.Count != 0)
            {
                VcfVariantUtilities.FillInCollapsedReadsCount(shouldOutputRcCounts, shouldOutputTsCounts, allele, tsCounts);
            }
        }
        public static CalledAllele ConvertUnpackedVariant(VcfVariant v, bool shouldOutputRcCounts = false,
                                                          bool shouldOutputTsCounts = false, bool shouldTrimComplexAlleles = true)
        {
            if (v == null)
            {
                return(null);
            }

            if (v.VariantAlleles.Count() > 1)
            {
                throw new ArgumentException("This method does not handle crushed vcf format. Use Convert(IEnumerable<VcfVariant> vcfVariants)");
            }


            var genotypeQscore              = 0;
            var referenceSupport            = 0;
            var altSupport                  = 0;
            var genotypeString              = "";
            var totalCoverage               = 0;
            var isRef                       = ((v.VariantAlleles.Count() == 1) && v.VariantAlleles[0] == ".");
            var variantQuality              = v.Quality;
            var numAlts                     = 1;
            var noiseLevel                  = 1;
            var fractionNocalls             = 0f;
            var strandBiasInGATKScaleCoords = -100f;
            var tsCounts                    = new List <string>();

            if (v.InfoFields.ContainsKey("DP"))
            {
                totalCoverage = Int32.Parse(v.InfoFields["DP"]);
            }

            if (v.Genotypes.Count > 0)
            {
                if (v.Genotypes[0].ContainsKey("GQ"))
                {
                    genotypeQscore = Int32.Parse(v.Genotypes[0]["GQ"]);
                }
                else if (v.Genotypes[0].ContainsKey("GQX"))
                {
                    genotypeQscore = Int32.Parse(v.Genotypes[0]["GQX"]);
                }
                genotypeString = v.Genotypes[0]["GT"];

                if (v.Genotypes[0].ContainsKey("NL"))
                {
                    noiseLevel = Int32.Parse(v.Genotypes[0]["NL"]);
                }

                if (v.Genotypes[0].ContainsKey("NC"))
                {
                    fractionNocalls = float.Parse(v.Genotypes[0]["NC"]);
                }

                if (v.Genotypes[0].ContainsKey("SB"))
                {
                    strandBiasInGATKScaleCoords = float.Parse(v.Genotypes[0]["SB"]);
                }

                var ADstring = new string[] { "0", "0" };

                if (v.Genotypes[0].ContainsKey("AD"))
                {
                    ADstring = v.Genotypes[0]["AD"].Split(',');
                }

                var VFstring = new string[] { "0", "0" };
                if (v.Genotypes[0].ContainsKey("VF"))
                {
                    VFstring = v.Genotypes[0]["VF"].Split(',');
                }

                referenceSupport = int.Parse(ADstring[0]);
                altSupport       = isRef ? 0 : int.Parse(ADstring[1]);
                if (shouldOutputRcCounts)
                {
                    if (v.Genotypes[0].ContainsKey("US"))
                    {
                        tsCounts = v.Genotypes[0]["US"].Split(',').ToList();
                    }
                }

                if (isRef)
                {
                    numAlts = 0;
                }
                else
                {
                    numAlts = 1; //note this, method should never get a value here >1. these should be UNPACKED variants
                }
            }

            var strandBiasResults = new BiasResults();

            strandBiasResults.GATKBiasScore = strandBiasInGATKScaleCoords;

            var filters = MapFilterString(v.Filters);
            var allele  = new CalledAllele()
            {
                Chromosome        = v.ReferenceName,
                ReferencePosition = v.ReferencePosition,
                ReferenceAllele   = v.ReferenceAllele,
                AlternateAllele   = v.VariantAlleles[0],
                TotalCoverage     = totalCoverage,
                AlleleSupport     = isRef ? referenceSupport : altSupport,
                ReferenceSupport  = referenceSupport,
                VariantQscore     = (int)variantQuality,
                GenotypeQscore    = genotypeQscore,
                Genotype          = MapGTString(genotypeString, numAlts),
                Filters           = filters,
                NoiseLevelApplied = noiseLevel,
                StrandBiasResults = strandBiasResults,
                IsForcedToReport  = filters.Contains(FilterType.ForcedReport)
            };

            allele.SetType();
            allele.ForceFractionNoCalls(fractionNocalls);

            //rescue attempt for complex types, ie ACGT -> ACGTGG
            if ((allele.Type == AlleleCategory.Unsupported) && shouldTrimComplexAlleles)
            {
                TrimUnsupportedAlleleType(allele);
            }

            FillInCollapsedReadsCount(shouldOutputRcCounts, shouldOutputTsCounts, allele, tsCounts);

            return(allele);
        }