Ejemplo n.º 1
0
        public void VarCallsBecomeRefsAndNulls()
        {
            var originalVcfVariant  = TestHelper.CreateDummyAllele("chr1", 123, "A", "T", 1000, 156);
            var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 124, "A", "T", 1000, 156);
            var vs1 = new VariantSite(originalVcfVariant);
            var vs2 = new VariantSite(originalVcfVariant2);

            var vcParams = new VariantCallingParameters();

            vcParams.Validate();
            var caller = new VariantCaller(vcParams, new BamFilterParameters());

            //since there is an alt at position 124 ( a call of 156 alt / 1000 total, that means 844 original ref calls.
            //Of which we said, 100 will get sucked up. So that leaves 744 / 1000 calls for a reference.
            //So, we can still make a confident ref call.

            var nbhd = new VcfNeighborhood(vcParams, 0, "chr1", vs1, vs2, "");

            nbhd.SetRangeOfInterest();
            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Snv)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "A",
                AlternateAllele   = "T",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 500
            });
            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
            };

            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            var acceptedMNVs = nbhd.CalledVariants;
            var acceptedRefs = nbhd.CalledRefs;

            Assert.Equal(1, acceptedMNVs.Count);
            Assert.Equal(1, acceptedMNVs[123].Count);

            Assert.Equal(2, acceptedRefs.Count);


            var vcfVariant2asRef = new VcfVariant()
            {
                ReferenceName     = "chr1",
                ReferencePosition = 124,
                ReferenceAllele   = "A",
                VariantAlleles    = new[] { "." },
                Genotypes         = new List <Dictionary <string, string> >()
                {
                    new Dictionary <string, string>()
                    {
                        { "GT", "0/." }, { "DP", "1000" }, { "AD", "844" }
                    }
                },
            };

            VcfMergerTests.CheckVariantsMatch(originalVcfVariant, acceptedMNVs[123][0]);
            VcfMergerTests.CheckVariantsMatch(vcfVariant2asRef, acceptedRefs[124]);

            // If one has been sucked up and there are refs remaining, we should output it as a ref.
            var suckedUpRefRecord100 = new SuckedUpRefRecord()
            {
                Counts = 100, AlleleThatClaimedIt = new CalledAllele()
            };

            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
                { 124, suckedUpRefRecord100 }
            };


            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            acceptedMNVs = nbhd.CalledVariants;
            acceptedRefs = nbhd.CalledRefs;

            Assert.Equal(1, acceptedMNVs.Count);
            Assert.Equal(1, acceptedMNVs[123].Count);

            Assert.Equal(2, acceptedRefs.Count);

            vcfVariant2asRef = new VcfVariant()
            {
                ReferenceName     = "chr1",
                ReferencePosition = 124,
                ReferenceAllele   = "A",
                VariantAlleles    = new[] { "." },
                Genotypes         = new List <Dictionary <string, string> >()
                {
                    new Dictionary <string, string>()
                    {
                        { "GT", "0/." }, { "DP", "1000" }, { "AD", "744" }
                    }
                },
            };

            VcfMergerTests.CheckVariantsMatch(originalVcfVariant, acceptedMNVs[123][0]);
            VcfMergerTests.CheckVariantsMatch(vcfVariant2asRef, acceptedRefs[124]);


            // If one has been sucked up all the way
            // we should output it as a null.
            var suckedUpRefRecord1000 = new SuckedUpRefRecord()
            {
                Counts = 1000, AlleleThatClaimedIt = new CalledAllele()
            };

            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
                { 124, suckedUpRefRecord1000 }
            };

            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            acceptedMNVs = nbhd.CalledVariants;
            acceptedRefs = nbhd.CalledRefs;

            Assert.Equal(1, acceptedMNVs.Count);
            Assert.Equal(1, acceptedMNVs[123].Count);

            Assert.Equal(2, acceptedRefs.Count);

            var vcfVariant2asNull = new VcfVariant()
            {
                ReferenceName     = "chr1",
                ReferencePosition = 124,
                ReferenceAllele   = "A",
                VariantAlleles    = new[] { "." },
                Genotypes         = new List <Dictionary <string, string> >()
                {
                    new Dictionary <string, string>()
                    {
                        { "GT", "./." }, { "DP", "1000" }, { "AD", "0" }
                    }
                },
            };

            VcfMergerTests.CheckVariantsMatch(originalVcfVariant, acceptedMNVs[123][0]);
            VcfMergerTests.CheckVariantsMatch(vcfVariant2asNull, acceptedRefs[124]);
        }
Ejemplo n.º 2
0
        public void CallAVariantInANewLocation()
        {
            //set up the original variants
            var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 123, "A", "T", 1000, 156);
            var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 124, "A", "T", 1000, 156);
            var originalVcfVariant3 = TestHelper.CreateDummyAllele("chr1", 234, "A", "T", 1000, 156);
            var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr1", 234, "A", "T", 1000, 156);

            var vs1 = new VariantSite(originalVcfVariant1);
            var vs2 = new VariantSite(originalVcfVariant2);
            var vs3 = new VariantSite(originalVcfVariant3);
            var vs4 = new VariantSite(originalVcfVariant4);

            var vcParams = new VariantCallingParameters();

            vcParams.Validate();
            var caller = new VariantCaller(vcParams, new BamFilterParameters());
            var nbhd   = new VcfNeighborhood(vcParams, 0, "chr1", vs1, vs2, "");

            nbhd.AddVariantSite(vs3, "RRRRR"); //note, we do not add vs4, that is not going to get used for phasing. Sps it is a variant that failed filters.
            nbhd.SetRangeOfInterest();

            //now stage one candidate MNV:
            var newMNV = new CalledAllele(AlleleCategory.Snv)
            {
                Chromosome        = "chr1",
                ReferencePosition = 129,
                ReferenceAllele   = "A",
                AlternateAllele   = "TT",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 500
            };


            nbhd.AddAcceptedPhasedVariant(newMNV);
            var suckedUpRefRecord1000 = new SuckedUpRefRecord()
            {
                Counts = 1000, AlleleThatClaimedIt = new CalledAllele()
            };

            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
                { 124, suckedUpRefRecord1000 }
            };

            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            var acceptedMNVs = nbhd.CalledVariants;
            var acceptedRefs = nbhd.CalledRefs;


            var vcfVariant0asRef = new VcfVariant()
            {
                ReferenceName     = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "A",
                VariantAlleles    = new[] { "." },
                Genotypes         = new List <Dictionary <string, string> >()
                {
                    new Dictionary <string, string>()
                    {
                        { "GT", "0/." }
                    }
                },
            };

            var vcfVariant3asRef = new VcfVariant()
            {
                ReferenceName     = "chr1",
                ReferencePosition = 234,
                ReferenceAllele   = "A",
                VariantAlleles    = new[] { "." },
                Genotypes         = new List <Dictionary <string, string> >()
                {
                    new Dictionary <string, string>()
                    {
                        { "GT", "0/." }
                    }
                },
            };

            var vcfVariant2asNull = new VcfVariant()
            {
                ReferenceName     = "chr1",
                ReferencePosition = 124,
                ReferenceAllele   = "A",
                VariantAlleles    = new[] { "." },
                Genotypes         = new List <Dictionary <string, string> >()
                {
                    new Dictionary <string, string>()
                    {
                        { "GT", "./." }
                    }
                },
            };

            Assert.Equal(1, acceptedMNVs.Count);
            Assert.Equal(1, acceptedMNVs[129].Count);

            Assert.Equal(3, acceptedRefs.Count);

            VcfMergerTests.CheckVariantsMatch(vcfVariant0asRef, acceptedRefs[123]);
            VcfMergerTests.CheckVariantsMatch(vcfVariant2asNull, acceptedRefs[124]);
            VcfMergerTests.CheckVariantsMatch(newMNV, acceptedMNVs[129][0]);
            VcfMergerTests.CheckVariantsMatch(vcfVariant3asRef, acceptedRefs[234]);
        }
Ejemplo n.º 3
0
        //this unit test was made after we found bug ScyllaLoosingRefCalls_PICS-723.
        //We had a 1/. GT reported when it should be 1/0.
        //The reason for this is that all the refs (the "0"s) got incorrectly sucked up.
        //Ie, MNV ACG-> AG claimed 50 refs, so we (incorrectly) subtracted 50 refs from it.
        //The bug is that the ref counts got subtractedfrom the exact same mnv that claimed them.
        // This should never happen, and was not the intent of the alg.
        //
        //The affected mehtod is: CreateMnvsFromClusters in VcfNbhd
        public void CreateMnvsFromClusters_TakeUpRefCount()
        {
            var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 123, "ACG", "AT", 1000, 156);
            var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 123, "A", "TTTTTT", 1000, 200);
            var originalVcfVariant3 = TestHelper.CreateDummyAllele("chr1", 123, "AC", "TT", 1000, 100);

            var vs1 = new VariantSite(originalVcfVariant1);
            var vs2 = new VariantSite(originalVcfVariant2);

            var caller = new VariantCaller(new VariantCallingParameters(), new BamFilterParameters());
            var nbhd   = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr1", vs1, vs2, "");


            nbhd.SetRangeOfInterest();
            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Snv)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "A",
                AlternateAllele   = "T",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 200
            });


            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Mnv)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "ACG",
                AlternateAllele   = "AT",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 300
            });

            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Insertion)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "A",
                AlternateAllele   = "AAAAA",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 250
            });


            //default behavior, nothing gets sucked up
            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
            };
            vs1.VcfReferencePosition = 123;
            var vead        = new Vead("dummy", new VariantSite[] { vs1 });
            var vg          = new VeadGroup(vead);
            var fakeCluster = new Cluster("test", new List <VeadGroup>()
            {
                vg
            });

            fakeCluster.ResetConsensus();
            nbhd.CreateMnvsFromClusters(new List <Cluster> {
                fakeCluster
            },
                                        20, 100);
            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            var acceptedMNVs = nbhd.CalledVariants;
            var acceptedRefs = nbhd.CalledRefs;

            Assert.Equal(2, acceptedMNVs.Count);
            Assert.Equal(3, acceptedMNVs[123].Count);
            Assert.Equal(1, acceptedRefs.Count);

            //check the ref counts on all the MNVs. Nothing should be sucked up.
            Assert.Equal(1000 - 200, acceptedMNVs[123][0].ReferenceSupport);  //total depth - allele suport. overly simple for now)
            Assert.Equal(1000 - 300, acceptedMNVs[123][1].ReferenceSupport);  //total depth - allele suport. overly simple for now)
            Assert.Equal(1000 - 250, acceptedMNVs[123][2].ReferenceSupport);  //total depth - allele suport. overly simple for now)

            // now variant 0 will suck up 100 ref calls:
            var suckedUpRefRecord100 = new SuckedUpRefRecord()
            {
                Counts = 100, AlleleThatClaimedIt = nbhd.CandidateVariants[0]
            };

            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
                { 123, suckedUpRefRecord100 }
            };
            nbhd.CreateMnvsFromClusters(new List <Cluster> {
                fakeCluster
            },
                                        20, 100);

            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            acceptedMNVs = nbhd.CalledVariants;
            acceptedRefs = nbhd.CalledRefs;


            //check the ref counts on all the MNVs. refs should only be taken up by the first one
            Assert.Equal(1000 - 200, acceptedMNVs[123][0].ReferenceSupport);  //total depth - allele suport. overly simple for now)

            //old result - has bug
            //Assert.Equal(1000 - 300, acceptedMNVs[123][1].ReferenceSupport);  //total depth - allele suport - sucked up ref)
            //Assert.Equal(1000 - 250, acceptedMNVs[123][2].ReferenceSupport);  //total depth - allele suport - sucked up ref)

            //new result, fixed
            Assert.Equal(1000 - 300 - 100, acceptedMNVs[123][1].ReferenceSupport);  //total depth - allele suport - sucked up ref)
            Assert.Equal(1000 - 250 - 100, acceptedMNVs[123][2].ReferenceSupport);  //total depth - allele suport - sucked up ref)
        }
Ejemplo n.º 4
0
        public void CreateMnvsFromClusters(IEnumerable <ICluster> clusters, int qNoiselevel, int maxQscore, bool crushNbhd = false)
        {
            if (clusters == null)
            {
                return;
            }
            if (clusters.Count() == 0)
            {
                return;
            }

            var depthAtSites   = new int[0];
            var nocallsAtSites = new int[0];

            DepthAtSites(clusters, out depthAtSites, out nocallsAtSites);

            Logger.WriteToLog("Creating MNVs from clusters.");

            int anchorPosition = -1;

            //if we are crushing the vcf, or in diploid mode, always report all phased alleles throug the nbhd, starting at the first position of interest. (ie, the first position we started phasing on)
            //If we are in somatic mode or uncrushed mode, we just report the variants at the loci we find them on (normal Pisces)
            if (crushNbhd || _nbhdGTcalculator.PloidyModel == Pisces.Domain.Types.PloidyModel.Diploid)
            {
                anchorPosition = FirstPositionOfInterest;
            }


            foreach (var cluster in clusters)
            {
                CalledAllele mnv;

                var clusterConsensus = cluster.GetConsensusSites();

                Logger.WriteToLog(cluster.Name + "\tVariantSites\t" + VariantSite.ArrayToString(clusterConsensus));
                Logger.WriteToLog(cluster.Name + "\tVariantPositions\t" + VariantSite.ArrayToPositions(clusterConsensus));


                var referenceRemoval = PhasedVariantExtractor.Extract(out mnv, clusterConsensus,
                                                                      ReferenceSequence, depthAtSites, nocallsAtSites, cluster.CountsAtSites, ReferenceName, qNoiselevel, maxQscore, anchorPosition);

                if ((mnv.Type != Pisces.Domain.Types.AlleleCategory.Reference) && mnv.AlleleSupport != 0)
                {
                    Logger.WriteToLog(cluster.Name + "mnv accepted:\t" + mnv.ToString());
                    AddAcceptedPhasedVariant(mnv);

                    //keep track of reference calls sucked into MNVs.
                    //We will need to subtract this from the ref counts when we write out the final vcf.
                    foreach (var refPosition in referenceRemoval.Keys)
                    {
                        if (!UsedRefCountsLookup.ContainsKey(refPosition))
                        {
                            var suckedUpRefRecord = new SuckedUpRefRecord()
                            {
                                Counts = 0, AlleleThatClaimedIt = mnv
                            };
                            UsedRefCountsLookup.Add(refPosition, suckedUpRefRecord);
                        }

                        UsedRefCountsLookup[refPosition].Counts += referenceRemoval[refPosition].Counts;
                    }
                }
                else if (mnv.TotalCoverage != 0) //dont add empty stuff..
                {
                    Logger.WriteToLog("mnv rejected:\t" + mnv.ToString());
                    AddRejectedPhasedVariant(mnv);
                }
            }
            foreach (var phasedVariant in CandidateVariants)
            {
                var calledPhasedVariant = phasedVariant as CalledAllele;
                if (calledPhasedVariant == null)
                {
                    continue;
                }

                calledPhasedVariant.ReferenceSupport = phasedVariant.TotalCoverage - phasedVariant.AlleleSupport;
                if (UsedRefCountsLookup.ContainsKey(phasedVariant.ReferencePosition) && (UsedRefCountsLookup[phasedVariant.ReferencePosition].AlleleThatClaimedIt != phasedVariant))
                {
                    calledPhasedVariant.ReferenceSupport = calledPhasedVariant.ReferenceSupport - UsedRefCountsLookup[phasedVariant.ReferencePosition].Counts;
                }

                calledPhasedVariant.ReferenceSupport = Math.Max(0, calledPhasedVariant.ReferenceSupport);
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Warning #1. This algorithm has an inherent assumption:
        /// the VS must be in order of their true position (first base of difference).
        /// Thats not always how they appeared in the vcf.
        /// Warning #2. Variants are typically reported in the VCF on their first base of difference
        /// from the reference genome (or in the case of indels, one base before).
        /// However, in germline (crushed) formatting, Scylla reports all variants in a nbhd
        /// at the same (anchored) position. This is because there can only ever be two alleles
        /// given the diploid assumption. So, you cant report 5 different alleles at 5 spots withn the neighborhood.
        /// IE, somatic genotyping/reporting is loci-specific.
        /// but diploid genotyping/reporting is is forced to be consistent through the whole neighborhood.
        /// </summary>
        /// <param name="allele"> allele we are going to create from the cluster</param>
        /// <param name="clusterVariantSites">the variant site results for the cluster</param>
        /// <param name="referenceSequence">the reference seqeunce, so we can populate inbetween the MNVs</param>
        /// <param name="neighborhoodDepthAtSites">depths needed to populate the new allele</param>
        /// <param name="clusterCountsAtSites">call counts needed to populate the new allele</param>
        /// <param name="chromosome">chr needed to populate the new allele</param>
        /// <param name="qNoiselevel">NL needed to populate the new allele</param>
        /// <param name="maxQscore">Q max needed to determine Q score of the new allele</param>
        /// <param name="anchorPosition">if we are forcing the allele to be at a given position, instead of the poisition it would naturally be at in the VCF file</param>
        /// <returns></returns>
        public static Dictionary <int, SuckedUpRefRecord> Extract(out CalledAllele allele,
                                                                  VariantSite[] clusterVariantSites, string referenceSequence, int[] neighborhoodDepthAtSites, int[] neighborhoodNoCallsAtSites, int clusterRefSupport,
                                                                  int[] clusterCountsAtSites, string chromosome, int qNoiselevel, int maxQscore, int anchorPosition = -1)
        {
            if (clusterVariantSites.Length != neighborhoodDepthAtSites.Length || neighborhoodDepthAtSites.Length != clusterCountsAtSites.Length)
            {
                throw new InvalidDataException("Variant sites, depths, and counts arrays are different lengths.");
            }

            var referenceRemoval = new Dictionary <int, SuckedUpRefRecord>();

            // Initialize items we'll eventually use to build a variant.
            var alleleReference = "";
            var alleleAlternate = "";
            var totalCoverage   = 0;
            var varCount        = 0;
            var noCallCount     = 0;

            // Initialize trackers
            var referenceCallsSuckedIntoMnv = new List <int>();
            var nocallsInsideMnv            = new List <int>();
            var depthsInsideMnv             = new List <int>();
            var countsInsideMnv             = new List <int>();

            var lastRefBaseSitePosition  = clusterVariantSites[0].VcfReferencePosition;
            var firstVariantSitePosition = clusterVariantSites[0].VcfReferencePosition;
            var differenceStarted        = false;

            bool usingAnchor = (anchorPosition != -1);

            if (usingAnchor)
            {
                lastRefBaseSitePosition = anchorPosition - 1;
            }

            // Walk through the cluster's variant sites and build up ref/alt strings, average support, and average coverage
            for (var siteIndex = 0; siteIndex < clusterVariantSites.Length; siteIndex++)
            {
                var consensusSite = clusterVariantSites[siteIndex];

                var refAlleleToAdd  = consensusSite.TrueRefAllele;
                var altAlleleToAdd  = consensusSite.TrueAltAllele;
                var currentPosition = consensusSite.TrueFirstBaseOfDiff;
                var diff            = lastRefBaseSitePosition - currentPosition;

                // no variant here...
                if (refAlleleToAdd == altAlleleToAdd)
                {
                    continue;
                }

                if (differenceStarted && (diff >= 0))
                {
                    //We have a problem. the last site we added overlaps with the current site we want to add.
                    //The probably are not in conflict. But we will had to do some kind of sub string to get this right..

                    var lengthToTrimFromStart = diff + 1;

                    if ((lengthToTrimFromStart < consensusSite.TrueAltAllele.Length) &&
                        (lengthToTrimFromStart < consensusSite.TrueRefAllele.Length))
                    {
                        refAlleleToAdd  = consensusSite.TrueRefAllele.Substring(lengthToTrimFromStart);
                        altAlleleToAdd  = consensusSite.TrueAltAllele.Substring(lengthToTrimFromStart);
                        currentPosition = consensusSite.TrueFirstBaseOfDiff + lengthToTrimFromStart;
                    }
                    else
                    {
                        continue; //if the last variant site entirely covered this one, just dont worry about it.
                    }
                }



                // Nima: In diploid mode (usingAnchor == 1), any ref after anchor gets used up. I'm not sure if this is intented.
                // TJD - yes, intended.  For germline we phase a whole anchored block at a time, including reference
                if (differenceStarted || usingAnchor)
                {
                    var gapLength            = currentPosition - lastRefBaseSitePosition - 1;
                    var suckedUpRefPositions = new List <int>();
                    for (var i = 0; i < gapLength; i++)
                    {
                        var refPosition = lastRefBaseSitePosition + i + 1;
                        suckedUpRefPositions.Add(refPosition);
                    }
                    referenceCallsSuckedIntoMnv.AddRange(suckedUpRefPositions);

                    var gapFiller = FillGapWithReferenceData(referenceSequence,
                                                             clusterVariantSites[0], suckedUpRefPositions);

                    alleleReference += gapFiller;
                    alleleAlternate += gapFiller;
                }

                if (!differenceStarted)
                {
                    firstVariantSitePosition = currentPosition;
                }

                differenceStarted = true;
                depthsInsideMnv.Add(neighborhoodDepthAtSites[siteIndex]);
                countsInsideMnv.Add(clusterCountsAtSites[siteIndex]);
                nocallsInsideMnv.Add(neighborhoodNoCallsAtSites[siteIndex]);

                //this takes into account taking deletions out of the ref allele.
                lastRefBaseSitePosition = currentPosition + refAlleleToAdd.Length - 1;

                alleleReference += refAlleleToAdd;
                alleleAlternate += altAlleleToAdd;
            }



            if (differenceStarted)
            {
                //remove any trailing bases of agreement.
                var numTrailingBasesOfAgreement = VcfVariantUtilities.GetNumTrailingAgreement(alleleReference, alleleAlternate);

                //remove traling bases
                alleleReference = alleleReference.Substring(0, alleleReference.Length - numTrailingBasesOfAgreement);
                alleleAlternate = alleleAlternate.Substring(0, alleleAlternate.Length - numTrailingBasesOfAgreement);
            }
            //if we are not anchored, we trim off preceding bases of agreement, and move up the cooridnate to
            //the first base of difference.
            var numPrecedingBasesOfAgreement = usingAnchor ? 0 : VcfVariantUtilities.GetNumPrecedingAgreement(alleleReference, alleleAlternate);

            alleleReference = alleleReference.Substring(numPrecedingBasesOfAgreement,
                                                        alleleReference.Length - numPrecedingBasesOfAgreement);
            alleleAlternate = alleleAlternate.Substring(numPrecedingBasesOfAgreement,
                                                        alleleAlternate.Length - numPrecedingBasesOfAgreement);



            if (!differenceStarted || (alleleReference.Length == 0) && (alleleAlternate.Length == 0))
            {
                //taking out the preceding bases, the phased variant compacted to nothing!
                allele = Create(chromosome, -1, alleleReference, alleleAlternate, varCount, noCallCount, totalCoverage, clusterRefSupport, AlleleCategory.Reference, qNoiselevel, maxQscore);
                return(referenceRemoval);
            }

            // take average counts and depth through MNV
            // the only "holes" that lower these counts are Ns
            totalCoverage = depthsInsideMnv.Any() ? (int)depthsInsideMnv.Average() : 0;
            varCount      = countsInsideMnv.Any() ? (int)countsInsideMnv.Average() : 0;
            noCallCount   = nocallsInsideMnv.Any() ? (int)nocallsInsideMnv.Average() : 0;

            var trueStartPosition = usingAnchor ? anchorPosition : firstVariantSitePosition + numPrecedingBasesOfAgreement;

            var indexIntoRef    = (trueStartPosition - 1) - clusterVariantSites[0].VcfReferencePosition;
            var prependableBase = "R";

            if ((indexIntoRef >= 0) && (indexIntoRef < referenceSequence.Length))
            {
                prependableBase = referenceSequence[indexIntoRef].ToString();
            }

            //compacted to an insertion
            if ((alleleReference.Length == 0) && (alleleAlternate.Length != 0))
            {
                allele = Create(chromosome, trueStartPosition - 1, prependableBase + alleleReference, prependableBase + alleleAlternate,
                                varCount, noCallCount, totalCoverage, clusterRefSupport, AlleleCategory.Insertion, qNoiselevel, maxQscore);
            }
            //compacted to an insertion
            else if ((alleleReference.Length != 0) && (alleleAlternate.Length == 0))
            {
                allele = Create(chromosome, trueStartPosition - 1, prependableBase + alleleReference, prependableBase + alleleAlternate,
                                varCount, noCallCount, totalCoverage, clusterRefSupport, AlleleCategory.Deletion, qNoiselevel, maxQscore);
            }
            else  //MNV,pretty much what we were expecting. (and every time we are using an anchor)
            {
                allele = Create(chromosome, trueStartPosition, alleleReference, alleleAlternate,
                                varCount, noCallCount, totalCoverage, clusterRefSupport, AlleleCategory.Mnv, qNoiselevel, maxQscore);
            }


            if (varCount == 0)
            {
                allele = Create(chromosome, trueStartPosition, alleleReference, ".",
                                varCount, noCallCount, totalCoverage, clusterRefSupport, AlleleCategory.Reference, qNoiselevel, maxQscore);
            }

            foreach (var suckedupRefPos in referenceCallsSuckedIntoMnv)
            {
                if ((usingAnchor) || (suckedupRefPos > trueStartPosition))
                {
                    var suckedUpRefRecord = new SuckedUpRefRecord()
                    {
                        Counts = varCount, AlleleThatClaimedIt = allele
                    };
                    referenceRemoval.Add(suckedupRefPos, suckedUpRefRecord);
                }
            }

            return(referenceRemoval);
        }