private void LogVeadGroupInfo(IEnumerable <VeadGroup> collapsedReads)
        {
            if (_debugMode)
            {
                Logger.WriteToLog("variant-compressed read groups as follows:  ");
                Logger.WriteToLog("count" + "\t", collapsedReads.First().ToPositions());
                foreach (var vG in collapsedReads)
                {
                    Logger.WriteToLog("\t" + vG.NumVeads + "\t" + vG);
                }
            }

            Logger.WriteToLog("Found " + collapsedReads.Count() + " variant-collapsed read groups.");

            if (_debugMode)
            {
                StringBuilder sb = new StringBuilder();
                int[]         depths;
                int[]         nocalls;
                VeadGroup.DepthAtSites(collapsedReads, out depths, out nocalls);
                Logger.WriteToLog("depth at sites:  ");
                Logger.WriteToLog(collapsedReads.First().ToPositions());

                for (int i = 0; i < depths.Length; i++)
                {
                    Logger.WriteToLog(string.Join("\t", depths[i]));
                }
            }
        }
        public static VeadGroup CreateVeadGroup(List <Vead> veads)
        {
            var veadgroup = new VeadGroup(veads.First());

            foreach (var vead in veads.Skip(1))
            {
                veadgroup.AddSupport(vead);
            }
            return(veadgroup);
        }
        private static bool TestCanBeMerged(ICluster clusterA, ICluster clusterB, int maxNumDisagreements, VeadGroup veadGroupC = null)
        {
            var vgToCheck = new List <VeadGroup>();

            vgToCheck.AddRange(clusterA.GetVeadGroups());
            vgToCheck.AddRange(clusterB.GetVeadGroups());
            if (veadGroupC != null)
            {
                vgToCheck.Add(veadGroupC);
            }

            var worstAgreement       = VeadGroup.GetWorstAgreement(vgToCheck);
            var tooManyDisagreements = (worstAgreement.NumDisagreement > maxNumDisagreements);

            //note we already know the num agreements are OK, b/c vC has an acceptable num agreements with each cluster
            //before we got into this method

            return(!tooManyDisagreements);
        }
        public static Cluster MergeAllBestCandidates(SetOfClusters clusters, int maxNumDisagreements,
                                                     List <Cluster> bestCandidates, VeadGroup testVeadGroup)
        {
            var numCandidates = bestCandidates.Count;
            var bestcluster   = bestCandidates[0];

            for (var i = 0; i < numCandidates; i++)
            {
                for (var j = i + 1; j < numCandidates; j++)
                {
                    // First test if the clusters can be merged. If they can, merge them.
                    // If they can't, return the better of the two.
                    var clusterA = bestCandidates[i];
                    var clusterB = bestCandidates[j];

                    var canBeMerged = TestCanBeMerged(clusterA, clusterB, maxNumDisagreements, testVeadGroup);

                    if (canBeMerged)
                    {
                        clusters.RemoveCluster(clusterA.Name);
                        clusters.RemoveCluster(clusterB.Name);

                        var mergedCluster = MergeClusters(clusterA, clusterB);
                        clusters.AddCluster(mergedCluster);
                        bestcluster = mergedCluster;
                    }
                    else
                    {
                        if (clusterB.NumVeads > clusterA.NumVeads)
                        {
                            bestcluster = clusterB;
                        }
                        //else, leave it as      bestcluster = cA;
                    }
                }
            }
            return(bestcluster);
        }
        //this unit test was made after we found bug ScyllaLoosingRefCalls_PICS-723.
        //We had a 1/. GT reported when it should be 1/0.
        //The reason for this is that all the refs (the "0"s) got incorrectly sucked up.
        //Ie, MNV ACG-> AG claimed 50 refs, so we (incorrectly) subtracted 50 refs from it.
        //The bug is that the ref counts got subtractedfrom the exact same mnv that claimed them.
        // This should never happen, and was not the intent of the alg.
        //
        //The affected mehtod is: CreateMnvsFromClusters in VcfNbhd
        public void CreateMnvsFromClusters_TakeUpRefCount()
        {
            var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 123, "ACG", "AT", 1000, 156);
            var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 123, "A", "TTTTTT", 1000, 200);
            var originalVcfVariant3 = TestHelper.CreateDummyAllele("chr1", 123, "AC", "TT", 1000, 100);

            var vs1 = new VariantSite(originalVcfVariant1);
            var vs2 = new VariantSite(originalVcfVariant2);

            var caller = new VariantCaller(new VariantCallingParameters(), new BamFilterParameters());
            var nbhd   = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr1", vs1, vs2, "");


            nbhd.SetRangeOfInterest();
            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Snv)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "A",
                AlternateAllele   = "T",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 200
            });


            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Mnv)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "ACG",
                AlternateAllele   = "AT",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 300
            });

            nbhd.AddAcceptedPhasedVariant(
                new CalledAllele(AlleleCategory.Insertion)
            {
                Chromosome        = "chr1",
                ReferencePosition = 123,
                ReferenceAllele   = "A",
                AlternateAllele   = "AAAAA",
                VariantQscore     = 100,
                TotalCoverage     = 1000,
                AlleleSupport     = 250
            });


            //default behavior, nothing gets sucked up
            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
            };
            vs1.VcfReferencePosition = 123;
            var vead        = new Vead("dummy", new VariantSite[] { vs1 });
            var vg          = new VeadGroup(vead);
            var fakeCluster = new Cluster("test", new List <VeadGroup>()
            {
                vg
            });

            fakeCluster.ResetConsensus();
            nbhd.CreateMnvsFromClusters(new List <Cluster> {
                fakeCluster
            },
                                        20, 100);
            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            var acceptedMNVs = nbhd.CalledVariants;
            var acceptedRefs = nbhd.CalledRefs;

            Assert.Equal(2, acceptedMNVs.Count);
            Assert.Equal(3, acceptedMNVs[123].Count);
            Assert.Equal(1, acceptedRefs.Count);

            //check the ref counts on all the MNVs. Nothing should be sucked up.
            Assert.Equal(1000 - 200, acceptedMNVs[123][0].ReferenceSupport);  //total depth - allele suport. overly simple for now)
            Assert.Equal(1000 - 300, acceptedMNVs[123][1].ReferenceSupport);  //total depth - allele suport. overly simple for now)
            Assert.Equal(1000 - 250, acceptedMNVs[123][2].ReferenceSupport);  //total depth - allele suport. overly simple for now)

            // now variant 0 will suck up 100 ref calls:
            var suckedUpRefRecord100 = new SuckedUpRefRecord()
            {
                Counts = 100, AlleleThatClaimedIt = nbhd.CandidateVariants[0]
            };

            nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>()
            {
                { 123, suckedUpRefRecord100 }
            };
            nbhd.CreateMnvsFromClusters(new List <Cluster> {
                fakeCluster
            },
                                        20, 100);

            caller.CallMNVs(nbhd);
            caller.CallRefs(nbhd);

            acceptedMNVs = nbhd.CalledVariants;
            acceptedRefs = nbhd.CalledRefs;


            //check the ref counts on all the MNVs. refs should only be taken up by the first one
            Assert.Equal(1000 - 200, acceptedMNVs[123][0].ReferenceSupport);  //total depth - allele suport. overly simple for now)

            //old result - has bug
            //Assert.Equal(1000 - 300, acceptedMNVs[123][1].ReferenceSupport);  //total depth - allele suport - sucked up ref)
            //Assert.Equal(1000 - 250, acceptedMNVs[123][2].ReferenceSupport);  //total depth - allele suport - sucked up ref)

            //new result, fixed
            Assert.Equal(1000 - 300 - 100, acceptedMNVs[123][1].ReferenceSupport);  //total depth - allele suport - sucked up ref)
            Assert.Equal(1000 - 250 - 100, acceptedMNVs[123][2].ReferenceSupport);  //total depth - allele suport - sucked up ref)
        }
Exemple #6
0
        public override void InternalExecute(int maxThreads)
        {
            var startTime = DateTime.UtcNow;

            Logger.WriteToLog("Start processing.");

            // Package proximal variants into neighborhoods
            Logger.WriteToLog("Building neighborhoods.");
            var neighborhoods = _factory.CreateNeighborhoodBuilder().GetNeighborhoods();

            Logger.WriteToLog(string.Format("Neighborhood building complete. {0} neighborhoods created.",
                                            neighborhoods.Count()));

            //Then process each neighborhood separately

            var jobManager = new JobManager(maxThreads);

            var jobs = new List <IJob>();

            foreach (var vcfNeighborhood in neighborhoods)
            {
                if (!string.IsNullOrEmpty(_factory.FilteredNbhd) && (_factory.FilteredNbhd != vcfNeighborhood.Id))
                {
                    continue;
                }

                Logger.WriteToLog("Creating Neighborhood: {0}", vcfNeighborhood.Id);


                var clusterer       = _factory.CreateNeighborhoodClusterer();
                var veadGroupSource = _factory.CreateVeadGroupSource();
                var collapsedReads  = veadGroupSource.GetVeadGroups(vcfNeighborhood);

                if (_factory.DebugMode)
                {
                    Logger.WriteToLog("variant-compressed read groups as follows:  ");
                    Logger.WriteToLog("count" + "\t", collapsedReads.First().ToPositions());
                    foreach (var vG in collapsedReads)
                    {
                        Logger.WriteToLog("\t" + vG.NumVeads + "\t" + vG);
                    }
                }

                Logger.WriteToLog("Found " + collapsedReads.Count() + " variant-collapsed read groups.");

                if (_factory.DebugMode)
                {
                    StringBuilder sb     = new StringBuilder();
                    int[]         depths = VeadGroup.DepthAtSites(collapsedReads);
                    Logger.WriteToLog("depth at sites:  ");
                    Logger.WriteToLog(collapsedReads.First().ToPositions());
                    Logger.WriteToLog(string.Join("\t", depths));
                }

                jobs.Add(new GenericJob(() => ProcessNeighborhood(vcfNeighborhood, clusterer, collapsedReads)));
            }

            jobManager.Process(jobs);

            // Finally, come back and combine the information for results
            //  Add back everything that wasn't sucked up into MNVs
            //  Some adjustment for refs in MNVs or something
            //  Write VCF


            //(3) write results

            var variantCaller = _factory.CreateVariantCaller();
            var variantMerger = _factory.CreateVariantMerger();

            // var nbhds = neighborhoods.ToArray();

            using (var writer = _factory.CreatePhasedVcfWriter())
            {
                Logger.WriteToLog("Writing phased vcf.");
                writer.WriteHeader();

                //do this in chunks to avoid having all variants in memory at all times.
                var originalAllelesTrailingNeighbhood = new List <CalledAllele>();
                foreach (var nbhd in neighborhoods)
                {
                    Logger.WriteToLog("Writing original variants up to neighborhood " + nbhd.Id);
                    originalAllelesTrailingNeighbhood = variantMerger.WriteVariantsUptoChr(writer, originalAllelesTrailingNeighbhood, nbhd.ReferenceName);

                    Logger.WriteToLog("Writing phased variants inside neighborhood " + nbhd.Id);
                    variantCaller.CallMNVs(nbhd);
                    variantCaller.CallRefs(nbhd);
                    originalAllelesTrailingNeighbhood = variantMerger.WriteVariantsUptoIncludingNbhd(nbhd,
                                                                                                     writer, originalAllelesTrailingNeighbhood);
                }

                Logger.WriteToLog("Writing variants past last neighborhood");

                variantMerger.WriteRemainingVariants(writer, originalAllelesTrailingNeighbhood);
            }

            Logger.WriteToLog("Completed processing in {0}s.",
                              DateTime.UtcNow.Subtract(startTime).TotalSeconds);
        }
        public void ClusterVeadGroups()
        {
            // ----------------------------------------------------
            // Four Ns
            //  - This is from original "FourNs Test"
            // ----------------------------------------------------

            var veads = new List <Vead>()
            {
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[2, 2] {
                    { "C", "C" }, { "G", "N" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[2, 2] {
                    { "C", "C" }, { "G", "N" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[2, 2] {
                    { "C", "C" }, { "G", "N" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[2, 2] {
                    { "C", "C" }, { "G", "N" }
                }),
            };

            var veadgroup = PhasedVariantTestUtilities.CreateVeadGroup(veads);

            ExecuteClusteringTest(new List <VeadGroup>()
            {
                veadgroup
            },
                                  new List <List <VeadGroup> >
            {
                new List <VeadGroup> {
                    veadgroup
                }
            }, new List <string>()
            {
                "C>C,G>N"
            }
                                  , 1);


            // ----------------------------------------------------
            // Real Data
            //  - This data is from Sample 129 (original "Sample129Test")
            // ----------------------------------------------------

            veads = new List <Vead>()
            {
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[2, 2] {
                    { "A", "G" }, { "N", "N" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[2, 2] {
                    { "A", "G" }, { "C", "C" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[2, 2] {
                    { "A", "A" }, { "C", "C" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[2, 2] {
                    { "A", "G" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r5", new string[2, 2] {
                    { "N", "N" }, { "C", "C" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r6", new string[2, 2] {
                    { "N", "N" }, { "C", "A" }
                }),
            };

            var group1 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[2, 2] {
                { "A", "G" }, { "N", "N" }
            }));
            var group4 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[2, 2] {
                { "A", "G" }, { "C", "A" }
            }));
            var group6 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r6", new string[2, 2] {
                { "N", "N" }, { "C", "A" }
            }));

            var group2 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[2, 2] {
                { "A", "G" }, { "C", "C" }
            }));

            var group3 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[2, 2] {
                { "A", "A" }, { "C", "C" }
            }));
            var group5 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r5", new string[2, 2] {
                { "N", "N" }, { "C", "C" }
            }));

            ExecuteClusteringTest(new List <VeadGroup>()
            {
                group1, group2, group3, group4, group5, group6
            },
                                  new List <List <VeadGroup> >
            {
                new List <VeadGroup> {
                    group4, group6, group1
                },
                new List <VeadGroup> {
                    group3, group5
                },
                new List <VeadGroup> {
                    group2
                },
            },
                                  new List <string>()
            {
                "A>G,C>A", "A>G,C>C", "A>A,C>C"
            }
                                  , 1, 0);

            // ----------------------------------------------------
            // Ten grouped reads
            //  - This is from original "10 ReadsTest"
            // ----------------------------------------------------

            group1 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead>
            {
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[6, 2] {
                    { "N", "N" }, { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[6, 2] {
                    { "N", "N" }, { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r5", new string[6, 2] {
                    { "N", "N" }, { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" }
                }),
            });
            group2 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead>
            {
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[6, 2] {
                    { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[6, 2] {
                    { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r7", new string[6, 2] {
                    { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r8", new string[6, 2] {
                    { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" }
                }),
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r9", new string[6, 2] {
                    { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" }
                }),
            });
            group3 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead>
            {
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r10", new string[6, 2] {
                    { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" }
                }),
            });
            group4 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead>
            {
                PhasedVariantTestUtilities.CreateVeadFromStringArray("r6", new string[6, 2] {
                    { "C", "C" }, { "C", "C" }, { "C", "C" }, { "C", "C" }, { "C", "C" }, { "C", "C" }
                }),
            });

            ExecuteClusteringTest(new List <VeadGroup>()
            {
                group1, group2, group3, group4
            },
                                  new List <List <VeadGroup> >
            {
                new List <VeadGroup> {
                    group1
                },
                new List <VeadGroup> {
                    group2, group3
                },
                new List <VeadGroup> {
                    group4
                },
            }
                                  , new List <string>()
            {
                "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C"
            }, 4, 0);


            ExecuteClusteringTest(new List <VeadGroup>()
            {
                group1, group2, group3, group4
            },
                                  new List <List <VeadGroup> >
            {
                new List <VeadGroup> {
                    group1
                },
                new List <VeadGroup> {
                    group2, group3
                },
                new List <VeadGroup> {
                    group4
                },
            }
                                  , new List <string>()
            {
                "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C"
            }, 4, 0,
                                  ploidyConstraint: 3);


            ExecuteClusteringTest(new List <VeadGroup>()
            {
                group1, group2, group3, group4
            },
                                  new List <List <VeadGroup> >
            {
                new List <VeadGroup> {
                    group1
                },                    // 6 reads
                new List <VeadGroup> {
                    group2, group3
                },                           //3 reads
                //new List<VeadGroup>{group4}, //1 reads -> the looser
            }
                                  , new List <string>()
            {
                "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C"
            }, 4, 0,
                                  ploidyConstraint: 2);

            ExecuteClusteringTest(new List <VeadGroup>()
            {
                group1, group2, group3, group4
            },
                                  new List <List <VeadGroup> >
            {
                new List <VeadGroup> {
                    group1
                },                    // 6 reads -> the winner
            }
                                  , new List <string>()
            {
                "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C"
            }, 4, 0,
                                  ploidyConstraint: 1);
        }