private void LogVeadGroupInfo(IEnumerable <VeadGroup> collapsedReads) { if (_debugMode) { Logger.WriteToLog("variant-compressed read groups as follows: "); Logger.WriteToLog("count" + "\t", collapsedReads.First().ToPositions()); foreach (var vG in collapsedReads) { Logger.WriteToLog("\t" + vG.NumVeads + "\t" + vG); } } Logger.WriteToLog("Found " + collapsedReads.Count() + " variant-collapsed read groups."); if (_debugMode) { StringBuilder sb = new StringBuilder(); int[] depths; int[] nocalls; VeadGroup.DepthAtSites(collapsedReads, out depths, out nocalls); Logger.WriteToLog("depth at sites: "); Logger.WriteToLog(collapsedReads.First().ToPositions()); for (int i = 0; i < depths.Length; i++) { Logger.WriteToLog(string.Join("\t", depths[i])); } } }
public static VeadGroup CreateVeadGroup(List <Vead> veads) { var veadgroup = new VeadGroup(veads.First()); foreach (var vead in veads.Skip(1)) { veadgroup.AddSupport(vead); } return(veadgroup); }
private static bool TestCanBeMerged(ICluster clusterA, ICluster clusterB, int maxNumDisagreements, VeadGroup veadGroupC = null) { var vgToCheck = new List <VeadGroup>(); vgToCheck.AddRange(clusterA.GetVeadGroups()); vgToCheck.AddRange(clusterB.GetVeadGroups()); if (veadGroupC != null) { vgToCheck.Add(veadGroupC); } var worstAgreement = VeadGroup.GetWorstAgreement(vgToCheck); var tooManyDisagreements = (worstAgreement.NumDisagreement > maxNumDisagreements); //note we already know the num agreements are OK, b/c vC has an acceptable num agreements with each cluster //before we got into this method return(!tooManyDisagreements); }
public static Cluster MergeAllBestCandidates(SetOfClusters clusters, int maxNumDisagreements, List <Cluster> bestCandidates, VeadGroup testVeadGroup) { var numCandidates = bestCandidates.Count; var bestcluster = bestCandidates[0]; for (var i = 0; i < numCandidates; i++) { for (var j = i + 1; j < numCandidates; j++) { // First test if the clusters can be merged. If they can, merge them. // If they can't, return the better of the two. var clusterA = bestCandidates[i]; var clusterB = bestCandidates[j]; var canBeMerged = TestCanBeMerged(clusterA, clusterB, maxNumDisagreements, testVeadGroup); if (canBeMerged) { clusters.RemoveCluster(clusterA.Name); clusters.RemoveCluster(clusterB.Name); var mergedCluster = MergeClusters(clusterA, clusterB); clusters.AddCluster(mergedCluster); bestcluster = mergedCluster; } else { if (clusterB.NumVeads > clusterA.NumVeads) { bestcluster = clusterB; } //else, leave it as bestcluster = cA; } } } return(bestcluster); }
//this unit test was made after we found bug ScyllaLoosingRefCalls_PICS-723. //We had a 1/. GT reported when it should be 1/0. //The reason for this is that all the refs (the "0"s) got incorrectly sucked up. //Ie, MNV ACG-> AG claimed 50 refs, so we (incorrectly) subtracted 50 refs from it. //The bug is that the ref counts got subtractedfrom the exact same mnv that claimed them. // This should never happen, and was not the intent of the alg. // //The affected mehtod is: CreateMnvsFromClusters in VcfNbhd public void CreateMnvsFromClusters_TakeUpRefCount() { var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 123, "ACG", "AT", 1000, 156); var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 123, "A", "TTTTTT", 1000, 200); var originalVcfVariant3 = TestHelper.CreateDummyAllele("chr1", 123, "AC", "TT", 1000, 100); var vs1 = new VariantSite(originalVcfVariant1); var vs2 = new VariantSite(originalVcfVariant2); var caller = new VariantCaller(new VariantCallingParameters(), new BamFilterParameters()); var nbhd = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr1", vs1, vs2, ""); nbhd.SetRangeOfInterest(); nbhd.AddAcceptedPhasedVariant( new CalledAllele(AlleleCategory.Snv) { Chromosome = "chr1", ReferencePosition = 123, ReferenceAllele = "A", AlternateAllele = "T", VariantQscore = 100, TotalCoverage = 1000, AlleleSupport = 200 }); nbhd.AddAcceptedPhasedVariant( new CalledAllele(AlleleCategory.Mnv) { Chromosome = "chr1", ReferencePosition = 123, ReferenceAllele = "ACG", AlternateAllele = "AT", VariantQscore = 100, TotalCoverage = 1000, AlleleSupport = 300 }); nbhd.AddAcceptedPhasedVariant( new CalledAllele(AlleleCategory.Insertion) { Chromosome = "chr1", ReferencePosition = 123, ReferenceAllele = "A", AlternateAllele = "AAAAA", VariantQscore = 100, TotalCoverage = 1000, AlleleSupport = 250 }); //default behavior, nothing gets sucked up nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>() { }; vs1.VcfReferencePosition = 123; var vead = new Vead("dummy", new VariantSite[] { vs1 }); var vg = new VeadGroup(vead); var fakeCluster = new Cluster("test", new List <VeadGroup>() { vg }); fakeCluster.ResetConsensus(); nbhd.CreateMnvsFromClusters(new List <Cluster> { fakeCluster }, 20, 100); caller.CallMNVs(nbhd); caller.CallRefs(nbhd); var acceptedMNVs = nbhd.CalledVariants; var acceptedRefs = nbhd.CalledRefs; Assert.Equal(2, acceptedMNVs.Count); Assert.Equal(3, acceptedMNVs[123].Count); Assert.Equal(1, acceptedRefs.Count); //check the ref counts on all the MNVs. Nothing should be sucked up. Assert.Equal(1000 - 200, acceptedMNVs[123][0].ReferenceSupport); //total depth - allele suport. overly simple for now) Assert.Equal(1000 - 300, acceptedMNVs[123][1].ReferenceSupport); //total depth - allele suport. overly simple for now) Assert.Equal(1000 - 250, acceptedMNVs[123][2].ReferenceSupport); //total depth - allele suport. overly simple for now) // now variant 0 will suck up 100 ref calls: var suckedUpRefRecord100 = new SuckedUpRefRecord() { Counts = 100, AlleleThatClaimedIt = nbhd.CandidateVariants[0] }; nbhd.UsedRefCountsLookup = new Dictionary <int, SuckedUpRefRecord>() { { 123, suckedUpRefRecord100 } }; nbhd.CreateMnvsFromClusters(new List <Cluster> { fakeCluster }, 20, 100); caller.CallMNVs(nbhd); caller.CallRefs(nbhd); acceptedMNVs = nbhd.CalledVariants; acceptedRefs = nbhd.CalledRefs; //check the ref counts on all the MNVs. refs should only be taken up by the first one Assert.Equal(1000 - 200, acceptedMNVs[123][0].ReferenceSupport); //total depth - allele suport. overly simple for now) //old result - has bug //Assert.Equal(1000 - 300, acceptedMNVs[123][1].ReferenceSupport); //total depth - allele suport - sucked up ref) //Assert.Equal(1000 - 250, acceptedMNVs[123][2].ReferenceSupport); //total depth - allele suport - sucked up ref) //new result, fixed Assert.Equal(1000 - 300 - 100, acceptedMNVs[123][1].ReferenceSupport); //total depth - allele suport - sucked up ref) Assert.Equal(1000 - 250 - 100, acceptedMNVs[123][2].ReferenceSupport); //total depth - allele suport - sucked up ref) }
public override void InternalExecute(int maxThreads) { var startTime = DateTime.UtcNow; Logger.WriteToLog("Start processing."); // Package proximal variants into neighborhoods Logger.WriteToLog("Building neighborhoods."); var neighborhoods = _factory.CreateNeighborhoodBuilder().GetNeighborhoods(); Logger.WriteToLog(string.Format("Neighborhood building complete. {0} neighborhoods created.", neighborhoods.Count())); //Then process each neighborhood separately var jobManager = new JobManager(maxThreads); var jobs = new List <IJob>(); foreach (var vcfNeighborhood in neighborhoods) { if (!string.IsNullOrEmpty(_factory.FilteredNbhd) && (_factory.FilteredNbhd != vcfNeighborhood.Id)) { continue; } Logger.WriteToLog("Creating Neighborhood: {0}", vcfNeighborhood.Id); var clusterer = _factory.CreateNeighborhoodClusterer(); var veadGroupSource = _factory.CreateVeadGroupSource(); var collapsedReads = veadGroupSource.GetVeadGroups(vcfNeighborhood); if (_factory.DebugMode) { Logger.WriteToLog("variant-compressed read groups as follows: "); Logger.WriteToLog("count" + "\t", collapsedReads.First().ToPositions()); foreach (var vG in collapsedReads) { Logger.WriteToLog("\t" + vG.NumVeads + "\t" + vG); } } Logger.WriteToLog("Found " + collapsedReads.Count() + " variant-collapsed read groups."); if (_factory.DebugMode) { StringBuilder sb = new StringBuilder(); int[] depths = VeadGroup.DepthAtSites(collapsedReads); Logger.WriteToLog("depth at sites: "); Logger.WriteToLog(collapsedReads.First().ToPositions()); Logger.WriteToLog(string.Join("\t", depths)); } jobs.Add(new GenericJob(() => ProcessNeighborhood(vcfNeighborhood, clusterer, collapsedReads))); } jobManager.Process(jobs); // Finally, come back and combine the information for results // Add back everything that wasn't sucked up into MNVs // Some adjustment for refs in MNVs or something // Write VCF //(3) write results var variantCaller = _factory.CreateVariantCaller(); var variantMerger = _factory.CreateVariantMerger(); // var nbhds = neighborhoods.ToArray(); using (var writer = _factory.CreatePhasedVcfWriter()) { Logger.WriteToLog("Writing phased vcf."); writer.WriteHeader(); //do this in chunks to avoid having all variants in memory at all times. var originalAllelesTrailingNeighbhood = new List <CalledAllele>(); foreach (var nbhd in neighborhoods) { Logger.WriteToLog("Writing original variants up to neighborhood " + nbhd.Id); originalAllelesTrailingNeighbhood = variantMerger.WriteVariantsUptoChr(writer, originalAllelesTrailingNeighbhood, nbhd.ReferenceName); Logger.WriteToLog("Writing phased variants inside neighborhood " + nbhd.Id); variantCaller.CallMNVs(nbhd); variantCaller.CallRefs(nbhd); originalAllelesTrailingNeighbhood = variantMerger.WriteVariantsUptoIncludingNbhd(nbhd, writer, originalAllelesTrailingNeighbhood); } Logger.WriteToLog("Writing variants past last neighborhood"); variantMerger.WriteRemainingVariants(writer, originalAllelesTrailingNeighbhood); } Logger.WriteToLog("Completed processing in {0}s.", DateTime.UtcNow.Subtract(startTime).TotalSeconds); }
public void ClusterVeadGroups() { // ---------------------------------------------------- // Four Ns // - This is from original "FourNs Test" // ---------------------------------------------------- var veads = new List <Vead>() { PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[2, 2] { { "C", "C" }, { "G", "N" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[2, 2] { { "C", "C" }, { "G", "N" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[2, 2] { { "C", "C" }, { "G", "N" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[2, 2] { { "C", "C" }, { "G", "N" } }), }; var veadgroup = PhasedVariantTestUtilities.CreateVeadGroup(veads); ExecuteClusteringTest(new List <VeadGroup>() { veadgroup }, new List <List <VeadGroup> > { new List <VeadGroup> { veadgroup } }, new List <string>() { "C>C,G>N" } , 1); // ---------------------------------------------------- // Real Data // - This data is from Sample 129 (original "Sample129Test") // ---------------------------------------------------- veads = new List <Vead>() { PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[2, 2] { { "A", "G" }, { "N", "N" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[2, 2] { { "A", "G" }, { "C", "C" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[2, 2] { { "A", "A" }, { "C", "C" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[2, 2] { { "A", "G" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r5", new string[2, 2] { { "N", "N" }, { "C", "C" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r6", new string[2, 2] { { "N", "N" }, { "C", "A" } }), }; var group1 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[2, 2] { { "A", "G" }, { "N", "N" } })); var group4 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[2, 2] { { "A", "G" }, { "C", "A" } })); var group6 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r6", new string[2, 2] { { "N", "N" }, { "C", "A" } })); var group2 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[2, 2] { { "A", "G" }, { "C", "C" } })); var group3 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[2, 2] { { "A", "A" }, { "C", "C" } })); var group5 = new VeadGroup(PhasedVariantTestUtilities.CreateVeadFromStringArray("r5", new string[2, 2] { { "N", "N" }, { "C", "C" } })); ExecuteClusteringTest(new List <VeadGroup>() { group1, group2, group3, group4, group5, group6 }, new List <List <VeadGroup> > { new List <VeadGroup> { group4, group6, group1 }, new List <VeadGroup> { group3, group5 }, new List <VeadGroup> { group2 }, }, new List <string>() { "A>G,C>A", "A>G,C>C", "A>A,C>C" } , 1, 0); // ---------------------------------------------------- // Ten grouped reads // - This is from original "10 ReadsTest" // ---------------------------------------------------- group1 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead> { PhasedVariantTestUtilities.CreateVeadFromStringArray("r1", new string[6, 2] { { "N", "N" }, { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r2", new string[6, 2] { { "N", "N" }, { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r5", new string[6, 2] { { "N", "N" }, { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" } }), }); group2 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead> { PhasedVariantTestUtilities.CreateVeadFromStringArray("r3", new string[6, 2] { { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r4", new string[6, 2] { { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r7", new string[6, 2] { { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r8", new string[6, 2] { { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" } }), PhasedVariantTestUtilities.CreateVeadFromStringArray("r9", new string[6, 2] { { "N", "N" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" } }), }); group3 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead> { PhasedVariantTestUtilities.CreateVeadFromStringArray("r10", new string[6, 2] { { "C", "A" }, { "C", "A" }, { "C", "A" }, { "C", "A" }, { "N", "N" }, { "C", "A" } }), }); group4 = PhasedVariantTestUtilities.CreateVeadGroup(new List <Vead> { PhasedVariantTestUtilities.CreateVeadFromStringArray("r6", new string[6, 2] { { "C", "C" }, { "C", "C" }, { "C", "C" }, { "C", "C" }, { "C", "C" }, { "C", "C" } }), }); ExecuteClusteringTest(new List <VeadGroup>() { group1, group2, group3, group4 }, new List <List <VeadGroup> > { new List <VeadGroup> { group1 }, new List <VeadGroup> { group2, group3 }, new List <VeadGroup> { group4 }, } , new List <string>() { "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C" }, 4, 0); ExecuteClusteringTest(new List <VeadGroup>() { group1, group2, group3, group4 }, new List <List <VeadGroup> > { new List <VeadGroup> { group1 }, new List <VeadGroup> { group2, group3 }, new List <VeadGroup> { group4 }, } , new List <string>() { "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C" }, 4, 0, ploidyConstraint: 3); ExecuteClusteringTest(new List <VeadGroup>() { group1, group2, group3, group4 }, new List <List <VeadGroup> > { new List <VeadGroup> { group1 }, // 6 reads new List <VeadGroup> { group2, group3 }, //3 reads //new List<VeadGroup>{group4}, //1 reads -> the looser } , new List <string>() { "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C" }, 4, 0, ploidyConstraint: 2); ExecuteClusteringTest(new List <VeadGroup>() { group1, group2, group3, group4 }, new List <List <VeadGroup> > { new List <VeadGroup> { group1 }, // 6 reads -> the winner } , new List <string>() { "N>N,N>N,C>A,C>A,C>A,C>A", "C>A,C>A,C>A,C>A,N>N,C>A", "C>C,C>C,C>C,C>C,C>C,C>C" }, 4, 0, ploidyConstraint: 1); }