public static void TryMergeAllClusters(SetOfClusters clusterSet, int maxNumDisagreements) { var clusters = clusterSet.Clusters; for (var i = 0; i < clusters.Count(); i++) { for (var j = i + 1; j < clusters.Count(); j++) { var clusterA = clusters[i]; var clusterB = clusters[j]; var canBeMerged = TestCanBeMerged(clusterA, clusterB, maxNumDisagreements); if (!canBeMerged) { continue; } clusterSet.RemoveCluster(clusterA.Name); clusterSet.RemoveCluster(clusterB.Name); var mergedCluster = MergeClusters(clusterA, clusterB); clusterSet.AddCluster(mergedCluster); } } }
private SetOfClusters CreateDefaultSetOfClusters() { var clusteringParams = new ClusteringParameters() { MaxNumberDisagreements = 0, MinNumberAgreements = 0 }; var setOfClusters = new SetOfClusters(clusteringParams); Assert.Equal(0, setOfClusters.Clusters.Count()); return(setOfClusters); }
public void GetPhasingProbabilities() { var variantSites = new List <VariantSite> { new VariantSite(1), new VariantSite(2), new VariantSite(45) }; var clusters = new SetOfClusters(new ClusteringParameters()); // There should be a PhasingResult for each variant in variantSites var phasingProbabilities = VariantPhasingResult.GetPhasingProbabilities(variantSites, clusters); Assert.Equal(variantSites.Count, phasingProbabilities.Count); Assert.Equal(variantSites.Select(x => x), phasingProbabilities.Keys.ToList()); }
public static Cluster MergeAllBestCandidates(SetOfClusters clusters, int maxNumDisagreements, List <Cluster> bestCandidates, VeadGroup testVeadGroup) { var numCandidates = bestCandidates.Count; var bestcluster = bestCandidates[0]; for (var i = 0; i < numCandidates; i++) { for (var j = i + 1; j < numCandidates; j++) { // First test if the clusters can be merged. If they can, merge them. // If they can't, return the better of the two. var clusterA = bestCandidates[i]; var clusterB = bestCandidates[j]; var canBeMerged = TestCanBeMerged(clusterA, clusterB, maxNumDisagreements, testVeadGroup); if (canBeMerged) { clusters.RemoveCluster(clusterA.Name); clusters.RemoveCluster(clusterB.Name); var mergedCluster = MergeClusters(clusterA, clusterB); clusters.AddCluster(mergedCluster); bestcluster = mergedCluster; } else { if (clusterB.NumVeads > clusterA.NumVeads) { bestcluster = clusterB; } //else, leave it as bestcluster = cA; } } } return(bestcluster); }
private void MeetPloidyConstraints(SetOfClusters clusters) { while (clusters.NumClusters > _options.ClusterConstraint) { Logger.WriteToLog("Num clusters: " + clusters.Clusters.Length); Logger.WriteToLog("Num cluster constraint " + _options.ClusterConstraint + " is violated. Pruning clusters..."); int maxAllowedToRemove = (clusters.NumClusters - _options.ClusterConstraint); int numWorstClusters = clusters.RemoveWorstClusters(maxAllowedToRemove); if (numWorstClusters <= maxAllowedToRemove) { Logger.WriteToLog(numWorstClusters + " clusters pruned."); } else { Logger.WriteToLog(numWorstClusters + " low ranked clusters found. This is not resolveable with our cluster constraints."); // then we had a tie situation, and we do not know how to proceed. //if this is not resolvable, we are going to fail our cluster constraint. break; } } Logger.WriteToLog("Clusters finalized: " + clusters.Clusters.Length); }
public void MergeAllBestCandidates() { var setOfClusters = new SetOfClusters(new ClusteringParameters()); var veadgroups = ClusterTestHelpers.GetSampleVeadGroups(prefix: "Original"); var cluster = new Cluster("test", veadgroups); setOfClusters.AddCluster(cluster); var veadgroups2 = ClusterTestHelpers.GetSampleVeadGroups(prefix: "Second"); var cluster2 = new Cluster("test2", veadgroups2); setOfClusters.AddCluster(cluster2); Assert.Equal(2, setOfClusters.NumClusters); var veadgroups3 = ClusterTestHelpers.GetSampleVeadGroups(1, 1, prefix: "Tester"); // If can be merged, we should have 1 cluster. var bestCluster = ClusterMerger.MergeAllBestCandidates(setOfClusters, 0, new List <Cluster> { cluster, cluster2 }, veadgroups3.First()); Assert.Equal(1, setOfClusters.NumClusters); }
public static Dictionary <VariantSite, VariantPhasingResult> GetPhasingProbabilities(List <VariantSite> variantSites, SetOfClusters clusters) { var results = new Dictionary <VariantSite, VariantPhasingResult>(); foreach (var variantSite in variantSites) { results.Add(variantSite, GetPhasingProbabilitiesForVariant(variantSites.ToList(), clusters, variantSite)); } //TODO debug Clusters: (VariantGroup, Chr, Pos, Ref, Alt); Neighbors (VariantGroup, Chr, Pos, Ref, Alt, ProbOfAGivenB, ProbOfB, ProbOfAAndB) return(results); }
private static VariantPhasingResult GetPhasingProbabilitiesForVariant(List <VariantSite> variantGroup, SetOfClusters clusters, VariantSite variantSiteA) { var otherVariants = variantGroup.Where(vs => vs != variantSiteA).ToList(); var phasingResult = new VariantPhasingResult(variantSiteA, otherVariants, clusters.NumClusters); var relativeWeights = clusters.GetRelativeWeights(); //how many clusters have B in them //how many clusters have A and B in them? foreach (var cluster in clusters.Clusters) { var supportDict = cluster.GetVeadCountsInCluster(variantGroup); var weight = relativeWeights[cluster.Name]; foreach (var variantSiteB in otherVariants) { if (supportDict[variantSiteB] <= 0) { continue; } phasingResult.AddSupportForB(variantSiteB, weight); if (supportDict[variantSiteA] > 0) { phasingResult.AddSupportForAandB(variantSiteB, weight); } } } return(phasingResult); }
public SetOfClusters ClusterVeadGroups(List <VeadGroup> veadGroups, string nbhdID) { try { // Make the meatiest clusters first. veadGroups.Sort(); var clusters = new SetOfClusters(_options); if (veadGroups.Count == 0) { Logger.WriteToLog("No vead groups given to clustering algorithm."); return(clusters); } var maxNumNewClusters = veadGroups[0].SiteResults.Length * _options.MaxNumNewClustersPerSite; var nbhdStart = veadGroups[0].SiteResults[0].VcfReferencePosition; var nbhdEnd = veadGroups[0].SiteResults[veadGroups[0].SiteResults.Length - 1].VcfReferencePosition; Logger.WriteToLog("Maximum num new clusters for this nbhd is " + maxNumNewClusters); Logger.WriteToLog("There are {0} variant sites in this nbhd, from position {1} to {2}", veadGroups[0].SiteResults.Length, nbhdStart, nbhdEnd); if (_debug) { Logger.WriteToLog("variant-compressed read groups as follows: "); Logger.WriteToLog("count" + "\t", veadGroups[0].ToPositions()); foreach (var vG in veadGroups) { Logger.WriteToLog("\t" + vG.NumVeads + "\t" + vG); } } int outerIteration = 0; while (veadGroups.Count > 0) { //Logger.WriteToLog("ITER: {0}.{1}\tNum clusters: {2} \tUnassigned read groups: {3}", outerIteration, 0, clusters.NumClusters, veadGroups.Count); CreateNewCluster(veadGroups, clusters); // Is there any poor-fitting read that should go somewhere else? if (_options.AllowWorstFitRemoval) { clusters.ReAssignWorstFit(); } //TODO log the contents of clusterSet (previously "PrintContents()"). const int maxNumReallocationIterations = 10; var reallocationIterationNumber = 1; // Keep trying to allocate free agents to clusters if possible, then tell how many we have left free. // If we haven't allocated any in the last round, give up and break off a new cluster. while (veadGroups.Count > 0) { var initialReadsLeft = veadGroups.Count; //Logger.WriteToLog("ITER: {0}.{1}\tNum clusters: {2} \tUnassigned read groups: {3}", outerIteration, reallocationIterationNumber, clusters.NumClusters, initialReadsLeft); // Find the best existing cluster for each free agent. Add the free agent to that cluster if it exists. Return anyone left free. veadGroups = AllocateReadsToClusters(veadGroups, clusters, _options.MaxNumberDisagreements); //TODO log the contents of clusterSet (previously "PrintContents()"). if (veadGroups.Count == initialReadsLeft) { break; // Reallocation didn't do any good. Give up. } reallocationIterationNumber++; if (reallocationIterationNumber > maxNumReallocationIterations) { break; } } if (clusters.NumClusters > maxNumNewClusters) { break; } outerIteration++; } //Logger.WriteToLog("ITER: {0}.{1}\tNum clusters: {2} \tUnassigned read groups: {3}", outerIteration, 0, clusters.NumClusters, veadGroups.Count); if (clusters != null) { //Logger.WriteToLog("Found " + clusters.Clusters.Length + " clusters in Nbhd " + nbhdID); if (_options.ClusterConstraint > 0) { MeetPloidyConstraints(clusters); } } return(clusters); } catch (Exception ex) { Logger.WriteToLog("Clustering issue.", ex); throw; } }
private List <VeadGroup> AllocateReadsToClusters(List <VeadGroup> veadGroups, SetOfClusters clusters, int maxNumDisagreements) { var vgsRemaining = new List <VeadGroup>(); foreach (var vg in veadGroups) { var bestFits = clusters.GetClusterFits(vg); if (bestFits.Count == 0) { vgsRemaining.Add(vg); } else { var bestCandidates = bestFits.Last().Value; var bestcluster = bestCandidates[0]; if (_options.AllowClusterMerging) { bestcluster = ClusterMerger.MergeAllBestCandidates(clusters, maxNumDisagreements, bestCandidates, vg); } bestcluster.Add(vg); } } return(vgsRemaining); }
private void CreateNewCluster(List <VeadGroup> vgs, SetOfClusters clusters) { clusters.CreateAndAddCluster(vgs[0]); vgs.Remove(vgs[0]); }