private static async Task <List <ClusterNode> > BuildClustersAsync(ICollection <Node> nodes, IDistanceMetric distanceMetric, ProgressData progressData) { var nodeCount = nodes .SelectMany(node => node.NeighborsByDistance.Select(neighbor => neighbor.Node.FirstLeaf.Index)) .Concat(nodes.Select(node => node.FirstLeaf.Index)) .Distinct().Count(); progressData.Reset($"Building clusters for {nodeCount} matches...", nodes.Count - 1); await Task.Run(async() => { // Collect isolated nodes off to the side as we find them var isolatedNodes = new List <Node>(); while (nodes.Count > 1) { // This is a little verbose, but optimized for performance -- O(N) overall. Node secondNode = null; var neighborToCluster = new Neighbor { DistanceSquared = double.MaxValue }; foreach (var node in nodes) { if (node.FirstLeaf.NeighborsByDistance.Count > 0 && node.FirstLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared) { secondNode = node; neighborToCluster = node.FirstLeaf.NeighborsByDistance.First(); } if (node.FirstLeaf != node.SecondLeaf && node.SecondLeaf.NeighborsByDistance.Count > 0 && node.SecondLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared) { secondNode = node; neighborToCluster = node.SecondLeaf.NeighborsByDistance.First(); } } var foundNodesToCluster = secondNode != null; ClusterNode clusterNode; if (!foundNodesToCluster) { // Some of the nodes might have no neighbors because they are fully isolated. // In other words, none of the leaf nodes in the cluster has any shared matches outside of the cluster. // This might happen for a very distant cluster with no sharing in closer relatives, // or for example a split between maternal and paternal relatives. var isIsolatedNodes = nodes.ToLookup(node => { var leafNodeIndexes = node.GetOrderedLeafNodesIndexes(); return(node.GetOrderedLeafNodes().All(leafNode => leafNodeIndexes.IsSupersetOf(leafNode.Coords.Keys))); }); var newIsolatedNodes = isIsolatedNodes[true].ToList(); if (newIsolatedNodes.Count > 0) { // Segregate the isolated nodes, since there is nothing that will make them un-isolated. isolatedNodes.AddRange(newIsolatedNodes); nodes = isIsolatedNodes[false].ToList(); // If there are fewer than 2 nodes remaining after segregating the isolated nodes, we're done. if (nodes.Count <= 1) { break; } } // All of the remaining nodes have at least one shared match in some other cluster. // Make a larger cluster by joining the smallest cluster with the other node that has the greatest overlap with it. var smallestNode = nodes.OrderBy(node => node.NumChildren).First(); var smallestNodeLeafNodes = new HashSet <int>(smallestNode.GetOrderedLeafNodesIndexes()); var otherNode = nodes .Where(node => node != smallestNode) .OrderByDescending(node => smallestNodeLeafNodes.Intersect(node.GetOrderedLeafNodesIndexes()).Count()) .ThenBy(node => node.NumChildren) .First(); clusterNode = new ClusterNode(otherNode, smallestNode, double.PositiveInfinity, distanceMetric); } else { var firstNode = neighborToCluster.Node; var first = firstNode.GetHighestParent(); var second = secondNode.GetHighestParent(); clusterNode = new ClusterNode(first, second, neighborToCluster.DistanceSquared, distanceMetric); } var nodesToRemove = GetNodesToRemove(clusterNode); var nodesWithRemovedNeighbors = new HashSet <LeafNode>(await RemoveNodesAsync(nodes, nodesToRemove.ToList())); nodes.Remove(clusterNode.First); nodes.Remove(clusterNode.Second); // The first and last leaf nodes in the new cluster cannot have each other as neighbors. if (clusterNode.FirstLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.SecondLeaf == neighbor.Node) > 0) { nodesWithRemovedNeighbors.Add(clusterNode.FirstLeaf); } if (clusterNode.SecondLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.FirstLeaf == neighbor.Node) > 0) { nodesWithRemovedNeighbors.Add(clusterNode.SecondLeaf); } await RecalculateNeighborsAsync(nodes, nodesWithRemovedNeighbors, distanceMetric); nodes.Add(clusterNode); progressData.Increment(); } // If any isolated nodes were found, add them to the end in order of decreasing size. if (isolatedNodes.Count > 0) { var nodesLargestFirst = isolatedNodes.OrderByDescending(n => n.NumChildren).ToList(); var node = nodesLargestFirst.First(); foreach (var otherNode in nodesLargestFirst.Skip(1)) { node = new ClusterNode(node, otherNode, double.PositiveInfinity, distanceMetric); } if (nodes.Count > 0) { node = new ClusterNode(nodes.Last(), node, double.PositiveInfinity, distanceMetric); nodes.Remove(nodes.Last()); } nodes.Add(node); } }); progressData.Reset("Done"); return(nodes.OfType <ClusterNode>().ToList()); }
private static async Task <List <ClusterNode> > BuildClustersAsync(ICollection <Node> nodes, IDistanceMetric distanceMetric, ProgressData progressData) { var nodeCount = nodes .SelectMany(node => node.NeighborsByDistance.Select(neighbor => neighbor.Node.FirstLeaf.Index)) .Concat(nodes.Select(node => node.FirstLeaf.Index)) .Distinct().Count(); progressData.Reset($"Building clusters for {nodeCount} matches...", nodes.Count - 1); await Task.Run(async() => { while (nodes.Count > 1) { // This is a little verbose, but optimized for performance -- O(N) overall. Node secondNode = null; var neighborToCluster = new Neighbor { DistanceSquared = double.MaxValue }; foreach (var node in nodes) { if (node.FirstLeaf.NeighborsByDistance.Count > 0 && node.FirstLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared) { secondNode = node; neighborToCluster = node.FirstLeaf.NeighborsByDistance.First(); } if (node.FirstLeaf != node.SecondLeaf && node.SecondLeaf.NeighborsByDistance.Count > 0 && node.SecondLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared) { secondNode = node; neighborToCluster = node.SecondLeaf.NeighborsByDistance.First(); } } ClusterNode clusterNode; if (secondNode == null) { var nodesLargestFirst = nodes.OrderByDescending(node => node.GetOrderedLeafNodes().Count()).Take(2).ToList(); clusterNode = new ClusterNode(nodesLargestFirst[0], nodesLargestFirst[1], double.PositiveInfinity, distanceMetric); } else { var firstNode = neighborToCluster.Node; var first = firstNode.GetHighestParent(); var second = secondNode.GetHighestParent(); clusterNode = new ClusterNode(first, second, neighborToCluster.DistanceSquared, distanceMetric); } var nodesWithRemovedNeighbors = new HashSet <LeafNode>(); var nodesToRemove = new List <LeafNode>(); // If joining clusters with more than one node, then the interior nodes are no longer available for further clustering. if (clusterNode.First.FirstLeaf != clusterNode.First.SecondLeaf) { nodesToRemove.Add(clusterNode.First.SecondLeaf); } if (clusterNode.Second.FirstLeaf != clusterNode.Second.SecondLeaf) { nodesToRemove.Add(clusterNode.Second.FirstLeaf); } // If at least one node is unavailable for further clustering, then remove those nodes from the lists of neighbors. if (nodesToRemove.Count > 0) { // Find the exposed leaf nodes that might have the the unavailable nodes as potential neighbors. var leafNodesWithNeighbors = nodes.Select(node => node.FirstLeaf) .Concat(nodes.Where(node => node.SecondLeaf != node.FirstLeaf).Select(node => node.SecondLeaf)) .Where(leafNode => leafNode.NeighborsByDistance?.Count > 0); var removeNeighborsTasks = leafNodesWithNeighbors .Where(node => nodesToRemove.Any(nodeToRemove => nodeToRemove.Index < node.Index)) .Select(node => Task.Run(() => { var numNeighborsRemoved = node.NeighborsByDistance.RemoveAll(neighbor => nodesToRemove.Contains(neighbor.Node)); return(numNeighborsRemoved > 0 ? node : null); })); var affectedNodes = await Task.WhenAll(removeNeighborsTasks); nodesWithRemovedNeighbors.UnionWith(affectedNodes.Where(node => node != null)); } nodes.Remove(clusterNode.First); nodes.Remove(clusterNode.Second); // The first and last leaf nodes in the new cluster cannot have each other as neighbors. if (clusterNode.FirstLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.SecondLeaf == neighbor.Node) > 0) { nodesWithRemovedNeighbors.Add(clusterNode.FirstLeaf); } if (clusterNode.SecondLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.FirstLeaf == neighbor.Node) > 0) { nodesWithRemovedNeighbors.Add(clusterNode.SecondLeaf); } var nodesWithLastNeighborRemoved = nodesWithRemovedNeighbors.Where(node => node.NeighborsByDistance.Count == 0).ToList(); if (nodesWithLastNeighborRemoved.Count > 0) { var recalculateTasks = nodesWithLastNeighborRemoved.Select(leafNode => Task.Run(() => { var highestParent = leafNode.GetHighestParent(); var leafNodes = nodes.Select(node => node.FirstLeaf) .Concat(nodes.Where(node => node.SecondLeaf != node.FirstLeaf).Select(node => node.SecondLeaf)) .Where(node => node != highestParent.FirstLeaf && node != highestParent.SecondLeaf); leafNode.NeighborsByDistance = GetNeighborsByDistance(leafNodes, leafNode, distanceMetric); })); await Task.WhenAll(recalculateTasks); } nodes.Add(clusterNode); progressData.Increment(); } }); progressData.Reset("Done"); return(nodes.OfType <ClusterNode>().ToList()); }