Beispiel #1
0
        private static async Task <List <ClusterNode> > BuildClustersAsync(ICollection <Node> nodes, IDistanceMetric distanceMetric, ProgressData progressData)
        {
            var nodeCount = nodes
                            .SelectMany(node => node.NeighborsByDistance.Select(neighbor => neighbor.Node.FirstLeaf.Index))
                            .Concat(nodes.Select(node => node.FirstLeaf.Index))
                            .Distinct().Count();

            progressData.Reset($"Building clusters for {nodeCount} matches...", nodes.Count - 1);

            await Task.Run(async() =>
            {
                // Collect isolated nodes off to the side as we find them
                var isolatedNodes = new List <Node>();

                while (nodes.Count > 1)
                {
                    // This is a little verbose, but optimized for performance -- O(N) overall.
                    Node secondNode       = null;
                    var neighborToCluster = new Neighbor {
                        DistanceSquared = double.MaxValue
                    };
                    foreach (var node in nodes)
                    {
                        if (node.FirstLeaf.NeighborsByDistance.Count > 0 && node.FirstLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared)
                        {
                            secondNode        = node;
                            neighborToCluster = node.FirstLeaf.NeighborsByDistance.First();
                        }
                        if (node.FirstLeaf != node.SecondLeaf && node.SecondLeaf.NeighborsByDistance.Count > 0 && node.SecondLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared)
                        {
                            secondNode        = node;
                            neighborToCluster = node.SecondLeaf.NeighborsByDistance.First();
                        }
                    }

                    var foundNodesToCluster = secondNode != null;

                    ClusterNode clusterNode;
                    if (!foundNodesToCluster)
                    {
                        // Some of the nodes might have no neighbors because they are fully isolated.
                        // In other words, none of the leaf nodes in the cluster has any shared matches outside of the cluster.
                        // This might happen for a very distant cluster with no sharing in closer relatives,
                        // or for example a split between maternal and paternal relatives.
                        var isIsolatedNodes = nodes.ToLookup(node =>
                        {
                            var leafNodeIndexes = node.GetOrderedLeafNodesIndexes();
                            return(node.GetOrderedLeafNodes().All(leafNode => leafNodeIndexes.IsSupersetOf(leafNode.Coords.Keys)));
                        });
                        var newIsolatedNodes = isIsolatedNodes[true].ToList();
                        if (newIsolatedNodes.Count > 0)
                        {
                            // Segregate the isolated nodes, since there is nothing that will make them un-isolated.
                            isolatedNodes.AddRange(newIsolatedNodes);
                            nodes = isIsolatedNodes[false].ToList();

                            // If there are fewer than 2 nodes remaining after segregating the isolated nodes, we're done.
                            if (nodes.Count <= 1)
                            {
                                break;
                            }
                        }

                        // All of the remaining nodes have at least one shared match in some other cluster.
                        // Make a larger cluster by joining the smallest cluster with the other node that has the greatest overlap with it.
                        var smallestNode          = nodes.OrderBy(node => node.NumChildren).First();
                        var smallestNodeLeafNodes = new HashSet <int>(smallestNode.GetOrderedLeafNodesIndexes());
                        var otherNode             = nodes
                                                    .Where(node => node != smallestNode)
                                                    .OrderByDescending(node => smallestNodeLeafNodes.Intersect(node.GetOrderedLeafNodesIndexes()).Count())
                                                    .ThenBy(node => node.NumChildren)
                                                    .First();
                        clusterNode = new ClusterNode(otherNode, smallestNode, double.PositiveInfinity, distanceMetric);
                    }
                    else
                    {
                        var firstNode = neighborToCluster.Node;
                        var first     = firstNode.GetHighestParent();
                        var second    = secondNode.GetHighestParent();
                        clusterNode   = new ClusterNode(first, second, neighborToCluster.DistanceSquared, distanceMetric);
                    }

                    var nodesToRemove = GetNodesToRemove(clusterNode);

                    var nodesWithRemovedNeighbors = new HashSet <LeafNode>(await RemoveNodesAsync(nodes, nodesToRemove.ToList()));

                    nodes.Remove(clusterNode.First);
                    nodes.Remove(clusterNode.Second);

                    // The first and last leaf nodes in the new cluster cannot have each other as neighbors.
                    if (clusterNode.FirstLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.SecondLeaf == neighbor.Node) > 0)
                    {
                        nodesWithRemovedNeighbors.Add(clusterNode.FirstLeaf);
                    }
                    if (clusterNode.SecondLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.FirstLeaf == neighbor.Node) > 0)
                    {
                        nodesWithRemovedNeighbors.Add(clusterNode.SecondLeaf);
                    }

                    await RecalculateNeighborsAsync(nodes, nodesWithRemovedNeighbors, distanceMetric);

                    nodes.Add(clusterNode);

                    progressData.Increment();
                }

                // If any isolated nodes were found, add them to the end in order of decreasing size.
                if (isolatedNodes.Count > 0)
                {
                    var nodesLargestFirst = isolatedNodes.OrderByDescending(n => n.NumChildren).ToList();
                    var node = nodesLargestFirst.First();
                    foreach (var otherNode in nodesLargestFirst.Skip(1))
                    {
                        node = new ClusterNode(node, otherNode, double.PositiveInfinity, distanceMetric);
                    }

                    if (nodes.Count > 0)
                    {
                        node = new ClusterNode(nodes.Last(), node, double.PositiveInfinity, distanceMetric);
                        nodes.Remove(nodes.Last());
                    }
                    nodes.Add(node);
                }
            });

            progressData.Reset("Done");

            return(nodes.OfType <ClusterNode>().ToList());
        }
Beispiel #2
0
        private static async Task <List <ClusterNode> > BuildClustersAsync(ICollection <Node> nodes, IDistanceMetric distanceMetric, ProgressData progressData)
        {
            var nodeCount = nodes
                            .SelectMany(node => node.NeighborsByDistance.Select(neighbor => neighbor.Node.FirstLeaf.Index))
                            .Concat(nodes.Select(node => node.FirstLeaf.Index))
                            .Distinct().Count();

            progressData.Reset($"Building clusters for {nodeCount} matches...", nodes.Count - 1);

            await Task.Run(async() =>
            {
                while (nodes.Count > 1)
                {
                    // This is a little verbose, but optimized for performance -- O(N) overall.
                    Node secondNode       = null;
                    var neighborToCluster = new Neighbor {
                        DistanceSquared = double.MaxValue
                    };
                    foreach (var node in nodes)
                    {
                        if (node.FirstLeaf.NeighborsByDistance.Count > 0 && node.FirstLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared)
                        {
                            secondNode        = node;
                            neighborToCluster = node.FirstLeaf.NeighborsByDistance.First();
                        }
                        if (node.FirstLeaf != node.SecondLeaf && node.SecondLeaf.NeighborsByDistance.Count > 0 && node.SecondLeaf.NeighborsByDistance.First().DistanceSquared < neighborToCluster.DistanceSquared)
                        {
                            secondNode        = node;
                            neighborToCluster = node.SecondLeaf.NeighborsByDistance.First();
                        }
                    }

                    ClusterNode clusterNode;
                    if (secondNode == null)
                    {
                        var nodesLargestFirst = nodes.OrderByDescending(node => node.GetOrderedLeafNodes().Count()).Take(2).ToList();
                        clusterNode           = new ClusterNode(nodesLargestFirst[0], nodesLargestFirst[1], double.PositiveInfinity, distanceMetric);
                    }
                    else
                    {
                        var firstNode = neighborToCluster.Node;
                        var first     = firstNode.GetHighestParent();
                        var second    = secondNode.GetHighestParent();
                        clusterNode   = new ClusterNode(first, second, neighborToCluster.DistanceSquared, distanceMetric);
                    }

                    var nodesWithRemovedNeighbors = new HashSet <LeafNode>();
                    var nodesToRemove             = new List <LeafNode>();

                    // If joining clusters with more than one node, then the interior nodes are no longer available for further clustering.
                    if (clusterNode.First.FirstLeaf != clusterNode.First.SecondLeaf)
                    {
                        nodesToRemove.Add(clusterNode.First.SecondLeaf);
                    }
                    if (clusterNode.Second.FirstLeaf != clusterNode.Second.SecondLeaf)
                    {
                        nodesToRemove.Add(clusterNode.Second.FirstLeaf);
                    }

                    // If at least one node is unavailable for further clustering, then remove those nodes from the lists of neighbors.
                    if (nodesToRemove.Count > 0)
                    {
                        // Find the exposed leaf nodes that might have the the unavailable nodes as potential neighbors.
                        var leafNodesWithNeighbors = nodes.Select(node => node.FirstLeaf)
                                                     .Concat(nodes.Where(node => node.SecondLeaf != node.FirstLeaf).Select(node => node.SecondLeaf))
                                                     .Where(leafNode => leafNode.NeighborsByDistance?.Count > 0);

                        var removeNeighborsTasks = leafNodesWithNeighbors
                                                   .Where(node => nodesToRemove.Any(nodeToRemove => nodeToRemove.Index < node.Index))
                                                   .Select(node => Task.Run(() =>
                        {
                            var numNeighborsRemoved = node.NeighborsByDistance.RemoveAll(neighbor => nodesToRemove.Contains(neighbor.Node));
                            return(numNeighborsRemoved > 0 ? node : null);
                        }));

                        var affectedNodes = await Task.WhenAll(removeNeighborsTasks);
                        nodesWithRemovedNeighbors.UnionWith(affectedNodes.Where(node => node != null));
                    }

                    nodes.Remove(clusterNode.First);
                    nodes.Remove(clusterNode.Second);

                    // The first and last leaf nodes in the new cluster cannot have each other as neighbors.
                    if (clusterNode.FirstLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.SecondLeaf == neighbor.Node) > 0)
                    {
                        nodesWithRemovedNeighbors.Add(clusterNode.FirstLeaf);
                    }
                    if (clusterNode.SecondLeaf.NeighborsByDistance.RemoveAll(neighbor => clusterNode.FirstLeaf == neighbor.Node) > 0)
                    {
                        nodesWithRemovedNeighbors.Add(clusterNode.SecondLeaf);
                    }

                    var nodesWithLastNeighborRemoved = nodesWithRemovedNeighbors.Where(node => node.NeighborsByDistance.Count == 0).ToList();
                    if (nodesWithLastNeighborRemoved.Count > 0)
                    {
                        var recalculateTasks = nodesWithLastNeighborRemoved.Select(leafNode => Task.Run(() =>
                        {
                            var highestParent = leafNode.GetHighestParent();
                            var leafNodes     = nodes.Select(node => node.FirstLeaf)
                                                .Concat(nodes.Where(node => node.SecondLeaf != node.FirstLeaf).Select(node => node.SecondLeaf))
                                                .Where(node => node != highestParent.FirstLeaf && node != highestParent.SecondLeaf);
                            leafNode.NeighborsByDistance = GetNeighborsByDistance(leafNodes, leafNode, distanceMetric);
                        }));
                        await Task.WhenAll(recalculateTasks);
                    }

                    nodes.Add(clusterNode);

                    progressData.Increment();
                }
            });

            progressData.Reset("Done");

            return(nodes.OfType <ClusterNode>().ToList());
        }