/// <summary> /// Merges the small outlier clusters with nearby larger clusters. /// </summary> /// <returns>The number of outlier clusters merged.</returns> /// <param name="maxOutlierMergeDistance">An outlier will only be merged if its distance from /// its nearest cluster does not exceed this square distance.</param> public int MergeOutliers(long maxOutlierMergeDistance) { var mergesDone = 0; var cc = new ClosestCluster <string>(Clusters); var closeOutlierPairs = cc.FindClosestOutliers( MaxNeighborsToCompare, maxOutlierMergeDistance, OutlierSize ); foreach (var pair in closeOutlierPairs) { pair.Relabel(Clusters); // We do not want an outlier to cause the merger of two large clusters // if each of the large clusters is near the outlier but not near each other. // Thus, once the outlier is merged with the nearer of its neighbors, // it will be ruled out from firther merges. if (pair.CountOutliers(Clusters, OutlierSize) != 1) { continue; } if (Clusters.Merge(pair.Color1, pair.Color2)) { mergesDone++; } } return(mergesDone); }
/// <summary> /// Merges the outliers to their nearest neighboring large cluster. /// In this case, any cluster smaller than UnmergeableSize is considered an outlier. /// </summary> private void MergeOutliers() { var cc = new ClosestCluster <string>(Clusters); foreach (var cp in cc.FindClosestOutliers(Clusters.NumPartitions, long.MaxValue, UnmergeableSize)) { Merge(cp.Point1, cp.Point2); } }
/// <summary> /// Perform unassisted classification of points. /// </summary> public Classification <UnsignedPoint, string> Classify() { // 3) Create multiple HilbertIndexes. // 4) Find best HilbertIndex and find the one that predicts the lowest number of clusters K (OptimalIndex). // 5) Set the characteristic merge distance S (MergeSquareDistance). //TODO: Support formation and use of more than one HilbertIndex, to respect IndexBudget.IndexCount. var useOptimalPermutation = true; UnsignedPoint[] hilbertOrderedPoints; Timer.Start("Find optimum Hilbert ordering"); if (!useOptimalPermutation) { var optimum = OptimalIndex.Search( HilbertPoints, IndexConfig.OutlierSize, IndexConfig.NoiseSkipBy, IndexConfig.ReducedNoiseSkipBy, IndexConfig.MaxTrials, IndexConfig.MaxIterationsWithoutImprovement, IndexConfig.UseSample, true ); hilbertOrderedPoints = HilbertOrderedPoints(optimum.SortedPointIndices.ToList()); MergeSquareDistance = optimum.MergeSquareDistance; } else { var optimum = OptimalPermutation.Search( Clusters.Points().ToList(), BitsPerDimension, IndexConfig.OutlierSize, IndexConfig.NoiseSkipBy, IndexConfig.ReducedNoiseSkipBy, IndexConfig.MaxTrials, IndexConfig.MaxIterationsWithoutImprovement, IndexConfig.UseSample, true ); hilbertOrderedPoints = optimum.SortedPoints.ToArray(); MergeSquareDistance = optimum.MergeSquareDistance; } Timer.Stop("Find optimum Hilbert ordering"); // 6) Pass over the points in Hilbert order. Every consescutive pair closer than the distance S is merged into the // same cluster. Timer.Start("Merge by Hilbert index"); MergeByHilbertIndex(hilbertOrderedPoints); Timer.Stop("Merge by Hilbert index"); // 7) Find the distance from the Centroid of each non-outlier cluster to every other large cluster (ClosestCluster). // 8) For the closest neighboring large clusters, probe deeper and find the pair of points, // one drawn from each of two clusters, that is closest and their separation s (square Cartesian distance). // 9) If a pair of clusters is closer than S (s ≤ S), merge them, transitively. Timer.Start("Merge neighboring large clusters"); var cc = new ClosestCluster <string>(Clusters); var closeClusterPairs = cc.FindClosestClusters(MaxNeighborsToCompare, MergeSquareDistance, OutlierSize, UseExactClusterDistance); var clusterMerges = 0; foreach (var pair in closeClusterPairs.Where(p => p.SquareDistance <= MergeSquareDistance)) { pair.Relabel(Clusters); if (Clusters.Merge(pair.Color1, pair.Color2)) { clusterMerges++; } } Timer.Stop("Merge neighboring large clusters"); // 10) Merge outliers with neighboring clusters. // For all the remaining outliers (small clusters), merge them with the nearest large cluster // unless their distance is too great (MergeSquareDistance * OutlierDistanceMultiplier). // Do not permit this phase to cause two large clusters to be joined to each other. Timer.Start("Merge outliers"); var maxOutlierMergeDistance = (long)(MergeSquareDistance * OutlierDistanceMultiplier); var outlierMerges = MergeOutliers(maxOutlierMergeDistance); Timer.Stop("Merge outliers"); var msg = $" {clusterMerges} Cluster merges, {outlierMerges} Outlier merges"; Logger.Info(msg); return(Clusters); }