/// <summary> /// Finds exactly the cluster closest to the cluster whose label matches color1 and the points /// in each cluster that are closest, along with the square distance between them. /// </summary> /// <param name="color1">Label of the cluster whose nearest neighbor is being sought.</param> /// <returns>A point in the cluster corresponding to color1, the closest point to it /// from another cluster, the square distance between the points, and the label of the other cluster. /// NOTE: ClosestPair.Color1 will equal color1. /// </returns> public ClosestPair FindClusterExhaustively(TLabel color1) { var shortest = new ClosestPair(); foreach (var p1 in Clusters.PointsInClass(color1)) { foreach (var pc in Clusters.Points() .Select(p => new { Point = p, Color = Clusters.GetClassLabel(p) }) .Where(pc => !color1.Equals(pc.Color))) { var d = p1.Measure(pc.Point); if (d < shortest.SquareDistance) { shortest.SquareDistance = d; shortest.Color1 = color1; shortest.Point1 = p1; shortest.Color2 = pc.Color; shortest.Point2 = pc.Point; } } } //TODO: If there is only one cluster, the if statement above will not be triggered and shortest will // be ill-defined and cause a Null Pointer exception in Swap. return(shortest.Swap(color1)); }
/// <summary> /// Sorts the points according to their position in a Hilbert curve. /// </summary> private void SortPoints() { var maxValue = Clusters.Points().Select(p => p.MaxCoordinate).Max(); var bitsPerDimension = ((int)maxValue + 1).SmallestPowerOfTwo(); var index = new Dictionary <HilbertPoint, UnsignedPoint> (); var hPoints = new List <HilbertPoint> (); foreach (UnsignedPoint p in Clusters.Points()) { var hp = new HilbertPoint(p.Coordinates, bitsPerDimension); index [hp] = p; hPoints.Add(hp); } hPoints.Sort(); SortedPoints = hPoints.Select(hp => index [hp]).ToList(); }
/// <summary> /// Check if the ids in the points in the Clusters match the ids in the SortedPoints. /// </summary>s private void ValidateIds() { if (SortedPoints.Count != Clusters.NumPoints) { throw new InvalidOperationException("Clusters holds more points than SortedPoints"); } var idRangeClusters = Clusters.Points().Aggregate( new { Min = int.MaxValue, Max = int.MinValue }, (accumulator, o) => new { Min = Math.Min(o.UniqueId, accumulator.Min), Max = Math.Max(o.UniqueId, accumulator.Max) } ); var idRangeSortedPoints = SortedPoints.Aggregate( new { Min = int.MaxValue, Max = int.MinValue }, (accumulator, o) => new { Min = Math.Min(o.UniqueId, accumulator.Min), Max = Math.Max(o.UniqueId, accumulator.Max) } ); if (idRangeClusters.Min != idRangeSortedPoints.Min) { throw new InvalidOperationException("The lowest Id among the points in SortedPoints and Clusters is not the same"); } if (idRangeClusters.Max != idRangeSortedPoints.Max) { throw new InvalidOperationException("The highest Id among the points in SortedPoints and Clusters is not the same"); } /* * // Exhaustive comparison of all ids. More costly. * var idsInClusters = new HashSet<int>(); * foreach (var point in Clusters.Points()) * idsInClusters.Add(point.UniqueId); * * foreach (var point in SortedPoints) * { * if (!idsInClusters.Contains(point.UniqueId)) * throw new InvalidOperationException("SortedPoints has a point whose Id does not match a point in Clusters"); * } */ }
/// <summary> /// Perform unassisted classification of points. /// </summary> public Classification <UnsignedPoint, string> Classify() { // 3) Create multiple HilbertIndexes. // 4) Find best HilbertIndex and find the one that predicts the lowest number of clusters K (OptimalIndex). // 5) Set the characteristic merge distance S (MergeSquareDistance). //TODO: Support formation and use of more than one HilbertIndex, to respect IndexBudget.IndexCount. var useOptimalPermutation = true; UnsignedPoint[] hilbertOrderedPoints; Timer.Start("Find optimum Hilbert ordering"); if (!useOptimalPermutation) { var optimum = OptimalIndex.Search( HilbertPoints, IndexConfig.OutlierSize, IndexConfig.NoiseSkipBy, IndexConfig.ReducedNoiseSkipBy, IndexConfig.MaxTrials, IndexConfig.MaxIterationsWithoutImprovement, IndexConfig.UseSample, true ); hilbertOrderedPoints = HilbertOrderedPoints(optimum.SortedPointIndices.ToList()); MergeSquareDistance = optimum.MergeSquareDistance; } else { var optimum = OptimalPermutation.Search( Clusters.Points().ToList(), BitsPerDimension, IndexConfig.OutlierSize, IndexConfig.NoiseSkipBy, IndexConfig.ReducedNoiseSkipBy, IndexConfig.MaxTrials, IndexConfig.MaxIterationsWithoutImprovement, IndexConfig.UseSample, true ); hilbertOrderedPoints = optimum.SortedPoints.ToArray(); MergeSquareDistance = optimum.MergeSquareDistance; } Timer.Stop("Find optimum Hilbert ordering"); // 6) Pass over the points in Hilbert order. Every consescutive pair closer than the distance S is merged into the // same cluster. Timer.Start("Merge by Hilbert index"); MergeByHilbertIndex(hilbertOrderedPoints); Timer.Stop("Merge by Hilbert index"); // 7) Find the distance from the Centroid of each non-outlier cluster to every other large cluster (ClosestCluster). // 8) For the closest neighboring large clusters, probe deeper and find the pair of points, // one drawn from each of two clusters, that is closest and their separation s (square Cartesian distance). // 9) If a pair of clusters is closer than S (s ≤ S), merge them, transitively. Timer.Start("Merge neighboring large clusters"); var cc = new ClosestCluster <string>(Clusters); var closeClusterPairs = cc.FindClosestClusters(MaxNeighborsToCompare, MergeSquareDistance, OutlierSize, UseExactClusterDistance); var clusterMerges = 0; foreach (var pair in closeClusterPairs.Where(p => p.SquareDistance <= MergeSquareDistance)) { pair.Relabel(Clusters); if (Clusters.Merge(pair.Color1, pair.Color2)) { clusterMerges++; } } Timer.Stop("Merge neighboring large clusters"); // 10) Merge outliers with neighboring clusters. // For all the remaining outliers (small clusters), merge them with the nearest large cluster // unless their distance is too great (MergeSquareDistance * OutlierDistanceMultiplier). // Do not permit this phase to cause two large clusters to be joined to each other. Timer.Start("Merge outliers"); var maxOutlierMergeDistance = (long)(MergeSquareDistance * OutlierDistanceMultiplier); var outlierMerges = MergeOutliers(maxOutlierMergeDistance); Timer.Stop("Merge outliers"); var msg = $" {clusterMerges} Cluster merges, {outlierMerges} Outlier merges"; Logger.Info(msg); return(Clusters); }
private UnsignedPoint[] HilbertOrderedPoints(IList <int> hilbertSortedIds) { var keySorter = new KeySorter <int, UnsignedPoint>(id => id, point => point.UniqueId); return(keySorter.Sort(Clusters.Points().ToList(), hilbertSortedIds, 0)); }