/// <summary> /// Compare the exact coarseness with an estimate for all numbers of bits. /// /// This takes an assemblage of many clusters and finds the most concentrated /// cluster according to a single bit Hilbert curve. /// Then it composes a GridCoarseness for the points in that cluster. /// </summary> /// <param name="numPoints">Number of points</param> /// <param name="dimensions">Number of dimensions</param> /// <param name="clusterCount">Number of clusters</param> /// <param name="maxCoordinate">Larges value any cooedinate of any dimension can hold</param> /// <param name="maxStdDeviation">Maximum standard deviation among coordinate values relative to the center of each Gaussian cluster generated.</param> /// <param name="minStdDeviation">Maximum standard deviation among coordinate values relative to the center of each Gaussian cluster generated.</param> /// <returns>The GridCoarseness.</returns> GridCoarseness MakeTestGrid(int numPoints, int dimensions, int clusterCount, int maxCoordinate, int minStdDeviation = 10, int maxStdDeviation = 30) { var avgClusterSize = numPoints / clusterCount; var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = avgClusterSize - 100, MaxClusterSize = avgClusterSize + 100, MaxDistanceStdDev = maxStdDeviation, MinDistanceStdDev = minStdDeviation }; var clusters = data.MakeClusters(); var points = clusters.Points().ToList(); PointBalancer balancer = null; var bitsRequired = (maxCoordinate + 1).SmallestPowerOfTwo(); var lowresSort = HilbertSort.SortWithTies(points, 1, ref balancer); var largestBucket = lowresSort.OrderByDescending(bucket => bucket.Length).FirstOrDefault(); var bucketSize = largestBucket.Length; var grid = new GridCoarseness(largestBucket, bitsRequired); return(grid); }
/// <summary> /// For random clustered data, discover how unique shortened versions of the Hilbert index are. /// </summary> /// <param name="numPoints">Number of points.</param> /// <param name="dimensions">Dimensions per point.</param> /// <param name="clusterCount">Number of clusters.</param> /// <param name="smallBucketSize">Count of items that constitutes a small bucket.</param> /// <param name="maxCoordinate">Highest permitted coordinate value.</param> public void ClusteredUniquenessByBits(int numPoints, int dimensions, int clusterCount, int smallBucketSize, int maxCoordinate) { var clusterSizeVariation = 100; var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation; var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation; var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = minClusterSize, MaxClusterSize = maxClusterSize }; var clusters = data.MakeClusters(); var points = clusters.Points().ToList(); PointBalancer balancer = null; var bitsRequired = (maxCoordinate + 1).SmallestPowerOfTwo(); for (var iBits = 1; iBits <= bitsRequired; iBits++) { var maxBucketSize = MaxBucketSizePerBits(points, iBits, smallBucketSize, ref balancer, out int pointsInSmallBuckets); var pctInSmallBuckets = 100.0 * pointsInSmallBuckets / points.Count; Console.WriteLine($"Bits: {iBits} Max Bucket: {maxBucketSize} # in Small Buckets: {pointsInSmallBuckets} - {pctInSmallBuckets} %"); } }
public void CompareSpeedOfSorting_Balanced_vs_HilbertIndex() { var points = TestData(20000, 50, 20, 1000000, 100, 500, out int bitsPerDimension); var timer1 = new Stopwatch(); var timer2 = new Stopwatch(); var timer3 = new Stopwatch(); // 1. HilbertIndex timer1.Start(); var hIndex = new HilbertIndex(points.Select(p => new HilbertPoint(p.Coordinates, bitsPerDimension))); var sortedPointsFromIndex = hIndex.SortedPoints; timer1.Stop(); var hilbertIndexTime = timer1.ElapsedMilliseconds; // 2. HilbertSort.BalancedSort timer2.Start(); timer3.Start(); PointBalancer balancer = new PointBalancer(points); timer3.Stop(); HilbertSort.BalancedSort(points.ToList(), ref balancer); timer2.Stop(); var balancedSortTime = timer2.ElapsedMilliseconds; var balancerTime = timer3.ElapsedMilliseconds; var message = $"HilbertIndex required {hilbertIndexTime / 1000.0} sec. Balanced Sort required {balancedSortTime / 1000.0} sec, of which {balancerTime / 1000.0} sec is Balancer ctor. Relative Cost = {HilbertSort.RelativeSortCost}"; Console.WriteLine(message); Assert.Greater(hilbertIndexTime, balancedSortTime, message); }
public void InPlaceSort_NarrowClusters() { var points = TestData(20000, 50, 20, 1000000, 10, 30, out int bitsPerDimension); PointBalancer balancer = new PointBalancer(points); var unoptimizedSort = HilbertSort.BalancedSort(points.ToList(), ref balancer).ToArray(); HilbertSort.SmallBalancedSort(points, ref balancer); CollectionAssert.AreEqual(unoptimizedSort, points, "Not in same order"); }
/// <summary> /// For uniformly random data, discover how unique shortened versions of the Hilbert index are. /// </summary> /// <remarks> /// The results from this test show that for truly random data, every point (or almost every point) /// ends up in its own bucket even if only one bit per dimension is used. /// </remarks> /// <param name="numPoints">Number of points.</param> /// <param name="dimensions">Dimensions per point.</param> /// <param name="smallBucketSize">Count of items that constitutes a small bucket.</param> /// <param name="maxCoordinate">Highest permitted coordinate value.</param> public void UniformUniquenessByBits(int numPoints, int dimensions, int smallBucketSize, int maxCoordinate) { var points = TestDataHelper.UniformRandomPoints(numPoints, dimensions, maxCoordinate); PointBalancer balancer = null; var bitsRequired = (maxCoordinate + 1).SmallestPowerOfTwo(); var maxBucketSize = new int[bitsRequired]; for (var iBits = 1; iBits <= bitsRequired; iBits++) { maxBucketSize[iBits - 1] = MaxBucketSizePerBits(points, iBits, smallBucketSize, ref balancer, out int pointsInSmallBuckets); var pctInSmallBuckets = 100.0 * pointsInSmallBuckets / points.Count; Console.WriteLine($"Bits: {iBits} Max Bucket: {maxBucketSize[iBits-1]} # in Small Buckets: {pointsInSmallBuckets} - {pctInSmallBuckets} %"); } Assert.LessOrEqual(maxBucketSize[0], 2, $"Even a one-bit Hilbert curve should be enough to distinguish random points, but maxBucketSize is {maxBucketSize[0]}"); }
public void InPlaceSortRelativeCost() { var clusters = new[] { 10, 20, 50, 100 }; var stdDeviations = new[] { 20, 100, 200, 1000, 2000 }; var dimensions = 50; var numPoints = 20000; var report = "Clusters,Standard Deviation,Relative Cost\n"; foreach (var k in clusters) { foreach (var sd in stdDeviations) { var points = TestData(numPoints, dimensions, k, 1000000, sd, sd, out int bitsPerDimension); PointBalancer balancer = new PointBalancer(points); HilbertSort.SmallBalancedSort(points, ref balancer); var cost = HilbertSort.RelativeSortCost; report += $"{k},{sd},{cost}\n"; } } Console.WriteLine($"\n\nFinal report:\n\n{report}"); }
public void LowresVersusHiresCase(int numPoints, int dimensions, int clusterCount, int lowresBits) { var maxCoordinate = 1000; var clusterSizeVariation = 100; var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation; var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation; var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = minClusterSize, MaxClusterSize = maxClusterSize }; var clusters = data.MakeClusters(); var points = clusters.Points().ToList(); PointBalancer balancer = null; var hiresSort = HilbertSort.BalancedSort(points, ref balancer); var lowresSort = HilbertSort.SortWithTies(points, lowresBits, ref balancer); var lowresPositions = new Dictionary <UnsignedPoint, int>(); var hiresPosition = new Dictionary <UnsignedPoint, int>(); foreach (var p in hiresSort.Select((p, i) => { hiresPosition[p] = i; return(p); })) { ; } foreach (var ties in lowresSort.Select((p, i) => new { Points = p, Position = i })) { foreach (var point in ties.Points) { lowresPositions[point] = ties.Position; } } // Compare the positions of many pairs of points in the two orderings to see that // they are either in the same relative order // or tied for position in the lowres ordering. var actualNumPoints = points.Count; var largestBucket = lowresSort.Select(bucket => bucket.Length).Max(); var caseDescription = $"N = {actualNumPoints} D = {dimensions} K = {clusterCount} B = {lowresBits}"; Console.WriteLine(caseDescription); Console.WriteLine($"Lowres buckets = {lowresSort.Count} Largest bucket = {largestBucket}"); int outOfPlaceCount = 0; for (var i = 0; i < actualNumPoints - 1; i++) { var p1 = points[i]; for (var j = i + 1; j < actualNumPoints; j++) { var p2 = points[j]; var lowresPosition1 = lowresPositions[p1]; var lowresPosition2 = lowresPositions[p2]; var hiresPosition1 = hiresPosition[p1]; var hiresPosition2 = hiresPosition[p2]; if (lowresPosition1 != lowresPosition2) { if (lowresPosition1 < lowresPosition2 != hiresPosition1 < hiresPosition2) { outOfPlaceCount++; } } } } var msg = $"Out of place count = {outOfPlaceCount}"; Console.WriteLine(msg); Assert.AreEqual(0, outOfPlaceCount, msg); }
public int MaxBucketSizePerBits(List <UnsignedPoint> points, int lowresBits, int smallBucketSize, ref PointBalancer balancer, out int pointsInSmallBuckets) { balancer = balancer ?? new PointBalancer(points); var dimensions = points[0].Dimensions; var lowresSort = HilbertSort.SortWithTies(points, lowresBits, ref balancer); var lowresPositions = new Dictionary <UnsignedPoint, int>(); foreach (var ties in lowresSort.Select((p, i) => new { Points = p, Position = i })) { foreach (var point in ties.Points) { lowresPositions[point] = ties.Position; } } // Compare the positions of many pairs of points in the two orderings to see that // they are either in the same relative order // or tied for position in the lowres ordering. var actualNumPoints = points.Count; var largestBucket = lowresSort.Select(bucket => bucket.Length).Max(); pointsInSmallBuckets = lowresSort.Select(bucket => bucket.Length > smallBucketSize ? 0 : bucket.Length).Sum(); var caseDescription = $"N = {actualNumPoints} D = {dimensions} B = {lowresBits}"; //Console.WriteLine(caseDescription); //Console.WriteLine($"Buckets: Count = {lowresSort.Count} Largest = {largestBucket} Points in Small = {pointsInSmallBuckets}"); return(largestBucket); }