public void CompareSpeedOfSorting_Unbalanced_vs_HilbertIndex() { var points = TestData(20000, 50, 20, 1000000, 100, 500, out int bitsPerDimension); var timer1 = new Stopwatch(); var timer2 = new Stopwatch(); // 1. HilbertIndex timer1.Start(); var hIndex = new HilbertIndex(points.Select(p => new HilbertPoint(p.Coordinates, bitsPerDimension))); var sortedPointsFromIndex = hIndex.SortedPoints; timer1.Stop(); var hilbertIndexTime = timer1.ElapsedMilliseconds; // 2. HilbertSort.Sort timer2.Start(); HilbertSort.Sort(points.ToList(), bitsPerDimension); timer2.Stop(); var unbalancedSortTime = timer2.ElapsedMilliseconds; var message = $"HilbertIndex required {hilbertIndexTime / 1000.0} sec. Unbalanced Sort required {unbalancedSortTime / 1000.0} sec."; Console.WriteLine(message); Assert.Greater(hilbertIndexTime, unbalancedSortTime, message); }
public void DensityCompared() { var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 50, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 100, MaxClusterSize = 500 }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var numPoints = hIndex.SortedPoints.Count; var windowRadius = (int)Math.Sqrt(numPoints / 2); var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius); Console.WriteLine($"Window Radius = {windowRadius}. {hIndex.SortedPoints.Count} points"); Console.Write("Exact,Estimated"); for (var i = 0; i < numPoints; i++) { var p = hIndex.SortedPoints[i]; var exact = dMeter.ExactNeighbors(p); var estimate = dMeter.EstimatedDensity(p, windowRadius); Console.Write($"{exact},{estimate}"); } }
/// <summary> /// For the same test data, create a single HilbertIndex many times and average the execution time across all indices. /// /// The goal is to identify how the time depends on number of points N, number of dimensions D, and bits per coordinate B. /// (It should be insensitive to cluster count K.) /// </summary> /// <param name="N">Number of points to index.</param> /// <param name="K">Number of clusters of points to create.</param> /// <param name="D">Number dimensions.</param> /// <param name="B">Number bits.</param> /// <param name="repeats">Number of times to repeat.</param> /// <returns>Average number of seconds to create the index, averaged over several tries. /// The time excludes the time to create the test data. /// </returns> private double SingleIndexCreationPerformanceCase(int N, int K, int D, int B, int repeats) { var data = new GaussianClustering { ClusterCount = K, Dimensions = D, MaxCoordinate = (1 << B) - 1, MinClusterSize = N / K, MaxClusterSize = N / K }; var clusters = data.MakeClusters(); var timer = new Stopwatch(); var totalTimeMilliseconds = 0L; for (var i = 0; i < repeats; i++) { timer.Reset(); timer.Start(); var hIndex = new HilbertIndex(clusters, B); Assert.AreEqual(N, hIndex.Count, "Index has wrong number of points"); timer.Stop(); totalTimeMilliseconds += timer.ElapsedMilliseconds; } return((double)totalTimeMilliseconds / (1000.0 * repeats)); }
public void CompareSpeedOfSorting_Balanced_vs_HilbertIndex() { var points = TestData(20000, 50, 20, 1000000, 100, 500, out int bitsPerDimension); var timer1 = new Stopwatch(); var timer2 = new Stopwatch(); var timer3 = new Stopwatch(); // 1. HilbertIndex timer1.Start(); var hIndex = new HilbertIndex(points.Select(p => new HilbertPoint(p.Coordinates, bitsPerDimension))); var sortedPointsFromIndex = hIndex.SortedPoints; timer1.Stop(); var hilbertIndexTime = timer1.ElapsedMilliseconds; // 2. HilbertSort.BalancedSort timer2.Start(); timer3.Start(); PointBalancer balancer = new PointBalancer(points); timer3.Stop(); HilbertSort.BalancedSort(points.ToList(), ref balancer); timer2.Stop(); var balancedSortTime = timer2.ElapsedMilliseconds; var balancerTime = timer3.ElapsedMilliseconds; var message = $"HilbertIndex required {hilbertIndexTime / 1000.0} sec. Balanced Sort required {balancedSortTime / 1000.0} sec, of which {balancerTime / 1000.0} sec is Balancer ctor. Relative Cost = {HilbertSort.RelativeSortCost}"; Console.WriteLine(message); Assert.Greater(hilbertIndexTime, balancedSortTime, message); }
/// <summary> /// Compare the HilbertIndex values of the two points, but use the UniqueId as a tie-breaker. /// /// This permits sorting by HilbertIndex. /// </summary> /// <param name="other">Second point in comparison.</param> /// <returns>-1 if this has a lower index, 0 if they match, and +1 if this has a higher index.</returns> public int CompareTo(HilbertPoint other) { var cmp = HilbertIndex.CompareTo(other.HilbertIndex); if (cmp == 0) { cmp = UniqueId.CompareTo(other.UniqueId); } return(cmp); }
private Dictionary <string, CorrelationStats> DensityCorrelationCases(int[] varyWindowRadius, int[] varyNumPoints, int dimensions, int clusterCount, int repeats = 1) { var stats = new Dictionary <string, CorrelationStats>(); for (var iRepeat = 0; iRepeat < repeats; iRepeat++) { foreach (var numPoints in varyNumPoints) { var bitsPerDimension = 10; var clusterSize = numPoints / clusterCount; var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = clusterSize, MaxClusterSize = clusterSize }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var dMeter = new DensityMeter(hIndex, neighborhoodDistance, varyWindowRadius[0]); // It is more efficient to process windowRadius in descending order, // because the DistanceMemo can reuse more work that way. Once a larger window has been processed, // it includes all shorter windows as well. foreach (var windowRadius in varyWindowRadius.OrderByDescending(r => r)) { var label = MakeLabel(numPoints, windowRadius, dimensions, clusterCount); CorrelationStats corStats; if (!stats.TryGetValue(label, out corStats)) { corStats = new CorrelationStats(label); stats[label] = corStats; } corStats.Add(DensityCorrelationCase(dMeter, windowRadius)); Console.Write(corStats); } } } return(stats); }
public void DensityCorrelation() { var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 50, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 100, MaxClusterSize = 500 }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); // Choice of neighborhoodDistance is crucial. // - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating // that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors // and your window size is 100 then many points will have their neighbor count saturate near 100 and // no meaningful variation in density will be found. // - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density. // - We know that almost every point has two neighbors within MaximumSquareDistance, so we should // make it smaller than MaximumSquareDistance. var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var numPoints = hIndex.SortedPoints.Count; var windowRadius = (int)Math.Sqrt(numPoints / 2); var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius); Func <HilbertPoint, long> exactMetric = p => (long)dMeter.ExactNeighbors(p); Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius); var correlator = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric); var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000)); Console.WriteLine($"Correlation between exact and estimated density is: {correlation}"); Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough"); }
/// <summary> /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems. /// /// From this we can deduce which of six cases obtain (the SplitQuality). /// </summary> /// <returns>A Tuple with these parts: /// 1) comparison of actual to expected (with its BCubed), /// 2) the expected number of clusters /// 3) the actual number of clusters /// 4) a qualitative assessment of the results. /// </returns> /// <param name="numPoints">Number of points.</param> /// <param name="dimensions">Number of Dimensions.</param> /// <param name="overlapPercent">Overlap percent.</param> /// <param name="clusterSizeVariation">Cluster size variation.</param> /// <param name="maxCoordinate">Max value of any coordinate.</param> /// <param name="acceptablePrecision">Acceptable precision</param> /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param> private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent, int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true) { Logger.SetupForTests(); var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo(); var clusterCount = 2; var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation; var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation; var outlierSize = 5; var radiusShrinkage = 0.6; // 0.7 merges too many that belong apart! var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = minClusterSize, MaxClusterSize = maxClusterSize }; var expectedClusters = data.TwoClusters(overlapPercent); Classification <UnsignedPoint, string> actualClusters; if (useDensityClassifier) { var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var unmergeableSize = expectedClusters.NumPoints / 6; var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize) { MergeableShrinkage = radiusShrinkage }; actualClusters = densityClassifier.Classify(); } else { var classifier = new HilbertClassifier(expectedClusters.Points(), 10) { OutlierSize = outlierSize }; //classifier.IndexConfig.NoiseSkipBy = 0; classifier.IndexConfig.UseSample = false; actualClusters = classifier.Classify(); } var comparison = expectedClusters.Compare(actualClusters); SplitQuality qualitativeResult = SplitQuality.Unsplit; if (comparison.BCubed >= 1.0) { qualitativeResult = SplitQuality.PerfectSplit; } else if (actualClusters.NumPartitions == 1) { qualitativeResult = SplitQuality.Unsplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0) { qualitativeResult = SplitQuality.GoodOverSplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision) { qualitativeResult = SplitQuality.FairOverSplit; } else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision) { qualitativeResult = SplitQuality.GoodSplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0) { qualitativeResult = SplitQuality.BadOverSplit; } else // Assume correct number of clusters. { qualitativeResult = SplitQuality.BadSplit; } Logger.Info($" Quality: {qualitativeResult} Comparison: {comparison}"); return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>( comparison, expectedClusters.NumPartitions, actualClusters.NumPartitions, qualitativeResult )); }
/// <summary> /// UnsignedPoint.SquareDistanceCompare has an optimization. This tests how often this optimization /// can be exploited in a realistic test. The comparison will be against an estimated characteristic distance /// between points. This distance is assumed to be close enough to trigger two points to be merged into a single cluster. /// </summary> private double SquareDistanceCompareOptimizableCase(int totalComparisons, bool useExtendedOptimization = false) { // 1. Make test data. var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 100, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 50, MaxClusterSize = 150 }; var clusters = data.MakeClusters(); // 2. Create HilbertIndex for points. var hIndex = new HilbertIndex(clusters, bitsPerDimension); // 3. Deduce the characteristic distance. var counter = new ClusterCounter { OutlierSize = 5, NoiseSkipBy = 10 }; var count = counter.Count(hIndex.SortedPoints); var mergeDistance = count.MaximumSquareDistance; var longDistance = 5 * mergeDistance; // 4. Select random pairs of points and see how many distance comparisons can exploit the optimization. var rng = new FastRandom(); var points = clusters.Points().ToList(); var ableToUseOptimizationsAtShortDistance = 0; var ableToUseOptimizationsAtLongDistance = 0; for (var i = 0; i < totalComparisons; i++) { var p1 = points[rng.Next(points.Count)]; var p2 = points[rng.Next(points.Count)]; if (useExtendedOptimization) { if (IsExtendedDistanceOptimizationUsable(p1, p2, mergeDistance, bitsPerDimension)) { ableToUseOptimizationsAtShortDistance++; } if (IsExtendedDistanceOptimizationUsable(p1, p2, longDistance, bitsPerDimension)) { ableToUseOptimizationsAtLongDistance++; } } else { if (IsDistanceOptimizationUsable(p1, p2, mergeDistance)) { ableToUseOptimizationsAtShortDistance++; } if (IsDistanceOptimizationUsable(p1, p2, longDistance)) { ableToUseOptimizationsAtLongDistance++; } } } var percentOptimizable = 100.0 * ableToUseOptimizationsAtShortDistance / totalComparisons; var percentOptimizableLongDistance = 100.0 * ableToUseOptimizationsAtLongDistance / totalComparisons; var message = $"Comparisons were {percentOptimizable} % Optimizable at short distance, {percentOptimizableLongDistance} % at long distance"; Console.WriteLine(message); return(percentOptimizable); }
public double SquareDistanceCompareValidationCase(int numTriangulationPoints) { var correctResult = 0; var wrongResult = 0; var totalComparisons = 10000; var extraShortTrianagulatable = 0; var extraShortNotTrianagulatable = 0; var shortTrianagulatable = 0; var shortNotTrianagulatable = 0; var longTrianagulatable = 0; var longNotTrianagulatable = 0; // 1. Make test data. var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 100, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 50, MaxClusterSize = 150 }; var clusters = data.MakeClusters(); // 2. Create HilbertIndex for points. var hIndex = new HilbertIndex(clusters, bitsPerDimension); hIndex.SetTriangulation(numTriangulationPoints); // 3. Deduce the characteristic distance. var counter = new ClusterCounter { OutlierSize = 5, NoiseSkipBy = 10 }; var count = counter.Count(hIndex.SortedPoints); var mergeDistance = count.MaximumSquareDistance; var longDistance = 5 * mergeDistance; // 4. Select random pairs of the HilbertPoints points and see how many distance comparisons yield the correct result. var rng = new FastRandom(); var points = hIndex.SortedPoints.ToList(); for (var i = 0; i < totalComparisons; i++) { var p1 = points[rng.Next(points.Count)]; var p2 = points[rng.Next(points.Count)]; var d = p1.Measure(p2); if (d.CompareTo(mergeDistance) == p1.SquareDistanceCompare(p2, mergeDistance)) { correctResult++; } else { wrongResult++; } if (d.CompareTo(longDistance) == p1.SquareDistanceCompare(p2, longDistance)) { correctResult++; } else { wrongResult++; } if (p1.Triangulatable(p2, mergeDistance / 2)) { extraShortTrianagulatable++; } else { extraShortNotTrianagulatable++; } if (p1.Triangulatable(p2, mergeDistance)) { shortTrianagulatable++; } else { shortNotTrianagulatable++; } if (p1.Triangulatable(p2, longDistance)) { longTrianagulatable++; } else { longNotTrianagulatable++; } } var extraShortPct = 100.0 * extraShortTrianagulatable / (extraShortTrianagulatable + extraShortNotTrianagulatable); var shortPct = 100.0 * shortTrianagulatable / (shortTrianagulatable + shortNotTrianagulatable); var longPct = 100.0 * longTrianagulatable / (longTrianagulatable + longNotTrianagulatable); Console.WriteLine($"Triangulatable? \n XS: {extraShortPct} % \n Short: {shortPct} % Yes {shortTrianagulatable}, No {shortNotTrianagulatable}\n Long: {longPct} % Yes {longTrianagulatable}, No {longNotTrianagulatable}"); Assert.AreEqual(wrongResult, 0, $"{correctResult} correct, {wrongResult} wrong"); return(shortPct); }
public PolyChromaticClosestPoint(Classification <UnsignedPoint, TLabel> clusters, HilbertIndex index) { Clusters = clusters; var sorter = new KeySorter <HilbertPoint, UnsignedPoint>(p => p.UniqueId, p => p.UniqueId); var unsortedPoints = Clusters.Points().ToList(); var sortedPoints = index.SortedPoints; SortedPoints = sorter.Sort(unsortedPoints, sortedPoints, 10).ToList(); ValidateIds(); }