public void DensityCompared() { var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 50, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 100, MaxClusterSize = 500 }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var numPoints = hIndex.SortedPoints.Count; var windowRadius = (int)Math.Sqrt(numPoints / 2); var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius); Console.WriteLine($"Window Radius = {windowRadius}. {hIndex.SortedPoints.Count} points"); Console.Write("Exact,Estimated"); for (var i = 0; i < numPoints; i++) { var p = hIndex.SortedPoints[i]; var exact = dMeter.ExactNeighbors(p); var estimate = dMeter.EstimatedDensity(p, windowRadius); Console.Write($"{exact},{estimate}"); } }
private Dictionary <string, CorrelationStats> DensityCorrelationCases(int[] varyWindowRadius, int[] varyNumPoints, int dimensions, int clusterCount, int repeats = 1) { var stats = new Dictionary <string, CorrelationStats>(); for (var iRepeat = 0; iRepeat < repeats; iRepeat++) { foreach (var numPoints in varyNumPoints) { var bitsPerDimension = 10; var clusterSize = numPoints / clusterCount; var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = clusterSize, MaxClusterSize = clusterSize }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var dMeter = new DensityMeter(hIndex, neighborhoodDistance, varyWindowRadius[0]); // It is more efficient to process windowRadius in descending order, // because the DistanceMemo can reuse more work that way. Once a larger window has been processed, // it includes all shorter windows as well. foreach (var windowRadius in varyWindowRadius.OrderByDescending(r => r)) { var label = MakeLabel(numPoints, windowRadius, dimensions, clusterCount); CorrelationStats corStats; if (!stats.TryGetValue(label, out corStats)) { corStats = new CorrelationStats(label); stats[label] = corStats; } corStats.Add(DensityCorrelationCase(dMeter, windowRadius)); Console.Write(corStats); } } } return(stats); }
public void DensityCorrelation() { var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 50, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 100, MaxClusterSize = 500 }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); // Choice of neighborhoodDistance is crucial. // - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating // that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors // and your window size is 100 then many points will have their neighbor count saturate near 100 and // no meaningful variation in density will be found. // - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density. // - We know that almost every point has two neighbors within MaximumSquareDistance, so we should // make it smaller than MaximumSquareDistance. var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var numPoints = hIndex.SortedPoints.Count; var windowRadius = (int)Math.Sqrt(numPoints / 2); var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius); Func <HilbertPoint, long> exactMetric = p => (long)dMeter.ExactNeighbors(p); Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius); var correlator = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric); var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000)); Console.WriteLine($"Correlation between exact and estimated density is: {correlation}"); Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough"); }
/// <summary> /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems. /// /// From this we can deduce which of six cases obtain (the SplitQuality). /// </summary> /// <returns>A Tuple with these parts: /// 1) comparison of actual to expected (with its BCubed), /// 2) the expected number of clusters /// 3) the actual number of clusters /// 4) a qualitative assessment of the results. /// </returns> /// <param name="numPoints">Number of points.</param> /// <param name="dimensions">Number of Dimensions.</param> /// <param name="overlapPercent">Overlap percent.</param> /// <param name="clusterSizeVariation">Cluster size variation.</param> /// <param name="maxCoordinate">Max value of any coordinate.</param> /// <param name="acceptablePrecision">Acceptable precision</param> /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param> private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent, int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true) { Logger.SetupForTests(); var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo(); var clusterCount = 2; var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation; var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation; var outlierSize = 5; var radiusShrinkage = 0.6; // 0.7 merges too many that belong apart! var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = minClusterSize, MaxClusterSize = maxClusterSize }; var expectedClusters = data.TwoClusters(overlapPercent); Classification <UnsignedPoint, string> actualClusters; if (useDensityClassifier) { var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var unmergeableSize = expectedClusters.NumPoints / 6; var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize) { MergeableShrinkage = radiusShrinkage }; actualClusters = densityClassifier.Classify(); } else { var classifier = new HilbertClassifier(expectedClusters.Points(), 10) { OutlierSize = outlierSize }; //classifier.IndexConfig.NoiseSkipBy = 0; classifier.IndexConfig.UseSample = false; actualClusters = classifier.Classify(); } var comparison = expectedClusters.Compare(actualClusters); SplitQuality qualitativeResult = SplitQuality.Unsplit; if (comparison.BCubed >= 1.0) { qualitativeResult = SplitQuality.PerfectSplit; } else if (actualClusters.NumPartitions == 1) { qualitativeResult = SplitQuality.Unsplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0) { qualitativeResult = SplitQuality.GoodOverSplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision) { qualitativeResult = SplitQuality.FairOverSplit; } else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision) { qualitativeResult = SplitQuality.GoodSplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0) { qualitativeResult = SplitQuality.BadOverSplit; } else // Assume correct number of clusters. { qualitativeResult = SplitQuality.BadSplit; } Logger.Info($" Quality: {qualitativeResult} Comparison: {comparison}"); return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>( comparison, expectedClusters.NumPartitions, actualClusters.NumPartitions, qualitativeResult )); }
/// <summary> /// UnsignedPoint.SquareDistanceCompare has an optimization. This tests how often this optimization /// can be exploited in a realistic test. The comparison will be against an estimated characteristic distance /// between points. This distance is assumed to be close enough to trigger two points to be merged into a single cluster. /// </summary> private double SquareDistanceCompareOptimizableCase(int totalComparisons, bool useExtendedOptimization = false) { // 1. Make test data. var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 100, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 50, MaxClusterSize = 150 }; var clusters = data.MakeClusters(); // 2. Create HilbertIndex for points. var hIndex = new HilbertIndex(clusters, bitsPerDimension); // 3. Deduce the characteristic distance. var counter = new ClusterCounter { OutlierSize = 5, NoiseSkipBy = 10 }; var count = counter.Count(hIndex.SortedPoints); var mergeDistance = count.MaximumSquareDistance; var longDistance = 5 * mergeDistance; // 4. Select random pairs of points and see how many distance comparisons can exploit the optimization. var rng = new FastRandom(); var points = clusters.Points().ToList(); var ableToUseOptimizationsAtShortDistance = 0; var ableToUseOptimizationsAtLongDistance = 0; for (var i = 0; i < totalComparisons; i++) { var p1 = points[rng.Next(points.Count)]; var p2 = points[rng.Next(points.Count)]; if (useExtendedOptimization) { if (IsExtendedDistanceOptimizationUsable(p1, p2, mergeDistance, bitsPerDimension)) { ableToUseOptimizationsAtShortDistance++; } if (IsExtendedDistanceOptimizationUsable(p1, p2, longDistance, bitsPerDimension)) { ableToUseOptimizationsAtLongDistance++; } } else { if (IsDistanceOptimizationUsable(p1, p2, mergeDistance)) { ableToUseOptimizationsAtShortDistance++; } if (IsDistanceOptimizationUsable(p1, p2, longDistance)) { ableToUseOptimizationsAtLongDistance++; } } } var percentOptimizable = 100.0 * ableToUseOptimizationsAtShortDistance / totalComparisons; var percentOptimizableLongDistance = 100.0 * ableToUseOptimizationsAtLongDistance / totalComparisons; var message = $"Comparisons were {percentOptimizable} % Optimizable at short distance, {percentOptimizableLongDistance} % at long distance"; Console.WriteLine(message); return(percentOptimizable); }
public double SquareDistanceCompareValidationCase(int numTriangulationPoints) { var correctResult = 0; var wrongResult = 0; var totalComparisons = 10000; var extraShortTrianagulatable = 0; var extraShortNotTrianagulatable = 0; var shortTrianagulatable = 0; var shortNotTrianagulatable = 0; var longTrianagulatable = 0; var longNotTrianagulatable = 0; // 1. Make test data. var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 100, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 50, MaxClusterSize = 150 }; var clusters = data.MakeClusters(); // 2. Create HilbertIndex for points. var hIndex = new HilbertIndex(clusters, bitsPerDimension); hIndex.SetTriangulation(numTriangulationPoints); // 3. Deduce the characteristic distance. var counter = new ClusterCounter { OutlierSize = 5, NoiseSkipBy = 10 }; var count = counter.Count(hIndex.SortedPoints); var mergeDistance = count.MaximumSquareDistance; var longDistance = 5 * mergeDistance; // 4. Select random pairs of the HilbertPoints points and see how many distance comparisons yield the correct result. var rng = new FastRandom(); var points = hIndex.SortedPoints.ToList(); for (var i = 0; i < totalComparisons; i++) { var p1 = points[rng.Next(points.Count)]; var p2 = points[rng.Next(points.Count)]; var d = p1.Measure(p2); if (d.CompareTo(mergeDistance) == p1.SquareDistanceCompare(p2, mergeDistance)) { correctResult++; } else { wrongResult++; } if (d.CompareTo(longDistance) == p1.SquareDistanceCompare(p2, longDistance)) { correctResult++; } else { wrongResult++; } if (p1.Triangulatable(p2, mergeDistance / 2)) { extraShortTrianagulatable++; } else { extraShortNotTrianagulatable++; } if (p1.Triangulatable(p2, mergeDistance)) { shortTrianagulatable++; } else { shortNotTrianagulatable++; } if (p1.Triangulatable(p2, longDistance)) { longTrianagulatable++; } else { longNotTrianagulatable++; } } var extraShortPct = 100.0 * extraShortTrianagulatable / (extraShortTrianagulatable + extraShortNotTrianagulatable); var shortPct = 100.0 * shortTrianagulatable / (shortTrianagulatable + shortNotTrianagulatable); var longPct = 100.0 * longTrianagulatable / (longTrianagulatable + longNotTrianagulatable); Console.WriteLine($"Triangulatable? \n XS: {extraShortPct} % \n Short: {shortPct} % Yes {shortTrianagulatable}, No {shortNotTrianagulatable}\n Long: {longPct} % Yes {longTrianagulatable}, No {longNotTrianagulatable}"); Assert.AreEqual(wrongResult, 0, $"{correctResult} correct, {wrongResult} wrong"); return(shortPct); }