public void DensityCompared()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;
            var windowRadius         = (int)Math.Sqrt(numPoints / 2);
            var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Console.WriteLine($"Window Radius = {windowRadius}. {hIndex.SortedPoints.Count} points");
            Console.Write("Exact,Estimated");
            for (var i = 0; i < numPoints; i++)
            {
                var p        = hIndex.SortedPoints[i];
                var exact    = dMeter.ExactNeighbors(p);
                var estimate = dMeter.EstimatedDensity(p, windowRadius);
                Console.Write($"{exact},{estimate}");
            }
        }
        private Dictionary <string, CorrelationStats> DensityCorrelationCases(int[] varyWindowRadius, int[] varyNumPoints, int dimensions, int clusterCount, int repeats = 1)
        {
            var stats = new Dictionary <string, CorrelationStats>();

            for (var iRepeat = 0; iRepeat < repeats; iRepeat++)
            {
                foreach (var numPoints in varyNumPoints)
                {
                    var bitsPerDimension = 10;
                    var clusterSize      = numPoints / clusterCount;
                    var data             = new GaussianClustering
                    {
                        ClusterCount   = clusterCount,
                        Dimensions     = dimensions,
                        MaxCoordinate  = (1 << bitsPerDimension) - 1,
                        MinClusterSize = clusterSize,
                        MaxClusterSize = clusterSize
                    };
                    var expectedClusters = data.MakeClusters();
                    var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
                    var cc = new ClusterCounter {
                        NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
                    };
                    var count = cc.Count(hIndex.SortedPoints);
                    var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
                    var dMeter = new DensityMeter(hIndex, neighborhoodDistance, varyWindowRadius[0]);

                    // It is more efficient to process windowRadius in descending order,
                    // because the DistanceMemo can reuse more work that way. Once a larger window has been processed,
                    // it includes all shorter windows as well.
                    foreach (var windowRadius in varyWindowRadius.OrderByDescending(r => r))
                    {
                        var label = MakeLabel(numPoints, windowRadius, dimensions, clusterCount);
                        CorrelationStats corStats;
                        if (!stats.TryGetValue(label, out corStats))
                        {
                            corStats     = new CorrelationStats(label);
                            stats[label] = corStats;
                        }
                        corStats.Add(DensityCorrelationCase(dMeter, windowRadius));
                        Console.Write(corStats);
                    }
                }
            }
            return(stats);
        }
        public void DensityCorrelation()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            // Choice of neighborhoodDistance is crucial.
            //   - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating
            //	   that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors
            //     and your window size is 100 then many points will have their neighbor count saturate near 100 and
            //     no meaningful variation in density will be found.
            //   - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density.
            //   - We know that almost every point has two neighbors within MaximumSquareDistance, so we should
            //     make it smaller than MaximumSquareDistance.
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;

            var windowRadius = (int)Math.Sqrt(numPoints / 2);
            var dMeter       = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Func <HilbertPoint, long> exactMetric     = p => (long)dMeter.ExactNeighbors(p);
            Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius);
            var correlator  = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric);
            var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000));

            Console.WriteLine($"Correlation between exact and estimated density is: {correlation}");
            Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough");
        }
Пример #4
0
        /// <summary>
        /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems.
        ///
        /// From this we can deduce which of six cases obtain (the SplitQuality).
        /// </summary>
        /// <returns>A Tuple with these parts:
        ///   1) comparison of actual to expected (with its BCubed),
        ///   2) the expected number of clusters
        ///   3) the actual number of clusters
        ///   4) a qualitative assessment of the results.
        /// </returns>
        /// <param name="numPoints">Number of points.</param>
        /// <param name="dimensions">Number of Dimensions.</param>
        /// <param name="overlapPercent">Overlap percent.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.</param>
        /// <param name="maxCoordinate">Max value of any coordinate.</param>
        /// <param name="acceptablePrecision">Acceptable precision</param>
        /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param>
        private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent,
                                                                                                                int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true)
        {
            Logger.SetupForTests();
            var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo();
            var clusterCount     = 2;
            var minClusterSize   = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize   = (numPoints / clusterCount) + clusterSizeVariation;
            var outlierSize      = 5;
            var radiusShrinkage  = 0.6;            // 0.7 merges too many that belong apart!
            var data             = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var expectedClusters = data.TwoClusters(overlapPercent);

            Classification <UnsignedPoint, string> actualClusters;

            if (useDensityClassifier)
            {
                var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension);
                var cc     = new ClusterCounter {
                    NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1
                };
                var count = cc.Count(hIndex.SortedPoints);

                var unmergeableSize   = expectedClusters.NumPoints / 6;
                var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize)
                {
                    MergeableShrinkage = radiusShrinkage
                };

                actualClusters = densityClassifier.Classify();
            }
            else
            {
                var classifier = new HilbertClassifier(expectedClusters.Points(), 10)
                {
                    OutlierSize = outlierSize
                };
                //classifier.IndexConfig.NoiseSkipBy = 0;
                classifier.IndexConfig.UseSample = false;
                actualClusters = classifier.Classify();
            }

            var          comparison        = expectedClusters.Compare(actualClusters);
            SplitQuality qualitativeResult = SplitQuality.Unsplit;

            if (comparison.BCubed >= 1.0)
            {
                qualitativeResult = SplitQuality.PerfectSplit;
            }
            else if (actualClusters.NumPartitions == 1)
            {
                qualitativeResult = SplitQuality.Unsplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0)
            {
                qualitativeResult = SplitQuality.GoodOverSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.FairOverSplit;
            }
            else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.GoodSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0)
            {
                qualitativeResult = SplitQuality.BadOverSplit;
            }
            else             // Assume correct number of clusters.
            {
                qualitativeResult = SplitQuality.BadSplit;
            }

            Logger.Info($"  Quality: {qualitativeResult}  Comparison: {comparison}");

            return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>(
                       comparison,
                       expectedClusters.NumPartitions,
                       actualClusters.NumPartitions,
                       qualitativeResult
                       ));
        }
        /// <summary>
        /// UnsignedPoint.SquareDistanceCompare has an optimization. This tests how often this optimization
        /// can be exploited in a realistic test. The comparison will be against an estimated characteristic distance
        /// between points. This distance is assumed to be close enough to trigger two points to be merged into a single cluster.
        /// </summary>
        private double SquareDistanceCompareOptimizableCase(int totalComparisons, bool useExtendedOptimization = false)
        {
            // 1. Make test data.
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters = data.MakeClusters();

            // 2. Create HilbertIndex for points.
            var hIndex = new HilbertIndex(clusters, bitsPerDimension);

            // 3. Deduce the characteristic distance.
            var counter = new ClusterCounter
            {
                OutlierSize = 5,
                NoiseSkipBy = 10
            };
            var count         = counter.Count(hIndex.SortedPoints);
            var mergeDistance = count.MaximumSquareDistance;
            var longDistance  = 5 * mergeDistance;

            // 4. Select random pairs of points and see how many distance comparisons can exploit the optimization.
            var rng    = new FastRandom();
            var points = clusters.Points().ToList();
            var ableToUseOptimizationsAtShortDistance = 0;
            var ableToUseOptimizationsAtLongDistance  = 0;

            for (var i = 0; i < totalComparisons; i++)
            {
                var p1 = points[rng.Next(points.Count)];
                var p2 = points[rng.Next(points.Count)];
                if (useExtendedOptimization)
                {
                    if (IsExtendedDistanceOptimizationUsable(p1, p2, mergeDistance, bitsPerDimension))
                    {
                        ableToUseOptimizationsAtShortDistance++;
                    }
                    if (IsExtendedDistanceOptimizationUsable(p1, p2, longDistance, bitsPerDimension))
                    {
                        ableToUseOptimizationsAtLongDistance++;
                    }
                }
                else
                {
                    if (IsDistanceOptimizationUsable(p1, p2, mergeDistance))
                    {
                        ableToUseOptimizationsAtShortDistance++;
                    }
                    if (IsDistanceOptimizationUsable(p1, p2, longDistance))
                    {
                        ableToUseOptimizationsAtLongDistance++;
                    }
                }
            }
            var percentOptimizable             = 100.0 * ableToUseOptimizationsAtShortDistance / totalComparisons;
            var percentOptimizableLongDistance = 100.0 * ableToUseOptimizationsAtLongDistance / totalComparisons;
            var message = $"Comparisons were {percentOptimizable} % Optimizable at short distance, {percentOptimizableLongDistance} % at long distance";

            Console.WriteLine(message);
            return(percentOptimizable);
        }
        public double SquareDistanceCompareValidationCase(int numTriangulationPoints)
        {
            var correctResult                = 0;
            var wrongResult                  = 0;
            var totalComparisons             = 10000;
            var extraShortTrianagulatable    = 0;
            var extraShortNotTrianagulatable = 0;
            var shortTrianagulatable         = 0;
            var shortNotTrianagulatable      = 0;
            var longTrianagulatable          = 0;
            var longNotTrianagulatable       = 0;

            // 1. Make test data.
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters = data.MakeClusters();

            // 2. Create HilbertIndex for points.
            var hIndex = new HilbertIndex(clusters, bitsPerDimension);

            hIndex.SetTriangulation(numTriangulationPoints);

            // 3. Deduce the characteristic distance.
            var counter = new ClusterCounter
            {
                OutlierSize = 5,
                NoiseSkipBy = 10
            };
            var count         = counter.Count(hIndex.SortedPoints);
            var mergeDistance = count.MaximumSquareDistance;
            var longDistance  = 5 * mergeDistance;

            // 4. Select random pairs of the HilbertPoints points and see how many distance comparisons yield the correct result.
            var rng    = new FastRandom();
            var points = hIndex.SortedPoints.ToList();

            for (var i = 0; i < totalComparisons; i++)
            {
                var p1 = points[rng.Next(points.Count)];
                var p2 = points[rng.Next(points.Count)];
                var d  = p1.Measure(p2);
                if (d.CompareTo(mergeDistance) == p1.SquareDistanceCompare(p2, mergeDistance))
                {
                    correctResult++;
                }
                else
                {
                    wrongResult++;
                }

                if (d.CompareTo(longDistance) == p1.SquareDistanceCompare(p2, longDistance))
                {
                    correctResult++;
                }
                else
                {
                    wrongResult++;
                }

                if (p1.Triangulatable(p2, mergeDistance / 2))
                {
                    extraShortTrianagulatable++;
                }
                else
                {
                    extraShortNotTrianagulatable++;
                }

                if (p1.Triangulatable(p2, mergeDistance))
                {
                    shortTrianagulatable++;
                }
                else
                {
                    shortNotTrianagulatable++;
                }

                if (p1.Triangulatable(p2, longDistance))
                {
                    longTrianagulatable++;
                }
                else
                {
                    longNotTrianagulatable++;
                }
            }
            var extraShortPct = 100.0 * extraShortTrianagulatable / (extraShortTrianagulatable + extraShortNotTrianagulatable);
            var shortPct      = 100.0 * shortTrianagulatable / (shortTrianagulatable + shortNotTrianagulatable);
            var longPct       = 100.0 * longTrianagulatable / (longTrianagulatable + longNotTrianagulatable);

            Console.WriteLine($"Triangulatable? \n    XS: {extraShortPct} % \n    Short: {shortPct} % Yes {shortTrianagulatable}, No {shortNotTrianagulatable}\n    Long: {longPct} % Yes {longTrianagulatable}, No {longNotTrianagulatable}");
            Assert.AreEqual(wrongResult, 0, $"{correctResult} correct, {wrongResult} wrong");

            return(shortPct);
        }