public void CompareSpeedOfSorting_Unbalanced_vs_HilbertIndex()
        {
            var points = TestData(20000, 50, 20, 1000000, 100, 500, out int bitsPerDimension);

            var timer1 = new Stopwatch();
            var timer2 = new Stopwatch();

            // 1. HilbertIndex
            timer1.Start();
            var hIndex = new HilbertIndex(points.Select(p => new HilbertPoint(p.Coordinates, bitsPerDimension)));
            var sortedPointsFromIndex = hIndex.SortedPoints;

            timer1.Stop();
            var hilbertIndexTime = timer1.ElapsedMilliseconds;

            // 2. HilbertSort.Sort
            timer2.Start();

            HilbertSort.Sort(points.ToList(), bitsPerDimension);
            timer2.Stop();
            var unbalancedSortTime = timer2.ElapsedMilliseconds;

            var message = $"HilbertIndex required {hilbertIndexTime / 1000.0} sec.  Unbalanced Sort required {unbalancedSortTime / 1000.0} sec.";

            Console.WriteLine(message);
            Assert.Greater(hilbertIndexTime, unbalancedSortTime, message);
        }
        public void DensityCompared()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;
            var windowRadius         = (int)Math.Sqrt(numPoints / 2);
            var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Console.WriteLine($"Window Radius = {windowRadius}. {hIndex.SortedPoints.Count} points");
            Console.Write("Exact,Estimated");
            for (var i = 0; i < numPoints; i++)
            {
                var p        = hIndex.SortedPoints[i];
                var exact    = dMeter.ExactNeighbors(p);
                var estimate = dMeter.EstimatedDensity(p, windowRadius);
                Console.Write($"{exact},{estimate}");
            }
        }
예제 #3
0
        /// <summary>
        /// For the same test data, create a single HilbertIndex many times and average the execution time across all indices.
        ///
        /// The goal is to identify how the time depends on number of points N, number of dimensions D, and bits per coordinate B.
        /// (It should be insensitive to cluster count K.)
        /// </summary>
        /// <param name="N">Number of points to index.</param>
        /// <param name="K">Number of clusters of points to create.</param>
        /// <param name="D">Number dimensions.</param>
        /// <param name="B">Number bits.</param>
        /// <param name="repeats">Number of times to repeat.</param>
        /// <returns>Average number of seconds to create the index, averaged over several tries.
        /// The time excludes the time to create the test data.
        /// </returns>
        private double SingleIndexCreationPerformanceCase(int N, int K, int D, int B, int repeats)
        {
            var data = new GaussianClustering
            {
                ClusterCount   = K,
                Dimensions     = D,
                MaxCoordinate  = (1 << B) - 1,
                MinClusterSize = N / K,
                MaxClusterSize = N / K
            };
            var clusters = data.MakeClusters();
            var timer    = new Stopwatch();
            var totalTimeMilliseconds = 0L;

            for (var i = 0; i < repeats; i++)
            {
                timer.Reset();
                timer.Start();
                var hIndex = new HilbertIndex(clusters, B);
                Assert.AreEqual(N, hIndex.Count, "Index has wrong number of points");
                timer.Stop();
                totalTimeMilliseconds += timer.ElapsedMilliseconds;
            }
            return((double)totalTimeMilliseconds / (1000.0 * repeats));
        }
        public void CompareSpeedOfSorting_Balanced_vs_HilbertIndex()
        {
            var points = TestData(20000, 50, 20, 1000000, 100, 500, out int bitsPerDimension);

            var timer1 = new Stopwatch();
            var timer2 = new Stopwatch();
            var timer3 = new Stopwatch();

            // 1. HilbertIndex
            timer1.Start();
            var hIndex = new HilbertIndex(points.Select(p => new HilbertPoint(p.Coordinates, bitsPerDimension)));
            var sortedPointsFromIndex = hIndex.SortedPoints;

            timer1.Stop();
            var hilbertIndexTime = timer1.ElapsedMilliseconds;

            // 2. HilbertSort.BalancedSort
            timer2.Start();
            timer3.Start();
            PointBalancer balancer = new PointBalancer(points);

            timer3.Stop();
            HilbertSort.BalancedSort(points.ToList(), ref balancer);
            timer2.Stop();
            var balancedSortTime = timer2.ElapsedMilliseconds;
            var balancerTime     = timer3.ElapsedMilliseconds;

            var message = $"HilbertIndex required {hilbertIndexTime / 1000.0} sec.  Balanced Sort required {balancedSortTime / 1000.0} sec, of which {balancerTime / 1000.0} sec is Balancer ctor.  Relative Cost = {HilbertSort.RelativeSortCost}";

            Console.WriteLine(message);
            Assert.Greater(hilbertIndexTime, balancedSortTime, message);
        }
예제 #5
0
        /// <summary>
        /// Compare the HilbertIndex values of the two points, but use the UniqueId as a tie-breaker.
        ///
        /// This permits sorting by HilbertIndex.
        /// </summary>
        /// <param name="other">Second point in comparison.</param>
        /// <returns>-1 if this has a lower index, 0 if they match, and +1 if this has a higher index.</returns>
        public int CompareTo(HilbertPoint other)
        {
            var cmp = HilbertIndex.CompareTo(other.HilbertIndex);

            if (cmp == 0)
            {
                cmp = UniqueId.CompareTo(other.UniqueId);
            }
            return(cmp);
        }
        private Dictionary <string, CorrelationStats> DensityCorrelationCases(int[] varyWindowRadius, int[] varyNumPoints, int dimensions, int clusterCount, int repeats = 1)
        {
            var stats = new Dictionary <string, CorrelationStats>();

            for (var iRepeat = 0; iRepeat < repeats; iRepeat++)
            {
                foreach (var numPoints in varyNumPoints)
                {
                    var bitsPerDimension = 10;
                    var clusterSize      = numPoints / clusterCount;
                    var data             = new GaussianClustering
                    {
                        ClusterCount   = clusterCount,
                        Dimensions     = dimensions,
                        MaxCoordinate  = (1 << bitsPerDimension) - 1,
                        MinClusterSize = clusterSize,
                        MaxClusterSize = clusterSize
                    };
                    var expectedClusters = data.MakeClusters();
                    var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
                    var cc = new ClusterCounter {
                        NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
                    };
                    var count = cc.Count(hIndex.SortedPoints);
                    var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
                    var dMeter = new DensityMeter(hIndex, neighborhoodDistance, varyWindowRadius[0]);

                    // It is more efficient to process windowRadius in descending order,
                    // because the DistanceMemo can reuse more work that way. Once a larger window has been processed,
                    // it includes all shorter windows as well.
                    foreach (var windowRadius in varyWindowRadius.OrderByDescending(r => r))
                    {
                        var label = MakeLabel(numPoints, windowRadius, dimensions, clusterCount);
                        CorrelationStats corStats;
                        if (!stats.TryGetValue(label, out corStats))
                        {
                            corStats     = new CorrelationStats(label);
                            stats[label] = corStats;
                        }
                        corStats.Add(DensityCorrelationCase(dMeter, windowRadius));
                        Console.Write(corStats);
                    }
                }
            }
            return(stats);
        }
        public void DensityCorrelation()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            // Choice of neighborhoodDistance is crucial.
            //   - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating
            //	   that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors
            //     and your window size is 100 then many points will have their neighbor count saturate near 100 and
            //     no meaningful variation in density will be found.
            //   - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density.
            //   - We know that almost every point has two neighbors within MaximumSquareDistance, so we should
            //     make it smaller than MaximumSquareDistance.
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;

            var windowRadius = (int)Math.Sqrt(numPoints / 2);
            var dMeter       = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Func <HilbertPoint, long> exactMetric     = p => (long)dMeter.ExactNeighbors(p);
            Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius);
            var correlator  = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric);
            var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000));

            Console.WriteLine($"Correlation between exact and estimated density is: {correlation}");
            Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough");
        }
예제 #8
0
        /// <summary>
        /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems.
        ///
        /// From this we can deduce which of six cases obtain (the SplitQuality).
        /// </summary>
        /// <returns>A Tuple with these parts:
        ///   1) comparison of actual to expected (with its BCubed),
        ///   2) the expected number of clusters
        ///   3) the actual number of clusters
        ///   4) a qualitative assessment of the results.
        /// </returns>
        /// <param name="numPoints">Number of points.</param>
        /// <param name="dimensions">Number of Dimensions.</param>
        /// <param name="overlapPercent">Overlap percent.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.</param>
        /// <param name="maxCoordinate">Max value of any coordinate.</param>
        /// <param name="acceptablePrecision">Acceptable precision</param>
        /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param>
        private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent,
                                                                                                                int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true)
        {
            Logger.SetupForTests();
            var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo();
            var clusterCount     = 2;
            var minClusterSize   = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize   = (numPoints / clusterCount) + clusterSizeVariation;
            var outlierSize      = 5;
            var radiusShrinkage  = 0.6;            // 0.7 merges too many that belong apart!
            var data             = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var expectedClusters = data.TwoClusters(overlapPercent);

            Classification <UnsignedPoint, string> actualClusters;

            if (useDensityClassifier)
            {
                var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension);
                var cc     = new ClusterCounter {
                    NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1
                };
                var count = cc.Count(hIndex.SortedPoints);

                var unmergeableSize   = expectedClusters.NumPoints / 6;
                var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize)
                {
                    MergeableShrinkage = radiusShrinkage
                };

                actualClusters = densityClassifier.Classify();
            }
            else
            {
                var classifier = new HilbertClassifier(expectedClusters.Points(), 10)
                {
                    OutlierSize = outlierSize
                };
                //classifier.IndexConfig.NoiseSkipBy = 0;
                classifier.IndexConfig.UseSample = false;
                actualClusters = classifier.Classify();
            }

            var          comparison        = expectedClusters.Compare(actualClusters);
            SplitQuality qualitativeResult = SplitQuality.Unsplit;

            if (comparison.BCubed >= 1.0)
            {
                qualitativeResult = SplitQuality.PerfectSplit;
            }
            else if (actualClusters.NumPartitions == 1)
            {
                qualitativeResult = SplitQuality.Unsplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0)
            {
                qualitativeResult = SplitQuality.GoodOverSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.FairOverSplit;
            }
            else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.GoodSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0)
            {
                qualitativeResult = SplitQuality.BadOverSplit;
            }
            else             // Assume correct number of clusters.
            {
                qualitativeResult = SplitQuality.BadSplit;
            }

            Logger.Info($"  Quality: {qualitativeResult}  Comparison: {comparison}");

            return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>(
                       comparison,
                       expectedClusters.NumPartitions,
                       actualClusters.NumPartitions,
                       qualitativeResult
                       ));
        }
        /// <summary>
        /// UnsignedPoint.SquareDistanceCompare has an optimization. This tests how often this optimization
        /// can be exploited in a realistic test. The comparison will be against an estimated characteristic distance
        /// between points. This distance is assumed to be close enough to trigger two points to be merged into a single cluster.
        /// </summary>
        private double SquareDistanceCompareOptimizableCase(int totalComparisons, bool useExtendedOptimization = false)
        {
            // 1. Make test data.
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters = data.MakeClusters();

            // 2. Create HilbertIndex for points.
            var hIndex = new HilbertIndex(clusters, bitsPerDimension);

            // 3. Deduce the characteristic distance.
            var counter = new ClusterCounter
            {
                OutlierSize = 5,
                NoiseSkipBy = 10
            };
            var count         = counter.Count(hIndex.SortedPoints);
            var mergeDistance = count.MaximumSquareDistance;
            var longDistance  = 5 * mergeDistance;

            // 4. Select random pairs of points and see how many distance comparisons can exploit the optimization.
            var rng    = new FastRandom();
            var points = clusters.Points().ToList();
            var ableToUseOptimizationsAtShortDistance = 0;
            var ableToUseOptimizationsAtLongDistance  = 0;

            for (var i = 0; i < totalComparisons; i++)
            {
                var p1 = points[rng.Next(points.Count)];
                var p2 = points[rng.Next(points.Count)];
                if (useExtendedOptimization)
                {
                    if (IsExtendedDistanceOptimizationUsable(p1, p2, mergeDistance, bitsPerDimension))
                    {
                        ableToUseOptimizationsAtShortDistance++;
                    }
                    if (IsExtendedDistanceOptimizationUsable(p1, p2, longDistance, bitsPerDimension))
                    {
                        ableToUseOptimizationsAtLongDistance++;
                    }
                }
                else
                {
                    if (IsDistanceOptimizationUsable(p1, p2, mergeDistance))
                    {
                        ableToUseOptimizationsAtShortDistance++;
                    }
                    if (IsDistanceOptimizationUsable(p1, p2, longDistance))
                    {
                        ableToUseOptimizationsAtLongDistance++;
                    }
                }
            }
            var percentOptimizable             = 100.0 * ableToUseOptimizationsAtShortDistance / totalComparisons;
            var percentOptimizableLongDistance = 100.0 * ableToUseOptimizationsAtLongDistance / totalComparisons;
            var message = $"Comparisons were {percentOptimizable} % Optimizable at short distance, {percentOptimizableLongDistance} % at long distance";

            Console.WriteLine(message);
            return(percentOptimizable);
        }
        public double SquareDistanceCompareValidationCase(int numTriangulationPoints)
        {
            var correctResult                = 0;
            var wrongResult                  = 0;
            var totalComparisons             = 10000;
            var extraShortTrianagulatable    = 0;
            var extraShortNotTrianagulatable = 0;
            var shortTrianagulatable         = 0;
            var shortNotTrianagulatable      = 0;
            var longTrianagulatable          = 0;
            var longNotTrianagulatable       = 0;

            // 1. Make test data.
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters = data.MakeClusters();

            // 2. Create HilbertIndex for points.
            var hIndex = new HilbertIndex(clusters, bitsPerDimension);

            hIndex.SetTriangulation(numTriangulationPoints);

            // 3. Deduce the characteristic distance.
            var counter = new ClusterCounter
            {
                OutlierSize = 5,
                NoiseSkipBy = 10
            };
            var count         = counter.Count(hIndex.SortedPoints);
            var mergeDistance = count.MaximumSquareDistance;
            var longDistance  = 5 * mergeDistance;

            // 4. Select random pairs of the HilbertPoints points and see how many distance comparisons yield the correct result.
            var rng    = new FastRandom();
            var points = hIndex.SortedPoints.ToList();

            for (var i = 0; i < totalComparisons; i++)
            {
                var p1 = points[rng.Next(points.Count)];
                var p2 = points[rng.Next(points.Count)];
                var d  = p1.Measure(p2);
                if (d.CompareTo(mergeDistance) == p1.SquareDistanceCompare(p2, mergeDistance))
                {
                    correctResult++;
                }
                else
                {
                    wrongResult++;
                }

                if (d.CompareTo(longDistance) == p1.SquareDistanceCompare(p2, longDistance))
                {
                    correctResult++;
                }
                else
                {
                    wrongResult++;
                }

                if (p1.Triangulatable(p2, mergeDistance / 2))
                {
                    extraShortTrianagulatable++;
                }
                else
                {
                    extraShortNotTrianagulatable++;
                }

                if (p1.Triangulatable(p2, mergeDistance))
                {
                    shortTrianagulatable++;
                }
                else
                {
                    shortNotTrianagulatable++;
                }

                if (p1.Triangulatable(p2, longDistance))
                {
                    longTrianagulatable++;
                }
                else
                {
                    longNotTrianagulatable++;
                }
            }
            var extraShortPct = 100.0 * extraShortTrianagulatable / (extraShortTrianagulatable + extraShortNotTrianagulatable);
            var shortPct      = 100.0 * shortTrianagulatable / (shortTrianagulatable + shortNotTrianagulatable);
            var longPct       = 100.0 * longTrianagulatable / (longTrianagulatable + longNotTrianagulatable);

            Console.WriteLine($"Triangulatable? \n    XS: {extraShortPct} % \n    Short: {shortPct} % Yes {shortTrianagulatable}, No {shortNotTrianagulatable}\n    Long: {longPct} % Yes {longTrianagulatable}, No {longNotTrianagulatable}");
            Assert.AreEqual(wrongResult, 0, $"{correctResult} correct, {wrongResult} wrong");

            return(shortPct);
        }
        public PolyChromaticClosestPoint(Classification <UnsignedPoint, TLabel> clusters, HilbertIndex index)
        {
            Clusters = clusters;

            var sorter         = new KeySorter <HilbertPoint, UnsignedPoint>(p => p.UniqueId, p => p.UniqueId);
            var unsortedPoints = Clusters.Points().ToList();
            var sortedPoints   = index.SortedPoints;

            SortedPoints = sorter.Sort(unsortedPoints, sortedPoints, 10).ToList();
            ValidateIds();
        }