/// <summary>
        /// Compare the exact coarseness with an estimate for all numbers of bits.
        ///
        /// This takes an assemblage of many clusters and finds the most concentrated
        /// cluster according to a single bit Hilbert curve.
        /// Then it composes a GridCoarseness for the points in that cluster.
        /// </summary>
        /// <param name="numPoints">Number of points</param>
        /// <param name="dimensions">Number of dimensions</param>
        /// <param name="clusterCount">Number of clusters</param>
        /// <param name="maxCoordinate">Larges value any cooedinate of any dimension can hold</param>
        /// <param name="maxStdDeviation">Maximum standard deviation among coordinate values relative to the center of each Gaussian cluster generated.</param>
        /// <param name="minStdDeviation">Maximum standard deviation among coordinate values relative to the center of each Gaussian cluster generated.</param>
        /// <returns>The GridCoarseness.</returns>
        GridCoarseness MakeTestGrid(int numPoints, int dimensions, int clusterCount, int maxCoordinate, int minStdDeviation = 10, int maxStdDeviation = 30)
        {
            var avgClusterSize = numPoints / clusterCount;
            var data           = new GaussianClustering
            {
                ClusterCount      = clusterCount,
                Dimensions        = dimensions,
                MaxCoordinate     = maxCoordinate,
                MinClusterSize    = avgClusterSize - 100,
                MaxClusterSize    = avgClusterSize + 100,
                MaxDistanceStdDev = maxStdDeviation,
                MinDistanceStdDev = minStdDeviation
            };
            var           clusters     = data.MakeClusters();
            var           points       = clusters.Points().ToList();
            PointBalancer balancer     = null;
            var           bitsRequired = (maxCoordinate + 1).SmallestPowerOfTwo();

            var lowresSort    = HilbertSort.SortWithTies(points, 1, ref balancer);
            var largestBucket = lowresSort.OrderByDescending(bucket => bucket.Length).FirstOrDefault();
            var bucketSize    = largestBucket.Length;

            var grid = new GridCoarseness(largestBucket, bitsRequired);

            return(grid);
        }
        public void DensityCompared()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;
            var windowRadius         = (int)Math.Sqrt(numPoints / 2);
            var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Console.WriteLine($"Window Radius = {windowRadius}. {hIndex.SortedPoints.Count} points");
            Console.Write("Exact,Estimated");
            for (var i = 0; i < numPoints; i++)
            {
                var p        = hIndex.SortedPoints[i];
                var exact    = dMeter.ExactNeighbors(p);
                var estimate = dMeter.EstimatedDensity(p, windowRadius);
                Console.Write($"{exact},{estimate}");
            }
        }
Beispiel #3
0
        private void OptimalIndexTestCase(
            int hilbertTries, int minClusterSize, int maxClusterSize, int dimensions, int clusterCount, int acceptableClusterCount,
            int bitsPerDimension, int outlierSize, int noiseSkipBy)
        {
            var data = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = 1000,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var clusters = data.MakeClusters();
            var points   = clusters.Points().Select(p => HilbertPoint.CastOrConvert(p, bitsPerDimension, true)).ToList();
            var results  = OptimalIndex.Search(
                points,
                outlierSize,
                noiseSkipBy,
                hilbertTries,     // maxTrials
                4                 // maxIterationsWithoutImprovement
                );
            var message = $"Estimated cluster count = {results.EstimatedClusterCount}, actual = {clusterCount}, acceptable = {acceptableClusterCount}";

            Console.WriteLine(message);
            Assert.LessOrEqual(results.EstimatedClusterCount, acceptableClusterCount, $"HilbertIndex fragmented by more than 50%: {message}");
        }
Beispiel #4
0
        /// <summary>
        /// For random clustered data, discover how unique shortened versions of the Hilbert index are.
        /// </summary>
        /// <param name="numPoints">Number of points.</param>
        /// <param name="dimensions">Dimensions per point.</param>
        /// <param name="clusterCount">Number of clusters.</param>
        /// <param name="smallBucketSize">Count of items that constitutes a small bucket.</param>
        /// <param name="maxCoordinate">Highest permitted coordinate value.</param>
        public void ClusteredUniquenessByBits(int numPoints, int dimensions, int clusterCount, int smallBucketSize, int maxCoordinate)
        {
            var clusterSizeVariation = 100;
            var minClusterSize       = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize       = (numPoints / clusterCount) + clusterSizeVariation;
            var data = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var           clusters     = data.MakeClusters();
            var           points       = clusters.Points().ToList();
            PointBalancer balancer     = null;
            var           bitsRequired = (maxCoordinate + 1).SmallestPowerOfTwo();

            for (var iBits = 1; iBits <= bitsRequired; iBits++)
            {
                var maxBucketSize     = MaxBucketSizePerBits(points, iBits, smallBucketSize, ref balancer, out int pointsInSmallBuckets);
                var pctInSmallBuckets = 100.0 * pointsInSmallBuckets / points.Count;
                Console.WriteLine($"Bits: {iBits}  Max Bucket: {maxBucketSize}  # in Small Buckets: {pointsInSmallBuckets} - {pctInSmallBuckets} %");
            }
        }
Beispiel #5
0
        /// <summary>
        /// Create test data in known clusters, perform unattended clustering, time the process.
        /// Make no attempt to verify the correctness of the result.
        /// The timing does not include the creation of the test data, just the clustering.
        /// </summary>
        /// <param name="numPoints">Number of points to cluster.</param>
        /// <param name="clusterCount">Cluster count.</param>
        /// <param name="dimensions">Dimensions per point.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.
        ///  The average number of points per cluster is numPoints/clusterCount.
        ///  The actual size of a given cluster will be permitted to vary by as much as ± clusterSizeVariation.
        /// </param>
        /// <param name="maxCoordinate">All points will have coordinate values in the range 0 to maxCoordinate.</param>
        /// <returns>Time in seconds and an Boolean which is false if the clustering did not produce perfect results.</returns>
        private Tuple <double, bool> ClassifyPerformance(int numPoints, int clusterCount, int dimensions,
                                                         int clusterSizeVariation = 0, int maxCoordinate = 1000)
        {
            var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation;
            var data           = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var expectedClusters = data.MakeClusters();
            var timer            = new Stopwatch();

            timer.Start();
            var classifier = new HilbertClassifier(expectedClusters.Points(), 10);

            classifier.IndexConfig.UseSample = true;
            var actualClusters = classifier.Classify();

            timer.Stop();
            var success = expectedClusters.IsSimilarTo(actualClusters);

            if (!success)
            {
                Console.WriteLine($"Clustering was not perfect. # of Clusters actual/expected: {actualClusters.NumPartitions}/{expectedClusters.NumPartitions}");
            }
            var seconds = timer.ElapsedMilliseconds / 1000.0;

            return(new Tuple <double, bool>(seconds, success));
        }
Beispiel #6
0
        public void ClusterWithoutFiles()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 20,
                Dimensions     = 50,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 200,
                MaxClusterSize = 600
            };
            var expectedClassification = data.MakeClusters();

            var config = new SlashConfig()
            {
                AcceptableBCubed = 0.98
            };

            config.Index.BitsPerDimension = bitsPerDimension;
            config.UseNoFiles();
            var command = new SlashCommand(SlashCommand.CommandType.Cluster, config)
            {
                InputFile  = null,
                OutputFile = null
            };

            command.Configuration.DensityClassifier.SkipDensityClassification = true;
            // Need to put this here, because the command initializes the logger differently.
            Logger.SetupForTests(null);
            command.LoadData(expectedClassification);

            command.Execute();

            Assert.IsTrue(command.IsClassificationAcceptable, $"The BCubed value of {command.MeasuredChange.BCubed} was not good enough.");
        }
Beispiel #7
0
        /// <summary>
        /// For the same test data, create a single HilbertIndex many times and average the execution time across all indices.
        ///
        /// The goal is to identify how the time depends on number of points N, number of dimensions D, and bits per coordinate B.
        /// (It should be insensitive to cluster count K.)
        /// </summary>
        /// <param name="N">Number of points to index.</param>
        /// <param name="K">Number of clusters of points to create.</param>
        /// <param name="D">Number dimensions.</param>
        /// <param name="B">Number bits.</param>
        /// <param name="repeats">Number of times to repeat.</param>
        /// <returns>Average number of seconds to create the index, averaged over several tries.
        /// The time excludes the time to create the test data.
        /// </returns>
        private double SingleIndexCreationPerformanceCase(int N, int K, int D, int B, int repeats)
        {
            var data = new GaussianClustering
            {
                ClusterCount   = K,
                Dimensions     = D,
                MaxCoordinate  = (1 << B) - 1,
                MinClusterSize = N / K,
                MaxClusterSize = N / K
            };
            var clusters = data.MakeClusters();
            var timer    = new Stopwatch();
            var totalTimeMilliseconds = 0L;

            for (var i = 0; i < repeats; i++)
            {
                timer.Reset();
                timer.Start();
                var hIndex = new HilbertIndex(clusters, B);
                Assert.AreEqual(N, hIndex.Count, "Index has wrong number of points");
                timer.Stop();
                totalTimeMilliseconds += timer.ElapsedMilliseconds;
            }
            return((double)totalTimeMilliseconds / (1000.0 * repeats));
        }
Beispiel #8
0
        public void Classify_DensitySpread()
        {
            var clusterCount     = 50;
            var dimensions       = 100;
            var maxCoordinate    = 1000;
            var acceptableBCubed = 0.99;
            var clusterSizes     = new int[50];

            foreach (var i in Enumerable.Range(0, 50))
            {
                clusterSizes[i] = 100 + (100 * i);
            }

            var minClusterSize = clusterSizes.Min();
            var maxClusterSize = clusterSizes.Max();
            var data           = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize,
                ClusterSizes   = clusterSizes
            };

            ClusterCore(data, acceptableBCubed);
        }
Beispiel #9
0
        /// <summary>
        /// A test case for PolyChromaticClosestPoint.FindPairByCentroids where clusters conform to a Gaussian distribution.
        /// </summary>
        /// <param name="nPoints">Number of points in each cluster.</param>
        /// <param name="dimensions">Number of Dimensions in each point.</param>
        /// <param name="numClusters">Number of clusters to create.</param>
        public void FindPairByCentroidsTestCase(int nPoints, int dimensions, int numClusters)
        {
            var successes  = 0;
            var worstRatio = 1.0;
            var color1     = "0";

            var data = new GaussianClustering
            {
                ClusterCount   = numClusters,
                Dimensions     = dimensions,
                MaxCoordinate  = 1000,
                MinClusterSize = nPoints,
                MaxClusterSize = nPoints
            };
            var clusters = data.MakeClusters();
            PolyChromaticClosestPoint <string> pccp;

            pccp = new PolyChromaticClosestPoint <string>(clusters);

            for (var iColor2 = 1; iColor2 < numClusters; iColor2++)
            {
                var color2 = iColor2.ToString();

                var exact       = pccp.FindPairExhaustively(color1, color2);
                var approximate = pccp.FindPairByCentroids(color1, color2);

                var expectedDistance = exact.SquareDistance;
                var actualDistance   = approximate.SquareDistance;

                if (actualDistance <= expectedDistance)
                {
                    successes++;
                }
                else
                {
                    worstRatio = Math.Max(worstRatio, actualDistance / (double)expectedDistance);
                }

                if (exact.SquareDistance >= approximate.SquareDistance)
                {
                    Console.WriteLine("FindPairByCentroids CORRECT.   Exact {0}. Approx {1}", exact, approximate);
                }
                else
                {
                    Console.WriteLine("FindPairByCentroids INCORRECT. Exact {0}. Approx {1}. Too high by {2:N3}%",
                                      exact, approximate, 100.0 * (approximate.SquareDistance / (double)exact.SquareDistance - 1.0));
                }
            }

            Assert.AreEqual(numClusters - 1, successes,
                            string.Format("Did not succeed every time. Failed {0} of {1} times. Worst distance ratio is {2:N4}. {3} points of {4} dimensions.",
                                          numClusters - successes - 1,
                                          numClusters - 1,
                                          worstRatio,
                                          nPoints,
                                          dimensions
                                          )
                            );
        }
Beispiel #10
0
        public void AllColorPairsClosestClusterTest(int nPoints, int dimensions, int numClusters, int numCurvesToTry)
        {
            var rankHistogram = new int[numClusters + 1];             // We will skip the first element so as to have a one-based array.
            var data          = new GaussianClustering
            {
                ClusterCount   = numClusters,
                Dimensions     = dimensions,
                MaxCoordinate  = 1000,
                MinClusterSize = nPoints,
                MaxClusterSize = nPoints
            };
            var worstDistanceRatio = 1.0;
            var ratioSum           = 0.0;
            var ratioCount         = 0;
            var clusters           = data.MakeClusters();

            var bitsPerDimension = (1 + data.MaxCoordinate).SmallestPowerOfTwo();
            var results          = OptimalIndex
                                   .Search(
                clusters.Points().Select(up => HilbertPoint.CastOrConvert(up, bitsPerDimension, true)).ToList(),
                5 /*outlier size */, 10 /* NoiseSkipBy */, 1 /* ReducedNoiseSkipBy */, numCurvesToTry
                );

            var pccp1         = new PolyChromaticClosestPoint <string>(clusters, results.Index);
            var allColorPairs = pccp1.FindAllClustersApproximately();

            foreach (var color1 in clusters.ClassLabels())
            {
                var exact       = pccp1.FindClusterExhaustively(color1).Swap(color1);
                var color1Pairs = allColorPairs
                                  .Where(cp => cp.Color1.Equals(color1) || cp.Color2.Equals(color1))
                                  .Select(cp => cp.Swap(color1))
                                  .ToList();
                var approximateColor2Distance = color1Pairs.First(cp => cp.Color2.Equals(exact.Color2)).SquareDistance;
                var approximateRank           = color1Pairs.Count(cp => cp.SquareDistance < approximateColor2Distance) + 1;
                rankHistogram[approximateRank]++;
#pragma warning disable RECS0018 // Comparison of floating point numbers with equality operator
                var ratio = exact.SquareDistance == 0.0 ? 0 : approximateColor2Distance / (double)exact.SquareDistance;
#pragma warning restore RECS0018 // Comparison of floating point numbers with equality operator
                ratioSum += ratio;
                ratioCount++;
                worstDistanceRatio = Math.Max(worstDistanceRatio, ratio);
            }
            Debug.WriteLine(string.Format("Worst distance overage   = {0:N3}%", (worstDistanceRatio - 1.0) * 100.0));
            Debug.WriteLine(string.Format("Average distance overage = {0:N3}%", ((ratioSum / ratioCount) - 1.0) * 100.0));
            for (var iRank = 1; iRank <= numClusters; iRank++)
            {
                if (rankHistogram[iRank] > 0 || iRank < 4)
                {
                    Debug.WriteLine(string.Format("For {0} Clusters the closest cluster found was Ranked #{1}.", rankHistogram[iRank], iRank));
                }
            }
            // Accept a win, place or show: the true closest cluster shows up as no worse than the 3rd ranked cluster according to the approximate measure.
            Assert.IsTrue(rankHistogram[1] + rankHistogram[2] + rankHistogram[3] == numClusters,
                          string.Format("Found the closest cluster for {0} colors", rankHistogram[1])
                          );
        }
Beispiel #11
0
        /// <summary>
        /// Create test data in known clusters, perform unattended clustering, and compare the results to the known clusters.
        /// The test passes if the BCubed value is high enough.
        /// </summary>
        /// <param name="numPoints">Number of points to cluster.</param>
        /// <param name="clusterCount">Cluster count.</param>
        /// <param name="dimensions">Dimensions per point.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.
        ///  The average number of points per cluster is numPoints/clusterCount.
        ///  The actual size of a given cluster will be permitted to vary by as much as ± clusterSizeVariation.
        /// </param>
        /// <param name="maxCoordinate">All points will have coordinate values in the range 0 to maxCoordinate.</param>
        /// <param name="acceptableBCubed">The comparison of the actual and expected clusters must yield a BCubed value
        /// that is this high or higher. A value of 1.0 means a perfect clustering, with no points out of place.</param>
        private void ClassifyCase(int numPoints, int clusterCount, int dimensions,
                                  int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptableBCubed = 0.99)
        {
            var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation;
            var data           = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };

            ClusterCore(data, acceptableBCubed);
        }
Beispiel #12
0
        private void ClusterCore(GaussianClustering data, double acceptableBCubed)
        {
            var expectedClusters = data.MakeClusters();
            var classifier       = new HilbertClassifier(expectedClusters.Points(), 10);

            classifier.IndexConfig.UseSample = true;

            var actualClusters = classifier.Classify();
            var comparison     = expectedClusters.Compare(actualClusters);

            var message = $"   Comparison of clusters: {comparison}.\n   Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}.";

            Console.WriteLine(message);
            Console.WriteLine($"   Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}");
            Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}");
        }
        private Dictionary <string, CorrelationStats> DensityCorrelationCases(int[] varyWindowRadius, int[] varyNumPoints, int dimensions, int clusterCount, int repeats = 1)
        {
            var stats = new Dictionary <string, CorrelationStats>();

            for (var iRepeat = 0; iRepeat < repeats; iRepeat++)
            {
                foreach (var numPoints in varyNumPoints)
                {
                    var bitsPerDimension = 10;
                    var clusterSize      = numPoints / clusterCount;
                    var data             = new GaussianClustering
                    {
                        ClusterCount   = clusterCount,
                        Dimensions     = dimensions,
                        MaxCoordinate  = (1 << bitsPerDimension) - 1,
                        MinClusterSize = clusterSize,
                        MaxClusterSize = clusterSize
                    };
                    var expectedClusters = data.MakeClusters();
                    var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
                    var cc = new ClusterCounter {
                        NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
                    };
                    var count = cc.Count(hIndex.SortedPoints);
                    var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
                    var dMeter = new DensityMeter(hIndex, neighborhoodDistance, varyWindowRadius[0]);

                    // It is more efficient to process windowRadius in descending order,
                    // because the DistanceMemo can reuse more work that way. Once a larger window has been processed,
                    // it includes all shorter windows as well.
                    foreach (var windowRadius in varyWindowRadius.OrderByDescending(r => r))
                    {
                        var label = MakeLabel(numPoints, windowRadius, dimensions, clusterCount);
                        CorrelationStats corStats;
                        if (!stats.TryGetValue(label, out corStats))
                        {
                            corStats     = new CorrelationStats(label);
                            stats[label] = corStats;
                        }
                        corStats.Add(DensityCorrelationCase(dMeter, windowRadius));
                        Console.Write(corStats);
                    }
                }
            }
            return(stats);
        }
        private Classification <UnsignedPoint, string> TestData(int[] clusterSizes, int dimensions, int maxCoordinate)
        {
            var clusterCount   = clusterSizes.Length;
            var minClusterSize = clusterSizes.Min();
            var maxClusterSize = clusterSizes.Max();
            var data           = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize,
                ClusterSizes   = clusterSizes
            };

            return(data.MakeClusters());
        }
Beispiel #15
0
        /// <summary>
        /// Create test data in known chained clusters, perform unattended clustering, and compare the results to the known clusters.
        /// The test passes if the BCubed value is high enough.
        /// </summary>
        /// <param name="numPoints">Number of points to cluster.</param>
        /// <param name="clusterCount">Cluster count.</param>
        /// <param name="chainLength">Number of segments in each chain.</param>
        /// <param name="dimensions">Dimensions per point.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.
        ///  The average number of points per cluster is numPoints/clusterCount.
        ///  The actual size of a given cluster will be permitted to vary by as much as ± clusterSizeVariation.
        /// </param>
        /// <param name="maxCoordinate">All points will have coordinate values in the range 0 to maxCoordinate.</param>
        /// <param name="acceptableBCubed">The comparison of the actual and expected clusters must yield a BCubed value
        /// that is this high or higher. A value of 1.0 means a perfect clustering, with no points out of place.</param>
        private void ClassifyChainCase(int numPoints, int clusterCount, int chainLength, int dimensions,
                                       int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptableBCubed = 0.99)
        {
            var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation;
            var data           = new GaussianClustering
            {
                ClusterCount      = clusterCount,
                Dimensions        = dimensions,
                MaxCoordinate     = maxCoordinate,
                MinClusterSize    = minClusterSize,
                MaxClusterSize    = maxClusterSize,
                MaxDistanceStdDev = 300,
                MinDistanceStdDev = 150
            };

            ClusterChainCore(data, acceptableBCubed, chainLength);
        }
        UnsignedPoint[] TestData(int numPoints, int dimensions, int clusterCount, int maxCoordinate, int minStdDeviation, int maxStdDeviation, out int bitsPerDimension)
        {
            var avgClusterSize = numPoints / clusterCount;
            var data           = new GaussianClustering
            {
                ClusterCount      = clusterCount,
                Dimensions        = dimensions,
                MaxCoordinate     = maxCoordinate,
                MinClusterSize    = avgClusterSize - 100,
                MaxClusterSize    = avgClusterSize + 100,
                MaxDistanceStdDev = maxStdDeviation,
                MinDistanceStdDev = minStdDeviation
            };
            var clusters = data.MakeClusters();
            var points   = clusters.Points().ToArray();

            bitsPerDimension = (maxCoordinate + 1).SmallestPowerOfTwo();
            return(points);
        }
        public void DensityCorrelation()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            // Choice of neighborhoodDistance is crucial.
            //   - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating
            //	   that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors
            //     and your window size is 100 then many points will have their neighbor count saturate near 100 and
            //     no meaningful variation in density will be found.
            //   - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density.
            //   - We know that almost every point has two neighbors within MaximumSquareDistance, so we should
            //     make it smaller than MaximumSquareDistance.
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;

            var windowRadius = (int)Math.Sqrt(numPoints / 2);
            var dMeter       = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Func <HilbertPoint, long> exactMetric     = p => (long)dMeter.ExactNeighbors(p);
            Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius);
            var correlator  = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric);
            var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000));

            Console.WriteLine($"Correlation between exact and estimated density is: {correlation}");
            Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough");
        }
Beispiel #18
0
        private void ClusterChainCore(GaussianClustering data, double acceptableBCubed, int chainLength)
        {
            var expectedClusters = data.MakeChains(chainLength);
            var classifier       = new HilbertClassifier(expectedClusters.Points(), 10);

            classifier.IndexConfig.UseSample = true;
            var actualClusters = classifier.Classify();
            var comparison     = expectedClusters.Compare(actualClusters);

            var message = $"   Comparison of clusters: {comparison}.\n   Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}.";

            Logger.Info(message);
            var message2 = $"   Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}";

            Logger.Info(message2);
            var pointsInOutliers = actualClusters.LabelToPoints.Values
                                   .Select(values => values.Count())
                                   .Where(count => count < classifier.OutlierSize)
                                   .Sum();
            var message3 = $"   Points in Outliers/Total Point: {pointsInOutliers} / {actualClusters.NumPoints}";

            Logger.Info(message3);
            Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}");
        }
        /// <summary>
        /// UnsignedPoint.SquareDistanceCompare has an optimization. This tests how often this optimization
        /// can be exploited in a realistic test. The comparison will be against an estimated characteristic distance
        /// between points. This distance is assumed to be close enough to trigger two points to be merged into a single cluster.
        /// </summary>
        private double SquareDistanceCompareOptimizableCase(int totalComparisons, bool useExtendedOptimization = false)
        {
            // 1. Make test data.
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters = data.MakeClusters();

            // 2. Create HilbertIndex for points.
            var hIndex = new HilbertIndex(clusters, bitsPerDimension);

            // 3. Deduce the characteristic distance.
            var counter = new ClusterCounter
            {
                OutlierSize = 5,
                NoiseSkipBy = 10
            };
            var count         = counter.Count(hIndex.SortedPoints);
            var mergeDistance = count.MaximumSquareDistance;
            var longDistance  = 5 * mergeDistance;

            // 4. Select random pairs of points and see how many distance comparisons can exploit the optimization.
            var rng    = new FastRandom();
            var points = clusters.Points().ToList();
            var ableToUseOptimizationsAtShortDistance = 0;
            var ableToUseOptimizationsAtLongDistance  = 0;

            for (var i = 0; i < totalComparisons; i++)
            {
                var p1 = points[rng.Next(points.Count)];
                var p2 = points[rng.Next(points.Count)];
                if (useExtendedOptimization)
                {
                    if (IsExtendedDistanceOptimizationUsable(p1, p2, mergeDistance, bitsPerDimension))
                    {
                        ableToUseOptimizationsAtShortDistance++;
                    }
                    if (IsExtendedDistanceOptimizationUsable(p1, p2, longDistance, bitsPerDimension))
                    {
                        ableToUseOptimizationsAtLongDistance++;
                    }
                }
                else
                {
                    if (IsDistanceOptimizationUsable(p1, p2, mergeDistance))
                    {
                        ableToUseOptimizationsAtShortDistance++;
                    }
                    if (IsDistanceOptimizationUsable(p1, p2, longDistance))
                    {
                        ableToUseOptimizationsAtLongDistance++;
                    }
                }
            }
            var percentOptimizable             = 100.0 * ableToUseOptimizationsAtShortDistance / totalComparisons;
            var percentOptimizableLongDistance = 100.0 * ableToUseOptimizationsAtLongDistance / totalComparisons;
            var message = $"Comparisons were {percentOptimizable} % Optimizable at short distance, {percentOptimizableLongDistance} % at long distance";

            Console.WriteLine(message);
            return(percentOptimizable);
        }
Beispiel #20
0
        public void LowresVersusHiresCase(int numPoints, int dimensions, int clusterCount, int lowresBits)
        {
            var maxCoordinate        = 1000;
            var clusterSizeVariation = 100;
            var minClusterSize       = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize       = (numPoints / clusterCount) + clusterSizeVariation;
            var data = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var           clusters = data.MakeClusters();
            var           points   = clusters.Points().ToList();
            PointBalancer balancer = null;

            var hiresSort  = HilbertSort.BalancedSort(points, ref balancer);
            var lowresSort = HilbertSort.SortWithTies(points, lowresBits, ref balancer);

            var lowresPositions = new Dictionary <UnsignedPoint, int>();
            var hiresPosition   = new Dictionary <UnsignedPoint, int>();

            foreach (var p in hiresSort.Select((p, i) => { hiresPosition[p] = i; return(p); }))
            {
                ;
            }
            foreach (var ties in lowresSort.Select((p, i) => new { Points = p, Position = i }))
            {
                foreach (var point in ties.Points)
                {
                    lowresPositions[point] = ties.Position;
                }
            }

            //      Compare the positions of many pairs of points in the two orderings to see that
            //      they are either in the same relative order
            //      or tied for position in the lowres ordering.
            var actualNumPoints = points.Count;
            var largestBucket   = lowresSort.Select(bucket => bucket.Length).Max();
            var caseDescription = $"N = {actualNumPoints}  D = {dimensions}  K = {clusterCount}  B = {lowresBits}";

            Console.WriteLine(caseDescription);
            Console.WriteLine($"Lowres buckets = {lowresSort.Count}  Largest bucket = {largestBucket}");

            int outOfPlaceCount = 0;

            for (var i = 0; i < actualNumPoints - 1; i++)
            {
                var p1 = points[i];
                for (var j = i + 1; j < actualNumPoints; j++)
                {
                    var p2 = points[j];
                    var lowresPosition1 = lowresPositions[p1];
                    var lowresPosition2 = lowresPositions[p2];
                    var hiresPosition1  = hiresPosition[p1];
                    var hiresPosition2  = hiresPosition[p2];
                    if (lowresPosition1 != lowresPosition2)
                    {
                        if (lowresPosition1 < lowresPosition2 != hiresPosition1 < hiresPosition2)
                        {
                            outOfPlaceCount++;
                        }
                    }
                }
            }
            var msg = $"Out of place count = {outOfPlaceCount}";

            Console.WriteLine(msg);
            Assert.AreEqual(0, outOfPlaceCount, msg);
        }
Beispiel #21
0
        public void DistanceDistribution()
        {
            /*
             *      Percentile,By Index,By Random
             *      -----------------------------
             *      0%,111.35,146.55
             *      1%,142.06,255.96
             *      2%,147.21,2163.43
             *      3%,151.2,2214.15
             *      4%,154.06,2245.2
             *      5%,156.24,2271.37
             *      6%,158.38,2292.29
             *      7%,160.42,2313.55
             *      8%,162.29,2327.14
             *      9%,164.07,2345.25
             *      10%,165.41,2359.95
             *      11%,166.72,2372.83
             *      12%,167.99,2386.15
             *      13%,169.29,2398.47
             *      14%,170.43,2410.01
             *      15%,171.53,2422.34
             *      16%,172.48,2432.43
             *      17%,173.58,2443.08
             *      18%,174.73,2454.27
             *      19%,175.56,2463.71
             *      20%,176.35,2472.97
             *      21%,177.35,2483.24
             *      22%,178.3,2491.9
             *      23%,179.1,2501.44
             *      24%,179.82,2510.26
             *      25%,180.64,2517.73
             *      26%,181.55,2524.97
             *      27%,182.33,2531.58
             *      28%,182.98,2538.08
             *      29%,183.67,2543.83
             *      30%,184.33,2550.93
             *      31%,185.09,2556.59
             *      32%,185.7,2563.37
             *      33%,186.41,2570.29
             *      34%,187.09,2577.29
             *      35%,187.7,2583.56
             *      36%,188.43,2589.95
             *      37%,189.07,2596.13
             *      38%,189.71,2602.24
             *      39%,190.46,2608.28
             *      40%,191.08,2615.25
             *      41%,191.79,2620.81
             *      42%,192.46,2626.02
             *      43%,193.09,2632.7
             *      44%,193.71,2638.18
             *      45%,194.31,2643.35
             *      46%,194.98,2648.69
             *      47%,195.65,2655.47
             *      48%,196.3,2660.26
             *      49%,196.96,2666.37
             *      50%,197.66,2670.94
             *      51%,198.34,2677.09
             *      52%,199.07,2681.9
             *      53%,199.72,2687.11
             *      54%,200.3,2692.42
             *      55%,201.06,2697.92
             *      56%,201.71,2703.76
             *      57%,202.4,2710.17
             *      58%,203.16,2715.06
             *      59%,203.82,2720.25
             *      60%,204.51,2725.99
             *      61%,205.32,2731.6
             *      62%,206.08,2736.59
             *      63%,206.79,2741.72
             *      64%,207.58,2746.59
             *      65%,208.29,2754.03
             *      66%,209.07,2760.81
             *      67%,209.8,2766.65
             *      68%,210.68,2771.98
             *      69%,211.71,2778.27
             *      70%,212.38,2784.23
             *      71%,213.19,2790.71
             *      72%,213.92,2796.42
             *      73%,214.82,2802.84
             *      74%,215.68,2809.36
             *      75%,216.54,2814.55
             *      76%,217.48,2821.32
             *      77%,218.43,2827.56
             *      78%,219.35,2833.35
             *      79%,220.28,2840.72
             *      80%,221.33,2848.87
             *      81%,222.31,2856.89
             *      82%,223.42,2864
             *      83%,224.46,2872.51
             *      84%,225.83,2881.09
             *      85%,227.06,2891.57
             *      86%,228.27,2900.46
             *      87%,229.63,2910.46
             *      88%,231.55,2919.5
             *      89%,233.59,2933.76
             *      90%,235.6,2944.88
             *      91%,237.25,2959.45
             *      92%,239.83,2976.08
             *      93%,241.88,2990.4
             *      94%,244.97,3010.08
             *      95%,248.23,3029.15
             *      96%,252.34,3052.37
             *      97%,260.68,3074.84
             *      98%,282.76,3112.43      *** Note the jump from 282 to 2550, which shows that the characteristic distance is about 282.
             *      99%,2550.87,3170.93
             *      100%,3114.89,3412.57
             */
            var data = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 50,
                MaxCoordinate  = 1000,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters         = data.MakeClusters();
            var bitsPerDimension = 10;
            var points           = clusters.Points().Select(p => HilbertPoint.CastOrConvert(p, bitsPerDimension, true)).ToList();
            var results          = OptimalIndex.Search(
                points,
                5,                     // outlierSize
                10,                    // noiseSkipBy
                1000,                  // maxTrials
                4                      // maxIterationsWithoutImprovement
                );
            var pointsFromIndex  = results.Index.SortedPoints;
            var distancesRandom  = new List <long>();
            var distancesHilbert = new List <long>();
            var n   = pointsFromIndex.Count;
            var rng = new FastRandom();

            for (var i = 0; i < n - 1; i++)
            {
                var p1 = pointsFromIndex[i];
                var p2 = pointsFromIndex[i + 1];
                distancesHilbert.Add(p1.Measure(p2));

                var p3 = pointsFromIndex[rng.Next(n)];
                var p4 = pointsFromIndex[rng.Next(n)];
                distancesRandom.Add(p3.Measure(p4));
            }
            distancesHilbert.Sort();
            distancesRandom.Sort();
            Console.WriteLine("Percentile,By Index,By Random");
            for (var percentile = 0; percentile <= 100; percentile++)
            {
                var i           = Math.Min(n - 2, (n - 1) * percentile / 100);
                var distHilbert = Math.Round(Math.Sqrt(distancesHilbert[i]), 2);
                var distRandom  = Math.Round(Math.Sqrt(distancesRandom[i]), 2);
                Console.Write($"{percentile}%,{distHilbert},{distRandom}");
            }
        }
Beispiel #22
0
        /// <summary>
        /// A test case for PolyChromaticClosestPoint.FindPairApproximately where clusters conform to a Gaussian distribution.
        /// </summary>
        /// <param name="nPoints">Number of points in each cluster.</param>
        /// <param name="dimensions">Number of Dimensions in each point.</param>
        /// <param name="numClusters">Number of clusters to create.</param>
        /// <param name="hilbertsToTry">Number of randomly generated Hilbert curves to try.</param>
        public void GaussianPolyChromaticPairTestCase(int nPoints, int dimensions, int numClusters, int hilbertsToTry = 1)
        {
            var successes  = 0;
            var worstRatio = 1.0;
            var color1     = "0";

            var data = new GaussianClustering
            {
                ClusterCount   = numClusters,
                Dimensions     = dimensions,
                MaxCoordinate  = 1000,
                MinClusterSize = nPoints,
                MaxClusterSize = nPoints
            };
            var clusters = data.MakeClusters();
            PolyChromaticClosestPoint <string> pccp;

            if (hilbertsToTry <= 1)
            {
                pccp = new PolyChromaticClosestPoint <string>(clusters);
            }
            else
            {
                var bitsPerDimension = (1 + data.MaxCoordinate).SmallestPowerOfTwo();
                var results          = OptimalIndex.Search(
                    clusters.Points().Select(up => HilbertPoint.CastOrConvert(up, bitsPerDimension, true)).ToList(),
                    5 /*outlier size */, 10 /* NoiseSkipBy */, 1 /* ReducedNoiseSkipBy */, hilbertsToTry
                    );
                pccp = new PolyChromaticClosestPoint <string>(clusters, results.Index);
            }
            for (var iColor2 = 1; iColor2 < numClusters; iColor2++)
            {
                var color2 = iColor2.ToString();

                var exact       = pccp.FindPairExhaustively(color1, color2);
                var approximate = pccp.FindPairApproximately(color1, color2);

                var expectedDistance = exact.SquareDistance;
                var actualDistance   = approximate.SquareDistance;

                if (actualDistance <= expectedDistance)
                {
                    successes++;
                }
                else
                {
                    worstRatio = Math.Max(worstRatio, actualDistance / (double)expectedDistance);
                }

                if (exact.SquareDistance >= approximate.SquareDistance)
                {
                    Console.WriteLine("FindPairApproximately CORRECT.   Exact {0}. Approx {1}", exact, approximate);
                }
                else
                {
                    Console.WriteLine("FindPairApproximately INCORRECT. Exact {0}. Approx {1}. Too high by {2:N3}%",
                                      exact, approximate, 100.0 * (approximate.SquareDistance / (double)exact.SquareDistance - 1.0));
                }
            }

            Assert.AreEqual(numClusters - 1, successes,
                            string.Format("Did not succeed every time. Failed {0} of {1} times. Worst distance ratio is {2:N4}. {3} points of {4} dimensions.",
                                          numClusters - successes - 1,
                                          numClusters - 1,
                                          worstRatio,
                                          nPoints,
                                          dimensions
                                          )
                            );
        }
Beispiel #23
0
        public void ClosestClusterTest(int nPoints, int dimensions, int numClusters, int numCurvesToTry, int numCurvesToKeep)
        {
            var correctColorCount    = 0;
            var correctDistanceCount = 0;
            var data = new GaussianClustering
            {
                ClusterCount   = numClusters,
                Dimensions     = dimensions,
                MaxCoordinate  = 1000,
                MinClusterSize = nPoints,
                MaxClusterSize = nPoints
            };

            var closestExact = new PolyChromaticClosestPoint <string> .ClosestPair();

            var closestApproximate = new PolyChromaticClosestPoint <string> .ClosestPair();

            var clusters = data.MakeClusters();
            var pccps    = new List <PolyChromaticClosestPoint <string> >();

            var bitsPerDimension = (1 + data.MaxCoordinate).SmallestPowerOfTwo();

            var bestIndices = OptimalIndex.SearchMany(
                clusters.Points().Select(up => HilbertPoint.CastOrConvert(up, bitsPerDimension, true)).ToList(),
                numCurvesToKeep,
                5 /*outlier size */, 10 /* NoiseSkipBy */, 1 /* ReducedNoiseSkipBy */, numCurvesToTry
                );

            //var pointLists = bestIndices.Select(result => result.Index.SortedPoints).ToList();
            //foreach (var pList in pointLists)
            //	pccps.Add(new PolyChromaticClosestPoint<string>(clusters, pList));

            var indices = bestIndices.Select(result => result.Index).ToList();

            foreach (var index in indices)
            {
                pccps.Add(new PolyChromaticClosestPoint <string>(clusters, index));
            }

            var pccp1 = pccps[0];

            foreach (var color in pccp1.Clusters.ClassLabels())
            {
                var exact       = pccp1.FindClusterExhaustively(color);
                var approximate = pccps.Select(pccp => pccp.FindClusterApproximately(color)).OrderBy(cp => cp).First();

                if (exact.SquareDistance >= approximate.SquareDistance)
                {
                    correctDistanceCount++;
                }

                if (exact.Color2.Equals(approximate.Color2))
                {
                    correctColorCount++;
                }

                if (exact.SquareDistance < closestExact.SquareDistance)
                {
                    closestExact = exact;
                }

                if (approximate.SquareDistance < closestApproximate.SquareDistance)
                {
                    closestApproximate = approximate;
                }

                var ratio = approximate.SquareDistance / (double)exact.SquareDistance;
                Console.WriteLine(string.Format("Exact {0} vs Approx. {1}. Over by {2:N3}%", exact, approximate, (ratio - 1.0) * 100.0));
            }

            if (closestExact.SquareDistance >= closestApproximate.SquareDistance)
            {
                Console.WriteLine("DID FIND the closest pair of points overall. Exact {0}. Approx {1}", closestExact, closestApproximate);
            }
            else
            {
                Console.WriteLine("DID NOT FIND the closest pair of points overall. Exact {0}. Approx {1}", closestExact, closestApproximate);
            }

            Assert.IsTrue(correctColorCount == numClusters && correctDistanceCount == numClusters,
                          string.Format("Of {0} clusters, only {1} searches found the closest cluster and {2} found the shortest distance.",
                                        numClusters,
                                        correctColorCount,
                                        correctDistanceCount
                                        )
                          );
        }
Beispiel #24
0
        /// <summary>
        /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems.
        ///
        /// From this we can deduce which of six cases obtain (the SplitQuality).
        /// </summary>
        /// <returns>A Tuple with these parts:
        ///   1) comparison of actual to expected (with its BCubed),
        ///   2) the expected number of clusters
        ///   3) the actual number of clusters
        ///   4) a qualitative assessment of the results.
        /// </returns>
        /// <param name="numPoints">Number of points.</param>
        /// <param name="dimensions">Number of Dimensions.</param>
        /// <param name="overlapPercent">Overlap percent.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.</param>
        /// <param name="maxCoordinate">Max value of any coordinate.</param>
        /// <param name="acceptablePrecision">Acceptable precision</param>
        /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param>
        private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent,
                                                                                                                int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true)
        {
            Logger.SetupForTests();
            var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo();
            var clusterCount     = 2;
            var minClusterSize   = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize   = (numPoints / clusterCount) + clusterSizeVariation;
            var outlierSize      = 5;
            var radiusShrinkage  = 0.6;            // 0.7 merges too many that belong apart!
            var data             = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var expectedClusters = data.TwoClusters(overlapPercent);

            Classification <UnsignedPoint, string> actualClusters;

            if (useDensityClassifier)
            {
                var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension);
                var cc     = new ClusterCounter {
                    NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1
                };
                var count = cc.Count(hIndex.SortedPoints);

                var unmergeableSize   = expectedClusters.NumPoints / 6;
                var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize)
                {
                    MergeableShrinkage = radiusShrinkage
                };

                actualClusters = densityClassifier.Classify();
            }
            else
            {
                var classifier = new HilbertClassifier(expectedClusters.Points(), 10)
                {
                    OutlierSize = outlierSize
                };
                //classifier.IndexConfig.NoiseSkipBy = 0;
                classifier.IndexConfig.UseSample = false;
                actualClusters = classifier.Classify();
            }

            var          comparison        = expectedClusters.Compare(actualClusters);
            SplitQuality qualitativeResult = SplitQuality.Unsplit;

            if (comparison.BCubed >= 1.0)
            {
                qualitativeResult = SplitQuality.PerfectSplit;
            }
            else if (actualClusters.NumPartitions == 1)
            {
                qualitativeResult = SplitQuality.Unsplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0)
            {
                qualitativeResult = SplitQuality.GoodOverSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.FairOverSplit;
            }
            else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.GoodSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0)
            {
                qualitativeResult = SplitQuality.BadOverSplit;
            }
            else             // Assume correct number of clusters.
            {
                qualitativeResult = SplitQuality.BadSplit;
            }

            Logger.Info($"  Quality: {qualitativeResult}  Comparison: {comparison}");

            return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>(
                       comparison,
                       expectedClusters.NumPartitions,
                       actualClusters.NumPartitions,
                       qualitativeResult
                       ));
        }
Beispiel #25
0
        public void ClosestOfFiftyClusters()
        {
            int hilbertTries           = 1000;
            var correctColorCount      = 0;
            var correctCrosscheckCount = 0;
            var correctDistanceCount   = 0;
            var nPoints      = 100;
            var dimensions   = 100;
            var clusterCount = 50;
            var data         = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = 1000,
                MinClusterSize = nPoints,
                MaxClusterSize = nPoints
            };

            var closestExact = new PolyChromaticClosestPoint <string> .ClosestPair();

            var closestApproximate = new PolyChromaticClosestPoint <string> .ClosestPair();

            var bitsPerDimension = (1 + data.MaxCoordinate).SmallestPowerOfTwo();

            var clusters = data.MakeClusters();

            Assert.AreEqual(clusterCount, clusters.NumPartitions, "Test data are grouped into fewer clusters than requested.");

            PolyChromaticClosestPoint <string> pccp;

            if (hilbertTries <= 1)
            {
                pccp = new PolyChromaticClosestPoint <string>(clusters);
            }
            else
            {
                var reducedNoiseSkipBy = 1;
                var results            = OptimalIndex.Search(
                    clusters.Points().Select(up => HilbertPoint.CastOrConvert(up, bitsPerDimension, true)).ToList(),
                    5 /*outlier size */, 10 /* NoiseSkipBy */, reducedNoiseSkipBy, hilbertTries
                    );
                pccp = new PolyChromaticClosestPoint <string>(clusters, results.Index);
            }
            foreach (var color in pccp.Clusters.ClassLabels())
            {
                var exact       = pccp.FindClusterExhaustively(color);
                var approximate = pccp.FindClusterApproximately(color);
                var crosscheck  = pccp.FindClusterIteratively(color);

                if (exact.SquareDistance >= approximate.SquareDistance)
                {
                    correctDistanceCount++;
                }

                if (exact.Color2.Equals(approximate.Color2))
                {
                    correctColorCount++;
                }

                if (exact.Color2.Equals(crosscheck.Color2))
                {
                    correctCrosscheckCount++;
                }

                if (exact.SquareDistance < closestExact.SquareDistance)
                {
                    closestExact = exact;
                }

                if (approximate.SquareDistance < closestApproximate.SquareDistance)
                {
                    closestApproximate = approximate;
                }

                var ratio = approximate.SquareDistance / (double)exact.SquareDistance;
                Console.WriteLine(string.Format("Exact {0} vs Approx. {1} vs Cross {2}. Over by {3:N3}%", exact, approximate, crosscheck, (ratio - 1.0) * 100.0));
            }

            if (closestExact.SquareDistance >= closestApproximate.SquareDistance)
            {
                Console.WriteLine("DID FIND the closest pair of points overall. Exact {0}. Approx {1}", closestExact, closestApproximate);
            }
            else
            {
                Console.WriteLine("DID NOT FIND the closest pair of points overall. Exact {0}. Approx {1}", closestExact, closestApproximate);
            }

            Assert.IsTrue(correctColorCount == clusterCount && correctDistanceCount == clusterCount,
                          string.Format("Of {0} clusters, only {1} searches found the closest cluster and {2} found the shortest distance. Crosscheck = {3}",
                                        clusterCount,
                                        correctColorCount,
                                        correctDistanceCount,
                                        correctCrosscheckCount
                                        )
                          );
        }
        public double SquareDistanceCompareValidationCase(int numTriangulationPoints)
        {
            var correctResult                = 0;
            var wrongResult                  = 0;
            var totalComparisons             = 10000;
            var extraShortTrianagulatable    = 0;
            var extraShortNotTrianagulatable = 0;
            var shortTrianagulatable         = 0;
            var shortNotTrianagulatable      = 0;
            var longTrianagulatable          = 0;
            var longNotTrianagulatable       = 0;

            // 1. Make test data.
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 100,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 50,
                MaxClusterSize = 150
            };
            var clusters = data.MakeClusters();

            // 2. Create HilbertIndex for points.
            var hIndex = new HilbertIndex(clusters, bitsPerDimension);

            hIndex.SetTriangulation(numTriangulationPoints);

            // 3. Deduce the characteristic distance.
            var counter = new ClusterCounter
            {
                OutlierSize = 5,
                NoiseSkipBy = 10
            };
            var count         = counter.Count(hIndex.SortedPoints);
            var mergeDistance = count.MaximumSquareDistance;
            var longDistance  = 5 * mergeDistance;

            // 4. Select random pairs of the HilbertPoints points and see how many distance comparisons yield the correct result.
            var rng    = new FastRandom();
            var points = hIndex.SortedPoints.ToList();

            for (var i = 0; i < totalComparisons; i++)
            {
                var p1 = points[rng.Next(points.Count)];
                var p2 = points[rng.Next(points.Count)];
                var d  = p1.Measure(p2);
                if (d.CompareTo(mergeDistance) == p1.SquareDistanceCompare(p2, mergeDistance))
                {
                    correctResult++;
                }
                else
                {
                    wrongResult++;
                }

                if (d.CompareTo(longDistance) == p1.SquareDistanceCompare(p2, longDistance))
                {
                    correctResult++;
                }
                else
                {
                    wrongResult++;
                }

                if (p1.Triangulatable(p2, mergeDistance / 2))
                {
                    extraShortTrianagulatable++;
                }
                else
                {
                    extraShortNotTrianagulatable++;
                }

                if (p1.Triangulatable(p2, mergeDistance))
                {
                    shortTrianagulatable++;
                }
                else
                {
                    shortNotTrianagulatable++;
                }

                if (p1.Triangulatable(p2, longDistance))
                {
                    longTrianagulatable++;
                }
                else
                {
                    longNotTrianagulatable++;
                }
            }
            var extraShortPct = 100.0 * extraShortTrianagulatable / (extraShortTrianagulatable + extraShortNotTrianagulatable);
            var shortPct      = 100.0 * shortTrianagulatable / (shortTrianagulatable + shortNotTrianagulatable);
            var longPct       = 100.0 * longTrianagulatable / (longTrianagulatable + longNotTrianagulatable);

            Console.WriteLine($"Triangulatable? \n    XS: {extraShortPct} % \n    Short: {shortPct} % Yes {shortTrianagulatable}, No {shortNotTrianagulatable}\n    Long: {longPct} % Yes {longTrianagulatable}, No {longNotTrianagulatable}");
            Assert.AreEqual(wrongResult, 0, $"{correctResult} correct, {wrongResult} wrong");

            return(shortPct);
        }