Esempio n. 1
0
        /// <summary>
        /// Create test data in known clusters, perform unattended clustering, time the process.
        /// Make no attempt to verify the correctness of the result.
        /// The timing does not include the creation of the test data, just the clustering.
        /// </summary>
        /// <param name="numPoints">Number of points to cluster.</param>
        /// <param name="clusterCount">Cluster count.</param>
        /// <param name="dimensions">Dimensions per point.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.
        ///  The average number of points per cluster is numPoints/clusterCount.
        ///  The actual size of a given cluster will be permitted to vary by as much as ± clusterSizeVariation.
        /// </param>
        /// <param name="maxCoordinate">All points will have coordinate values in the range 0 to maxCoordinate.</param>
        /// <returns>Time in seconds and an Boolean which is false if the clustering did not produce perfect results.</returns>
        private Tuple <double, bool> ClassifyPerformance(int numPoints, int clusterCount, int dimensions,
                                                         int clusterSizeVariation = 0, int maxCoordinate = 1000)
        {
            var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation;
            var data           = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var expectedClusters = data.MakeClusters();
            var timer            = new Stopwatch();

            timer.Start();
            var classifier = new HilbertClassifier(expectedClusters.Points(), 10);

            classifier.IndexConfig.UseSample = true;
            var actualClusters = classifier.Classify();

            timer.Stop();
            var success = expectedClusters.IsSimilarTo(actualClusters);

            if (!success)
            {
                Console.WriteLine($"Clustering was not perfect. # of Clusters actual/expected: {actualClusters.NumPartitions}/{expectedClusters.NumPartitions}");
            }
            var seconds = timer.ElapsedMilliseconds / 1000.0;

            return(new Tuple <double, bool>(seconds, success));
        }
Esempio n. 2
0
        private void ClusterCore(GaussianClustering data, double acceptableBCubed)
        {
            var expectedClusters = data.MakeClusters();
            var classifier       = new HilbertClassifier(expectedClusters.Points(), 10);

            classifier.IndexConfig.UseSample = true;

            var actualClusters = classifier.Classify();
            var comparison     = expectedClusters.Compare(actualClusters);

            var message = $"   Comparison of clusters: {comparison}.\n   Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}.";

            Console.WriteLine(message);
            Console.WriteLine($"   Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}");
            Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}");
        }
        public void ClusterNetflixData()
        {
            throw new Exception("Test not fully written");
            var classifier = new HilbertClassifier(AllNetflixData.Points, 3);

            classifier.IndexConfig.IndexCount = 1;
            Timer.Start("Clustering Netflix data");
            var actualClusters = classifier.Classify();

            Timer.Stop("Clustering Netflix data");

            //TODO: Design a measure of success.

            LogCLusterStats(actualClusters);
            Timer.Log();
        }
Esempio n. 4
0
        private void ClassifyCase(Classification <UnsignedPoint, string> expectedClusters, double acceptableBCubed = 0.99)
        {
            var maxCoordinate    = expectedClusters.Points().Select(p => p.MaxCoordinate).Max();
            var bitsPerDimension = ((int)maxCoordinate).SmallestPowerOfTwo();
            var classifier       = new HilbertClassifier(expectedClusters.Points(), bitsPerDimension);

            classifier.IndexConfig.UseSample = true;

            var actualClusters = classifier.Classify();
            var comparison     = expectedClusters.Compare(actualClusters);

            var message = $"   Comparison of clusters: {comparison}.\n   Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}.";

            Console.WriteLine(message);
            Console.WriteLine($"   Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}");
            Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}");
        }
Esempio n. 5
0
        private void ClusterChainCore(GaussianClustering data, double acceptableBCubed, int chainLength)
        {
            var expectedClusters = data.MakeChains(chainLength);
            var classifier       = new HilbertClassifier(expectedClusters.Points(), 10);

            classifier.IndexConfig.UseSample = true;
            var actualClusters = classifier.Classify();
            var comparison     = expectedClusters.Compare(actualClusters);

            var message = $"   Comparison of clusters: {comparison}.\n   Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}.";

            Logger.Info(message);
            var message2 = $"   Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}";

            Logger.Info(message2);
            var pointsInOutliers = actualClusters.LabelToPoints.Values
                                   .Select(values => values.Count())
                                   .Where(count => count < classifier.OutlierSize)
                                   .Sum();
            var message3 = $"   Points in Outliers/Total Point: {pointsInOutliers} / {actualClusters.NumPoints}";

            Logger.Info(message3);
            Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}");
        }
Esempio n. 6
0
        /// <summary>
        /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems.
        ///
        /// From this we can deduce which of six cases obtain (the SplitQuality).
        /// </summary>
        /// <returns>A Tuple with these parts:
        ///   1) comparison of actual to expected (with its BCubed),
        ///   2) the expected number of clusters
        ///   3) the actual number of clusters
        ///   4) a qualitative assessment of the results.
        /// </returns>
        /// <param name="numPoints">Number of points.</param>
        /// <param name="dimensions">Number of Dimensions.</param>
        /// <param name="overlapPercent">Overlap percent.</param>
        /// <param name="clusterSizeVariation">Cluster size variation.</param>
        /// <param name="maxCoordinate">Max value of any coordinate.</param>
        /// <param name="acceptablePrecision">Acceptable precision</param>
        /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param>
        private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent,
                                                                                                                int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true)
        {
            Logger.SetupForTests();
            var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo();
            var clusterCount     = 2;
            var minClusterSize   = (numPoints / clusterCount) - clusterSizeVariation;
            var maxClusterSize   = (numPoints / clusterCount) + clusterSizeVariation;
            var outlierSize      = 5;
            var radiusShrinkage  = 0.6;            // 0.7 merges too many that belong apart!
            var data             = new GaussianClustering
            {
                ClusterCount   = clusterCount,
                Dimensions     = dimensions,
                MaxCoordinate  = maxCoordinate,
                MinClusterSize = minClusterSize,
                MaxClusterSize = maxClusterSize
            };
            var expectedClusters = data.TwoClusters(overlapPercent);

            Classification <UnsignedPoint, string> actualClusters;

            if (useDensityClassifier)
            {
                var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension);
                var cc     = new ClusterCounter {
                    NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1
                };
                var count = cc.Count(hIndex.SortedPoints);

                var unmergeableSize   = expectedClusters.NumPoints / 6;
                var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize)
                {
                    MergeableShrinkage = radiusShrinkage
                };

                actualClusters = densityClassifier.Classify();
            }
            else
            {
                var classifier = new HilbertClassifier(expectedClusters.Points(), 10)
                {
                    OutlierSize = outlierSize
                };
                //classifier.IndexConfig.NoiseSkipBy = 0;
                classifier.IndexConfig.UseSample = false;
                actualClusters = classifier.Classify();
            }

            var          comparison        = expectedClusters.Compare(actualClusters);
            SplitQuality qualitativeResult = SplitQuality.Unsplit;

            if (comparison.BCubed >= 1.0)
            {
                qualitativeResult = SplitQuality.PerfectSplit;
            }
            else if (actualClusters.NumPartitions == 1)
            {
                qualitativeResult = SplitQuality.Unsplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0)
            {
                qualitativeResult = SplitQuality.GoodOverSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.FairOverSplit;
            }
            else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision)
            {
                qualitativeResult = SplitQuality.GoodSplit;
            }
            else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0)
            {
                qualitativeResult = SplitQuality.BadOverSplit;
            }
            else             // Assume correct number of clusters.
            {
                qualitativeResult = SplitQuality.BadSplit;
            }

            Logger.Info($"  Quality: {qualitativeResult}  Comparison: {comparison}");

            return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>(
                       comparison,
                       expectedClusters.NumPartitions,
                       actualClusters.NumPartitions,
                       qualitativeResult
                       ));
        }