/// <summary> /// Create test data in known clusters, perform unattended clustering, time the process. /// Make no attempt to verify the correctness of the result. /// The timing does not include the creation of the test data, just the clustering. /// </summary> /// <param name="numPoints">Number of points to cluster.</param> /// <param name="clusterCount">Cluster count.</param> /// <param name="dimensions">Dimensions per point.</param> /// <param name="clusterSizeVariation">Cluster size variation. /// The average number of points per cluster is numPoints/clusterCount. /// The actual size of a given cluster will be permitted to vary by as much as ± clusterSizeVariation. /// </param> /// <param name="maxCoordinate">All points will have coordinate values in the range 0 to maxCoordinate.</param> /// <returns>Time in seconds and an Boolean which is false if the clustering did not produce perfect results.</returns> private Tuple <double, bool> ClassifyPerformance(int numPoints, int clusterCount, int dimensions, int clusterSizeVariation = 0, int maxCoordinate = 1000) { var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation; var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation; var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = minClusterSize, MaxClusterSize = maxClusterSize }; var expectedClusters = data.MakeClusters(); var timer = new Stopwatch(); timer.Start(); var classifier = new HilbertClassifier(expectedClusters.Points(), 10); classifier.IndexConfig.UseSample = true; var actualClusters = classifier.Classify(); timer.Stop(); var success = expectedClusters.IsSimilarTo(actualClusters); if (!success) { Console.WriteLine($"Clustering was not perfect. # of Clusters actual/expected: {actualClusters.NumPartitions}/{expectedClusters.NumPartitions}"); } var seconds = timer.ElapsedMilliseconds / 1000.0; return(new Tuple <double, bool>(seconds, success)); }
private void ClusterCore(GaussianClustering data, double acceptableBCubed) { var expectedClusters = data.MakeClusters(); var classifier = new HilbertClassifier(expectedClusters.Points(), 10); classifier.IndexConfig.UseSample = true; var actualClusters = classifier.Classify(); var comparison = expectedClusters.Compare(actualClusters); var message = $" Comparison of clusters: {comparison}.\n Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}."; Console.WriteLine(message); Console.WriteLine($" Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}"); Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}"); }
public void ClusterNetflixData() { throw new Exception("Test not fully written"); var classifier = new HilbertClassifier(AllNetflixData.Points, 3); classifier.IndexConfig.IndexCount = 1; Timer.Start("Clustering Netflix data"); var actualClusters = classifier.Classify(); Timer.Stop("Clustering Netflix data"); //TODO: Design a measure of success. LogCLusterStats(actualClusters); Timer.Log(); }
private void ClassifyCase(Classification <UnsignedPoint, string> expectedClusters, double acceptableBCubed = 0.99) { var maxCoordinate = expectedClusters.Points().Select(p => p.MaxCoordinate).Max(); var bitsPerDimension = ((int)maxCoordinate).SmallestPowerOfTwo(); var classifier = new HilbertClassifier(expectedClusters.Points(), bitsPerDimension); classifier.IndexConfig.UseSample = true; var actualClusters = classifier.Classify(); var comparison = expectedClusters.Compare(actualClusters); var message = $" Comparison of clusters: {comparison}.\n Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}."; Console.WriteLine(message); Console.WriteLine($" Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}"); Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}"); }
private void ClusterChainCore(GaussianClustering data, double acceptableBCubed, int chainLength) { var expectedClusters = data.MakeChains(chainLength); var classifier = new HilbertClassifier(expectedClusters.Points(), 10); classifier.IndexConfig.UseSample = true; var actualClusters = classifier.Classify(); var comparison = expectedClusters.Compare(actualClusters); var message = $" Comparison of clusters: {comparison}.\n Clusters expected/actual: {expectedClusters.NumPartitions}/{actualClusters.NumPartitions}."; Logger.Info(message); var message2 = $" Large clusters: {actualClusters.NumLargePartitions(classifier.OutlierSize)}"; Logger.Info(message2); var pointsInOutliers = actualClusters.LabelToPoints.Values .Select(values => values.Count()) .Where(count => count < classifier.OutlierSize) .Sum(); var message3 = $" Points in Outliers/Total Point: {pointsInOutliers} / {actualClusters.NumPoints}"; Logger.Info(message3); Assert.GreaterOrEqual(comparison.BCubed, acceptableBCubed, $"Clustering was not good enough. BCubed = {comparison.BCubed}"); }
/// <summary> /// Perform a classification of two clusters that are near enough to each other to partially overlap, causing problems. /// /// From this we can deduce which of six cases obtain (the SplitQuality). /// </summary> /// <returns>A Tuple with these parts: /// 1) comparison of actual to expected (with its BCubed), /// 2) the expected number of clusters /// 3) the actual number of clusters /// 4) a qualitative assessment of the results. /// </returns> /// <param name="numPoints">Number of points.</param> /// <param name="dimensions">Number of Dimensions.</param> /// <param name="overlapPercent">Overlap percent.</param> /// <param name="clusterSizeVariation">Cluster size variation.</param> /// <param name="maxCoordinate">Max value of any coordinate.</param> /// <param name="acceptablePrecision">Acceptable precision</param> /// <param name="useDensityClassifier">If set to <c>true</c> use density classifier.</param> private Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality> ClassifyTwoClustersHelper(int numPoints, int dimensions, double overlapPercent, int clusterSizeVariation = 0, int maxCoordinate = 1000, double acceptablePrecision = 0.98, bool useDensityClassifier = true) { Logger.SetupForTests(); var bitsPerDimension = maxCoordinate.SmallestPowerOfTwo(); var clusterCount = 2; var minClusterSize = (numPoints / clusterCount) - clusterSizeVariation; var maxClusterSize = (numPoints / clusterCount) + clusterSizeVariation; var outlierSize = 5; var radiusShrinkage = 0.6; // 0.7 merges too many that belong apart! var data = new GaussianClustering { ClusterCount = clusterCount, Dimensions = dimensions, MaxCoordinate = maxCoordinate, MinClusterSize = minClusterSize, MaxClusterSize = maxClusterSize }; var expectedClusters = data.TwoClusters(overlapPercent); Classification <UnsignedPoint, string> actualClusters; if (useDensityClassifier) { var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = outlierSize, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); var unmergeableSize = expectedClusters.NumPoints / 6; var densityClassifier = new DensityClassifier(hIndex, count.MaximumSquareDistance, unmergeableSize) { MergeableShrinkage = radiusShrinkage }; actualClusters = densityClassifier.Classify(); } else { var classifier = new HilbertClassifier(expectedClusters.Points(), 10) { OutlierSize = outlierSize }; //classifier.IndexConfig.NoiseSkipBy = 0; classifier.IndexConfig.UseSample = false; actualClusters = classifier.Classify(); } var comparison = expectedClusters.Compare(actualClusters); SplitQuality qualitativeResult = SplitQuality.Unsplit; if (comparison.BCubed >= 1.0) { qualitativeResult = SplitQuality.PerfectSplit; } else if (actualClusters.NumPartitions == 1) { qualitativeResult = SplitQuality.Unsplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= 1.0) { qualitativeResult = SplitQuality.GoodOverSplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision) { qualitativeResult = SplitQuality.FairOverSplit; } else if (actualClusters.NumPartitions == expectedClusters.NumPartitions && comparison.Precision >= acceptablePrecision) { qualitativeResult = SplitQuality.GoodSplit; } else if (actualClusters.NumPartitions > expectedClusters.NumPartitions && comparison.Precision < 1.0) { qualitativeResult = SplitQuality.BadOverSplit; } else // Assume correct number of clusters. { qualitativeResult = SplitQuality.BadSplit; } Logger.Info($" Quality: {qualitativeResult} Comparison: {comparison}"); return(new Tuple <ClusterMetric <UnsignedPoint, string>, int, int, SplitQuality>( comparison, expectedClusters.NumPartitions, actualClusters.NumPartitions, qualitativeResult )); }