/// <summary> /// Verifies the exists. /// </summary> private void VerifyExists() { if (!Classification.Exists(_databasePath, _classificationName, out _errOut)) { Classification.Add(_databasePath, _classificationName, out _errOut); } }
public void AddTest() { VerifyDoesntExist(); bool value = Classification.Add(_databasePath, _classificationName, out _errOut); General.HasTrueValue(value, _errOut); }
/// <summary> /// Add noise points to the data and classify each noise point with the nearest cluster center. /// </summary> /// <param name="noisePointsToAdd">Number of noise points to add.</param> /// <param name="clusterCenters">Cluster centers for each cluster, where the key is the cluster id.</param> /// <param name="clusters">The noise points will be added to these clusters.</param> private void AddNoise(int noisePointsToAdd, Dictionary <string, UnsignedPoint> clusterCenters, Classification <UnsignedPoint, string> clusters) { if (noisePointsToAdd <= 0) { return; } var pccp = new PolyChromaticClosestPoint <string> (clusters); var closest = new List <Tuple <String, String> > (); // Find the nearest neighboring cluster to each cluster. // We will be choosing random noise points positioned in the space between clusters that are near neighbors. foreach (var clusterId in clusters.ClassLabels()) { var cp = pccp.FindClusterApproximately(clusterId).Swap(clusterId); closest.Add(new Tuple <string, string>(cp.Color1, cp.Color2)); } // We need to pick random points from each cluster, so must convert from Sets to Lists for performance. var clustersAsLists = new Dictionary <string, List <UnsignedPoint> > (); foreach (var pair in clusters.LabelToPoints) { clustersAsLists [pair.Key] = pair.Value.ToList(); } // Pick random pairs of clusters that are close neighbors. // Then pick a random point from each cluster and compute a weighted average of the two points. // This will construct noise points that tend to form a filament between two clusters. // Such connecting filaments pose the greatest likelihood of merging two distinct // clusters into one, the very error that must be compensated for by an improved algorithm. for (var i = 0; i < noisePointsToAdd; i++) { var whereToAdd = closest [r.Next(closest.Count)]; // The weight will range from 0.18 to 0.82 so as to keep most noise points from being inside a cluster, // which would make them non-noisy. var weight1 = r.NextDouble() * 0.64 + 0.18; var weight2 = 1.0 - weight1; var c1 = clustersAsLists[whereToAdd.Item1]; var c2 = clustersAsLists[whereToAdd.Item2]; var p1 = c1[r.Next(c1.Count)]; var p2 = c2[r.Next(c2.Count)]; var vRandom = new int[Dimensions]; for (var iDim = 0; iDim < vRandom.Length; iDim++) { vRandom [iDim] = (int)(weight1 * p1.Coordinates [iDim] + weight2 * p2.Coordinates [iDim]); } var pRandom = new UnsignedPoint(vRandom); var d1 = c1.Select(p => pRandom.Measure(p)).Min(); var d2 = c2.Select(p => pRandom.Measure(p)).Min(); var cRandom = d1 < d2 ? whereToAdd.Item1 : whereToAdd.Item2; clusters.Add(pRandom, cRandom); Noise.Add(pRandom); } }
/// <summary> /// Generate random points clumped into individual, well-separated, Gaussian clusters with optional uniform noise added. /// /// </summary> /// <returns>Points that are grouped into clusters and stored in a Classification.</returns> public Classification <UnsignedPoint, string> MakeClusters() { var clusters = new Classification <UnsignedPoint, string>(); r = new FastRandom(); //var z = new ZigguratGaussianSampler(); var farthestDistanceFromClusterCenter = 0.0; var minDistance = EllipsoidalGenerator.MinimumSeparation(MaxDistanceStdDev, Dimensions); var centerGenerator = new DiffuseGenerator(Dimensions, minDistance) { // Keep the centers of the clusters away from the edge, so that points do not go out of bounds and have their coordinates truncated. Minimum = MaxDistanceStdDev, Maximum = MaxCoordinate - MaxDistanceStdDev }; var iCluster = 0; var clusterCenters = new Dictionary <string, UnsignedPoint> (); foreach (var clusterCenter in centerGenerator.Take(ClusterCount).Where(ctr => ctr != null)) { var centerPoint = new UnsignedPoint(clusterCenter); // The cluster size may be random, or come from ClusterSizes. int clusterSize; if (ClusterSizes.Length > 0) { clusterSize = ClusterSizes[iCluster % ClusterSizes.Length]; } else { clusterSize = r.Next(MinClusterSize, MaxClusterSize); } var pointGenerator = new EllipsoidalGenerator(clusterCenter, RandomDoubles(Dimensions, MinDistanceStdDev, MaxDistanceStdDev, r), Dimensions); var clusterId = iCluster.ToString(); foreach (var iPoint in Enumerable.Range(1, clusterSize)) { UnsignedPoint p; clusters.Add( p = new UnsignedPoint(pointGenerator.Generate(new int[Dimensions])), clusterId ); var distance = Math.Sqrt(centerPoint.Measure(p)); farthestDistanceFromClusterCenter = Math.Max(farthestDistanceFromClusterCenter, distance); } clusterCenters[clusterId] = centerPoint; iCluster++; } AddNoise((int)Math.Floor(clusters.NumPoints * NoisePercentage / 100), clusterCenters, clusters); Debug.WriteLine("Test data: Farthest Distance from center = {0:N2}. Minimum Distance Permitted between Clusters = {1:N2}. Max Standard Deviation = {2}", farthestDistanceFromClusterCenter, minDistance, MaxDistanceStdDev ); return(clusters); }
/// <summary> /// Make a Classification of N-Dimensional data where the inputs are arrays of integers and the final element in each matrix /// is the number of its category. /// </summary> /// <param name="pointsPlusClass">Data to classify.</param> /// <returns>A Classification of the points.</returns> public static Classification <UnsignedPoint, string> MakeClassification(IList <int[]> pointsPlusClass) { var dimensions = pointsPlusClass[0].Length - 1; // The last number for each point is its category. var c = new Classification <UnsignedPoint, string>(); foreach (var pointPlusClass in pointsPlusClass) { var point = new UnsignedPoint(pointPlusClass.Take(dimensions).ToArray()); c.Add(point, pointPlusClass[dimensions].ToString(CultureInfo.InvariantCulture)); } return(c); }
/// <summary> /// Generate random points clumped into individual, well-separated, chains of Gaussian clusters. /// Each chain consists of individiual Gaussian clusters that overlap. /// </summary> /// <returns>Points that are grouped into clusters and stored in a Classification.</returns> public Classification <UnsignedPoint, string> MakeChains(int chainLength) { var clusters = new Classification <UnsignedPoint, string>(); r = new FastRandom(); var minDistance = EllipsoidalGenerator.MinimumSeparation(MaxDistanceStdDev, Dimensions); var centerGenerator = new ChainGenerator(Dimensions, minDistance) { // Keep the centers of the clusters away from the edge, so that points do not go out of bounds and have their coordinates truncated. Minimum = MaxDistanceStdDev, Maximum = MaxCoordinate - MaxDistanceStdDev }; var segmentLength = (int)(MinDistanceStdDev * Math.Sqrt(Dimensions) / 3); var iCluster = 0; foreach (var chain in centerGenerator.Chains(chainLength, segmentLength).Take(ClusterCount).Where(chain => chain.Any())) { var centerPoints = chain.Select(center => new UnsignedPoint(center)).ToList(); // The cluster size may be random, or come from ClusterSizes. int clusterSize; if (ClusterSizes.Length > 0) { clusterSize = ClusterSizes[iCluster % ClusterSizes.Length]; } else { clusterSize = r.Next(MinClusterSize, MaxClusterSize); } // Having decided on an overall cluster size, each segment gets an even number of points. var segmentSize = clusterSize / chainLength; var clusterId = iCluster.ToString(); // Each point generator is for a different segment of a chain. foreach (var pointGenerator in chain .Select(segmentCenter => new EllipsoidalGenerator(segmentCenter, RandomDoubles(Dimensions, MinDistanceStdDev, MaxDistanceStdDev, r), Dimensions)) ) { foreach (var iPoint in Enumerable.Range(1, segmentSize)) { clusters.Add( new UnsignedPoint(pointGenerator.Generate(new int[Dimensions])), clusterId ); } } iCluster++; } return(clusters); }
/// <summary> /// Create two random clusters that may be separated from one another by enough distance /// that they do not overlap, or be partly overlapping, or fully overlapping. /// /// NOTE: This type of setup is to test divisive clustering, that divides two partly mixed gaussians. /// </summary> /// <param name="overlapPercent">A number from zero to 100. /// If zero, the clusters do not overlap at all. /// If fifty, then the clusters partly overlap. /// If 100, the clusters have the same center, so are indistinguishable.</param> /// <returns>The two clusters.</returns> public Classification <UnsignedPoint, string> TwoClusters(double overlapPercent) { var clusters = new Classification <UnsignedPoint, string>(); r = new FastRandom(); var farthestDistanceFromClusterCenter = 0.0; var minDistance = EllipsoidalGenerator.MinimumSeparation(MaxDistanceStdDev, Dimensions); var centerGenerator = new DiffuseGenerator(Dimensions, minDistance) { // Keep the centers of the clusters away from the edge, so that points do not go out of bounds and have their coordinates truncated. // Keep the maximum coordinate farther away, because we will pick the second point by shifting one coordinate // in the higher direction. Minimum = MaxDistanceStdDev, Maximum = MaxCoordinate - MaxDistanceStdDev - (int)minDistance }; var iCluster = 0; var clusterCenter1 = centerGenerator.Take(1).FirstOrDefault(); var clusterCenter2 = (int[])clusterCenter1.Clone(); clusterCenter2[0] += (int)(minDistance * (100.0 - overlapPercent) / 100.0); var centers = new[] { clusterCenter1, clusterCenter2 }; foreach (var clusterCenter in centers) { var centerPoint = new UnsignedPoint(clusterCenter); var clusterSize = r.Next(MinClusterSize, MaxClusterSize); var pointGenerator = new EllipsoidalGenerator(clusterCenter, RandomDoubles(Dimensions, MinDistanceStdDev, MaxDistanceStdDev, r), Dimensions); var clusterId = iCluster.ToString(); foreach (var iPoint in Enumerable.Range(1, clusterSize)) { UnsignedPoint p; clusters.Add( p = new UnsignedPoint(pointGenerator.Generate(new int[Dimensions])), clusterId ); var distance = Math.Sqrt(centerPoint.Measure(p)); farthestDistanceFromClusterCenter = Math.Max(farthestDistanceFromClusterCenter, distance); } iCluster++; } //TODO: Go back and recluster the points. Put each point into the cluster whose centroid // it is nearest. Thus, if two clusters partly overlap, the points from one will be pushed into the other. return(clusters); }