internal static ClusterResult InitializeClusters(ClusterData input, ClusterAlgorithmOptions options) { ClusterResult result = new ClusterResult(); Random random = new Random(options.RandomSeed); result.ClusterAssignment = new int[input.RowCount]; // ensure each cluster gets one data point for (int row = 0; row < options.NumberOfClusters; row++) { result.ClusterAssignment[row] = row; } //assign rest of data points randomly for (int row = options.NumberOfClusters; row < input.RowCount; row++) { result.ClusterAssignment[row] = random.Next(0, options.NumberOfClusters); } //initialize mean value matrix (without data) result.ClusterMeanValues = new double[options.NumberOfClusters][]; int colCount = input.RawData[0].Length; for (int cluster = 0; cluster < options.NumberOfClusters; cluster++) { result.ClusterMeanValues[cluster] = new double[colCount]; } return(result); }
public static ClusterEvaluation[] Cluster(ClusterData data, int minClusterCount, int maxClusterCount, int repetitions = 20, bool normalizeData = true) { if (repetitions <= 0) { throw new ArgumentException("repetitions > 0!"); } if (maxClusterCount - minClusterCount < 0) { throw new ArgumentException("invalud cluster counts"); } ClusterEvaluation[] ce = new ClusterEvaluation[maxClusterCount - minClusterCount + 1]; int probe = -1; for (int c = minClusterCount; c <= maxClusterCount; c++) { double inner = double.MaxValue; int success = 0; double iter = 0; int bestSeed = 0; for (int i = 0; i < repetitions; i++) { int seed = GetSeed(RandomSeedGenerator.Generate()); ClusterAlgorithmOptions options = new ClusterAlgorithmOptions(c, seed, normalizeData); var result = KmeansClusterAlgorithm.Analyze(data, options); if (result.TerminationStatus != Status.EmptyClusters) { success++; if (inner > result.TotalClusterToPointDistance) { inner = result.TotalClusterToPointDistance; bestSeed = seed; } iter += result.Iterations; } } probe++; if (success < 1) { inner = -1; } ce[probe] = new ClusterEvaluation(c, success, Math.Round(iter / (double)repetitions, 2), Math.Round(inner, 2), bestSeed); } return(ce); }
public static ClusterResult Analyze(ClusterData input, ClusterAlgorithmOptions options) { bool clusterIsAssigned; bool clusterResultChanged = false; bool maxIterationsReached = false; ClusterResult result = InitializeClusters(input, options); result.NormalizedData = (options.NormalizeData) ? Normalize(input) : input.RawData; do { clusterIsAssigned = UpdateClusterMeanValues(result, input, options); if (clusterIsAssigned) { clusterResultChanged = UpdateClusterAssignment(result, input, options); result.Iterations++; maxIterationsReached = (result.Iterations >= options.MaximumIterationCount); } } while (clusterIsAssigned && clusterResultChanged && !maxIterationsReached); if (!clusterIsAssigned) { result.TerminationStatus = Status.EmptyClusters; } else if (maxIterationsReached) { result.TerminationStatus = Status.MaxIterationsReached; } else if (!clusterResultChanged) { result.TerminationStatus = Status.Convergence; } if (clusterIsAssigned) { CalculateObjective(result, input, options); } return(result); }
internal static bool UpdateClusterAssignment(ClusterResult result, ClusterData data, ClusterAlgorithmOptions options) { bool updated = false; //iterate each data point for (int row = 0; row < data.RowCount; row++) { //determine distance to each cluster double[] distances = new double[options.NumberOfClusters]; for (int cluster = 0; cluster < options.NumberOfClusters; cluster++) { distances[cluster] = GetDistance( result.ClusterMeanValues[cluster], result.NormalizedData[row], data.AttributeWeights ); } //update assignment with best found cluster int closestCluster = FindMinimumIndex(distances); if (closestCluster != result.ClusterAssignment[row]) { updated = true; result.ClusterAssignment[row] = closestCluster; } } return(updated); }
private static void CalculateObjective(ClusterResult result, ClusterData input, ClusterAlgorithmOptions options) { //cluster-point-distance double[] clusterTotals = new double[options.NumberOfClusters]; int[] clusterNumbers = new int[options.NumberOfClusters]; for (int r = 0; r < input.RowCount; r++) { int clusterId = result.ClusterAssignment[r]; var m = result.ClusterMeanValues[clusterId]; var d = result.NormalizedData[r]; var w = input.AttributeWeights; clusterTotals[clusterId] += GetDistance(m, d, w); clusterNumbers[clusterId] += 1; } double withinScatter = 0; for (int c = 0; c < options.NumberOfClusters; c++) { withinScatter += clusterNumbers[c] * clusterTotals[c]; } result.TotalClusterToPointDistance = withinScatter; }
internal static bool UpdateClusterMeanValues(ClusterResult result, ClusterData data, ClusterAlgorithmOptions options) { //create array for cluster totals double[][] totals = new double[options.NumberOfClusters][]; for (int c = 0; c < options.NumberOfClusters; c++) { totals[c] = new double[data.ColumnCount + 1];//additional col for assignment counting } //fill that cluster total array and count assigned points for (int row = 0; row < result.ClusterAssignment.Length; row++) { int cluster = result.ClusterAssignment[row]; totals[cluster][data.ColumnCount] += 1;//count number of assigned points for (int col = 0; col < data.ColumnCount; col++) { totals[cluster][col] += data.AttributeWeights[col] * result.NormalizedData[row][col]; } } //update the mean by dividing through number of assigned points for (int cluster = 0; cluster < options.NumberOfClusters; cluster++) { if (totals[cluster][data.ColumnCount] == 0) { return(false); } for (int col = 0; col < data.ColumnCount; col++) { result.ClusterMeanValues[cluster][col] = totals[cluster][col] / totals[cluster][data.ColumnCount]; } } return(true); }