/// <summary> /// Performs KMeans partitioning (clustering) on the data. This is done on all /// parameters within the DataLines except for the dates, since they are not /// easily normalized (without loosing the meaning). /// </summary> /// <param name="k">The number of partitions to use.</param> /// <param name="data">The dataset that needs partitioning.</param> /// <returns>A list of KMeanCluster together containing all the entries from /// the original dataset.</returns> public static List <KMeanCluster> KMeansPartition(int k, List <DataLine> data) { // select k random KMeanCluster[] clusters = new KMeanCluster[k]; data.Shuffle(1337); for (int i = 0; i < k; i++) { clusters[i] = new KMeanCluster(data[i]); } int iteration = 0; while (iteration < 10000) { // upper cutoff // beregn alle iris' til de k means data.ForEach(a => clusters.OrderBy(c => Dissimilarity(a, c)).First().AddMember(a)); // beregn nye centroids KMeanCluster[] newClusters = new KMeanCluster[k]; for (int c = 0; c < k; c++) { newClusters[c] = clusters[c].CalcCentroid(); } // stop hvis ingen ændringer if (Changed(clusters, newClusters)) { clusters = newClusters; } else { break; } iteration++; Console.WriteLine("Iteration " + iteration + " done."); } return(clusters.ToList()); }
private static double Dissimilarity(DataLine a, KMeanCluster c) { Dictionary <string, double> centroid = c.Centroid; return(centroid.Sum(kv => Math.Abs(a.hashDoubles[kv.Key] ?? 0.0 - kv.Value))); // TODO: null == 0.0 might not be good }