/// <summary> /// calculateStatistics is a function that claculates statistics and properties of a cluster. These statistics are independent on other clusters. /// </summary> /// <param name="Cluster">a cluster object</param> private static void calculateStatistics(Cluster cls) { int Code; string Message = "Function <calculateStatistics>: "; try { int NumberOfSamples = cls.ClusterData.Length; int NumberOfAttributes = cls.Centroid.Length; cls.ClusterDataDistanceToCentroid = new double[NumberOfSamples]; cls.Mean = new double[NumberOfAttributes]; cls.StandardDeviation = new double[NumberOfAttributes]; cls.InClusterMaxDistance = -1; //in case of empty cluster if (NumberOfSamples == 0) { cls.InClusterFarthestSampleIndex = 0; cls.InClusterMaxDistance = 0; cls.InClusterFarthestSample = new double[NumberOfAttributes]; for (int j = 0; j < NumberOfAttributes; j++) { cls.Mean[j] = 0; cls.Centroid[j] = 0; cls.InClusterFarthestSample[j] = 0; } cls.NearestCluster = -1; } else { for (int i = 0; i < NumberOfSamples; i++) { //calculate distance for each sample cls.ClusterDataDistanceToCentroid[i] = KMeans.calculateDistance(cls.ClusterData[i], cls.Centroid); if (cls.ClusterDataDistanceToCentroid[i] > cls.InClusterMaxDistance) { //farthest sample cls.InClusterFarthestSampleIndex = i; cls.InClusterFarthestSample = cls.ClusterData[i]; cls.InClusterMaxDistance = cls.ClusterDataDistanceToCentroid[i]; } for (int j = 0; j < NumberOfAttributes; j++) { cls.Mean[j] += cls.ClusterData[i][j] / NumberOfSamples; } } double[] ClusterVariance = new double[NumberOfAttributes]; for (int i = 0; i < NumberOfSamples; i++) { for (int j = 0; j < NumberOfAttributes; j++) { ClusterVariance[j] += Math.Pow((cls.ClusterData[i][j] - cls.Mean[j]), 2) / NumberOfSamples; } } for (int i = 0; i < NumberOfAttributes; i++) { cls.StandardDeviation[i] = Math.Sqrt(ClusterVariance[i]); } } } catch (Exception Ex) { Code = 400; Message += "Unhandled exception:\t" + Ex.ToString(); throw new KMeansException(Code, Message); } }
/// <summary> /// calculateNoreStatistics is a function that claculates statistics of a cluster. These statistics are dependent on other clusters. /// </summary> /// <param name="RawData">data to be clustered</param> /// <param name="DataToClusterMapping">contains the assigned cluster number for each sample of the RawData</param> /// <param name="Centroids">the centroids of the clusters</param> /// <param name="NearestCluster">nearest cluster number</param> /// <param name="NearestForeignSampleInNearestCluster">nearest sample belonging of the nearest cluster to this cluster's centroid</param> /// <param name="DistanceToNearestForeignSampleInNearestCluster">distance between the nearest sample of the nearest cluster and this cluster's centroid</param> /// <param name="NearestForeignSample">nearest sample not belonging to this cluster and this cluster's centroid</param> /// <param name="DistanceToNearestForeignSample">distance between the nearest foreign sample and this cluster's centroid</param> /// <param name="ClusterOfNearestForeignSample">the cluster to which the nearest foreign sample belongs</param> private static void calculateMoreStatistics(double[][] RawData, int[] DataToClusterMapping, double[][] Centroids, int[] NearestCluster, out double[][] NearestForeignSampleInNearestCluster, out double[] DistanceToNearestForeignSampleInNearestCluster, out double[][] NearestForeignSample, out double[] DistanceToNearestForeignSample, out int[] ClusterOfNearestForeignSample) { int Code; string Message = "Function <calculateMoreStatistics>: "; try { NearestForeignSampleInNearestCluster = new double[Centroids.Length][]; DistanceToNearestForeignSampleInNearestCluster = new double[Centroids.Length]; NearestForeignSample = new double[Centroids.Length][]; DistanceToNearestForeignSample = new double[Centroids.Length]; ClusterOfNearestForeignSample = new int[Centroids.Length]; for (int i = 0; i < Centroids.Length; i++) { //in case of empty cluster if (NearestCluster[i] == -1) { NearestForeignSampleInNearestCluster[i] = null; NearestForeignSample[i] = null; DistanceToNearestForeignSampleInNearestCluster[i] = -1; DistanceToNearestForeignSample[i] = -1; ClusterOfNearestForeignSample[i] = -1; } else { DistanceToNearestForeignSampleInNearestCluster[i] = double.MaxValue; DistanceToNearestForeignSample[i] = double.MaxValue; } } double curDistance; for (int i = 0; i < RawData.Length; i++) { for (int j = 0; j < Centroids.Length; j++) { //skip if sample belong to the cluster itself or the cluster is empty if (DataToClusterMapping[i] == j || NearestCluster[j] == -1) { continue; } curDistance = KMeans.calculateDistance(RawData[i], Centroids[j]); if (curDistance < DistanceToNearestForeignSample[j]) { DistanceToNearestForeignSample[j] = curDistance; NearestForeignSample[j] = RawData[i]; ClusterOfNearestForeignSample[j] = DataToClusterMapping[i]; } if (DataToClusterMapping[i] == NearestCluster[j]) { if (curDistance < DistanceToNearestForeignSampleInNearestCluster[j]) { DistanceToNearestForeignSampleInNearestCluster[j] = curDistance; NearestForeignSampleInNearestCluster[j] = RawData[i]; } } } } } catch (Exception Ex) { Code = 400; Message += "Unhandled exception:\t" + Ex.ToString(); throw new KMeansException(Code, Message); } }
/// <summary> /// RecommendedNumberOfClusters is a function that gives a recommended number of clusters for the given samples based on some provided methods. /// </summary> /// <param name="rawData">The samples to be clustered</param> /// <param name="kmeansMaxIterations">Maximum allowed number of Kmeans iteration for clustering</param> /// <param name="kmeansAlgorithm">The desired Kmeans clustering algorithm (1 or 2) /// <ul style="list-style-type:none"> /// <li> - 1: Centoids are the nearest samples to the means</li> /// <li> - 2: Centoids are the means</li> /// </ul></param> /// <param name="numberOfAttributes">Number of attributes for each sample</param> /// <param name="maxNumberOfClusters">Maximum desired number of clusters</param> /// <param name="minNumberOfClusters">Minimum desired number of clusters</param> /// <param name="method">Integer 0,1,2 or 3 representing the method to be used /// <ul style = "list-style-type:none" > /// <li> - Method 0: Radial method in which the farthest sample of each cluster must be closer to the cluster centoid than the nearest foreign sample of the other clusters </li> /// <li> - Method 1: Standard Deviation method in which the standard deviation in each cluster must be less than the desired standard deviation </li> /// <li> - Method 2: Both. uses radial and standard deviation methods at the same time </li> /// <li> - Method 3: Balanced clusters method in which the clusters contain the closest number of samples</li> /// </ul> /// </param> /// <param name="standardDeviation">The desired standard deviation upper limit in each cluster</param> /// <param name="recommendedNumbersOfCluster">The variable through which the recommended number of clusters is returned</param> /// <param name="centroids">Initial Centroids</param> /// <returns>The recommended number of clusters for the given samples based on the specified method.</returns> public int RecommendedNumberOfClusters(double[][] rawData, int kmeansMaxIterations, int numberOfAttributes, int maxNumberOfClusters, int minNumberOfClusters, int method, double[] standardDeviation, int kmeansAlgorithm = 1, double[][] centroids = null) { int recommendedNumbersOfCluster; int Code; string Message = "Function <RecommendedNumberOfClusters>: "; try { //some checks if (maxNumberOfClusters < 2) { Code = 104; Message += "Maximum number of clusters must be at least 2"; throw new KMeansException(Code, Message); } int MaxClusters = Math.Min(rawData.Length, maxNumberOfClusters); if (minNumberOfClusters < 2) { minNumberOfClusters = 2; } if (method > 3 || method < 0) { Code = 122; Message += "Method must be either 0,1,2 or 3"; throw new KMeansException(Code, Message); } if ((method == 1 || method == 2) && standardDeviation == null) { Code = 123; Message += "Parameter StdDev is needed"; throw new KMeansException(Code, Message); } if (kmeansMaxIterations < 1) { Code = 108; Message += "Unacceptable number of maximum iterations"; throw new KMeansException(Code, Message); } if (rawData == null) { Code = 100; Message += "RawData is null"; throw new KMeansException(Code, Message); } if (numberOfAttributes < 1) { Code = 107; Message += "Unacceptable number of attributes. Must be at least 1"; throw new KMeansException(Code, Message); } if (kmeansAlgorithm != 2) { kmeansAlgorithm = 1; } //checks that all the samples have same number of attributes KMeans.verifyRawDataConsistency(rawData, numberOfAttributes); double[][] Centroids; int IterationReached = -1; int[] kMeansResponse; Cluster[] cluster; bool isRadial, isStandardDeviation; double[] balancedError = new double[MaxClusters - minNumberOfClusters + 1]; for (int i = minNumberOfClusters; i <= MaxClusters; i++) { //cluster the data with number of clusters equals to i kMeansResponse = KMeans.runKMeansAlgorithm(rawData, i, numberOfAttributes, kmeansMaxIterations, kmeansAlgorithm, centroids, out Centroids, out IterationReached); cluster = ClusteringResults.CreateClusteringResult(rawData, kMeansResponse, Centroids, i); isRadial = true; isStandardDeviation = true; if (method == 0 || method == 2) { //radial method check isRadial = radialClustersCheck(cluster); } if (method == 1 || method == 2) { //standard deviation check isStandardDeviation = stdDeviationClustersCheck(cluster, standardDeviation); } if (method == 3) { //start balanced check balancedError[i - minNumberOfClusters] = 0; double[] countSamples = new double[i]; double average = 0; for (int c = 0; c < i; c++) { countSamples[c] = cluster[c].ClusterData.Length; average = average + countSamples[c] / i; } for (int c = 0; c < i; c++) { //error calculation balancedError[i - minNumberOfClusters] = balancedError[i - minNumberOfClusters] + Math.Pow(countSamples[c] - average, 2) / i; } } else if (isRadial && isStandardDeviation) { recommendedNumbersOfCluster = i; //return new AnomalyDetectionResponse(0, "OK"); return(recommendedNumbersOfCluster); } } if (method == 3) { // get minimum value (most balanced solution) int minIndex = 0; for (int l = 1; l < balancedError.Length; l++) { if (balancedError[l] < balancedError[minIndex]) { minIndex = l; } } recommendedNumbersOfCluster = minIndex + minNumberOfClusters; //return new AnomalyDetectionResponse(0, "OK"); return(recommendedNumbersOfCluster); } ///// find a way to throw the response recommendedNumbersOfCluster = 0; //return new AnomalyDetectionResponse(1, "Could not find a recommended number of clusters based on the desired constraints"); return(recommendedNumbersOfCluster); } catch (Exception Ex) { Code = 400; Message += "Unhandled exception:\t" + Ex.ToString(); throw new KMeansException(Code, Message); } }