/// <summary> /// Performs hierarchical clustering based on a matrix of distances. /// </summary> /// <param name="distMatrix">The matrix of distances. It is lower triangular, excluding the diagonal.</param> /// <param name="linkage">Specifies the linkage for the clustering.</param> /// <param name="preserveOrder"></param> /// <param name="periodic"></param> /// <param name="nthreads"></param> /// <param name="progress"></param> /// <returns>An array of cluster nodes defining the resulting tree.</returns> public HierarchicalClusterNode[] TreeCluster(MatrixIndexer distMatrix, HierarchicalClusterLinkage linkage, bool preserveOrder, bool periodic, int nthreads, Action <int> progress) { double avDist = CalcAverageDistance(distMatrix); switch (linkage) { case HierarchicalClusterLinkage.Average: return(preserveOrder ? AverageLinkageClusterLinear(distMatrix, periodic) : AverageLinkageCluster(distMatrix, nthreads, avDist)); case HierarchicalClusterLinkage.Maximum: return(preserveOrder ? MaximumLinkageClusterLinear(distMatrix, periodic) : MaximumLinkageCluster(distMatrix, nthreads, avDist)); case HierarchicalClusterLinkage.Single: return(preserveOrder ? SingleLinkageClusterLinear(distMatrix, periodic) : SingleLinkageCluster(distMatrix, nthreads, avDist)); default: throw new ArgumentException(); } }
private static HierarchicalClusterNode[] GenericLinkageClusterLinear(MatrixIndexer matrix, bool periodic, Func <double, int, double, int, double> linkage) { int nelements = matrix.RowCount; int[] clusterid = new int[nelements]; int[] number = new int[nelements]; int[] position = new int[nelements]; HierarchicalClusterNode[] result = ArrayUtils.FillArray(i => new HierarchicalClusterNode(), nelements - 1); for (int j = 0; j < nelements; j++) { number[j] = 1; clusterid[j] = j; position[j] = j; } for (int n = nelements; n > 1; n--) { result[nelements - n].distance = FindClosestPairLinear(n, matrix, out int i1, out int j1, position, periodic, out bool reverse, out bool carryOver); result[nelements - n].left = reverse ? clusterid[j1] : clusterid[i1]; result[nelements - n].right = reverse ? clusterid[i1] : clusterid[j1]; for (int j = 0; j < j1; j++) { matrix[j1, j] = linkage(matrix[i1, j], number[i1], matrix[j1, j], number[j1]); } for (int j = j1 + 1; j < i1; j++) { matrix[j, j1] = linkage(matrix[i1, j], number[i1], matrix[j, j1], number[j1]); } for (int j = i1 + 1; j < n; j++) { matrix[j, j1] = linkage(matrix[j, i1], number[i1], matrix[j, j1], number[j1]); } for (int j = 0; j < i1; j++) { matrix[i1, j] = matrix[n - 1, j]; } for (int j = i1 + 1; j < n - 1; j++) { matrix[j, i1] = matrix[n - 1, j]; } number[j1] = number[i1] + number[j1]; number[i1] = number[n - 1]; clusterid[j1] = n - nelements - 1; clusterid[i1] = clusterid[n - 1]; position[j1] = Math.Min(position[j1], position[i1]); position[i1] = position[n - 1]; if (!carryOver) { for (int i = 0; i < position.Length; i++) { if (position[i] > position[j1]) { position[i]--; } } } } return(result); }
/// <summary> /// Run k-means clustering /// </summary> /// <param name="data"></param> /// <param name="k">number of clusters</param> /// <param name="maxIter">maximal number of iterations, if not converging</param> /// <param name="restarts"></param> /// <param name="progress"></param> /// <param name="clusterCenters"></param> /// <param name="clusterIndices"></param> public static void GenerateClusters(MatrixIndexer data, int k, int maxIter, int restarts, Action <int> progress, out float[,] clusterCenters, out int[] clusterIndices) { ExtractUniqueRows(data, out var rowIndexMap, out var reducedData); GenerateClustersImpl(reducedData, k, maxIter, restarts, progress, out clusterCenters, out var uniqueClusterIndices); clusterIndices = new int[data.RowCount]; RestoreRowIndices(rowIndexMap, uniqueClusterIndices, clusterIndices); }
private static HierarchicalClusterNode[] GenericLinkageCluster(MatrixIndexer distMatrix, int nthreads, double defaultDist, Func <double, int, double, int, double> linkage) { int nelements = distMatrix.RowCount; int[] clusterid = new int[nelements]; int[] number = new int[nelements]; HierarchicalClusterNode[] result = ArrayUtils.FillArray(i => new HierarchicalClusterNode(), nelements - 1); for (int j = 0; j < nelements; j++) { number[j] = 1; clusterid[j] = j; } for (int n = nelements; n > 1; n--) { double dist = FindClosestPairDistance(n, distMatrix, out int i1, out int j1, nthreads, defaultDist); if (i1 != -1) { result[nelements - n].distance = dist; result[nelements - n].left = clusterid[i1]; result[nelements - n].right = clusterid[j1]; } else { i1 = 1; j1 = 0; dist = nelements - n > 0 ? result[nelements - n - 1].distance * 1.01 : 1; result[nelements - n].distance = dist; result[nelements - n].left = clusterid[i1]; result[nelements - n].right = clusterid[j1]; } for (int j = 0; j < j1; j++) { distMatrix[j1, j] = linkage(distMatrix[i1, j], number[i1], distMatrix[j1, j], number[j1]); } for (int j = j1 + 1; j < i1; j++) { distMatrix[j, j1] = linkage(distMatrix[i1, j], number[i1], distMatrix[j, j1], number[j1]); } for (int j = i1 + 1; j < n; j++) { distMatrix[j, j1] = linkage(distMatrix[j, i1], number[i1], distMatrix[j, j1], number[j1]); } for (int j = 0; j < i1; j++) { distMatrix[i1, j] = distMatrix[n - 1, j]; } for (int j = i1 + 1; j < n - 1; j++) { distMatrix[j, i1] = distMatrix[n - 1, j]; } number[j1] = number[i1] + number[j1]; number[i1] = number[n - 1]; clusterid[j1] = n - nelements - 1; clusterid[i1] = clusterid[n - 1]; } return(result); }
public static bool HasNaNOrInfColumns(MatrixIndexer m) { for (int i = 0; i < m.ColumnCount; i++) { if (m.IsNanOrInfColumn(i)) { return(true); } } return(false); }
/// <summary> /// Performs a hierarchical clustering on the the given data matrix. /// </summary> /// <param name="data">Data matrix that is going to be clustered.</param> /// <param name="access">Specifies whether rows or columns are to be clustered</param> /// <param name="distance">Defines the distance between two elements</param> /// <param name="linkage">Specifies the linkage for the clustering.</param> /// <param name="preserveOrder"></param> /// <param name="periodic"></param> /// <param name="nthreads"></param> /// <param name="progress"></param> /// <returns>An array of cluster nodes defining the resulting tree.</returns> public HierarchicalClusterNode[] TreeCluster(MatrixIndexer data, MatrixAccess access, IDistance distance, HierarchicalClusterLinkage linkage, bool preserveOrder, bool periodic, int nthreads, Action <int> progress) { int nelements = (access == MatrixAccess.Rows) ? data.RowCount : data.ColumnCount; if (nelements < 2) { return(new HierarchicalClusterNode[0]); } float[,] distMatrix = DistanceMatrix(data, distance, access); return(TreeCluster(distMatrix, linkage, preserveOrder, periodic, nthreads, progress)); }
/// <summary> /// Run k-means clustering /// </summary> /// <param name="data"></param> /// <param name="k">number of clusters</param> /// <param name="maxIter">maximal number of iterations, if not converging</param> /// <param name="restarts"></param> /// <param name="progress"></param> /// <param name="clusterCenters"></param> /// <param name="clusterIndices"></param> public static void GenerateClusters(MatrixIndexer data, int k, int maxIter, int restarts, Action <int> progress, out float[,] clusterCenters, out int[] clusterIndices) { Dictionary <EquatableArray <double>, List <int> > rowIndexMap; double[][] reducedData; ExtractUniqueRows(data, out rowIndexMap, out reducedData); int[] uniqueClusterIndices; GenerateClustersImpl(reducedData, k, maxIter, restarts, progress, out clusterCenters, out uniqueClusterIndices); clusterIndices = new int[data.RowCount]; RestoreRowIndices(rowIndexMap, uniqueClusterIndices, clusterIndices); }
public static bool HasNanOrInf(MatrixIndexer m, MatrixAccess access) { switch (access) { case MatrixAccess.Columns: return(HasNaNOrInfColumns(m)); case MatrixAccess.Rows: return(HasNaNOrInfRows(m)); default: throw new NotImplementedException($"Not implemented for access {access}"); } }
private static double FindClosestPairDistance(int n, MatrixIndexer distMatrix, out int ip, out int jp, int nthreads, double defaultDist) { if (nthreads <= 1 || n <= 1000) { return(FindClosestPairDistance(0, n, distMatrix, out ip, out jp, defaultDist)); } int[] nk = new int[nthreads + 1]; for (int k = 0; k < nthreads + 1; k++) { nk[k] = (int)Math.Round(0.5 + Math.Sqrt(0.25 + n * (n - 1) * k / (double)nthreads)); } int[] ips = new int[nthreads]; int[] jps = new int[nthreads]; double[] maxs = new double[nthreads]; Thread[] t = new Thread[nthreads]; for (int i = 0; i < nthreads; i++) { int index0 = i; t[i] = new Thread( new ThreadStart( delegate { maxs[index0] = FindClosestPairDistance(nk[index0], nk[index0 + 1], distMatrix, out ips[index0], out jps[index0], defaultDist); })); t[i].Start(); } for (int i = 0; i < nthreads; i++) { t[i].Join(); } ip = -1; jp = -1; double distance = double.MaxValue; for (int i = 0; i < nthreads; i++) { if (maxs[i] < distance) { distance = maxs[i]; ip = ips[i]; jp = jps[i]; } } if (distance == double.MaxValue) { return(defaultDist); } return(distance); }
/// <summary> /// Create distance matrix from <see cref="IDistance"/>. /// </summary> /// <param name="data"></param> /// <param name="distance"></param> public GenericDistanceMatrix(MatrixIndexer data, IDistance distance) { N = data.RowCount; distances = new double[N * (N - 1) / 2]; int k = 0; for (int i = 0; i < N; i++) { BaseVector xi = data.GetRow(i); for (int j = i + 1; j < N; j++) { distances[k++] = distance.Get(xi, data.GetRow(j)); } } }
private static float[,] DistanceMatrix(MatrixIndexer data, IDistance distance, MatrixAccess access) { int nrows = data.RowCount; int ncols = data.ColumnCount; int nelements = (access == MatrixAccess.Rows) ? nrows : ncols; float[,] result = new float[nelements, nelements]; for (int i = 0; i < nelements; i++) { for (int j = 0; j < i; j++) { result[i, j] = (float)distance.Get(GetVector(data, i, access), GetVector(data, j, access)); } } return(result); }
private static void ExtractUniqueRows(MatrixIndexer data, out Dictionary <EquatableArray <double>, List <int> > rowIndexMap, out double[][] reducedData) { var uniqueRows = new List <double[]>(); rowIndexMap = new Dictionary <EquatableArray <double>, List <int> >(); for (int row = 0; row < data.RowCount; row++) { var rowArray = data.GetRow(row).ToArray(); var rowEqArray = new EquatableArray <double>(rowArray); if (!rowIndexMap.ContainsKey(rowEqArray)) { rowIndexMap.Add(rowEqArray, new List <int>()); uniqueRows.Add(rowArray); } rowIndexMap[rowEqArray].Add(row); } reducedData = uniqueRows.ToArray(); }
private static double CalcAverageDistance(MatrixIndexer distMatrix) { double result = 0; double count = 0; for (int i = 0; i < distMatrix.RowCount; i++) { for (int j = 0; j < i; j++) { double x = distMatrix[i, j]; if (!double.IsNaN(x) && !double.IsInfinity(x)) { result += x; count++; } } } return(result / count); }
/// <summary> /// Run K-medoid clustering. /// </summary> /// <param name="data">data matrix with n rows</param> /// <param name="distance"></param> /// <param name="k">number of clusters k < n</param> /// <returns>Array of length n. <code>assignment[i]</code> returns the index of the cluster medoid in the data matrix.</returns> public static int[] GenerateClusters(MatrixIndexer data, IDistanceMatrix distance, int k) { var n = data.RowCount; var medoids = SelectInitialMedoids(distance, k, n); var assignments = AssignClusters(distance, medoids, n); var cost = CalculateCost(distance, assignments); while (true) { medoids = SelectMedoids(distance, assignments); assignments = AssignClusters(distance, medoids, n); var newCost = CalculateCost(distance, assignments); if (Math.Abs(newCost - cost) < 1E-10) { break; } cost = newCost; } return(assignments); }
private static double FindClosestPairLinear(int n, MatrixIndexer matrix, out int ip, out int jp, IList <int> position, bool periodic, out bool reverse, out bool carryOver) { ip = -1; jp = -1; reverse = false; carryOver = false; double distance = double.MaxValue; for (int i = 0; i < n; i++) { for (int j = 0; j < i; j++) { if (ValidPositions(position[i], position[j], n, periodic, out bool p, out bool c)) { if (matrix[i, j] < distance) { distance = matrix[i, j]; ip = i; jp = j; reverse = p; carryOver = c; } } } } for (int i = 0; i < n; i++) { for (int j = 0; j < i; j++) { if (matrix[i, j] < distance) { matrix[i, j] = (float)distance; } } } return(distance); }
private static double FindClosestPairDistance(int nmin, int nmax, MatrixIndexer matrix, out int ip, out int jp, double defaultDist) { ip = -1; jp = -1; double distance = double.MaxValue; for (int i = nmin; i < nmax; i++) { for (int j = 0; j < i; j++) { if (matrix[i, j] < distance) { distance = matrix[i, j]; ip = i; jp = j; } } } if (distance == double.MaxValue) { return(defaultDist); } return(distance); }
private MatrixIndexer FindFreeCell() { int foundRow = 0; int foundCol = 0; for (int row = 0; row < this.DimensionLength; row++) { for (int col = 0; col < this.DimensionLength; col++) { if (this.matrix[row, col] == 0) { foundRow = row; foundCol = col; break; } } } MatrixIndexer foundMatrixCoords = new MatrixIndexer(foundRow, foundCol); return foundMatrixCoords; }
/// <summary> /// Run K-medoid clustering. /// </summary> /// <param name="data">data matrix with n rows</param> /// <param name="distance"></param> /// <param name="k">number of clusters k < n</param> /// <returns>Array of length n. <code>assignment[i]</code> returns the index of the cluster medoid in the data matrix.</returns> public static int[] GenerateClusters(MatrixIndexer data, IDistance distance, int k) { // return GenerateClusters(data, new GenericDistanceMatrix(data, distance), k); // TODO allow calling GenericDistanceMatrix without circular dependancy throw new NotImplementedException("Use GenericDistanceMatrix to convert IDistance to IDistanceMatrix"); }
public HierarchicalClusterNode[] TreeClusterKmeans(MatrixIndexer data, MatrixAccess access, IDistance distance, HierarchicalClusterLinkage linkage, bool preserveOrder, bool periodic, int nthreads, int nmeans, int restarts, int maxIter, Action <int> progress) { int nelements = (access == MatrixAccess.Rows) ? data.RowCount : data.ColumnCount; if (nelements <= nmeans) { return(TreeCluster(data, access, distance, linkage, preserveOrder, periodic, nthreads, progress)); } float[,] c; int[] inds; if (access == MatrixAccess.Rows) { KmeansClustering.GenerateClusters(data, nmeans, maxIter, restarts, progress, out c, out inds); } else { KmeansClustering.GenerateClusters(data.Transpose(), nmeans, maxIter, restarts, progress, out c, out inds); } float[,] distMatrix = DistanceMatrix(new FloatMatrixIndexer(c), distance, MatrixAccess.Rows); HierarchicalClusterNode[] nodes = TreeCluster(distMatrix, linkage, preserveOrder, periodic, nthreads, progress); Dictionary <int, int[]> clusters; Dictionary <int, int> singletons; RearrangeClusters(inds, c.GetLength(0), out clusters, out singletons); HierarchicalClusterNode[] newNodes = new HierarchicalClusterNode[nelements - 1]; int fill = nelements - nmeans; Array.Copy(nodes, 0, newNodes, fill, nodes.Length); int pos = 0; for (int i = fill; i < newNodes.Length; i++) { HierarchicalClusterNode node = newNodes[i]; if (node.left < 0) { node.left -= fill; } else if (singletons.ContainsKey(node.left)) { node.left = singletons[node.left]; } else { if (clusters.ContainsKey(node.left)) { HierarchicalClusterNode[] branch = FillTerminalBranch(clusters[node.left], pos); Array.Copy(branch, 0, newNodes, pos, branch.Length); pos += branch.Length; node.left = -pos; } } if (node.right < 0) { node.right -= fill; } else if (singletons.ContainsKey(node.right)) { node.right = singletons[node.right]; } else { if (clusters.ContainsKey(node.right)) { HierarchicalClusterNode[] branch = FillTerminalBranch(clusters[node.right], pos); Array.Copy(branch, 0, newNodes, pos, branch.Length); pos += branch.Length; node.right = -pos; } } } return(newNodes); }
private static BaseVector GetVector(MatrixIndexer data, int index, MatrixAccess access) { return(access == MatrixAccess.Rows ? data.GetRow(index) : data.GetColumn(index)); }
private static HierarchicalClusterNode[] AverageLinkageCluster(MatrixIndexer distMatrix, int nthreads, double defaultDist) { return(GenericLinkageCluster(distMatrix, nthreads, defaultDist, AverageLinkage)); }
private static HierarchicalClusterNode[] MaximumLinkageClusterLinear(MatrixIndexer matrix, bool periodic) { return(GenericLinkageClusterLinear(matrix, periodic, MaximumLinkage)); }