/// <summary> /// Computes Intra-cluster distance for each vector in a cluster /// </summary> /// <returns></returns> private SilhouetteStats[] CoumputeAvgIntraClusterDistances(VectorMatrix cluster, double[][] distMatrix) { var silhouettes = new SilhouetteStats[cluster.RowCount]; for (int vi = 0; vi < cluster.RowCount; vi++) { silhouettes[vi].a = AvgDistance(vi, distMatrix); silhouettes[vi].b = Double.MaxValue; } return(silhouettes); }
private double AvgDistanceToCluster(DenseVector vector, int clusterId) { double dist = 0.0D; var cluster = Clusters[clusterId]; int vectorCount = cluster.RowCount; for (int vi = 0; vi < vectorCount; vi++) { dist += VectorMatrix.Distance(vector, cluster[vi]); } return(dist / vectorCount); }
private const int OneGBytesAvailable = 1073741824; // 1 Gig /// <summary> /// </summary> /// <param name="filePath"></param> /// <param name="sampleRate"> /// Compute silhouette on a sample of vectors per cluster os size sampleRate%. e.g. 0.01 = 1%. /// Default = 0.01. /// If teh sample rte is so small than /// </param> /// <param name="maxMemory"> /// Number of gigabytes of RAM to reserve for clusters cached in memory. /// Go to disk for those that do not fit. /// Default = 2.0 Gigs. /// </param> public Silhouette(string filePath, float sampleRate, float maxMemory) { VectorMatrix userClusters = new VectorMatrix(filePath, false); // Split the big User-ClusterId-Vector stream into smaller files, one per cluster var clusterCounts = userClusters.PartitionOnColunmIndex(2); string rootFolderPath = Directory.GetParent(filePath).FullName; int clusterCount = clusterCounts.Count; Clusters = new VectorMatrix[clusterCount]; int totalVectorsInMemory = 0; for (int ci = 0; ci < clusterCount; ci++) { int rowsOnDisk = clusterCounts[ci]; int maxRowsToLoad = ((sampleRate > 0.0F) && (sampleRate <= 1.0F)) ? (int)(sampleRate * rowsOnDisk) : rowsOnDisk; // Make sure we at least load 500 rows (or the actual number of rows on disk) if (maxRowsToLoad < 500) { maxRowsToLoad = Math.Min(rowsOnDisk, 500); } totalVectorsInMemory += maxRowsToLoad; Clusters[ci] = new VectorMatrix(rootFolderPath + "\\" + ci + ".csv", rowsOnDisk, maxRowsToLoad); Clusters[ci].Verbose = false; } // Load first partition to estimate the size of vectors Clusters[0].Load(); // Only need a rough estimate :) int bytesPerVector = Clusters[0][0].Length * sizeof(float) + 20 + 4 + 4; // How many vectors can we fit in 3 x 1GBytes ? if (maxMemory <= 0.0F) { maxMemory = 3.0F; } this.VectorsThatFit = (int)(maxMemory * (OneGBytesAvailable / bytesPerVector)); // Subtract the ones already in memory (cluster 0) this.VectorsThatFit -= Clusters[0].RowCount; }
public void PrintMatrix(double[][] matrix, bool printPadding, StreamWriter stream, VectorMatrix vectorMatrix2) { if ((matrix == null) || (matrix[0] == null)) { return; } // Make room for each line of output: One tab char + 6 digits of precission after "0." int maxRowCharLength = (matrix[0].Length + 1) * 9; StringBuilder rowValues = new StringBuilder(maxRowCharLength); // If we are not called with a second set of vectors then we are simply comparing this[] in N x N fashion. bool isNxM = (vectorMatrix2 == null); for (int curRow = 0; curRow < matrix.Length; curRow++) { rowValues.Clear(); if ((matrix[curRow] == null) || (matrix[curRow].Length < 1)) { throw new InvalidDataException(string.Format("Similarity/Distance matrix is missing data at row {0}", curRow)); } var rowId = this.Vectors[curRow].Item1; if (isNxM && printPadding) { // Output row Id only once rowValues.Append(curRow); // ...followed by 0 padding of bottom left corner of the triangular matrix for (int i = 0; i <= curRow; i++) { rowValues.AppendFormat("\t{0:F1}", 0.0D); } } for (int curCol = 0; curCol < matrix[curRow].Length; curCol++) { string colId = string.Empty; if (isNxM) { colId = this.Vectors[curCol + curRow + 1].Item1; } else { rowId = this.Vectors[curCol].Item1; colId = vectorMatrix2.Vectors[curCol].Item1; } if (!printPadding) { // Output row and col Ids with every line (i.e. not using diagonal matrix format) rowValues.AppendFormat("{0}\t{1}", rowId, colId); } rowValues.AppendFormat("\t{0:F6}", matrix[curRow][curCol]); } stream.WriteLine(rowValues.ToString()); } }