Beispiel #1
0
        /// <summary>
        /// Computes Intra-cluster distance for each vector in a cluster
        /// </summary>
        /// <returns></returns>
        private SilhouetteStats[] CoumputeAvgIntraClusterDistances(VectorMatrix cluster, double[][] distMatrix)
        {
            var silhouettes = new SilhouetteStats[cluster.RowCount];

            for (int vi = 0; vi < cluster.RowCount; vi++)
            {
                silhouettes[vi].a = AvgDistance(vi, distMatrix);
                silhouettes[vi].b = Double.MaxValue;
            }
            return(silhouettes);
        }
Beispiel #2
0
        private double AvgDistanceToCluster(DenseVector vector, int clusterId)
        {
            double dist = 0.0D;

            var cluster     = Clusters[clusterId];
            int vectorCount = cluster.RowCount;

            for (int vi = 0; vi < vectorCount; vi++)
            {
                dist += VectorMatrix.Distance(vector, cluster[vi]);
            }

            return(dist / vectorCount);
        }
Beispiel #3
0
        private const int OneGBytesAvailable = 1073741824;  // 1 Gig

        /// <summary>
        /// </summary>
        /// <param name="filePath"></param>
        /// <param name="sampleRate">
        /// Compute silhouette on a sample of vectors per cluster os size sampleRate%.  e.g. 0.01 = 1%.
        /// Default = 0.01.
        /// If teh sample rte is so small than
        /// </param>
        /// <param name="maxMemory">
        /// Number of gigabytes of RAM to reserve for clusters cached in memory.
        /// Go to disk for those that do not fit.
        /// Default = 2.0 Gigs.
        /// </param>
        public Silhouette(string filePath, float sampleRate, float maxMemory)
        {
            VectorMatrix userClusters = new VectorMatrix(filePath, false);

            // Split the big User-ClusterId-Vector stream into smaller files, one per cluster
            var    clusterCounts  = userClusters.PartitionOnColunmIndex(2);
            string rootFolderPath = Directory.GetParent(filePath).FullName;

            int clusterCount = clusterCounts.Count;

            Clusters = new VectorMatrix[clusterCount];
            int totalVectorsInMemory = 0;

            for (int ci = 0; ci < clusterCount; ci++)
            {
                int rowsOnDisk    = clusterCounts[ci];
                int maxRowsToLoad = ((sampleRate > 0.0F) && (sampleRate <= 1.0F)) ? (int)(sampleRate * rowsOnDisk) : rowsOnDisk;

                // Make sure we at least load 500 rows  (or the actual number of rows on disk)
                if (maxRowsToLoad < 500)
                {
                    maxRowsToLoad = Math.Min(rowsOnDisk, 500);
                }
                totalVectorsInMemory += maxRowsToLoad;
                Clusters[ci]          = new VectorMatrix(rootFolderPath + "\\" + ci + ".csv", rowsOnDisk, maxRowsToLoad);
                Clusters[ci].Verbose  = false;
            }

            // Load first partition to estimate the size of vectors
            Clusters[0].Load();
            // Only need a rough estimate :)
            int bytesPerVector = Clusters[0][0].Length * sizeof(float) + 20 + 4 + 4;

            // How many vectors can we fit in 3 x 1GBytes ?
            if (maxMemory <= 0.0F)
            {
                maxMemory = 3.0F;
            }
            this.VectorsThatFit = (int)(maxMemory * (OneGBytesAvailable / bytesPerVector));
            // Subtract the ones already in memory (cluster 0)
            this.VectorsThatFit -= Clusters[0].RowCount;
        }
Beispiel #4
0
        public void PrintMatrix(double[][] matrix, bool printPadding, StreamWriter stream, VectorMatrix vectorMatrix2)
        {
            if ((matrix == null) || (matrix[0] == null))
            {
                return;
            }

            // Make room for each line of output: One tab char + 6 digits of precission after "0."
            int           maxRowCharLength = (matrix[0].Length + 1) * 9;
            StringBuilder rowValues        = new StringBuilder(maxRowCharLength);

            // If we are not called with a second set of vectors then we are simply comparing this[] in  N x  N fashion.
            bool isNxM = (vectorMatrix2 == null);

            for (int curRow = 0; curRow < matrix.Length; curRow++)
            {
                rowValues.Clear();
                if ((matrix[curRow] == null) || (matrix[curRow].Length < 1))
                {
                    throw new InvalidDataException(string.Format("Similarity/Distance matrix is missing data at row {0}", curRow));
                }

                var rowId = this.Vectors[curRow].Item1;
                if (isNxM && printPadding)
                {
                    // Output row Id only once
                    rowValues.Append(curRow);
                    // ...followed by 0 padding of bottom left corner of the triangular matrix
                    for (int i = 0; i <= curRow; i++)
                    {
                        rowValues.AppendFormat("\t{0:F1}", 0.0D);
                    }
                }

                for (int curCol = 0; curCol < matrix[curRow].Length; curCol++)
                {
                    string colId = string.Empty;
                    if (isNxM)
                    {
                        colId = this.Vectors[curCol + curRow + 1].Item1;
                    }
                    else
                    {
                        rowId = this.Vectors[curCol].Item1;
                        colId = vectorMatrix2.Vectors[curCol].Item1;
                    }

                    if (!printPadding)
                    {
                        // Output row and col Ids with every line  (i.e. not using diagonal matrix format)
                        rowValues.AppendFormat("{0}\t{1}", rowId, colId);
                    }

                    rowValues.AppendFormat("\t{0:F6}", matrix[curRow][curCol]);
                }

                stream.WriteLine(rowValues.ToString());
            }
        }