Ejemplo n.º 1
0
        private const int OneGBytesAvailable = 1073741824;  // 1 Gig

        /// <summary>
        /// </summary>
        /// <param name="filePath"></param>
        /// <param name="sampleRate">
        /// Compute silhouette on a sample of vectors per cluster os size sampleRate%.  e.g. 0.01 = 1%.
        /// Default = 0.01.
        /// If teh sample rte is so small than
        /// </param>
        /// <param name="maxMemory">
        /// Number of gigabytes of RAM to reserve for clusters cached in memory.
        /// Go to disk for those that do not fit.
        /// Default = 2.0 Gigs.
        /// </param>
        public Silhouette(string filePath, float sampleRate, float maxMemory)
        {
            VectorMatrix userClusters = new VectorMatrix(filePath, false);

            // Split the big User-ClusterId-Vector stream into smaller files, one per cluster
            var    clusterCounts  = userClusters.PartitionOnColunmIndex(2);
            string rootFolderPath = Directory.GetParent(filePath).FullName;

            int clusterCount = clusterCounts.Count;

            Clusters = new VectorMatrix[clusterCount];
            int totalVectorsInMemory = 0;

            for (int ci = 0; ci < clusterCount; ci++)
            {
                int rowsOnDisk    = clusterCounts[ci];
                int maxRowsToLoad = ((sampleRate > 0.0F) && (sampleRate <= 1.0F)) ? (int)(sampleRate * rowsOnDisk) : rowsOnDisk;

                // Make sure we at least load 500 rows  (or the actual number of rows on disk)
                if (maxRowsToLoad < 500)
                {
                    maxRowsToLoad = Math.Min(rowsOnDisk, 500);
                }
                totalVectorsInMemory += maxRowsToLoad;
                Clusters[ci]          = new VectorMatrix(rootFolderPath + "\\" + ci + ".csv", rowsOnDisk, maxRowsToLoad);
                Clusters[ci].Verbose  = false;
            }

            // Load first partition to estimate the size of vectors
            Clusters[0].Load();
            // Only need a rough estimate :)
            int bytesPerVector = Clusters[0][0].Length * sizeof(float) + 20 + 4 + 4;

            // How many vectors can we fit in 3 x 1GBytes ?
            if (maxMemory <= 0.0F)
            {
                maxMemory = 3.0F;
            }
            this.VectorsThatFit = (int)(maxMemory * (OneGBytesAvailable / bytesPerVector));
            // Subtract the ones already in memory (cluster 0)
            this.VectorsThatFit -= Clusters[0].RowCount;
        }