예제 #1
0
 public DensityMeter(HilbertIndex index, long neighborhoodRadius, int memoWindowRadius = 0)
 {
     Index = index;
     NeighborhoodRadius = neighborhoodRadius;
     Estimator          = null;    // distances => distances.Count(d => d <= NeighborhoodRadius);
     Distances          = new DistanceMemo(Index, neighborhoodRadius, memoWindowRadius);
 }
예제 #2
0
 public IndexFound(Permutation <uint> permutation, HilbertIndex index, int estimatedClusterCount, long mergeSquareDistance)
 {
     PermutationUsed       = permutation;
     Index                 = index;
     EstimatedClusterCount = estimatedClusterCount;
     MergeSquareDistance   = mergeSquareDistance;
 }
        public DensityClassifier(HilbertIndex index, long mergeSquareDistance, int unmergeableSize)
        {
            Index = index;
            MergeSquareDistance = mergeSquareDistance;
            UnmergeableSize     = unmergeableSize;
            int labelCounter = 1;

            Clusters = new Classification <UnsignedPoint, string>(Index.SortedPoints, p => (labelCounter++).ToString());
        }
예제 #4
0
        /// <summary>
        /// Create and measure a new HilbertIndex using all the points, not just a sample of them,
        /// but use the same permutation.
        /// </summary>
        /// <param name="sampled">Sampled.</param>
        private IndexFound Unsample(IList <HilbertPoint> allPoints, IndexFound sampled)
        {
            var indexToTry    = new HilbertIndex(allPoints, sampled.PermutationUsed);
            var metricResults = Metric(indexToTry);
            var resultsToTry  = new IndexFound(sampled.PermutationUsed, indexToTry, metricResults.Item1, metricResults.Item2);

            if (ShouldCompact)
            {
                resultsToTry.Compact();
            }
            return(resultsToTry);
        }
예제 #5
0
 /// <summary>
 /// Initializes a new instance of the <see cref="T:Clustering.DistanceMemo"/> class.
 /// </summary>
 /// <param name="index">Index of points whose distances will be measured and rememerbed.</param>
 /// <param name="neighborhoodRadius">We will only memoize distances that are less than or equal to this value.
 /// Larger distances will need to be recomputed.</param>
 public DistanceMemo(HilbertIndex index, long neighborhoodRadius, int windowRadius = 0)
 {
     Index = index;
     NeighborhoodRadius = neighborhoodRadius;
     Distances          = new Dictionary <int, long> [Count];
     AllMeasured        = new bool[Count];
     if (windowRadius <= 0)
     {
         windowRadius = (int)Math.Sqrt(Count / 2);
     }
     WindowRadius = windowRadius;
 }
        /// <summary>
        /// Apply Density-based reclassification to the FinalClassification.
        /// This may cause some clusters to be split into smaller clusters.
        /// It will not cause any existing clusters to be merged.
        /// </summary>
        void ReclassifyByDensity()
        {
            // 0. Decide if we will be doing this or not, based on the configuration.
            if (!Configuration.DensityClassifier.SkipDensityClassification)
            {
                Timer.Start("Reclassify by density");
                var numberOfClustersSplit = 0;

                // 1. Loop through all clusters in FinalClassification
                // We will be modifying FinalClassification while iterating over it,
                // so we need to copy the list of labels up front.
                var classLabels = FinalClassification.ClassLabels().ToList();
                foreach (var clusterId in classLabels)
                {
                    // 2. Decide if the cluster needs reclustering.
                    if (NeedsReclustering(clusterId))
                    {
                        // 3. Obtain the members of the cluster and index them by the Hilbert curve
                        var pointsToClassify = FinalClassification.PointsInClass(clusterId);
                        var lookupPointById  = new Dictionary <int, UnsignedPoint>();
                        foreach (var p in pointsToClassify)
                        {
                            lookupPointById[p.UniqueId] = p;
                        }
                        int labelCounter      = 1;
                        var subClassification = new Classification <UnsignedPoint, string>(pointsToClassify, p => (labelCounter++).ToString());
                        var hIndex            = new HilbertIndex(subClassification, Configuration.Index.BitsPerDimension);

                        // 4. Create a DensityClassifier, properly configured.
                        var unmergeableSize   = (int)(pointsToClassify.Count * Configuration.DensityClassifier.UnmergeableSizeFraction);
                        var densityClassifier = new DensityClassifier(hIndex, MergeSquareDistance, unmergeableSize)
                        {
                            NeighborhoodRadiusMultiplier = Configuration.DensityClassifier.NeighborhoodRadiusMultiplier,
                            OutlierSize        = Configuration.DensityClassifier.OutlierSize,
                            MergeableShrinkage = Configuration.DensityClassifier.MergeableShrinkage
                        };

                        // 5. Reclassify.
                        //    This classification is in terms of HilbertPoints, so afterwards we will need to map them to
                        //    their non-HilbertPoint, original UnsignedPoints.
                        var densityClassification = densityClassifier.Classify();

                        // 6. If the number of clusters made from the points is more than one...
                        if (densityClassification.NumPartitions > 1)
                        {
                            numberOfClustersSplit++;

                            // 7. ... loop through all HilbertPoints from cluster and find corresponding UnsignedPoints.
                            foreach (var hPoint in densityClassification.Points())
                            {
                                var uPoint = lookupPointById[hPoint.UniqueId];

                                // Form the new class label by appending the previous label and the density-based label.
                                var previousClassLabel = FinalClassification.GetClassLabel(uPoint);
                                var densityClassLabel  = densityClassification.GetClassLabel(hPoint);
                                var newClassLabel      = $"{previousClassLabel}-{densityClassLabel}";

                                // 8. Pull point from its current cluster and add it to a new cluster
                                FinalClassification.Remove(uPoint);
                                FinalClassification.Add(uPoint, newClassLabel);
                            }
                        }
                    }
                }
                Timer.Stop("Reclassify by density");
                Logger.Info($"Clusters split due to density-based reclassification: {numberOfClustersSplit}");
            }
        }
 /// <summary>
 /// Create a new index based on an existing one, having all the same points, but with their coordinates permuted.
 ///
 /// All the points in the new index will share the same UniqueIds as their corresponding points in the original.
 /// To map from a point in tone index to the similar point in the other:
 ///
 ///   var p2 = hilbertIndex2.Equivalent(p1);
 /// </summary>
 /// <param name="original">Original.</param>
 /// <param name="permutation">Permutation.</param>
 public HilbertIndex(HilbertIndex original, Permutation <uint> permutation)
 {
     UnsortedPoints = original.UnsortedPoints.Select(p => p.Permute(permutation)).ToList();
     InitIndexing();
 }
예제 #8
0
        /// <summary>
        /// Search many HilbertIndex objects, each based on a different permutation of the dimensions, and
        /// keep the ones yielding the best Metrics, likely those that estimate the lowest values
        /// for the number of clusters.
        /// </summary>
        /// <param name="points">Points to index.</param>
        /// <param name="indexCount">Number of the best indices to return.
        /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param>
        /// <param name="startingPermutation">Starting permutation.</param>
        /// <returns>The best indices found and the permutations that generated them.
        /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns>
        public IList <IndexFound> SearchMany(IList <HilbertPoint> points, int indexCount, Permutation <uint> startingPermutation = null)
        {
            if (points.Count() < 10)
            {
                throw new ArgumentException("List has too few elements", nameof(points));
            }
            var queue            = new BinaryHeap <IndexFound>(BinaryHeapType.MaxHeap, indexCount);
            int dimensions       = points[0].Dimensions;
            var bitsPerDimension = points[0].BitsPerDimension;

            if (startingPermutation == null)
            {
                startingPermutation = new Permutation <uint>(dimensions);
            }
            var firstIndex = new HilbertIndex(points, startingPermutation);
            // Measure our first index, then loop through random permutations
            // looking for a better one, always accumulating the best in results.
            var metricResults = Metric(firstIndex);
            var bestResults   = new IndexFound(startingPermutation, firstIndex, metricResults.Item1, metricResults.Item2);

            if (ShouldCompact)
            {
                bestResults.Compact();
            }
            LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount);
            Logger.Info($"Cluster count Starts at: {bestResults}");
            var startingCount = bestResults.EstimatedClusterCount;

            queue.AddRemove(bestResults);

            // Decide if we are to sample points or use them all
            var sampledPoints = points;
            var sampleSize    = points.Count();

            if (UseSample)
            {
                sampleSize    = SampleSize(points, bestResults.EstimatedClusterCount);
                sampledPoints = Sample(points, sampleSize);
                Logger.Info($"    Sample is {sampleSize} of {points.Count} points");
            }
            var rejectedSampleSizes = new HashSet <int>();

            var iterationsWithoutImprovement = 0;
            var parallelOpts = new ParallelOptions {
                MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints)
            };

            List <Permutation <uint> > allPermutations = null;

            // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times.
            // Instead, we will try all or many of them in order.
            if (dimensions <= 7)
            {
                allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList();
            }

            for (var iteration = 0; iteration < MaxIterations; iteration++)
            {
                var improvedCount        = 0;
                var startFromPermutation = bestResults.PermutationUsed;
                Parallel.For(0, ParallelTrials, parallelOpts,
                             i =>
                {
                    Permutation <uint> permutationToTry;
                    // This locking is needed because we use a static random number generator to create a new permutation.
                    // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe.
                    if (dimensions > 7)
                    {
                        lock (startFromPermutation)
                        {
                            permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration);
                        }
                    }
                    else
                    {
                        lock (allPermutations)
                        {
                            if (!allPermutations.Any())
                            {
                                return;
                            }
                            permutationToTry = allPermutations.Last();
                            allPermutations.RemoveAt(allPermutations.Count - 1);
                        }
                    }
                    IList <HilbertPoint> sampledPointsToUse;
                    lock (points)
                    {
                        sampledPointsToUse = sampledPoints;
                    }
                    var indexToTry   = new HilbertIndex(sampledPointsToUse, permutationToTry);
                    metricResults    = Metric(indexToTry);
                    var resultsToTry = new IndexFound(permutationToTry, indexToTry, metricResults.Item1, metricResults.Item2);
                    if (ShouldCompact)
                    {
                        resultsToTry.Compact();
                    }
                    lock (queue)
                    {
                        if (resultsToTry.EstimatedClusterCount < startingCount / 4 &&
                            UseSample && sampleSize != points.Count())
                        {
                            // If the cluster count has improved too much and we are sampled,
                            // reject it and increase the sample size.
                            // Why? If the clusters are irregular, sampling can break
                            // them into so many small pieces that most points end up in outliers.
                            // This leads to a false low count.
                            if (!rejectedSampleSizes.Contains(indexToTry.Count))
                            {
                                sampleSize = Math.Min(points.Count(), 3 * indexToTry.Count / 2);
                                Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)");
                                var newSampledPoints = Sample(points, sampleSize);
                                lock (points)
                                {
                                    sampledPoints = newSampledPoints;
                                }
                                rejectedSampleSizes.Add(indexToTry.Count);
                            }
                        }
                        else
                        {
                            queue.AddRemove(resultsToTry);
                            var improved = resultsToTry.IsBetterThan(bestResults);
                            if (improved)
                            {
                                bestResults = resultsToTry;
                                Interlocked.Add(ref improvedCount, 1);
                                LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount);
                                Logger.Info($"Cluster count Improved to: {bestResults}");
                            }
                        }
                    }
                });
                if (improvedCount > 0)
                {
                    iterationsWithoutImprovement = 0;
                }
                else
                {
                    iterationsWithoutImprovement++;
                }
                if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement)
                {
                    break;
                }
                if (bestResults.EstimatedClusterCount <= 2)
                {
                    break;                     // No point in continuing!
                }
            }
            var indicesFound = queue.RemoveAll().Reverse().ToList();

            if (sampledPoints.Count < points.Count)
            {
                // Results are based on Sampled set of points. Now we need to recreate these indices using the
                // full set of points.
                //TODO: "Unsample" the indices.
                var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList();
                Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}");
                return(unsampledIndices);
            }
            else
            {
                return(indicesFound);
            }
        }
예제 #9
0
 /// <summary>
 /// Create a Compact from a HilbertIndex.
 /// </summary>
 /// <param name="index">Index to compact.</param>
 /// <param name="idToPoints">The key is the UniqueId and the value is the corresponding point.
 /// These points must be the UnsignedPoint analogs of the HilbertPoints in the HilbertIndex and share the same Ids.</param>
 public HilbertOrderedIndex(HilbertIndex index, Dictionary <int, UnsignedPoint> idToPoints)
 {
     UnsortedPoints = index.UnsortedPoints.Select(hp => idToPoints[hp.UniqueId]).ToList();
     SortedPoints   = index.SortedPoints.Select(hp => idToPoints[hp.UniqueId]).ToList();
     InitIndexing();
 }