Пример #1
0
        public override bool Equals(object obj)
        {
            var other = obj as Data;

            if (other == null)
            {
                return(false);
            }
            return(IndexFound.Equals(other.IndexFound) && Hash.Equals(other.Hash));
        }
Пример #2
0
        /// <summary>
        /// Create and measure a new HilbertIndex using all the points, not just a sample of them,
        /// but use the same permutation.
        /// </summary>
        /// <param name="sampled">Sampled.</param>
        private IndexFound Unsample(IList <HilbertPoint> allPoints, IndexFound sampled)
        {
            var indexToTry    = new HilbertIndex(allPoints, sampled.PermutationUsed);
            var metricResults = Metric(indexToTry);
            var resultsToTry  = new IndexFound(sampled.PermutationUsed, indexToTry, metricResults.Item1, metricResults.Item2);

            if (ShouldCompact)
            {
                resultsToTry.Compact();
            }
            return(resultsToTry);
        }
Пример #3
0
        /// <summary>
        /// Search many HilbertIndex objects, each based on a different permutation of the dimensions, and
        /// keep the ones yielding the best Metrics, likely those that estimate the lowest values
        /// for the number of clusters.
        /// </summary>
        /// <param name="points">Points to index.</param>
        /// <param name="indexCount">Number of the best indices to return.
        /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param>
        /// <param name="startingPermutation">Starting permutation.</param>
        /// <returns>The best indices found and the permutations that generated them.
        /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns>
        public IList <IndexFound> SearchMany(IList <HilbertPoint> points, int indexCount, Permutation <uint> startingPermutation = null)
        {
            if (points.Count() < 10)
            {
                throw new ArgumentException("List has too few elements", nameof(points));
            }
            var queue            = new BinaryHeap <IndexFound>(BinaryHeapType.MaxHeap, indexCount);
            int dimensions       = points[0].Dimensions;
            var bitsPerDimension = points[0].BitsPerDimension;

            if (startingPermutation == null)
            {
                startingPermutation = new Permutation <uint>(dimensions);
            }
            var firstIndex = new HilbertIndex(points, startingPermutation);
            // Measure our first index, then loop through random permutations
            // looking for a better one, always accumulating the best in results.
            var metricResults = Metric(firstIndex);
            var bestResults   = new IndexFound(startingPermutation, firstIndex, metricResults.Item1, metricResults.Item2);

            if (ShouldCompact)
            {
                bestResults.Compact();
            }
            LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount);
            Logger.Info($"Cluster count Starts at: {bestResults}");
            var startingCount = bestResults.EstimatedClusterCount;

            queue.AddRemove(bestResults);

            // Decide if we are to sample points or use them all
            var sampledPoints = points;
            var sampleSize    = points.Count();

            if (UseSample)
            {
                sampleSize    = SampleSize(points, bestResults.EstimatedClusterCount);
                sampledPoints = Sample(points, sampleSize);
                Logger.Info($"    Sample is {sampleSize} of {points.Count} points");
            }
            var rejectedSampleSizes = new HashSet <int>();

            var iterationsWithoutImprovement = 0;
            var parallelOpts = new ParallelOptions {
                MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints)
            };

            List <Permutation <uint> > allPermutations = null;

            // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times.
            // Instead, we will try all or many of them in order.
            if (dimensions <= 7)
            {
                allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList();
            }

            for (var iteration = 0; iteration < MaxIterations; iteration++)
            {
                var improvedCount        = 0;
                var startFromPermutation = bestResults.PermutationUsed;
                Parallel.For(0, ParallelTrials, parallelOpts,
                             i =>
                {
                    Permutation <uint> permutationToTry;
                    // This locking is needed because we use a static random number generator to create a new permutation.
                    // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe.
                    if (dimensions > 7)
                    {
                        lock (startFromPermutation)
                        {
                            permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration);
                        }
                    }
                    else
                    {
                        lock (allPermutations)
                        {
                            if (!allPermutations.Any())
                            {
                                return;
                            }
                            permutationToTry = allPermutations.Last();
                            allPermutations.RemoveAt(allPermutations.Count - 1);
                        }
                    }
                    IList <HilbertPoint> sampledPointsToUse;
                    lock (points)
                    {
                        sampledPointsToUse = sampledPoints;
                    }
                    var indexToTry   = new HilbertIndex(sampledPointsToUse, permutationToTry);
                    metricResults    = Metric(indexToTry);
                    var resultsToTry = new IndexFound(permutationToTry, indexToTry, metricResults.Item1, metricResults.Item2);
                    if (ShouldCompact)
                    {
                        resultsToTry.Compact();
                    }
                    lock (queue)
                    {
                        if (resultsToTry.EstimatedClusterCount < startingCount / 4 &&
                            UseSample && sampleSize != points.Count())
                        {
                            // If the cluster count has improved too much and we are sampled,
                            // reject it and increase the sample size.
                            // Why? If the clusters are irregular, sampling can break
                            // them into so many small pieces that most points end up in outliers.
                            // This leads to a false low count.
                            if (!rejectedSampleSizes.Contains(indexToTry.Count))
                            {
                                sampleSize = Math.Min(points.Count(), 3 * indexToTry.Count / 2);
                                Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)");
                                var newSampledPoints = Sample(points, sampleSize);
                                lock (points)
                                {
                                    sampledPoints = newSampledPoints;
                                }
                                rejectedSampleSizes.Add(indexToTry.Count);
                            }
                        }
                        else
                        {
                            queue.AddRemove(resultsToTry);
                            var improved = resultsToTry.IsBetterThan(bestResults);
                            if (improved)
                            {
                                bestResults = resultsToTry;
                                Interlocked.Add(ref improvedCount, 1);
                                LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount);
                                Logger.Info($"Cluster count Improved to: {bestResults}");
                            }
                        }
                    }
                });
                if (improvedCount > 0)
                {
                    iterationsWithoutImprovement = 0;
                }
                else
                {
                    iterationsWithoutImprovement++;
                }
                if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement)
                {
                    break;
                }
                if (bestResults.EstimatedClusterCount <= 2)
                {
                    break;                     // No point in continuing!
                }
            }
            var indicesFound = queue.RemoveAll().Reverse().ToList();

            if (sampledPoints.Count < points.Count)
            {
                // Results are based on Sampled set of points. Now we need to recreate these indices using the
                // full set of points.
                //TODO: "Unsample" the indices.
                var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList();
                Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}");
                return(unsampledIndices);
            }
            else
            {
                return(indicesFound);
            }
        }
Пример #4
0
 public override int GetHashCode()
 {
     return(IndexFound.GetHashCode() ^ Hash.GetHashCode());
 }