Example #1
0
        /// <summary> Create and measure a new curve using all points, not just a sample, but use the same permutation.</summary>
        /// <param name="sampled">Results from evaluating a sample of points.</param>
        private PermutationFound Unsample(IReadOnlyList <UnsignedPoint> allPoints, PermutationFound sampled)
        {
            var curveToTry    = HilbertSort.Sort(allPoints, BitsPerDimension, sampled.PermutationUsed);
            var metricResults = Metric(curveToTry);
            var resultsToTry  = new PermutationFound(sampled.PermutationUsed, curveToTry, metricResults.Item1, metricResults.Item2);

            return(resultsToTry);
        }
Example #2
0
        /// <summary>
        /// Search many Hilbert orderings of the points, each based on a different permutation of the dimensions, and
        /// keep the ones yielding the best Metrics, likely those that estimate the lowest values
        /// for the number of clusters.
        /// </summary>
        /// <param name="points">Points to index.</param>
        /// <param name="indexCount">Number of the best indices to return.
        /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param>
        /// <param name="startingPermutation">Starting permutation.</param>
        /// <returns>The best indices found and the permutations that generated them.
        /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns>
        public IList <PermutationFound> SearchMany(IReadOnlyList <UnsignedPoint> points, int indexCount, Permutation <uint> startingPermutation = null)
        {
            if (points.Count() < 10)
            {
                throw new ArgumentException("List has too few elements", nameof(points));
            }
            var queue      = new BinaryHeap <PermutationFound>(BinaryHeapType.MaxHeap, indexCount);
            int dimensions = points[0].Dimensions;

            if (startingPermutation == null)
            {
                startingPermutation = new Permutation <uint>(dimensions);
            }
            List <UnsignedPoint> firstCurve;

            if (ProblemSize(points.Count, dimensions, BitsPerDimension) < 1500000000L)
            {
                firstCurve = HilbertSort.Sort(points, BitsPerDimension, startingPermutation);
            }
            else
            {
                // Used for larger problems.
                firstCurve = SmallBucketSort <UnsignedPoint> .Sort(points, point => point.Coordinates.HilbertIndex(BitsPerDimension));
            }

            // Measure our first index, then loop through random permutations
            // looking for a better one, always accumulating the best in results.
            var metricResults = Metric(firstCurve);
            var bestResults   = new PermutationFound(startingPermutation, firstCurve, metricResults.Item1, metricResults.Item2);

            LowestCountSeen = Min(LowestCountSeen, bestResults.EstimatedClusterCount);
            Logger.Info($"Cluster count Starts at: {bestResults}");
            var startingCount = bestResults.EstimatedClusterCount;

            if (MaxIterations <= 1)
            {
                return new List <PermutationFound> {
                           bestResults
                }
            }
            ;
            queue.AddRemove(bestResults);

            // Decide if we are to sample points or use them all
            var sampledPoints = points;
            var sampleSize    = points.Count();

            if (UseSample)
            {
                sampleSize = SampleSize(points, bestResults.EstimatedClusterCount);

                sampledPoints = Sample(points, sampleSize);
                Logger.Info($"    Sample is {sampleSize} of {points.Count} points");
            }
            var rejectedSampleSizes = new HashSet <int>();

            var iterationsWithoutImprovement = 0;
            var parallelOpts = new ParallelOptions {
                MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints)
            };

            List <Permutation <uint> > allPermutations = null;

            // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times.
            // Instead, we will try all or many of them in order.
            if (dimensions <= 7)
            {
                allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList();
            }

            for (var iteration = 0; iteration < MaxIterations; iteration++)
            {
                var improvedCount        = 0;
                var startFromPermutation = bestResults.PermutationUsed;
                Parallel.For(0, ParallelTrials, parallelOpts,
                             i =>
                {
                    Permutation <uint> permutationToTry;
                    // This locking is needed because we use a static random number generator to create a new permutation.
                    // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe.
                    if (dimensions > 7)
                    {
                        lock (startFromPermutation)
                        {
                            permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration);
                        }
                    }
                    else
                    {
                        lock (allPermutations)
                        {
                            if (!allPermutations.Any())
                            {
                                return;
                            }
                            permutationToTry = allPermutations.Last();
                            allPermutations.RemoveAt(allPermutations.Count - 1);
                        }
                    }
                    IReadOnlyList <UnsignedPoint> sampledPointsToUse;
                    lock (points)
                    {
                        sampledPointsToUse = sampledPoints;
                    }
                    var curveToTry   = HilbertSort.Sort(sampledPointsToUse, BitsPerDimension, permutationToTry);
                    metricResults    = Metric(curveToTry);
                    var resultsToTry = new PermutationFound(permutationToTry, curveToTry, metricResults.Item1, metricResults.Item2);
                    lock (queue)
                    {
                        if (resultsToTry.EstimatedClusterCount < startingCount / 4 &&
                            UseSample && sampleSize != points.Count())
                        {
                            // If the cluster count has improved too much and we are sampled,
                            // reject it and increase the sample size.
                            // Why? If the clusters are irregular, sampling can break
                            // them into so many small pieces that most points end up in outliers.
                            // This leads to a false low count.
                            if (!rejectedSampleSizes.Contains(curveToTry.Count))
                            {
                                sampleSize = Math.Min(points.Count(), 3 * curveToTry.Count / 2);
                                Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)");
                                var newSampledPoints = Sample(points, sampleSize);
                                lock (points)
                                {
                                    sampledPoints = newSampledPoints;
                                }
                                rejectedSampleSizes.Add(curveToTry.Count);
                            }
                        }
                        else
                        {
                            queue.AddRemove(resultsToTry);
                            var improved = resultsToTry.IsBetterThan(bestResults);
                            if (improved)
                            {
                                bestResults = resultsToTry;
                                Interlocked.Add(ref improvedCount, 1);
                                LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount);
                                Logger.Info($"Cluster count Improved to: {bestResults}");
                            }
                        }
                    }
                });
                if (improvedCount > 0)
                {
                    iterationsWithoutImprovement = 0;
                }
                else
                {
                    iterationsWithoutImprovement++;
                }
                if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement)
                {
                    break;
                }
                if (bestResults.EstimatedClusterCount <= 2)
                {
                    break; // No point in continuing!
                }
            }
            var indicesFound = queue.RemoveAll().Reverse().ToList();

            if (sampledPoints.Count < points.Count)
            {
                // Results are based on Sampled set of points. Now we need to recreate these indices using the
                // full set of points.
                //TODO: "Unsample" the indices.
                var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList();
                Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}");
                return(unsampledIndices);
            }
            else
            {
                return(indicesFound);
            }
        }