Esempio n. 1
0
        /// <summary>
        /// Select either the Top N or Bottom N items in sorted order from the given collection, serially (not in parallel).
        ///
        /// This only performs a partial sort.
        /// </summary>
        /// <typeparam name="TElement">Type of element in the collection.</typeparam>
        /// <param name="items">Collection of items to sort and select.</param>
        /// <param name="topN">If true, find the Top N items in descending order, otherwise the Bottom N items in ascending order.</param>
        /// <param name="k">Number of items to select.</param>
        /// <param name="comparisonDelegate">If null, assume the items are IComparable and sort them according to their natural ordering.
        /// If not null, use this in the comparisons to establish the ordering.</param>
        /// <returns>The Top N or Bottom N items, as requested, sorted appropriately</returns>
        public static IEnumerable <TElement> SelectSerial <TElement>(this IEnumerable <TElement> items, bool topN, int k,
                                                                     IComparer <TElement> comparisonDelegate = null)
        {
            // Seems counterintuitive, but when looking for the Top N we use a Min Heap, and when
            // looking for the Bottom N we use a Max Heap.
            var heap = new BinaryHeap <TElement>(topN ? BinaryHeapType.MinHeap : BinaryHeapType.MaxHeap, k, comparisonDelegate);

            foreach (var item in items)
            {
                heap.AddRemove(item);
            }
            var resultsCount = heap.Count;

            for (var i = 0; i < resultsCount; i++)
            {
                yield return(heap.Remove());
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Search many Hilbert orderings of the points, each based on a different permutation of the dimensions, and
        /// keep the ones yielding the best Metrics, likely those that estimate the lowest values
        /// for the number of clusters.
        /// </summary>
        /// <param name="points">Points to index.</param>
        /// <param name="indexCount">Number of the best indices to return.
        /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param>
        /// <param name="startingPermutation">Starting permutation.</param>
        /// <returns>The best indices found and the permutations that generated them.
        /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns>
        public IList <PermutationFound> SearchMany(IReadOnlyList <UnsignedPoint> points, int indexCount, Permutation <uint> startingPermutation = null)
        {
            if (points.Count() < 10)
            {
                throw new ArgumentException("List has too few elements", nameof(points));
            }
            var queue      = new BinaryHeap <PermutationFound>(BinaryHeapType.MaxHeap, indexCount);
            int dimensions = points[0].Dimensions;

            if (startingPermutation == null)
            {
                startingPermutation = new Permutation <uint>(dimensions);
            }
            List <UnsignedPoint> firstCurve;

            if (ProblemSize(points.Count, dimensions, BitsPerDimension) < 1500000000L)
            {
                firstCurve = HilbertSort.Sort(points, BitsPerDimension, startingPermutation);
            }
            else
            {
                // Used for larger problems.
                firstCurve = SmallBucketSort <UnsignedPoint> .Sort(points, point => point.Coordinates.HilbertIndex(BitsPerDimension));
            }

            // Measure our first index, then loop through random permutations
            // looking for a better one, always accumulating the best in results.
            var metricResults = Metric(firstCurve);
            var bestResults   = new PermutationFound(startingPermutation, firstCurve, metricResults.Item1, metricResults.Item2);

            LowestCountSeen = Min(LowestCountSeen, bestResults.EstimatedClusterCount);
            Logger.Info($"Cluster count Starts at: {bestResults}");
            var startingCount = bestResults.EstimatedClusterCount;

            if (MaxIterations <= 1)
            {
                return new List <PermutationFound> {
                           bestResults
                }
            }
            ;
            queue.AddRemove(bestResults);

            // Decide if we are to sample points or use them all
            var sampledPoints = points;
            var sampleSize    = points.Count();

            if (UseSample)
            {
                sampleSize = SampleSize(points, bestResults.EstimatedClusterCount);

                sampledPoints = Sample(points, sampleSize);
                Logger.Info($"    Sample is {sampleSize} of {points.Count} points");
            }
            var rejectedSampleSizes = new HashSet <int>();

            var iterationsWithoutImprovement = 0;
            var parallelOpts = new ParallelOptions {
                MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints)
            };

            List <Permutation <uint> > allPermutations = null;

            // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times.
            // Instead, we will try all or many of them in order.
            if (dimensions <= 7)
            {
                allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList();
            }

            for (var iteration = 0; iteration < MaxIterations; iteration++)
            {
                var improvedCount        = 0;
                var startFromPermutation = bestResults.PermutationUsed;
                Parallel.For(0, ParallelTrials, parallelOpts,
                             i =>
                {
                    Permutation <uint> permutationToTry;
                    // This locking is needed because we use a static random number generator to create a new permutation.
                    // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe.
                    if (dimensions > 7)
                    {
                        lock (startFromPermutation)
                        {
                            permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration);
                        }
                    }
                    else
                    {
                        lock (allPermutations)
                        {
                            if (!allPermutations.Any())
                            {
                                return;
                            }
                            permutationToTry = allPermutations.Last();
                            allPermutations.RemoveAt(allPermutations.Count - 1);
                        }
                    }
                    IReadOnlyList <UnsignedPoint> sampledPointsToUse;
                    lock (points)
                    {
                        sampledPointsToUse = sampledPoints;
                    }
                    var curveToTry   = HilbertSort.Sort(sampledPointsToUse, BitsPerDimension, permutationToTry);
                    metricResults    = Metric(curveToTry);
                    var resultsToTry = new PermutationFound(permutationToTry, curveToTry, metricResults.Item1, metricResults.Item2);
                    lock (queue)
                    {
                        if (resultsToTry.EstimatedClusterCount < startingCount / 4 &&
                            UseSample && sampleSize != points.Count())
                        {
                            // If the cluster count has improved too much and we are sampled,
                            // reject it and increase the sample size.
                            // Why? If the clusters are irregular, sampling can break
                            // them into so many small pieces that most points end up in outliers.
                            // This leads to a false low count.
                            if (!rejectedSampleSizes.Contains(curveToTry.Count))
                            {
                                sampleSize = Math.Min(points.Count(), 3 * curveToTry.Count / 2);
                                Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)");
                                var newSampledPoints = Sample(points, sampleSize);
                                lock (points)
                                {
                                    sampledPoints = newSampledPoints;
                                }
                                rejectedSampleSizes.Add(curveToTry.Count);
                            }
                        }
                        else
                        {
                            queue.AddRemove(resultsToTry);
                            var improved = resultsToTry.IsBetterThan(bestResults);
                            if (improved)
                            {
                                bestResults = resultsToTry;
                                Interlocked.Add(ref improvedCount, 1);
                                LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount);
                                Logger.Info($"Cluster count Improved to: {bestResults}");
                            }
                        }
                    }
                });
                if (improvedCount > 0)
                {
                    iterationsWithoutImprovement = 0;
                }
                else
                {
                    iterationsWithoutImprovement++;
                }
                if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement)
                {
                    break;
                }
                if (bestResults.EstimatedClusterCount <= 2)
                {
                    break; // No point in continuing!
                }
            }
            var indicesFound = queue.RemoveAll().Reverse().ToList();

            if (sampledPoints.Count < points.Count)
            {
                // Results are based on Sampled set of points. Now we need to recreate these indices using the
                // full set of points.
                //TODO: "Unsample" the indices.
                var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList();
                Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}");
                return(unsampledIndices);
            }
            else
            {
                return(indicesFound);
            }
        }
Esempio n. 3
0
        /// <summary>
        /// Select either the Top N or Bottom N items in sorted order from the given collection, in parallel.
        ///
        /// This only performs a partial sort.
        /// </summary>
        /// <typeparam name="TElement">Type of element in the collection.</typeparam>
        /// <param name="items">Collection of items to sort and select.</param>
        /// <param name="topN">If true, find the Top N items in descending order, otherwise the Bottom N items in ascending order.</param>
        /// <param name="k">Number of items to select.</param>
        /// <param name="comparisonDelegate">If null, assume the items are IComparable and sort them according to their natural ordering.
        /// If not null, use this in the comparisons to establish the ordering.</param>
        /// <param name="options">If null, use the default values, otherwise use these options to control the parallelism.</param>
        /// <returns>The Top N or Bottom N items, as requested, sorted appropriately</returns>
        static IEnumerable <TElement> SelectParallel <TElement>(IEnumerable <TElement> items, bool topN, int k,
                                                                IComparer <TElement> comparisonDelegate = null, SelectParallelOptions options = null)
        {
            options = options ?? new SelectParallelOptions();

            // If we are only dedicating a single task to the operation, do it serially to save on Task overhead.
            if (options.TaskCount == 1)
            {
                return(SelectSerial(items, topN, k, comparisonDelegate));
            }

            var tasks        = new Task[options.TaskCount];
            var extremeItems = new List <TElement>();
            var enumerator   = items.GetEnumerator();

            for (var i = 0; i < options.TaskCount; i++)
            {
                var iTask = i;
                var batch = new TElement[options.BatchSize];
                tasks[iTask] = Task.Factory.StartNew(() =>
                {
                    var heap      = new BinaryHeap <TElement>(topN ? BinaryHeapType.MinHeap : BinaryHeapType.MaxHeap, k + 1, comparisonDelegate);
                    var moreItems = true;
                    var batchSize = options.BatchSize;
                    while (moreItems)
                    {
                        var iReadCount = 0;
                        lock (enumerator)
                        {
                            for (var iBatch = 0; iBatch < batchSize && moreItems; iBatch++)
                            {
                                if (enumerator.MoveNext())
                                {
                                    batch[iReadCount++] = enumerator.Current;
                                }
                                else
                                {
                                    moreItems = false;
                                }
                            }
                        }
                        for (var iBatch = 0; iBatch < iReadCount; iBatch++)
                        {
                            var item = batch[iBatch];
                            if (k + 1 > heap.Count)
                            {
                                heap.Add(item);
                            }
                            else if (heap.IsLessExtreme(item))
                            {
                                heap.Remove();
                                heap.Add(item);
                            }
                        }
                    }
                    lock (extremeItems)
                    {
                        extremeItems.AddRange(heap.RemoveAll());
                    }
                });
            }
            Task.WaitAll(tasks);
            //  At this point we have as many as k*TaskCount items left. Take the k most extreme.
            return(SelectSerial(extremeItems, topN, k, comparisonDelegate));
        }