/// <summary> /// Select either the Top N or Bottom N items in sorted order from the given collection, serially (not in parallel). /// /// This only performs a partial sort. /// </summary> /// <typeparam name="TElement">Type of element in the collection.</typeparam> /// <param name="items">Collection of items to sort and select.</param> /// <param name="topN">If true, find the Top N items in descending order, otherwise the Bottom N items in ascending order.</param> /// <param name="k">Number of items to select.</param> /// <param name="comparisonDelegate">If null, assume the items are IComparable and sort them according to their natural ordering. /// If not null, use this in the comparisons to establish the ordering.</param> /// <returns>The Top N or Bottom N items, as requested, sorted appropriately</returns> public static IEnumerable <TElement> SelectSerial <TElement>(this IEnumerable <TElement> items, bool topN, int k, IComparer <TElement> comparisonDelegate = null) { // Seems counterintuitive, but when looking for the Top N we use a Min Heap, and when // looking for the Bottom N we use a Max Heap. var heap = new BinaryHeap <TElement>(topN ? BinaryHeapType.MinHeap : BinaryHeapType.MaxHeap, k, comparisonDelegate); foreach (var item in items) { heap.AddRemove(item); } var resultsCount = heap.Count; for (var i = 0; i < resultsCount; i++) { yield return(heap.Remove()); } }
/// <summary> /// Search many Hilbert orderings of the points, each based on a different permutation of the dimensions, and /// keep the ones yielding the best Metrics, likely those that estimate the lowest values /// for the number of clusters. /// </summary> /// <param name="points">Points to index.</param> /// <param name="indexCount">Number of the best indices to return. /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param> /// <param name="startingPermutation">Starting permutation.</param> /// <returns>The best indices found and the permutations that generated them. /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns> public IList <PermutationFound> SearchMany(IReadOnlyList <UnsignedPoint> points, int indexCount, Permutation <uint> startingPermutation = null) { if (points.Count() < 10) { throw new ArgumentException("List has too few elements", nameof(points)); } var queue = new BinaryHeap <PermutationFound>(BinaryHeapType.MaxHeap, indexCount); int dimensions = points[0].Dimensions; if (startingPermutation == null) { startingPermutation = new Permutation <uint>(dimensions); } List <UnsignedPoint> firstCurve; if (ProblemSize(points.Count, dimensions, BitsPerDimension) < 1500000000L) { firstCurve = HilbertSort.Sort(points, BitsPerDimension, startingPermutation); } else { // Used for larger problems. firstCurve = SmallBucketSort <UnsignedPoint> .Sort(points, point => point.Coordinates.HilbertIndex(BitsPerDimension)); } // Measure our first index, then loop through random permutations // looking for a better one, always accumulating the best in results. var metricResults = Metric(firstCurve); var bestResults = new PermutationFound(startingPermutation, firstCurve, metricResults.Item1, metricResults.Item2); LowestCountSeen = Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Starts at: {bestResults}"); var startingCount = bestResults.EstimatedClusterCount; if (MaxIterations <= 1) { return new List <PermutationFound> { bestResults } } ; queue.AddRemove(bestResults); // Decide if we are to sample points or use them all var sampledPoints = points; var sampleSize = points.Count(); if (UseSample) { sampleSize = SampleSize(points, bestResults.EstimatedClusterCount); sampledPoints = Sample(points, sampleSize); Logger.Info($" Sample is {sampleSize} of {points.Count} points"); } var rejectedSampleSizes = new HashSet <int>(); var iterationsWithoutImprovement = 0; var parallelOpts = new ParallelOptions { MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints) }; List <Permutation <uint> > allPermutations = null; // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times. // Instead, we will try all or many of them in order. if (dimensions <= 7) { allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList(); } for (var iteration = 0; iteration < MaxIterations; iteration++) { var improvedCount = 0; var startFromPermutation = bestResults.PermutationUsed; Parallel.For(0, ParallelTrials, parallelOpts, i => { Permutation <uint> permutationToTry; // This locking is needed because we use a static random number generator to create a new permutation. // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe. if (dimensions > 7) { lock (startFromPermutation) { permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration); } } else { lock (allPermutations) { if (!allPermutations.Any()) { return; } permutationToTry = allPermutations.Last(); allPermutations.RemoveAt(allPermutations.Count - 1); } } IReadOnlyList <UnsignedPoint> sampledPointsToUse; lock (points) { sampledPointsToUse = sampledPoints; } var curveToTry = HilbertSort.Sort(sampledPointsToUse, BitsPerDimension, permutationToTry); metricResults = Metric(curveToTry); var resultsToTry = new PermutationFound(permutationToTry, curveToTry, metricResults.Item1, metricResults.Item2); lock (queue) { if (resultsToTry.EstimatedClusterCount < startingCount / 4 && UseSample && sampleSize != points.Count()) { // If the cluster count has improved too much and we are sampled, // reject it and increase the sample size. // Why? If the clusters are irregular, sampling can break // them into so many small pieces that most points end up in outliers. // This leads to a false low count. if (!rejectedSampleSizes.Contains(curveToTry.Count)) { sampleSize = Math.Min(points.Count(), 3 * curveToTry.Count / 2); Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)"); var newSampledPoints = Sample(points, sampleSize); lock (points) { sampledPoints = newSampledPoints; } rejectedSampleSizes.Add(curveToTry.Count); } } else { queue.AddRemove(resultsToTry); var improved = resultsToTry.IsBetterThan(bestResults); if (improved) { bestResults = resultsToTry; Interlocked.Add(ref improvedCount, 1); LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Improved to: {bestResults}"); } } } }); if (improvedCount > 0) { iterationsWithoutImprovement = 0; } else { iterationsWithoutImprovement++; } if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement) { break; } if (bestResults.EstimatedClusterCount <= 2) { break; // No point in continuing! } } var indicesFound = queue.RemoveAll().Reverse().ToList(); if (sampledPoints.Count < points.Count) { // Results are based on Sampled set of points. Now we need to recreate these indices using the // full set of points. //TODO: "Unsample" the indices. var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList(); Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}"); return(unsampledIndices); } else { return(indicesFound); } }
/// <summary> /// Select either the Top N or Bottom N items in sorted order from the given collection, in parallel. /// /// This only performs a partial sort. /// </summary> /// <typeparam name="TElement">Type of element in the collection.</typeparam> /// <param name="items">Collection of items to sort and select.</param> /// <param name="topN">If true, find the Top N items in descending order, otherwise the Bottom N items in ascending order.</param> /// <param name="k">Number of items to select.</param> /// <param name="comparisonDelegate">If null, assume the items are IComparable and sort them according to their natural ordering. /// If not null, use this in the comparisons to establish the ordering.</param> /// <param name="options">If null, use the default values, otherwise use these options to control the parallelism.</param> /// <returns>The Top N or Bottom N items, as requested, sorted appropriately</returns> static IEnumerable <TElement> SelectParallel <TElement>(IEnumerable <TElement> items, bool topN, int k, IComparer <TElement> comparisonDelegate = null, SelectParallelOptions options = null) { options = options ?? new SelectParallelOptions(); // If we are only dedicating a single task to the operation, do it serially to save on Task overhead. if (options.TaskCount == 1) { return(SelectSerial(items, topN, k, comparisonDelegate)); } var tasks = new Task[options.TaskCount]; var extremeItems = new List <TElement>(); var enumerator = items.GetEnumerator(); for (var i = 0; i < options.TaskCount; i++) { var iTask = i; var batch = new TElement[options.BatchSize]; tasks[iTask] = Task.Factory.StartNew(() => { var heap = new BinaryHeap <TElement>(topN ? BinaryHeapType.MinHeap : BinaryHeapType.MaxHeap, k + 1, comparisonDelegate); var moreItems = true; var batchSize = options.BatchSize; while (moreItems) { var iReadCount = 0; lock (enumerator) { for (var iBatch = 0; iBatch < batchSize && moreItems; iBatch++) { if (enumerator.MoveNext()) { batch[iReadCount++] = enumerator.Current; } else { moreItems = false; } } } for (var iBatch = 0; iBatch < iReadCount; iBatch++) { var item = batch[iBatch]; if (k + 1 > heap.Count) { heap.Add(item); } else if (heap.IsLessExtreme(item)) { heap.Remove(); heap.Add(item); } } } lock (extremeItems) { extremeItems.AddRange(heap.RemoveAll()); } }); } Task.WaitAll(tasks); // At this point we have as many as k*TaskCount items left. Take the k most extreme. return(SelectSerial(extremeItems, topN, k, comparisonDelegate)); }