/// <summary> /// Sort the items by a sort key derived by calling the supplied delegate on each item. /// /// The list is copied during the sort operation, so items are not sorted in place. /// </summary> /// <typeparam name="T">Type of item to be sorted.</typeparam> /// <param name="items">Items to be sorted.</param> /// <param name="ordering">Delegate that extracts the sort key from an item.</param> /// <returns>A new List of the original items, now sorted.</returns> public static List <T> Sort <T>(IReadOnlyList <T> items, Func <T, IComparable> ordering) { var unsortedItems = items.ToList(); var sorter = new SmallBucketSort <T>(unsortedItems, ordering); return(sorter.Sort()); }
/// <summary> /// Search many Hilbert orderings of the points, each based on a different permutation of the dimensions, and /// keep the ones yielding the best Metrics, likely those that estimate the lowest values /// for the number of clusters. /// </summary> /// <param name="points">Points to index.</param> /// <param name="indexCount">Number of the best indices to return. /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param> /// <param name="startingPermutation">Starting permutation.</param> /// <returns>The best indices found and the permutations that generated them. /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns> public IList <PermutationFound> SearchMany(IReadOnlyList <UnsignedPoint> points, int indexCount, Permutation <uint> startingPermutation = null) { if (points.Count() < 10) { throw new ArgumentException("List has too few elements", nameof(points)); } var queue = new BinaryHeap <PermutationFound>(BinaryHeapType.MaxHeap, indexCount); int dimensions = points[0].Dimensions; if (startingPermutation == null) { startingPermutation = new Permutation <uint>(dimensions); } List <UnsignedPoint> firstCurve; if (ProblemSize(points.Count, dimensions, BitsPerDimension) < 1500000000L) { firstCurve = HilbertSort.Sort(points, BitsPerDimension, startingPermutation); } else { // Used for larger problems. firstCurve = SmallBucketSort <UnsignedPoint> .Sort(points, point => point.Coordinates.HilbertIndex(BitsPerDimension)); } // Measure our first index, then loop through random permutations // looking for a better one, always accumulating the best in results. var metricResults = Metric(firstCurve); var bestResults = new PermutationFound(startingPermutation, firstCurve, metricResults.Item1, metricResults.Item2); LowestCountSeen = Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Starts at: {bestResults}"); var startingCount = bestResults.EstimatedClusterCount; if (MaxIterations <= 1) { return new List <PermutationFound> { bestResults } } ; queue.AddRemove(bestResults); // Decide if we are to sample points or use them all var sampledPoints = points; var sampleSize = points.Count(); if (UseSample) { sampleSize = SampleSize(points, bestResults.EstimatedClusterCount); sampledPoints = Sample(points, sampleSize); Logger.Info($" Sample is {sampleSize} of {points.Count} points"); } var rejectedSampleSizes = new HashSet <int>(); var iterationsWithoutImprovement = 0; var parallelOpts = new ParallelOptions { MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints) }; List <Permutation <uint> > allPermutations = null; // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times. // Instead, we will try all or many of them in order. if (dimensions <= 7) { allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList(); } for (var iteration = 0; iteration < MaxIterations; iteration++) { var improvedCount = 0; var startFromPermutation = bestResults.PermutationUsed; Parallel.For(0, ParallelTrials, parallelOpts, i => { Permutation <uint> permutationToTry; // This locking is needed because we use a static random number generator to create a new permutation. // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe. if (dimensions > 7) { lock (startFromPermutation) { permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration); } } else { lock (allPermutations) { if (!allPermutations.Any()) { return; } permutationToTry = allPermutations.Last(); allPermutations.RemoveAt(allPermutations.Count - 1); } } IReadOnlyList <UnsignedPoint> sampledPointsToUse; lock (points) { sampledPointsToUse = sampledPoints; } var curveToTry = HilbertSort.Sort(sampledPointsToUse, BitsPerDimension, permutationToTry); metricResults = Metric(curveToTry); var resultsToTry = new PermutationFound(permutationToTry, curveToTry, metricResults.Item1, metricResults.Item2); lock (queue) { if (resultsToTry.EstimatedClusterCount < startingCount / 4 && UseSample && sampleSize != points.Count()) { // If the cluster count has improved too much and we are sampled, // reject it and increase the sample size. // Why? If the clusters are irregular, sampling can break // them into so many small pieces that most points end up in outliers. // This leads to a false low count. if (!rejectedSampleSizes.Contains(curveToTry.Count)) { sampleSize = Math.Min(points.Count(), 3 * curveToTry.Count / 2); Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)"); var newSampledPoints = Sample(points, sampleSize); lock (points) { sampledPoints = newSampledPoints; } rejectedSampleSizes.Add(curveToTry.Count); } } else { queue.AddRemove(resultsToTry); var improved = resultsToTry.IsBetterThan(bestResults); if (improved) { bestResults = resultsToTry; Interlocked.Add(ref improvedCount, 1); LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Improved to: {bestResults}"); } } } }); if (improvedCount > 0) { iterationsWithoutImprovement = 0; } else { iterationsWithoutImprovement++; } if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement) { break; } if (bestResults.EstimatedClusterCount <= 2) { break; // No point in continuing! } } var indicesFound = queue.RemoveAll().Reverse().ToList(); if (sampledPoints.Count < points.Count) { // Results are based on Sampled set of points. Now we need to recreate these indices using the // full set of points. //TODO: "Unsample" the indices. var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList(); Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}"); return(unsampledIndices); } else { return(indicesFound); } }