public override bool Equals(object obj) { var other = obj as Data; if (other == null) { return(false); } return(IndexFound.Equals(other.IndexFound) && Hash.Equals(other.Hash)); }
/// <summary> /// Create and measure a new HilbertIndex using all the points, not just a sample of them, /// but use the same permutation. /// </summary> /// <param name="sampled">Sampled.</param> private IndexFound Unsample(IList <HilbertPoint> allPoints, IndexFound sampled) { var indexToTry = new HilbertIndex(allPoints, sampled.PermutationUsed); var metricResults = Metric(indexToTry); var resultsToTry = new IndexFound(sampled.PermutationUsed, indexToTry, metricResults.Item1, metricResults.Item2); if (ShouldCompact) { resultsToTry.Compact(); } return(resultsToTry); }
/// <summary> /// Search many HilbertIndex objects, each based on a different permutation of the dimensions, and /// keep the ones yielding the best Metrics, likely those that estimate the lowest values /// for the number of clusters. /// </summary> /// <param name="points">Points to index.</param> /// <param name="indexCount">Number of the best indices to return. /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param> /// <param name="startingPermutation">Starting permutation.</param> /// <returns>The best indices found and the permutations that generated them. /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns> public IList <IndexFound> SearchMany(IList <HilbertPoint> points, int indexCount, Permutation <uint> startingPermutation = null) { if (points.Count() < 10) { throw new ArgumentException("List has too few elements", nameof(points)); } var queue = new BinaryHeap <IndexFound>(BinaryHeapType.MaxHeap, indexCount); int dimensions = points[0].Dimensions; var bitsPerDimension = points[0].BitsPerDimension; if (startingPermutation == null) { startingPermutation = new Permutation <uint>(dimensions); } var firstIndex = new HilbertIndex(points, startingPermutation); // Measure our first index, then loop through random permutations // looking for a better one, always accumulating the best in results. var metricResults = Metric(firstIndex); var bestResults = new IndexFound(startingPermutation, firstIndex, metricResults.Item1, metricResults.Item2); if (ShouldCompact) { bestResults.Compact(); } LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Starts at: {bestResults}"); var startingCount = bestResults.EstimatedClusterCount; queue.AddRemove(bestResults); // Decide if we are to sample points or use them all var sampledPoints = points; var sampleSize = points.Count(); if (UseSample) { sampleSize = SampleSize(points, bestResults.EstimatedClusterCount); sampledPoints = Sample(points, sampleSize); Logger.Info($" Sample is {sampleSize} of {points.Count} points"); } var rejectedSampleSizes = new HashSet <int>(); var iterationsWithoutImprovement = 0; var parallelOpts = new ParallelOptions { MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints) }; List <Permutation <uint> > allPermutations = null; // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times. // Instead, we will try all or many of them in order. if (dimensions <= 7) { allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList(); } for (var iteration = 0; iteration < MaxIterations; iteration++) { var improvedCount = 0; var startFromPermutation = bestResults.PermutationUsed; Parallel.For(0, ParallelTrials, parallelOpts, i => { Permutation <uint> permutationToTry; // This locking is needed because we use a static random number generator to create a new permutation. // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe. if (dimensions > 7) { lock (startFromPermutation) { permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration); } } else { lock (allPermutations) { if (!allPermutations.Any()) { return; } permutationToTry = allPermutations.Last(); allPermutations.RemoveAt(allPermutations.Count - 1); } } IList <HilbertPoint> sampledPointsToUse; lock (points) { sampledPointsToUse = sampledPoints; } var indexToTry = new HilbertIndex(sampledPointsToUse, permutationToTry); metricResults = Metric(indexToTry); var resultsToTry = new IndexFound(permutationToTry, indexToTry, metricResults.Item1, metricResults.Item2); if (ShouldCompact) { resultsToTry.Compact(); } lock (queue) { if (resultsToTry.EstimatedClusterCount < startingCount / 4 && UseSample && sampleSize != points.Count()) { // If the cluster count has improved too much and we are sampled, // reject it and increase the sample size. // Why? If the clusters are irregular, sampling can break // them into so many small pieces that most points end up in outliers. // This leads to a false low count. if (!rejectedSampleSizes.Contains(indexToTry.Count)) { sampleSize = Math.Min(points.Count(), 3 * indexToTry.Count / 2); Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)"); var newSampledPoints = Sample(points, sampleSize); lock (points) { sampledPoints = newSampledPoints; } rejectedSampleSizes.Add(indexToTry.Count); } } else { queue.AddRemove(resultsToTry); var improved = resultsToTry.IsBetterThan(bestResults); if (improved) { bestResults = resultsToTry; Interlocked.Add(ref improvedCount, 1); LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Improved to: {bestResults}"); } } } }); if (improvedCount > 0) { iterationsWithoutImprovement = 0; } else { iterationsWithoutImprovement++; } if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement) { break; } if (bestResults.EstimatedClusterCount <= 2) { break; // No point in continuing! } } var indicesFound = queue.RemoveAll().Reverse().ToList(); if (sampledPoints.Count < points.Count) { // Results are based on Sampled set of points. Now we need to recreate these indices using the // full set of points. //TODO: "Unsample" the indices. var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList(); Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}"); return(unsampledIndices); } else { return(indicesFound); } }
public override int GetHashCode() { return(IndexFound.GetHashCode() ^ Hash.GetHashCode()); }