public DensityMeter(HilbertIndex index, long neighborhoodRadius, int memoWindowRadius = 0) { Index = index; NeighborhoodRadius = neighborhoodRadius; Estimator = null; // distances => distances.Count(d => d <= NeighborhoodRadius); Distances = new DistanceMemo(Index, neighborhoodRadius, memoWindowRadius); }
public IndexFound(Permutation <uint> permutation, HilbertIndex index, int estimatedClusterCount, long mergeSquareDistance) { PermutationUsed = permutation; Index = index; EstimatedClusterCount = estimatedClusterCount; MergeSquareDistance = mergeSquareDistance; }
public DensityClassifier(HilbertIndex index, long mergeSquareDistance, int unmergeableSize) { Index = index; MergeSquareDistance = mergeSquareDistance; UnmergeableSize = unmergeableSize; int labelCounter = 1; Clusters = new Classification <UnsignedPoint, string>(Index.SortedPoints, p => (labelCounter++).ToString()); }
/// <summary> /// Create and measure a new HilbertIndex using all the points, not just a sample of them, /// but use the same permutation. /// </summary> /// <param name="sampled">Sampled.</param> private IndexFound Unsample(IList <HilbertPoint> allPoints, IndexFound sampled) { var indexToTry = new HilbertIndex(allPoints, sampled.PermutationUsed); var metricResults = Metric(indexToTry); var resultsToTry = new IndexFound(sampled.PermutationUsed, indexToTry, metricResults.Item1, metricResults.Item2); if (ShouldCompact) { resultsToTry.Compact(); } return(resultsToTry); }
/// <summary> /// Initializes a new instance of the <see cref="T:Clustering.DistanceMemo"/> class. /// </summary> /// <param name="index">Index of points whose distances will be measured and rememerbed.</param> /// <param name="neighborhoodRadius">We will only memoize distances that are less than or equal to this value. /// Larger distances will need to be recomputed.</param> public DistanceMemo(HilbertIndex index, long neighborhoodRadius, int windowRadius = 0) { Index = index; NeighborhoodRadius = neighborhoodRadius; Distances = new Dictionary <int, long> [Count]; AllMeasured = new bool[Count]; if (windowRadius <= 0) { windowRadius = (int)Math.Sqrt(Count / 2); } WindowRadius = windowRadius; }
/// <summary> /// Apply Density-based reclassification to the FinalClassification. /// This may cause some clusters to be split into smaller clusters. /// It will not cause any existing clusters to be merged. /// </summary> void ReclassifyByDensity() { // 0. Decide if we will be doing this or not, based on the configuration. if (!Configuration.DensityClassifier.SkipDensityClassification) { Timer.Start("Reclassify by density"); var numberOfClustersSplit = 0; // 1. Loop through all clusters in FinalClassification // We will be modifying FinalClassification while iterating over it, // so we need to copy the list of labels up front. var classLabels = FinalClassification.ClassLabels().ToList(); foreach (var clusterId in classLabels) { // 2. Decide if the cluster needs reclustering. if (NeedsReclustering(clusterId)) { // 3. Obtain the members of the cluster and index them by the Hilbert curve var pointsToClassify = FinalClassification.PointsInClass(clusterId); var lookupPointById = new Dictionary <int, UnsignedPoint>(); foreach (var p in pointsToClassify) { lookupPointById[p.UniqueId] = p; } int labelCounter = 1; var subClassification = new Classification <UnsignedPoint, string>(pointsToClassify, p => (labelCounter++).ToString()); var hIndex = new HilbertIndex(subClassification, Configuration.Index.BitsPerDimension); // 4. Create a DensityClassifier, properly configured. var unmergeableSize = (int)(pointsToClassify.Count * Configuration.DensityClassifier.UnmergeableSizeFraction); var densityClassifier = new DensityClassifier(hIndex, MergeSquareDistance, unmergeableSize) { NeighborhoodRadiusMultiplier = Configuration.DensityClassifier.NeighborhoodRadiusMultiplier, OutlierSize = Configuration.DensityClassifier.OutlierSize, MergeableShrinkage = Configuration.DensityClassifier.MergeableShrinkage }; // 5. Reclassify. // This classification is in terms of HilbertPoints, so afterwards we will need to map them to // their non-HilbertPoint, original UnsignedPoints. var densityClassification = densityClassifier.Classify(); // 6. If the number of clusters made from the points is more than one... if (densityClassification.NumPartitions > 1) { numberOfClustersSplit++; // 7. ... loop through all HilbertPoints from cluster and find corresponding UnsignedPoints. foreach (var hPoint in densityClassification.Points()) { var uPoint = lookupPointById[hPoint.UniqueId]; // Form the new class label by appending the previous label and the density-based label. var previousClassLabel = FinalClassification.GetClassLabel(uPoint); var densityClassLabel = densityClassification.GetClassLabel(hPoint); var newClassLabel = $"{previousClassLabel}-{densityClassLabel}"; // 8. Pull point from its current cluster and add it to a new cluster FinalClassification.Remove(uPoint); FinalClassification.Add(uPoint, newClassLabel); } } } } Timer.Stop("Reclassify by density"); Logger.Info($"Clusters split due to density-based reclassification: {numberOfClustersSplit}"); } }
/// <summary> /// Create a new index based on an existing one, having all the same points, but with their coordinates permuted. /// /// All the points in the new index will share the same UniqueIds as their corresponding points in the original. /// To map from a point in tone index to the similar point in the other: /// /// var p2 = hilbertIndex2.Equivalent(p1); /// </summary> /// <param name="original">Original.</param> /// <param name="permutation">Permutation.</param> public HilbertIndex(HilbertIndex original, Permutation <uint> permutation) { UnsortedPoints = original.UnsortedPoints.Select(p => p.Permute(permutation)).ToList(); InitIndexing(); }
/// <summary> /// Search many HilbertIndex objects, each based on a different permutation of the dimensions, and /// keep the ones yielding the best Metrics, likely those that estimate the lowest values /// for the number of clusters. /// </summary> /// <param name="points">Points to index.</param> /// <param name="indexCount">Number of the best indices to return. /// For example, if this is 10, then the 10 indices with the lowest scores will be kept.</param> /// <param name="startingPermutation">Starting permutation.</param> /// <returns>The best indices found and the permutations that generated them. /// THe first item in the returned list is the best of the best, and the last is the worst of the best.</returns> public IList <IndexFound> SearchMany(IList <HilbertPoint> points, int indexCount, Permutation <uint> startingPermutation = null) { if (points.Count() < 10) { throw new ArgumentException("List has too few elements", nameof(points)); } var queue = new BinaryHeap <IndexFound>(BinaryHeapType.MaxHeap, indexCount); int dimensions = points[0].Dimensions; var bitsPerDimension = points[0].BitsPerDimension; if (startingPermutation == null) { startingPermutation = new Permutation <uint>(dimensions); } var firstIndex = new HilbertIndex(points, startingPermutation); // Measure our first index, then loop through random permutations // looking for a better one, always accumulating the best in results. var metricResults = Metric(firstIndex); var bestResults = new IndexFound(startingPermutation, firstIndex, metricResults.Item1, metricResults.Item2); if (ShouldCompact) { bestResults.Compact(); } LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Starts at: {bestResults}"); var startingCount = bestResults.EstimatedClusterCount; queue.AddRemove(bestResults); // Decide if we are to sample points or use them all var sampledPoints = points; var sampleSize = points.Count(); if (UseSample) { sampleSize = SampleSize(points, bestResults.EstimatedClusterCount); sampledPoints = Sample(points, sampleSize); Logger.Info($" Sample is {sampleSize} of {points.Count} points"); } var rejectedSampleSizes = new HashSet <int>(); var iterationsWithoutImprovement = 0; var parallelOpts = new ParallelOptions { MaxDegreeOfParallelism = EstimateMaxDegreesOfParallelism(sampledPoints) }; List <Permutation <uint> > allPermutations = null; // If the number of dimensions is small, we might waste time trying the same randomly chosen permutations mutiple times. // Instead, we will try all or many of them in order. if (dimensions <= 7) { allPermutations = Permutation <uint> .AllPermutations(dimensions).ToList(); } for (var iteration = 0; iteration < MaxIterations; iteration++) { var improvedCount = 0; var startFromPermutation = bestResults.PermutationUsed; Parallel.For(0, ParallelTrials, parallelOpts, i => { Permutation <uint> permutationToTry; // This locking is needed because we use a static random number generator to create a new permutation. // It is more expensive to make the random number generator threadsafe than to make this loop threadsafe. if (dimensions > 7) { lock (startFromPermutation) { permutationToTry = PermutationStrategy(startFromPermutation, dimensions, iteration); } } else { lock (allPermutations) { if (!allPermutations.Any()) { return; } permutationToTry = allPermutations.Last(); allPermutations.RemoveAt(allPermutations.Count - 1); } } IList <HilbertPoint> sampledPointsToUse; lock (points) { sampledPointsToUse = sampledPoints; } var indexToTry = new HilbertIndex(sampledPointsToUse, permutationToTry); metricResults = Metric(indexToTry); var resultsToTry = new IndexFound(permutationToTry, indexToTry, metricResults.Item1, metricResults.Item2); if (ShouldCompact) { resultsToTry.Compact(); } lock (queue) { if (resultsToTry.EstimatedClusterCount < startingCount / 4 && UseSample && sampleSize != points.Count()) { // If the cluster count has improved too much and we are sampled, // reject it and increase the sample size. // Why? If the clusters are irregular, sampling can break // them into so many small pieces that most points end up in outliers. // This leads to a false low count. if (!rejectedSampleSizes.Contains(indexToTry.Count)) { sampleSize = Math.Min(points.Count(), 3 * indexToTry.Count / 2); Logger.Info($"Increasing sample size to {sampleSize} because estimated K = {resultsToTry.EstimatedClusterCount} (not trusted)"); var newSampledPoints = Sample(points, sampleSize); lock (points) { sampledPoints = newSampledPoints; } rejectedSampleSizes.Add(indexToTry.Count); } } else { queue.AddRemove(resultsToTry); var improved = resultsToTry.IsBetterThan(bestResults); if (improved) { bestResults = resultsToTry; Interlocked.Add(ref improvedCount, 1); LowestCountSeen = Math.Min(LowestCountSeen, bestResults.EstimatedClusterCount); Logger.Info($"Cluster count Improved to: {bestResults}"); } } } }); if (improvedCount > 0) { iterationsWithoutImprovement = 0; } else { iterationsWithoutImprovement++; } if (iterationsWithoutImprovement >= MaxIterationsWithoutImprovement) { break; } if (bestResults.EstimatedClusterCount <= 2) { break; // No point in continuing! } } var indicesFound = queue.RemoveAll().Reverse().ToList(); if (sampledPoints.Count < points.Count) { // Results are based on Sampled set of points. Now we need to recreate these indices using the // full set of points. //TODO: "Unsample" the indices. var unsampledIndices = indicesFound.Select(i => Unsample(points, i)).ToList(); Logger.Info($"Final, unsampled Cluster count: {unsampledIndices[0]}"); return(unsampledIndices); } else { return(indicesFound); } }
/// <summary> /// Create a Compact from a HilbertIndex. /// </summary> /// <param name="index">Index to compact.</param> /// <param name="idToPoints">The key is the UniqueId and the value is the corresponding point. /// These points must be the UnsignedPoint analogs of the HilbertPoints in the HilbertIndex and share the same Ids.</param> public HilbertOrderedIndex(HilbertIndex index, Dictionary <int, UnsignedPoint> idToPoints) { UnsortedPoints = index.UnsortedPoints.Select(hp => idToPoints[hp.UniqueId]).ToList(); SortedPoints = index.SortedPoints.Select(hp => idToPoints[hp.UniqueId]).ToList(); InitIndexing(); }