private double DensityCorrelationCase(DensityMeter dMeter, int newWindowRadius, int numPointsToCorrelate = 1000) { dMeter.Distances.WindowRadius = newWindowRadius; Func <HilbertPoint, long> exactMetric = p => (long)dMeter.ExactNeighbors(p); Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, newWindowRadius); var correlator = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric); var correlation = correlator.TauB(dMeter.Index.SortedPoints.Take(numPointsToCorrelate)); return(correlation); }
public void TauB_ReverseOrder() { var kendall = new KendallTauCorrelation <int, int>( (int value) => value, (int value) => value * -10 ); Assert.AreEqual( -1.0, kendall.TauB(OneToTen), "Numbers that sort in reverse order should be perfectly anti-correlated." ); }
public void TauB_SameOrder() { var kendall = new KendallTauCorrelation <int, int>( (int value) => value, (int value) => value * 10 ); Assert.AreEqual( 1.0, kendall.TauB(OneToTen), "Numbers that sort in the same order should be perfectly correlated." ); }
public void TauB_OneSwap_NoTies() { var reordered = new[] { 1, 2, 3, 5, 4, 6, 7, 8, 9, 10 }; var kendall = new KendallTauCorrelation <int, int>( (int value) => value, (int value) => reordered[value - 1] ); Assert.AreEqual( 43.0 / 45.0, kendall.TauB(OneToTen), 0.00001, "If a single number is out of place the sequences should be almost perfectly correlated." ); }
public void TauB_Ties() { var reordered = new[] { 1, 1, 1, 4, 5, 6, 7, 8, 9, 10 }; var kendall = new KendallTauCorrelation <int, int>( (int value) => value, (int value) => reordered[value - 1] ); Assert.AreEqual( 42.0 / Sqrt(42.0 * 45.0), kendall.TauB(OneToTen), 0.00001, "Adding a few ties should be almost perfectly correlated." ); }
public void BA_CompareJaccardToCartesian() { int dimensions = 100; int n = 400; var maxPairsToCorrelate = 30000; var points = GenerateRandomDocumentChains(dimensions, 25, n); var pairs = new List <Tuple <int[], int[]> >(); for (var i = 0; i < points.Count - 1 && pairs.Count <= maxPairsToCorrelate; i++) { for (var j = i + 1; j < points.Count && pairs.Count <= maxPairsToCorrelate; j++) { pairs.Add(new Tuple <int[], int[]>(points[i], points[j])); } } Func <Tuple <int[], int[]>, int> traditionalCartesianMeasure = pair => UnrandomizedDistance(pair.Item1, pair.Item2); Func <Tuple <int[], int[]>, int> randomizedCartesianMeasure = pair => Distance(pair.Item1, pair.Item2); Func <Tuple <int[], int[]>, int> jaccardMeasure = pair => (int)(Jaccard(pair.Item1, pair.Item2) * 100); Func <Tuple <int[], int[]>, int> cosineSimilarityMeasure = pair => (int)(CosineSimilarity(pair.Item1, pair.Item2) * 100); foreach (var pair in pairs.Take(30)) { Console.WriteLine($"Cartesian = {randomizedCartesianMeasure(pair)} Jaccard = {jaccardMeasure(pair)}"); } var control = new KendallTauCorrelation <Tuple <int[], int[]>, int>(traditionalCartesianMeasure, jaccardMeasure); var correlator = new KendallTauCorrelation <Tuple <int[], int[]>, int>(randomizedCartesianMeasure, jaccardMeasure); var cosineCorrelator = new KendallTauCorrelation <Tuple <int[], int[]>, int>(randomizedCartesianMeasure, cosineSimilarityMeasure); var traditionalCorrelation = control.TauB(pairs); var correlation = correlator.TauB(pairs); var cosineCorrelation = cosineCorrelator.TauB(pairs); var oldVersusNewMsg = $"The randomized approach had a correlation of {correlation}, while the traditional measure yielded {traditionalCorrelation}"; var newVersusCosineMsg = $"The randomized approach had a correlation of {correlation}, while the cosine similarity yielded {cosineCorrelation}"; Console.WriteLine(oldVersusNewMsg); Console.WriteLine(newVersusCosineMsg); Assert.GreaterOrEqual(correlation, traditionalCorrelation, oldVersusNewMsg); Assert.GreaterOrEqual(correlation, 0.9, $"Cartesian and Jaccard distances only have a correlation of {correlation}"); }
public void DensityCorrelation() { var bitsPerDimension = 10; var data = new GaussianClustering { ClusterCount = 50, Dimensions = 100, MaxCoordinate = (1 << bitsPerDimension) - 1, MinClusterSize = 100, MaxClusterSize = 500 }; var expectedClusters = data.MakeClusters(); var hIndex = new HilbertIndex(expectedClusters, bitsPerDimension); var cc = new ClusterCounter { NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1 }; var count = cc.Count(hIndex.SortedPoints); // Choice of neighborhoodDistance is crucial. // - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating // that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors // and your window size is 100 then many points will have their neighbor count saturate near 100 and // no meaningful variation in density will be found. // - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density. // - We know that almost every point has two neighbors within MaximumSquareDistance, so we should // make it smaller than MaximumSquareDistance. var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5; var numPoints = hIndex.SortedPoints.Count; var windowRadius = (int)Math.Sqrt(numPoints / 2); var dMeter = new DensityMeter(hIndex, neighborhoodDistance, windowRadius); Func <HilbertPoint, long> exactMetric = p => (long)dMeter.ExactNeighbors(p); Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius); var correlator = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric); var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000)); Console.WriteLine($"Correlation between exact and estimated density is: {correlation}"); Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough"); }