private double DensityCorrelationCase(DensityMeter dMeter, int newWindowRadius, int numPointsToCorrelate = 1000)
        {
            dMeter.Distances.WindowRadius = newWindowRadius;
            Func <HilbertPoint, long> exactMetric     = p => (long)dMeter.ExactNeighbors(p);
            Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, newWindowRadius);
            var correlator  = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric);
            var correlation = correlator.TauB(dMeter.Index.SortedPoints.Take(numPointsToCorrelate));

            return(correlation);
        }
Пример #2
0
        public void TauB_ReverseOrder()
        {
            var kendall = new KendallTauCorrelation <int, int>(
                (int value) => value,
                (int value) => value * -10
                );

            Assert.AreEqual(
                -1.0,
                kendall.TauB(OneToTen),
                "Numbers that sort in reverse order should be perfectly anti-correlated."
                );
        }
Пример #3
0
        public void TauB_SameOrder()
        {
            var kendall = new KendallTauCorrelation <int, int>(
                (int value) => value,
                (int value) => value * 10
                );

            Assert.AreEqual(
                1.0,
                kendall.TauB(OneToTen),
                "Numbers that sort in the same order should be perfectly correlated."
                );
        }
Пример #4
0
        public void TauB_OneSwap_NoTies()
        {
            var reordered = new[] { 1, 2, 3, 5, 4, 6, 7, 8, 9, 10 };
            var kendall   = new KendallTauCorrelation <int, int>(
                (int value) => value,
                (int value) => reordered[value - 1]
                );

            Assert.AreEqual(
                43.0 / 45.0,
                kendall.TauB(OneToTen),
                0.00001,
                "If a single number is out of place the sequences should be almost perfectly correlated."
                );
        }
Пример #5
0
        public void TauB_Ties()
        {
            var reordered = new[] { 1, 1, 1, 4, 5, 6, 7, 8, 9, 10 };
            var kendall   = new KendallTauCorrelation <int, int>(
                (int value) => value,
                (int value) => reordered[value - 1]
                );

            Assert.AreEqual(
                42.0 / Sqrt(42.0 * 45.0),
                kendall.TauB(OneToTen),
                0.00001,
                "Adding a few ties should be almost perfectly correlated."
                );
        }
        public void BA_CompareJaccardToCartesian()
        {
            int dimensions          = 100;
            int n                   = 400;
            var maxPairsToCorrelate = 30000;
            var points              = GenerateRandomDocumentChains(dimensions, 25, n);
            var pairs               = new List <Tuple <int[], int[]> >();

            for (var i = 0; i < points.Count - 1 && pairs.Count <= maxPairsToCorrelate; i++)
            {
                for (var j = i + 1; j < points.Count && pairs.Count <= maxPairsToCorrelate; j++)
                {
                    pairs.Add(new Tuple <int[], int[]>(points[i], points[j]));
                }
            }
            Func <Tuple <int[], int[]>, int> traditionalCartesianMeasure = pair => UnrandomizedDistance(pair.Item1, pair.Item2);
            Func <Tuple <int[], int[]>, int> randomizedCartesianMeasure  = pair => Distance(pair.Item1, pair.Item2);
            Func <Tuple <int[], int[]>, int> jaccardMeasure          = pair => (int)(Jaccard(pair.Item1, pair.Item2) * 100);
            Func <Tuple <int[], int[]>, int> cosineSimilarityMeasure = pair => (int)(CosineSimilarity(pair.Item1, pair.Item2) * 100);

            foreach (var pair in pairs.Take(30))
            {
                Console.WriteLine($"Cartesian = {randomizedCartesianMeasure(pair)}  Jaccard = {jaccardMeasure(pair)}");
            }
            var control          = new KendallTauCorrelation <Tuple <int[], int[]>, int>(traditionalCartesianMeasure, jaccardMeasure);
            var correlator       = new KendallTauCorrelation <Tuple <int[], int[]>, int>(randomizedCartesianMeasure, jaccardMeasure);
            var cosineCorrelator = new KendallTauCorrelation <Tuple <int[], int[]>, int>(randomizedCartesianMeasure, cosineSimilarityMeasure);

            var traditionalCorrelation = control.TauB(pairs);
            var correlation            = correlator.TauB(pairs);
            var cosineCorrelation      = cosineCorrelator.TauB(pairs);
            var oldVersusNewMsg        = $"The randomized approach had a correlation of {correlation}, while the traditional measure yielded {traditionalCorrelation}";
            var newVersusCosineMsg     = $"The randomized approach had a correlation of {correlation}, while the cosine similarity yielded {cosineCorrelation}";

            Console.WriteLine(oldVersusNewMsg);
            Console.WriteLine(newVersusCosineMsg);

            Assert.GreaterOrEqual(correlation, traditionalCorrelation, oldVersusNewMsg);
            Assert.GreaterOrEqual(correlation, 0.9, $"Cartesian and Jaccard distances only have a correlation of {correlation}");
        }
        public void DensityCorrelation()
        {
            var bitsPerDimension = 10;
            var data             = new GaussianClustering
            {
                ClusterCount   = 50,
                Dimensions     = 100,
                MaxCoordinate  = (1 << bitsPerDimension) - 1,
                MinClusterSize = 100,
                MaxClusterSize = 500
            };
            var expectedClusters = data.MakeClusters();
            var hIndex           = new HilbertIndex(expectedClusters, bitsPerDimension);
            var cc = new ClusterCounter {
                NoiseSkipBy = 10, OutlierSize = 5, ReducedNoiseSkipBy = 1
            };
            var count = cc.Count(hIndex.SortedPoints);
            // Choice of neighborhoodDistance is crucial.
            //   - If it is too large, then a huge number of neighbors will be caught up in the dragnet, and estimating
            //	   that value with a window into the Hilbert curve will yield poor results. Why? If there are 200 neighbors
            //     and your window size is 100 then many points will have their neighbor count saturate near 100 and
            //     no meaningful variation in density will be found.
            //   - If it is too small, then too few neighbors (or none!) will be found, and we get no meaningful density.
            //   - We know that almost every point has two neighbors within MaximumSquareDistance, so we should
            //     make it smaller than MaximumSquareDistance.
            var neighborhoodDistance = count.MaximumSquareDistance * 2 / 5;
            var numPoints            = hIndex.SortedPoints.Count;

            var windowRadius = (int)Math.Sqrt(numPoints / 2);
            var dMeter       = new DensityMeter(hIndex, neighborhoodDistance, windowRadius);

            Func <HilbertPoint, long> exactMetric     = p => (long)dMeter.ExactNeighbors(p);
            Func <HilbertPoint, long> estimatedMetric = p => (long)dMeter.EstimatedDensity(p, windowRadius);
            var correlator  = new KendallTauCorrelation <HilbertPoint, long>(exactMetric, estimatedMetric);
            var correlation = correlator.TauB(hIndex.SortedPoints.Take(1000));

            Console.WriteLine($"Correlation between exact and estimated density is: {correlation}");
            Assert.GreaterOrEqual(correlation, 0.90, $"Correlation {correlation} is not high enough");
        }