Beispiel #1
0
        public IEnumerable <SimilarityResult> FindSimilar(IBagOfWords bag)
        {
            logger.LogDebug("Searching for similar documents");
            var vector        = encoder.GetFullVector(bag.Words.Select(item => item.Text).ToArray());
            var distanceTable = new Dictionary <IBagOfWords, double?>();

            foreach (var existing in vectorTable)
            {
                distanceTable[existing.Key] = null;
            }

            Parallel.ForEach(vectorTable.Keys.ToArray(),
                             existingDocument =>
            {
                var existing = vectorTable[existingDocument];
                if (existing == null)
                {
                    existing = encoder.GetFullVector(existingDocument.Words.Select(item => item.Text).ToArray());
                    vectorTable[existingDocument] = existing;
                }

                var distance = distanceMeasurer.Measure(vector, existing);
                distanceTable[existingDocument] = distance;
            });

            return(distanceTable.OrderByDescending(item => item.Value)
                   .Where(item => item.Value.HasValue)
                   .Select(item => new SimilarityResult(item.Key, item.Value.Value)));
        }
Beispiel #2
0
        public void NotNegativeTest(string s1, string s2)
        {
            double distance = d.Measure(s1, s2);

            Assert.GreaterOrEqual(distance, 0);
        }
        private IEnumerable <RecordResult> EnumerateRecordResults()
        {
            int[]    clusterQty      = new int[_k];
            double[] clusterDistance = new double[_n];
            double[,] avgClusterDistance = new double[_n, _k];

            for (int i = 0; i < _clusterNums.Length; i++)
            {
                // count total number of items in each cluster
                clusterQty[_clusterNums[i]] += 1;

                // calculate distance to own medoid
                clusterDistance[i] = _distance.Measure(_items[i], _items[_medoidIndices[_clusterNums[i]]]);

                // calculate average distance to each cluster
                for (int j = i + 1; j < _clusterNums.Length; j++)
                {
                    var dist = _distance.Measure(_items[i], _items[j]);
                    avgClusterDistance[i, _clusterNums[j]] += dist;
                    avgClusterDistance[j, _clusterNums[i]] += dist;
                }
            }

            for (int i = 0; i < _clusterNums.Length; i++)
            {
                for (int j = 0; j < _k; j++)
                {
                    if (j == _clusterNums[i])
                    {
                        // own cluster
                        avgClusterDistance[i, j] = clusterQty[j] == 1
                            ? 0
                            : avgClusterDistance[i, j] / (clusterQty[j] - 1);
                    }
                    else
                    {
                        // other cluster
                        avgClusterDistance[i, j] = clusterQty[j] == 0
                            ? double.MaxValue
                            : avgClusterDistance[i, j] / clusterQty[j];
                    }
                }
            }

            double calculateSilhuette(int i)
            {
                double a = double.MaxValue;
                double b = double.MaxValue;

                for (int j = 0; j < _k; j++)
                {
                    double d = avgClusterDistance[i, j];
                    if (j == _clusterNums[i])
                    {
                        a = d;
                    }
                    else if (b > d)
                    {
                        b = d;
                    }
                }

                return((b - a) / Math.Max(b, a));
            }

            for (int i = 0; i < _clusterNums.Length; i++)
            {
                yield return(new RecordResult(
                                 index: i,
                                 isMedoid: _medoidIndices.Any(m => m == i),
                                 clusterNo: _clusterNums[i],
                                 clusterDistance: clusterDistance[i],
                                 silhuetteIndex: calculateSilhuette(i)));
            }
        }