public IEnumerable <SimilarityResult> FindSimilar(IBagOfWords bag) { logger.LogDebug("Searching for similar documents"); var vector = encoder.GetFullVector(bag.Words.Select(item => item.Text).ToArray()); var distanceTable = new Dictionary <IBagOfWords, double?>(); foreach (var existing in vectorTable) { distanceTable[existing.Key] = null; } Parallel.ForEach(vectorTable.Keys.ToArray(), existingDocument => { var existing = vectorTable[existingDocument]; if (existing == null) { existing = encoder.GetFullVector(existingDocument.Words.Select(item => item.Text).ToArray()); vectorTable[existingDocument] = existing; } var distance = distanceMeasurer.Measure(vector, existing); distanceTable[existingDocument] = distance; }); return(distanceTable.OrderByDescending(item => item.Value) .Where(item => item.Value.HasValue) .Select(item => new SimilarityResult(item.Key, item.Value.Value))); }
public void NotNegativeTest(string s1, string s2) { double distance = d.Measure(s1, s2); Assert.GreaterOrEqual(distance, 0); }
private IEnumerable <RecordResult> EnumerateRecordResults() { int[] clusterQty = new int[_k]; double[] clusterDistance = new double[_n]; double[,] avgClusterDistance = new double[_n, _k]; for (int i = 0; i < _clusterNums.Length; i++) { // count total number of items in each cluster clusterQty[_clusterNums[i]] += 1; // calculate distance to own medoid clusterDistance[i] = _distance.Measure(_items[i], _items[_medoidIndices[_clusterNums[i]]]); // calculate average distance to each cluster for (int j = i + 1; j < _clusterNums.Length; j++) { var dist = _distance.Measure(_items[i], _items[j]); avgClusterDistance[i, _clusterNums[j]] += dist; avgClusterDistance[j, _clusterNums[i]] += dist; } } for (int i = 0; i < _clusterNums.Length; i++) { for (int j = 0; j < _k; j++) { if (j == _clusterNums[i]) { // own cluster avgClusterDistance[i, j] = clusterQty[j] == 1 ? 0 : avgClusterDistance[i, j] / (clusterQty[j] - 1); } else { // other cluster avgClusterDistance[i, j] = clusterQty[j] == 0 ? double.MaxValue : avgClusterDistance[i, j] / clusterQty[j]; } } } double calculateSilhuette(int i) { double a = double.MaxValue; double b = double.MaxValue; for (int j = 0; j < _k; j++) { double d = avgClusterDistance[i, j]; if (j == _clusterNums[i]) { a = d; } else if (b > d) { b = d; } } return((b - a) / Math.Max(b, a)); } for (int i = 0; i < _clusterNums.Length; i++) { yield return(new RecordResult( index: i, isMedoid: _medoidIndices.Any(m => m == i), clusterNo: _clusterNums[i], clusterDistance: clusterDistance[i], silhuetteIndex: calculateSilhuette(i))); } }