public void CorrectlyCalculateCosineSimilarity(double[] vectorA, double[] vectorB, double expectedSimilarity) { var similarityFunction = DistanceFunctionResolver.ResolveDistanceFunction(DistanceFunctionType.Cosine); var calculatedSimilarity = similarityFunction.Invoke(vectorA, vectorB); Assert.Equal(expectedSimilarity, calculatedSimilarity, 8); }
/// <summary> /// Calculates a matrix which organises the distances between data points in the embeddings. /// </summary> private static Dictionary <string, Dictionary <string, double> > CalculateDistanceMatrix( IReadOnlyCollection <IEmbedding> embeddings, DistanceFunctionType distanceFunctionType) { var distanceFunction = DistanceFunctionResolver.ResolveDistanceFunction(distanceFunctionType); var matrix = new Dictionary <string, Dictionary <string, double> >(); foreach (var embedding in embeddings) { var distances = embeddings.ToDictionary( otherIEmbedding => otherIEmbedding.Label, otherIEmbedding => distanceFunction.Invoke(embedding.Vector, otherIEmbedding.Vector)); matrix.Add(embedding.Label, distances); } return(matrix); }
public static Dictionary <string, int> GetLabelClusterMap( IEnumerable <IEmbedding> embeddings, double epsilon = 0.5, int minimumSamples = 5, DistanceFunctionType distanceFunctionType = DistanceFunctionType.Euclidean, int concurrentThreads = 4) { var embeddingsList = embeddings.ToList(); var distanceFunction = DistanceFunctionResolver.ResolveDistanceFunction(distanceFunctionType); var clusterLabels = new ConcurrentDictionary <string, int>(); var clusterRelationships = new ConcurrentBag <ConcurrentBag <int> >(); var clusterIndex = 0; var sampleSize = (int)Math.Ceiling((double)embeddingsList.Count / concurrentThreads); Parallel.For(0, concurrentThreads, threadIndex => { foreach (var embedding in embeddingsList.Skip(threadIndex * sampleSize).Take(sampleSize)) { if (clusterLabels.ContainsKey(embedding.Label)) { continue; } var neighbors = GetNeighborsAndWeight( embedding, embeddingsList, distanceFunction, epsilon); if (neighbors.Count < minimumSamples) { clusterLabels.AddOrUpdate( embedding.Label, -1, (key, existingClusterIndex) => existingClusterIndex); continue; } var localClusterIndex = clusterIndex++; clusterLabels.AddOrUpdate( embedding.Label, (key) => { clusterRelationships.Add(new ConcurrentBag <int> { localClusterIndex }); return(localClusterIndex); }, (key, existingClusterIndex) => { clusterRelationships.First(r => r.Contains(existingClusterIndex)).Add(localClusterIndex); return(localClusterIndex); }); for (var i = 0; i < neighbors.Count; i++) { var currentNeighbor = neighbors[i]; if (clusterLabels.TryGetValue(currentNeighbor.Label, out var existingClusterId)) { if (existingClusterId != -1 && existingClusterId != localClusterIndex) { clusterRelationships.First(r => r.Contains(existingClusterId)).Add(localClusterIndex); } clusterLabels[currentNeighbor.Label] = localClusterIndex; continue; } clusterLabels.AddOrUpdate( currentNeighbor.Label, localClusterIndex, (key, existingClusterIndex) => { clusterRelationships.First(r => r.Contains(existingClusterIndex)).Add(localClusterIndex); return(localClusterIndex); }); var currentNeighborsNeighbors = GetNeighborsAndWeight( currentNeighbor, embeddingsList, distanceFunction, epsilon); if (currentNeighborsNeighbors.Count >= minimumSamples) { neighbors = neighbors.Union(currentNeighborsNeighbors).ToList(); } } } }); var clusterIndexMap = GetClusterIndexMap(clusterRelationships); return(clusterLabels.ToDictionary( x => x.Key, x => clusterIndexMap[x.Value])); }