public void CorrectlyCalculateCosineSimilarity(double[] vectorA, double[] vectorB, double expectedSimilarity)
    {
        var similarityFunction =
            DistanceFunctionResolver.ResolveDistanceFunction(DistanceFunctionType.Cosine);

        var calculatedSimilarity = similarityFunction.Invoke(vectorA, vectorB);

        Assert.Equal(expectedSimilarity, calculatedSimilarity, 8);
    }
Beispiel #2
0
    /// <summary>
    /// Calculates a matrix which organises the distances between data points in the embeddings.
    /// </summary>
    private static Dictionary <string, Dictionary <string, double> > CalculateDistanceMatrix(
        IReadOnlyCollection <IEmbedding> embeddings,
        DistanceFunctionType distanceFunctionType)
    {
        var distanceFunction = DistanceFunctionResolver.ResolveDistanceFunction(distanceFunctionType);

        var matrix = new Dictionary <string, Dictionary <string, double> >();

        foreach (var embedding in embeddings)
        {
            var distances = embeddings.ToDictionary(
                otherIEmbedding => otherIEmbedding.Label,
                otherIEmbedding => distanceFunction.Invoke(embedding.Vector, otherIEmbedding.Vector));

            matrix.Add(embedding.Label, distances);
        }

        return(matrix);
    }
Beispiel #3
0
    public static Dictionary <string, int> GetLabelClusterMap(
        IEnumerable <IEmbedding> embeddings,
        double epsilon     = 0.5,
        int minimumSamples = 5,
        DistanceFunctionType distanceFunctionType = DistanceFunctionType.Euclidean,
        int concurrentThreads = 4)
    {
        var embeddingsList = embeddings.ToList();

        var distanceFunction = DistanceFunctionResolver.ResolveDistanceFunction(distanceFunctionType);

        var clusterLabels        = new ConcurrentDictionary <string, int>();
        var clusterRelationships = new ConcurrentBag <ConcurrentBag <int> >();
        var clusterIndex         = 0;
        var sampleSize           = (int)Math.Ceiling((double)embeddingsList.Count / concurrentThreads);

        Parallel.For(0, concurrentThreads, threadIndex =>
        {
            foreach (var embedding in embeddingsList.Skip(threadIndex * sampleSize).Take(sampleSize))
            {
                if (clusterLabels.ContainsKey(embedding.Label))
                {
                    continue;
                }

                var neighbors = GetNeighborsAndWeight(
                    embedding,
                    embeddingsList,
                    distanceFunction,
                    epsilon);

                if (neighbors.Count < minimumSamples)
                {
                    clusterLabels.AddOrUpdate(
                        embedding.Label,
                        -1,
                        (key, existingClusterIndex) => existingClusterIndex);
                    continue;
                }

                var localClusterIndex = clusterIndex++;
                clusterLabels.AddOrUpdate(
                    embedding.Label,
                    (key) =>
                {
                    clusterRelationships.Add(new ConcurrentBag <int> {
                        localClusterIndex
                    });
                    return(localClusterIndex);
                },
                    (key, existingClusterIndex) =>
                {
                    clusterRelationships.First(r => r.Contains(existingClusterIndex)).Add(localClusterIndex);
                    return(localClusterIndex);
                });

                for (var i = 0; i < neighbors.Count; i++)
                {
                    var currentNeighbor = neighbors[i];
                    if (clusterLabels.TryGetValue(currentNeighbor.Label, out var existingClusterId))
                    {
                        if (existingClusterId != -1 && existingClusterId != localClusterIndex)
                        {
                            clusterRelationships.First(r => r.Contains(existingClusterId)).Add(localClusterIndex);
                        }
                        clusterLabels[currentNeighbor.Label] = localClusterIndex;
                        continue;
                    }

                    clusterLabels.AddOrUpdate(
                        currentNeighbor.Label,
                        localClusterIndex,
                        (key, existingClusterIndex) =>
                    {
                        clusterRelationships.First(r => r.Contains(existingClusterIndex)).Add(localClusterIndex);
                        return(localClusterIndex);
                    });

                    var currentNeighborsNeighbors = GetNeighborsAndWeight(
                        currentNeighbor,
                        embeddingsList,
                        distanceFunction,
                        epsilon);

                    if (currentNeighborsNeighbors.Count >= minimumSamples)
                    {
                        neighbors = neighbors.Union(currentNeighborsNeighbors).ToList();
                    }
                }
            }
        });

        var clusterIndexMap = GetClusterIndexMap(clusterRelationships);

        return(clusterLabels.ToDictionary(
                   x => x.Key,
                   x => clusterIndexMap[x.Value]));
    }