Ejemplo n.º 1
0
        /// <summary>
        /// Generate nSamples many integers from 0 to poolSize such that no integer is selected twice.The duplication constraint is achieved via rejection sampling.
        /// </summary>
        public static int[] RejectionSample(int nSamples, int poolSize, IProvideRandomValues random)
        {
            if (poolSize < nSamples)
            {
                nSamples = poolSize;
            }
            var result = new int[nSamples];

            for (var i = 0; i < nSamples; i++)
            {
                var rejectSample = true;
                while (rejectSample)
                {
                    var j      = random.Next(0, poolSize);
                    var broken = false;
                    for (var k = 0; k < i; k++)
                    {
                        if (j == result[k])
                        {
                            broken = true;
                            break;
                        }
                    }
                    if (!broken)
                    {
                        rejectSample = false;
                    }

                    result[i] = j;
                }
            }
            return(result);
        }
Ejemplo n.º 2
0
        public static void Uniform(ref float[] data, float a, IProvideRandomValues random)
        {
            float a2 = 2 * a;
            float an = -a;

            random.NextFloats(data);
            SIMD.Multiply(ref data, a2);
            SIMD.Add(ref data, an);
        }
Ejemplo n.º 3
0
            internal IReadOnlyList <int> AddItems(IReadOnlyList <TItem> items, IProvideRandomValues generator, CancellationToken cancellationToken)
            {
                int newCount = items.Count;

                var newIDs = new List <int>();

                Items.AddRange(items);
                DistanceCache?.Resize(newCount, false);

                int id0 = Nodes.Count;

                for (int id = 0; id < newCount; ++id)
                {
                    Nodes.Add(Algorithm.NewNode(id0 + id, RandomLayer(generator, Parameters.LevelLambda)));
                    newIDs.Add(id0 + id);
                    cancellationToken.ThrowIfCancellationRequested();
                }
                return(newIDs);
            }
Ejemplo n.º 4
0
        /// <summary>
        /// Searches a flattened rp-tree for a point
        /// </summary>
        public static int[] SearchFlatTree(float[] point, FlatTree tree, IProvideRandomValues random)
        {
            var node = 0;

            while (tree.Children[node][0] > 0)
            {
                var side = SelectSide(tree.Hyperplanes[node], tree.Offsets[node], point, random);
                if (side == 0)
                {
                    node = tree.Children[node][0];
                }
                else
                {
                    node = tree.Children[node][1];
                }
            }
            var index = -1 * tree.Children[node][0];

            return(tree.Indices[index]);
        }
Ejemplo n.º 5
0
        public Umap(
            DistanceCalculation distance = null,
            IProvideRandomValues random  = null,
            int dimensions                    = 2,
            int numberOfNeighbors             = 15,
            int?customNumberOfEpochs          = null,
            ProgressReporter progressReporter = null)
        {
            if ((customNumberOfEpochs != null) && (customNumberOfEpochs <= 0))
            {
                throw new ArgumentOutOfRangeException(nameof(customNumberOfEpochs), "if non-null then must be a positive value");
            }

            _distanceFn        = distance ?? DistanceFunctions.Cosine;
            _random            = random ?? DefaultRandomGenerator.Instance;
            _nNeighbors        = numberOfNeighbors;
            _optimizationState = new OptimizationState {
                Dim = dimensions
            };
            _customNumberOfEpochs = customNumberOfEpochs;
            _progressReporter     = progressReporter;
        }
Ejemplo n.º 6
0
            private static int RandomLayer(IProvideRandomValues generator, double lambda)
            {
                var r = -Math.Log(generator.NextFloat()) * lambda;

                return((int)r);
            }
Ejemplo n.º 7
0
        /// <summary>
        /// Given a set of ``indices`` for data points from ``data``, create a random hyperplane to split the data, returning two arrays indices that fall on either side of the hyperplane. This is
        /// the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses euclidean distance to determine the hyperplane and which side each data
        /// sample falls on.
        /// </summary>
        private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(float[][] data, int[] indices, IProvideRandomValues random)
        {
            var dim = data[0].Length;

            // Select two random points, set the hyperplane between them
            var leftIndex  = random.Next(0, indices.Length);
            var rightIndex = random.Next(0, indices.Length);

            rightIndex += (leftIndex == rightIndex) ? 1 : 0;
            rightIndex %= indices.Length;
            var left  = indices[leftIndex];
            var right = indices[rightIndex];

            // Compute the normal vector to the hyperplane (the vector between the two points) and the offset from the origin
            var hyperplaneOffset = 0f;
            var hyperplaneVector = new float[dim];

            for (var i = 0; i < hyperplaneVector.Length; i++)
            {
                hyperplaneVector[i] = data[left][i] - data[right][i];
                hyperplaneOffset   -= (hyperplaneVector[i] * (data[left][i] + data[right][i])) / 2;
            }

            // For each point compute the margin (project into normal vector)
            // If we are on lower side of the hyperplane put in one pile, otherwise put it in the other pile (if we hit hyperplane on the nose, flip a coin)
            var nLeft  = 0;
            var nRight = 0;
            var side   = new int[indices.Length];

            for (var i = 0; i < indices.Length; i++)
            {
                var margin = hyperplaneOffset;
                for (var d = 0; d < dim; d++)
                {
                    margin += hyperplaneVector[d] * data[indices[i]][d];
                }

                if (margin == 0)
                {
                    side[i] = random.Next(0, 2);
                    if (side[i] == 0)
                    {
                        nLeft += 1;
                    }
                    else
                    {
                        nRight += 1;
                    }
                }
                else if (margin > 0)
                {
                    side[i] = 0;
                    nLeft  += 1;
                }
                else
                {
                    side[i] = 1;
                    nRight += 1;
                }
            }

            // Now that we have the counts, allocate arrays
            var indicesLeft  = new int[nLeft];
            var indicesRight = new int[nRight];

            // Populate the arrays with indices according to which side they fell on
            nLeft  = 0;
            nRight = 0;
            for (var i = 0; i < side.Length; i++)
            {
                if (side[i] == 0)
                {
                    indicesLeft[nLeft] = indices[i];
                    nLeft += 1;
                }
                else
                {
                    indicesRight[nRight] = indices[i];
                    nRight += 1;
                }
            }

            return(indicesLeft, indicesRight, hyperplaneVector, hyperplaneOffset);
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Select the side of the tree to search during flat tree search
        /// </summary>
        private static int SelectSide(float[] hyperplane, float offset, float[] point, IProvideRandomValues random)
        {
            var margin = offset;

            for (var d = 0; d < point.Length; d++)
            {
                margin += hyperplane[d] * point[d];
            }

            if (margin == 0)
            {
                return(random.Next(0, 2));
            }
            else if (margin > 0)
            {
                return(0);
            }
            else
            {
                return(1);
            }
        }
Ejemplo n.º 9
0
 private static RandomProjectionTreeNode MakeEuclideanTree(float[][] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
 {
     if (indices.Length > leafSize)
     {
         var(indicesLeft, indicesRight, hyperplaneVector, hyperplaneOffset) = EuclideanRandomProjectionSplit(data, indices, random);
         var leftChild  = MakeEuclideanTree(data, indicesLeft, leafSize, q + 1, random);
         var rightChild = MakeEuclideanTree(data, indicesRight, leafSize, q + 1, random);
         return(new RandomProjectionTreeNode {
             Indices = indices, LeftChild = leftChild, RightChild = rightChild, IsLeaf = false, Hyperplane = hyperplaneVector, Offset = hyperplaneOffset
         });
     }
     else
     {
         return(new RandomProjectionTreeNode {
             Indices = indices, LeftChild = null, RightChild = null, IsLeaf = true, Hyperplane = Array.Empty <float>(), Offset = 0
         });
     }
 }
Ejemplo n.º 10
0
        /// <summary>
        /// Construct a random projection tree based on ``data`` with leaves of size at most ``leafSize``
        /// </summary>
        public static RandomProjectionTreeNode MakeTree(float[][] data, int leafSize, int n, IProvideRandomValues random)
        {
            var indices = Enumerable.Range(0, data.Length).ToArray();

            return(MakeEuclideanTree(data, indices, leafSize, n, random));
        }
Ejemplo n.º 11
0
        /// <summary>
        /// Build a heap of candidate neighbors for nearest neighbor descent. For each vertex the candidate neighbors are any current neighbors, and any vertices that have the vertex as one of their nearest neighbors.
        /// </summary>
        public static Heap BuildCandidates(Heap currentGraph, int nVertices, int nNeighbors, int maxCandidates, IProvideRandomValues random)
        {
            var candidateNeighbors = MakeHeap(nVertices, maxCandidates);

            for (var i = 0; i < nVertices; i++)
            {
                for (var j = 0; j < nNeighbors; j++)
                {
                    if (currentGraph[0][i][j] < 0)
                    {
                        continue;
                    }

                    var idx = (int)currentGraph[0][i][j]; // TOOD: Should Heap be int values instead of float?
                    var isn = (int)currentGraph[2][i][j]; // TOOD: Should Heap be int values instead of float?
                    var d   = random.NextFloat();
                    HeapPush(candidateNeighbors, i, d, idx, isn);
                    HeapPush(candidateNeighbors, idx, d, i, isn);
                    currentGraph[2][i][j] = 0;
                }
            }
            return(candidateNeighbors);
        }
Ejemplo n.º 12
0
 /// <summary>
 /// Initializes a new instance of the <see cref="SmallWorld{TItem, TDistance}"/> class.
 /// </summary>
 /// <param name="distance">The distance function to use in the small world.</param>
 /// <param name="generator">The random number generator for building graph.</param>
 /// <param name="parameters">Parameters of the algorithm.</param>
 public SmallWorld(Func <TItem, TItem, TDistance> distance, IProvideRandomValues generator, Parameters parameters)
 {
     Distance  = distance;
     Graph     = new Graph <TItem, TDistance>(Distance, parameters);
     Generator = generator;
 }
Ejemplo n.º 13
0
        /// <summary>
        /// Deserializes the graph from byte array.
        /// </summary>
        /// <param name="items">The items to assign to the graph's verticies.</param>
        /// <param name="bytes">The serialized parameters and edges.</param>
        public static SmallWorld <TItem, TDistance> DeserializeGraph(IReadOnlyList <TItem> items, Func <TItem, TItem, TDistance> distance, IProvideRandomValues generator, Stream stream)
        {
            var    p0 = stream.Position;
            string hnswHeader;

            try
            {
                hnswHeader = MessagePackBinary.ReadString(stream);
            }
            catch (Exception E)
            {
                if (stream.CanSeek)
                {
                    stream.Position = p0;
                }                                            //Resets the stream to original position
                throw new InvalidDataException($"Invalid header found in stream, data is corrupted or invalid", E);
            }

            if (hnswHeader != SERIALIZATION_HEADER)
            {
                if (stream.CanSeek)
                {
                    stream.Position = p0;
                }                                             //Resets the stream to original position
                throw new InvalidDataException($"Invalid header found in stream, data is corrupted or invalid");
            }

            // readStrict: true -> removed, as not available anymore on MessagePack 2.0 - also probably not necessary anymore
            //                     see https://github.com/neuecc/MessagePack-CSharp/pull/663

            var parameters = MessagePackSerializer.Deserialize <Parameters>(stream);

            //Overwrite previous InitialDistanceCacheSize parameter, so we don't waste time/memory allocating a distance cache for an already existing graph
            parameters.InitialDistanceCacheSize = 0;

            var world = new SmallWorld <TItem, TDistance>(distance, generator, parameters);

            world.Graph.Deserialize(items, stream);
            return(world);
        }
Ejemplo n.º 14
0
        /// <summary>
        /// Create a version of nearest neighbor descent.
        /// </summary>
        public static NNDescentFn MakeNNDescent(DistanceCalculation distanceFn, IProvideRandomValues random)
        {
            return((data, leafArray, nNeighbors, nIters, maxCandidates, delta, rho, rpTreeInit, startingIteration) =>
            {
                var nVertices = data.Length;
                var currentGraph = MakeHeap(data.Length, nNeighbors);
                for (var i = 0; i < data.Length; i++)
                {
                    var indices = Utils.RejectionSample(nNeighbors, data.Length, random);
                    for (var j = 0; j < indices.Length; j++)
                    {
                        var d = distanceFn(data[i], data[indices[j]]);
                        HeapPush(currentGraph, i, d, indices[j], 1);
                        HeapPush(currentGraph, indices[j], d, i, 1);
                    }
                }
                if (rpTreeInit)
                {
                    for (var n = 0; n < leafArray.Length; n++)
                    {
                        for (var i = 0; i < leafArray[n].Length; i++)
                        {
                            if (leafArray[n][i] < 0)
                            {
                                break;
                            }

                            for (var j = i + 1; j < leafArray[n].Length; j++)
                            {
                                if (leafArray[n][j] < 0)
                                {
                                    break;
                                }

                                var d = distanceFn(data[leafArray[n][i]], data[leafArray[n][j]]);
                                HeapPush(currentGraph, leafArray[n][i], d, leafArray[n][j], 1);
                                HeapPush(currentGraph, leafArray[n][j], d, leafArray[n][i], 1);
                            }
                        }
                    }
                }
                for (var n = 0; n < nIters; n++)
                {
                    startingIteration?.Invoke(n, nIters);
                    var candidateNeighbors = BuildCandidates(currentGraph, nVertices, nNeighbors, maxCandidates, random);
                    var c = 0;
                    for (var i = 0; i < nVertices; i++)
                    {
                        for (var j = 0; j < maxCandidates; j++)
                        {
                            var p = (int)Math.Floor(candidateNeighbors[0][i][j]);
                            if ((p < 0) || (random.NextFloat() < rho))
                            {
                                continue;
                            }

                            for (var k = 0; k < maxCandidates; k++)
                            {
                                var q = (int)Math.Floor(candidateNeighbors[0][i][k]);
                                var cj = candidateNeighbors[2][i][j];
                                var ck = candidateNeighbors[2][i][k];
                                if (q < 0 || ((cj == 0) && (ck == 0)))
                                {
                                    continue;
                                }

                                var d = distanceFn(data[p], data[q]);
                                c += HeapPush(currentGraph, p, d, q, 1);
                                c += HeapPush(currentGraph, q, d, p, 1);
                            }
                        }
                    }
                    if (c <= delta * nNeighbors * data.Length)
                    {
                        break;
                    }
                }
                return DeHeapSort(currentGraph);
            });
        }
Ejemplo n.º 15
0
        /// <summary>
        /// Creates graph from the given items.
        /// Contains implementation of INSERT(hnsw, q, M, Mmax, efConstruction, mL) algorithm.
        /// Article: Section 4. Algorithm 1.
        /// </summary>
        /// <param name="items">The items to insert.</param>
        /// <param name="generator">The random number generator to distribute nodes across layers.</param>
        /// <param name="progressReporter">Interface to report progress </param>
        /// <param name="cancellationToken">Token to cancel adding items to the graph. The graph state will be corrupt if you cancel, and will need to be rebuilt from scratch.</param>
        internal IReadOnlyList <int> AddItems(IReadOnlyList <TItem> items, IProvideRandomValues generator, IProgressReporter progressReporter, CancellationToken cancellationToken)
        {
            if (items is null || !items.Any())
            {
                return(Array.Empty <int>());
            }

            GraphCore = GraphCore ?? new Core(Distance, Parameters);

            int startIndex = GraphCore.Items.Count;

            var newIDs = GraphCore.AddItems(items, generator, cancellationToken);

            var entryPoint = EntryPoint ?? GraphCore.Nodes[0];

            var searcher = new Searcher(GraphCore);
            Func <int, int, TDistance> nodeDistance = GraphCore.GetDistance;
            var neighboursIdsBuffer = new List <int>(GraphCore.Algorithm.GetM(0) + 1);

            for (int nodeId = startIndex; nodeId < GraphCore.Nodes.Count; ++nodeId)
            {
                cancellationToken.ThrowIfCancellationRequested();
                using (new ScopeLatencyTracker(GraphBuildEventSource.Instance?.GraphInsertNodeLatencyReporter))
                {
                    /*
                     * W ← ∅ // list for the currently found nearest elements
                     * ep ← get enter point for hnsw
                     * L ← level of ep // top layer for hnsw
                     * l ← ⌊-ln(unif(0..1))∙mL⌋ // new element’s level
                     * for lc ← L … l+1
                     *   W ← SEARCH-LAYER(q, ep, ef=1, lc)
                     *   ep ← get the nearest element from W to q
                     * for lc ← min(L, l) … 0
                     *   W ← SEARCH-LAYER(q, ep, efConstruction, lc)
                     *   neighbors ← SELECT-NEIGHBORS(q, W, M, lc) // alg. 3 or alg. 4
                     *     for each e ∈ neighbors // shrink connections if needed
                     *       eConn ← neighbourhood(e) at layer lc
                     *       if │eConn│ > Mmax // shrink connections of e if lc = 0 then Mmax = Mmax0
                     *         eNewConn ← SELECT-NEIGHBORS(e, eConn, Mmax, lc) // alg. 3 or alg. 4
                     *         set neighbourhood(e) at layer lc to eNewConn
                     *   ep ← W
                     * if l > L
                     *   set enter point for hnsw to q
                     */

                    // zoom in and find the best peer on the same level as newNode
                    var bestPeer    = entryPoint;
                    var currentNode = GraphCore.Nodes[nodeId];
                    var currentNodeTravelingCosts = new TravelingCosts <int, TDistance>(nodeDistance, nodeId);
                    for (int layer = bestPeer.MaxLayer; layer > currentNode.MaxLayer; --layer)
                    {
                        searcher.RunKnnAtLayer(bestPeer.Id, currentNodeTravelingCosts, neighboursIdsBuffer, layer, 1);
                        bestPeer = GraphCore.Nodes[neighboursIdsBuffer[0]];
                        neighboursIdsBuffer.Clear();
                    }

                    // connecting new node to the small world
                    for (int layer = Math.Min(currentNode.MaxLayer, entryPoint.MaxLayer); layer >= 0; --layer)
                    {
                        searcher.RunKnnAtLayer(bestPeer.Id, currentNodeTravelingCosts, neighboursIdsBuffer, layer, Parameters.ConstructionPruning);
                        var bestNeighboursIds = GraphCore.Algorithm.SelectBestForConnecting(neighboursIdsBuffer, currentNodeTravelingCosts, layer);

                        for (int i = 0; i < bestNeighboursIds.Count; ++i)
                        {
                            int newNeighbourId = bestNeighboursIds[i];
                            GraphCore.Algorithm.Connect(currentNode, GraphCore.Nodes[newNeighbourId], layer);
                            GraphCore.Algorithm.Connect(GraphCore.Nodes[newNeighbourId], currentNode, layer);

                            // if distance from newNode to newNeighbour is better than to bestPeer => update bestPeer
                            if (DistanceUtils.LowerThan(currentNodeTravelingCosts.From(newNeighbourId), currentNodeTravelingCosts.From(bestPeer.Id)))
                            {
                                bestPeer = GraphCore.Nodes[newNeighbourId];
                            }
                        }

                        neighboursIdsBuffer.Clear();
                    }

                    // zoom out to the highest level
                    if (currentNode.MaxLayer > entryPoint.MaxLayer)
                    {
                        entryPoint = currentNode;
                    }

                    // report distance cache hit rate
                    GraphBuildEventSource.Instance?.CoreGetDistanceCacheHitRateReporter?.Invoke(GraphCore.DistanceCacheHitRate);
                }
                progressReporter?.Progress(nodeId - startIndex, GraphCore.Nodes.Count - startIndex);
            }

            // construction is done
            EntryPoint = entryPoint;
            return(newIDs);
        }