Ejemplo n.º 1
0
        public static unsafe int GetDistance(string source, string target, CalculationOptions calculationOptions)
        {
            //Shortcut any processing if either string is empty
            if (source == null || source.Length == 0)
            {
                return(target?.Length ?? 0);
            }
            if (target == null || target.Length == 0)
            {
                return(source.Length);
            }

            fixed(char *sourcePtr = source)
            fixed(char *targetPtr = target)
            {
                return(CalculateDistance(sourcePtr, targetPtr, source.Length, target.Length, calculationOptions));
            }
        }
Ejemplo n.º 2
0
        private static unsafe int CalculateDistance(char *sourcePtr, char *targetPtr, int sourceLength, int targetLength, CalculationOptions calculationOptions)
        {
            //Identify and trim any common prefix or suffix between the strings
            var offset = DataHelper.GetIndexOfFirstNonMatchingCharacter(sourcePtr, targetPtr, sourceLength, targetLength);

            sourcePtr    += offset;
            targetPtr    += offset;
            sourceLength -= offset;
            targetLength -= offset;
            DataHelper.TrimLengthOfMatchingCharacters(sourcePtr, targetPtr, ref sourceLength, ref targetLength);

            //Check the trimmed values are not empty
            if (sourceLength == 0)
            {
                return(targetLength);
            }
            if (targetLength == 0)
            {
                return(sourceLength);
            }

            //Switch around variables so outer loop has fewer iterations
            if (targetLength < sourceLength)
            {
                var tempSourcePtr = sourcePtr;
                sourcePtr = targetPtr;
                targetPtr = tempSourcePtr;

                (sourceLength, targetLength) = (targetLength, sourceLength);
            }

            if (targetLength >= calculationOptions.EnableThreadingAfterXCharacters)
            {
                return(CalculateDistance_MultiThreaded(sourcePtr, targetPtr, sourceLength, targetLength, calculationOptions));
            }

#if NETCOREAPP
            if (Sse41.IsSupported)
            {
                //Levenshtein Distance diagonal calculation inspired by Anna Henningsen's C implementation
                //https://github.com/addaleax/levenshtein-sse
                if (sourceLength > 16 && sourceLength < ushort.MaxValue && targetLength < ushort.MaxValue)
                {
                    var diag1Array = ArrayPool <ushort> .Shared.Rent(sourceLength + 1);

                    var diag2Array = ArrayPool <ushort> .Shared.Rent(sourceLength + 1);

                    fixed(void *diag1Ptr = diag1Array)
                    fixed(void *diag2Ptr = diag2Array)
                    {
                        var result = CalculateDiagonal_MinSse41 <ushort>(diag1Ptr, diag2Ptr, sourcePtr, sourceLength, targetPtr, targetLength);

                        ArrayPool <ushort> .Shared.Return(diag1Array);

                        ArrayPool <ushort> .Shared.Return(diag2Array);

                        return(result);
                    }
                }
                else
                {
                    var diag1Array = ArrayPool <int> .Shared.Rent(sourceLength + 1);

                    var diag2Array = ArrayPool <int> .Shared.Rent(sourceLength + 1);

                    fixed(void *diag1Ptr = diag1Array)
                    fixed(void *diag2Ptr = diag2Array)
                    {
                        var result = CalculateDiagonal_MinSse41 <int>(diag1Ptr, diag2Ptr, sourcePtr, sourceLength, targetPtr, targetLength);

                        ArrayPool <int> .Shared.Return(diag1Array);

                        ArrayPool <int> .Shared.Return(diag2Array);

                        return(result);
                    }
                }
            }
#endif

            var pooledArray = ArrayPool <int> .Shared.Rent(targetLength);

            fixed(int *previousRowPtr = pooledArray)
            {
                DataHelper.SequentialFill(previousRowPtr, targetLength);

                var rowIndex = 0;

                //Levenshtein Distance outer loop unrolling inspired by Gustaf Andersson's JS implementation
                //https://github.com/gustf/js-levenshtein/blob/55ca1bf22bd55aa81cb5836c63582da6e9fb5fb0/index.js#L71-L90
                CalculateRows_4Rows(previousRowPtr, sourcePtr, sourceLength, ref rowIndex, targetPtr, targetLength);

                //Calculate Single Rows
                for (; rowIndex < sourceLength; rowIndex++)
                {
                    var lastSubstitutionCost = rowIndex;
                    var lastInsertionCost    = rowIndex + 1;

                    var sourcePrevChar = sourcePtr[rowIndex];

                    CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost);
                }

                var result = previousRowPtr[targetLength - 1];

                ArrayPool <int> .Shared.Return(pooledArray);

                return(result);
            }
        }
Ejemplo n.º 3
0
        public static unsafe int GetDistance(ReadOnlySpan <char> source, ReadOnlySpan <char> target, CalculationOptions calculationOptions)
        {
            var sourceLength = source.Length;
            var targetLength = target.Length;

            //Shortcut any processing if either string is empty
            if (sourceLength == 0)
            {
                return(targetLength);
            }
            if (targetLength == 0)
            {
                return(sourceLength);
            }

            fixed(char *sourcePtr = source)
            fixed(char *targetPtr = target)
            {
                return(CalculateDistance(sourcePtr, targetPtr, sourceLength, targetLength, calculationOptions));
            }
        }
Ejemplo n.º 4
0
        private static unsafe int CalculateDistance_MultiThreaded(char *sourcePtr, char *targetPtr, int sourceLength, int targetLength, CalculationOptions options)
        {
            var maximumNumberOfWorkers = Environment.ProcessorCount;
            var numberOfWorkers        = targetLength / options.MinimumCharactersPerThread;

            if (numberOfWorkers == 0)
            {
                numberOfWorkers = 1;
            }
            else if (numberOfWorkers > maximumNumberOfWorkers)
            {
                numberOfWorkers = maximumNumberOfWorkers;
            }

            var numberOfColumnsPerWorker = targetLength / numberOfWorkers;
            var remainderColumns         = targetLength % numberOfWorkers;

            var rowCountPtr          = stackalloc int[Environment.ProcessorCount];
            var columnBoundariesPool = ArrayPool <int[]> .Shared.Rent(numberOfWorkers + 1);

            //Initialise shared task boundaries
            for (var i = 0; i < numberOfWorkers + 1; i++)
            {
                columnBoundariesPool[i] = ArrayPool <int> .Shared.Rent(sourceLength + 1);

                columnBoundariesPool[i][0] = i * numberOfColumnsPerWorker;
            }
            columnBoundariesPool[numberOfWorkers][0] += remainderColumns;

            //Fill first column boundary (ColumnIndex = 0) with incrementing numbers
            fixed(int *startBoundaryPtr = columnBoundariesPool[0])
            {
                DataHelper.SequentialFill(startBoundaryPtr, 0, sourceLength + 1);
            }

            for (var workerIndex = 0; workerIndex < numberOfWorkers - 1; workerIndex++)
            {
                var columnIndex = workerIndex * numberOfColumnsPerWorker;

                ThreadPool.QueueUserWorkItem(WorkerTask, new WorkerState
                {
                    RowCountPtr           = rowCountPtr,
                    WorkerIndex           = workerIndex,
                    ColumnIndex           = columnIndex,
                    SourcePtr             = sourcePtr,
                    SourceLength          = sourceLength,
                    TargetSegmentPtr      = targetPtr + columnIndex,
                    TargetSegmentLength   = numberOfColumnsPerWorker,
                    BackColumnBoundary    = columnBoundariesPool[workerIndex],
                    ForwardColumnBoundary = columnBoundariesPool[workerIndex + 1]
                });
            }

            //Run last segment synchronously (ie. in the current thread)
            var lastWorkerIndex       = numberOfWorkers - 1;
            var lastWorkerColumnIndex = lastWorkerIndex * numberOfColumnsPerWorker;

            WorkerTask_CalculateSegment(new WorkerState
            {
                RowCountPtr           = rowCountPtr,
                WorkerIndex           = numberOfWorkers - 1,
                ColumnIndex           = (numberOfWorkers - 1) * numberOfColumnsPerWorker,
                SourcePtr             = sourcePtr,
                SourceLength          = sourceLength,
                TargetSegmentPtr      = targetPtr + lastWorkerColumnIndex,
                TargetSegmentLength   = numberOfColumnsPerWorker + remainderColumns,
                BackColumnBoundary    = columnBoundariesPool[lastWorkerIndex],
                ForwardColumnBoundary = columnBoundariesPool[lastWorkerIndex + 1]
            });

            //Extract last value in forward column boundary of last task (the actual distance)
            var result = columnBoundariesPool[numberOfWorkers][sourceLength];

            //Cleanup
            //Return all column boundaries then the container of boundaries
            for (var i = 0; i < numberOfWorkers + 1; i++)
            {
                ArrayPool <int> .Shared.Return(columnBoundariesPool[i]);
            }
            ArrayPool <int[]> .Shared.Return(columnBoundariesPool);

            return(result);
        }