public static unsafe int GetDistance(string source, string target, CalculationOptions calculationOptions) { //Shortcut any processing if either string is empty if (source == null || source.Length == 0) { return(target?.Length ?? 0); } if (target == null || target.Length == 0) { return(source.Length); } fixed(char *sourcePtr = source) fixed(char *targetPtr = target) { return(CalculateDistance(sourcePtr, targetPtr, source.Length, target.Length, calculationOptions)); } }
private static unsafe int CalculateDistance(char *sourcePtr, char *targetPtr, int sourceLength, int targetLength, CalculationOptions calculationOptions) { //Identify and trim any common prefix or suffix between the strings var offset = DataHelper.GetIndexOfFirstNonMatchingCharacter(sourcePtr, targetPtr, sourceLength, targetLength); sourcePtr += offset; targetPtr += offset; sourceLength -= offset; targetLength -= offset; DataHelper.TrimLengthOfMatchingCharacters(sourcePtr, targetPtr, ref sourceLength, ref targetLength); //Check the trimmed values are not empty if (sourceLength == 0) { return(targetLength); } if (targetLength == 0) { return(sourceLength); } //Switch around variables so outer loop has fewer iterations if (targetLength < sourceLength) { var tempSourcePtr = sourcePtr; sourcePtr = targetPtr; targetPtr = tempSourcePtr; (sourceLength, targetLength) = (targetLength, sourceLength); } if (targetLength >= calculationOptions.EnableThreadingAfterXCharacters) { return(CalculateDistance_MultiThreaded(sourcePtr, targetPtr, sourceLength, targetLength, calculationOptions)); } #if NETCOREAPP if (Sse41.IsSupported) { //Levenshtein Distance diagonal calculation inspired by Anna Henningsen's C implementation //https://github.com/addaleax/levenshtein-sse if (sourceLength > 16 && sourceLength < ushort.MaxValue && targetLength < ushort.MaxValue) { var diag1Array = ArrayPool <ushort> .Shared.Rent(sourceLength + 1); var diag2Array = ArrayPool <ushort> .Shared.Rent(sourceLength + 1); fixed(void *diag1Ptr = diag1Array) fixed(void *diag2Ptr = diag2Array) { var result = CalculateDiagonal_MinSse41 <ushort>(diag1Ptr, diag2Ptr, sourcePtr, sourceLength, targetPtr, targetLength); ArrayPool <ushort> .Shared.Return(diag1Array); ArrayPool <ushort> .Shared.Return(diag2Array); return(result); } } else { var diag1Array = ArrayPool <int> .Shared.Rent(sourceLength + 1); var diag2Array = ArrayPool <int> .Shared.Rent(sourceLength + 1); fixed(void *diag1Ptr = diag1Array) fixed(void *diag2Ptr = diag2Array) { var result = CalculateDiagonal_MinSse41 <int>(diag1Ptr, diag2Ptr, sourcePtr, sourceLength, targetPtr, targetLength); ArrayPool <int> .Shared.Return(diag1Array); ArrayPool <int> .Shared.Return(diag2Array); return(result); } } } #endif var pooledArray = ArrayPool <int> .Shared.Rent(targetLength); fixed(int *previousRowPtr = pooledArray) { DataHelper.SequentialFill(previousRowPtr, targetLength); var rowIndex = 0; //Levenshtein Distance outer loop unrolling inspired by Gustaf Andersson's JS implementation //https://github.com/gustf/js-levenshtein/blob/55ca1bf22bd55aa81cb5836c63582da6e9fb5fb0/index.js#L71-L90 CalculateRows_4Rows(previousRowPtr, sourcePtr, sourceLength, ref rowIndex, targetPtr, targetLength); //Calculate Single Rows for (; rowIndex < sourceLength; rowIndex++) { var lastSubstitutionCost = rowIndex; var lastInsertionCost = rowIndex + 1; var sourcePrevChar = sourcePtr[rowIndex]; CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost); } var result = previousRowPtr[targetLength - 1]; ArrayPool <int> .Shared.Return(pooledArray); return(result); } }
public static unsafe int GetDistance(ReadOnlySpan <char> source, ReadOnlySpan <char> target, CalculationOptions calculationOptions) { var sourceLength = source.Length; var targetLength = target.Length; //Shortcut any processing if either string is empty if (sourceLength == 0) { return(targetLength); } if (targetLength == 0) { return(sourceLength); } fixed(char *sourcePtr = source) fixed(char *targetPtr = target) { return(CalculateDistance(sourcePtr, targetPtr, sourceLength, targetLength, calculationOptions)); } }
private static unsafe int CalculateDistance_MultiThreaded(char *sourcePtr, char *targetPtr, int sourceLength, int targetLength, CalculationOptions options) { var maximumNumberOfWorkers = Environment.ProcessorCount; var numberOfWorkers = targetLength / options.MinimumCharactersPerThread; if (numberOfWorkers == 0) { numberOfWorkers = 1; } else if (numberOfWorkers > maximumNumberOfWorkers) { numberOfWorkers = maximumNumberOfWorkers; } var numberOfColumnsPerWorker = targetLength / numberOfWorkers; var remainderColumns = targetLength % numberOfWorkers; var rowCountPtr = stackalloc int[Environment.ProcessorCount]; var columnBoundariesPool = ArrayPool <int[]> .Shared.Rent(numberOfWorkers + 1); //Initialise shared task boundaries for (var i = 0; i < numberOfWorkers + 1; i++) { columnBoundariesPool[i] = ArrayPool <int> .Shared.Rent(sourceLength + 1); columnBoundariesPool[i][0] = i * numberOfColumnsPerWorker; } columnBoundariesPool[numberOfWorkers][0] += remainderColumns; //Fill first column boundary (ColumnIndex = 0) with incrementing numbers fixed(int *startBoundaryPtr = columnBoundariesPool[0]) { DataHelper.SequentialFill(startBoundaryPtr, 0, sourceLength + 1); } for (var workerIndex = 0; workerIndex < numberOfWorkers - 1; workerIndex++) { var columnIndex = workerIndex * numberOfColumnsPerWorker; ThreadPool.QueueUserWorkItem(WorkerTask, new WorkerState { RowCountPtr = rowCountPtr, WorkerIndex = workerIndex, ColumnIndex = columnIndex, SourcePtr = sourcePtr, SourceLength = sourceLength, TargetSegmentPtr = targetPtr + columnIndex, TargetSegmentLength = numberOfColumnsPerWorker, BackColumnBoundary = columnBoundariesPool[workerIndex], ForwardColumnBoundary = columnBoundariesPool[workerIndex + 1] }); } //Run last segment synchronously (ie. in the current thread) var lastWorkerIndex = numberOfWorkers - 1; var lastWorkerColumnIndex = lastWorkerIndex * numberOfColumnsPerWorker; WorkerTask_CalculateSegment(new WorkerState { RowCountPtr = rowCountPtr, WorkerIndex = numberOfWorkers - 1, ColumnIndex = (numberOfWorkers - 1) * numberOfColumnsPerWorker, SourcePtr = sourcePtr, SourceLength = sourceLength, TargetSegmentPtr = targetPtr + lastWorkerColumnIndex, TargetSegmentLength = numberOfColumnsPerWorker + remainderColumns, BackColumnBoundary = columnBoundariesPool[lastWorkerIndex], ForwardColumnBoundary = columnBoundariesPool[lastWorkerIndex + 1] }); //Extract last value in forward column boundary of last task (the actual distance) var result = columnBoundariesPool[numberOfWorkers][sourceLength]; //Cleanup //Return all column boundaries then the container of boundaries for (var i = 0; i < numberOfWorkers + 1; i++) { ArrayPool <int> .Shared.Return(columnBoundariesPool[i]); } ArrayPool <int[]> .Shared.Return(columnBoundariesPool); return(result); }