/// <summary> /// Returns the Jaro-Winkler distance between the specified /// strings. The distance is symmetric and will fall in the /// range 0 (no match) to 1 (perfect match). /// </summary> /// <param name="S1">First String</param> /// <param name="S2">Second String</param> /// <returns></returns> public static int Similarity(string S1, string S2) { int lLen1 = S1.Length; int lLen2 = S2.Length; if (lLen1 == 0) return lLen2 == 0 ? TreeSearchOptions.Equility : 0; StringPairKey K = new StringPairKey(S1, S2); if (_Similarity.ContainsKey(K)) return _Similarity[K]; int Sim = 0; int lSearchRange = Math.Max(0, Math.Max(lLen1, lLen2) / 2 - 1); // default initialized to false bool[] lMatched1 = new bool[lLen1]; bool[] lMatched2 = new bool[lLen2]; int lNumCommon = 0; for (int i = 0; i < lLen1; ++i) { int lStart = Math.Max(0, i - lSearchRange); int lEnd = Math.Min(i + lSearchRange + 1, lLen2); for (int j = lStart; j < lEnd; ++j) { if (lMatched2[j]) continue; if (S1[i] != S2[j]) continue; lMatched1[i] = true; lMatched2[j] = true; ++lNumCommon; break; } } if (lNumCommon == 0) { _Similarity.Add(K, 0); return 0; }; int lNumHalfTransposed = 0; int k = 0; for (int i = 0; i < lLen1; ++i) { if (!lMatched1[i]) continue; while (!lMatched2[k]) ++k; if (S1[i] != S2[k]) ++lNumHalfTransposed; ++k; } // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed); int lNumTransposed = lNumHalfTransposed / 2; // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed); double lNumCommonD = lNumCommon; double lWeight = (lNumCommonD / lLen1 + lNumCommonD / lLen2 + (lNumCommon - lNumTransposed) / lNumCommonD) / 3.0; if (lWeight <= mWeightThreshold) { Sim = (int)(lWeight * TreeSearchOptions.Equility); _Similarity.Add(K, Sim); return Sim; } int lMax = Math.Min(mNumChars, Math.Min(S1.Length, S2.Length)); int lPos = 0; while (lPos < lMax && S1[lPos] == S2[lPos]) ++lPos; if (lPos == 0) return (int)lWeight * TreeSearchOptions.Equility; Sim = (int)((lWeight + 0.1 * lPos * (1.0 - lWeight)) * TreeSearchOptions.Equility); _Similarity.Add(K, Sim); return Sim; }
/// <summary> /// Возвращает степень похожести двух строк /// </summary> /// <param name="S1"></param> /// <param name="S2"></param> /// <returns>От 0 до TreeSearchOptions.Equality</returns> public static int Similarity(string S1, string S2) { if (S1 == S2) return TreeSearchOptions.Equility; int MinLen = Math.Min(S1.Length, S2.Length); int MaxLen = Math.Max(S1.Length, S2.Length); if (MinLen == 0 && MaxLen != 0) return 0; //если в словаре уже есть такая пара - возвращаем сохраненный результат StringPairKey K = new StringPairKey(S1, S2); if (_Similarity.ContainsKey(K)) return _Similarity[K]; //найдем длины общих префиксов и суффиксов int prefix = 0; int suffix = 0; while (prefix < MinLen && S1[prefix] == S2[prefix]) ++prefix; while (suffix < MinLen && S1[S1.Length - 1 - suffix] == S2[S2.Length - 1 - suffix]) ++suffix; int Dist; if (prefix == MinLen || suffix == MinLen || (prefix + suffix >= MinLen)) //одна строка является префиксом или суффиксом другой Dist = (MaxLen - MinLen) * TreeSearchOptions.CostInsertRemoveChar; else { int S1Len = S1.Length - prefix - suffix; int S2Len = S2.Length - prefix - suffix; //вычислим матрицу расстояний int[,] D = new int[S1Len + 1, S2Len + 1]; D[0, 0] = 0; for (int j = 1; j <= S2Len; ++j) D[0, j] = D[0, j - 1] + TreeSearchOptions.CostInsertRemoveChar; for (int i = 1; i <= S1Len; ++i) { D[i, 0] = D[i - 1, 0] + TreeSearchOptions.CostInsertRemoveChar; for (int j = 1; j <= S2Len; ++j) D[i, j] = Math.Min( D[i - 1, j] + TreeSearchOptions.CostInsertRemoveChar, Math.Min( D[i, j - 1] + TreeSearchOptions.CostInsertRemoveChar, D[i - 1, j - 1] + (S1[i - 1 + prefix] == S2[j - 1 + prefix] ? 0 : TreeSearchOptions.CostReplaceChar) ) ); } Dist = D[S1Len, S2Len]; } //максимальное расстояние для пары строк такой длины int MaxDist = Math.Min(S1.Length, S2.Length) * Math.Min(TreeSearchOptions.CostReplaceChar, TreeSearchOptions.CostInsertRemoveChar * 2); MaxDist += Math.Abs(S1.Length - S2.Length) * TreeSearchOptions.CostInsertRemoveChar; //Нормированная степень похожести int Sim = (MaxDist - Dist) * TreeSearchOptions.Equility / MaxDist; _Similarity.AddOrUpdate(K, Sim, (k, v) => v); return Sim; }