public static int GetDistances(string s, string t, FuzzyAlgorithm algorithm = FuzzyAlgorithm.LevenshteinDistance) { int distance = 100000; switch (algorithm) { case FuzzyAlgorithm.LevenshteinDistance: distance = LevenshteinDistance.GetLevenshteinDistance(s, t); break; case FuzzyAlgorithm.DamerauLevenshteinDistance: distance = DamerauLevenshteinDistance.GetDamerauLevenshteinDistance(s, t); break; case FuzzyAlgorithm.HammingDistance: distance = HammingDistance.GetHammingDistance(s, t); break; default: distance = LevenshteinDistance.GetLevenshteinDistance(s, t); break; } return(distance); }
public void DamerauLevenshteinTest() { var first = "fкул"; var second = "кит"; var result = DamerauLevenshteinDistance.GetDistance(first, second); Assert.AreEqual(3, result); }
public async Task <List <DocumentModel> > ExecuteSearch(IndexModel indexModel, SearchModel searchModel) { var ids = new List <Guid>(); var tokens = await _analyzer.Anal(searchModel.Term); var keys = new List <string>(); var idxs = await DatabaseService.GetIndexes(indexModel, searchModel.Key); foreach (var dict in idxs) { var keys1 = dict.Keys.Where(x => tokens .Any(y => DamerauLevenshteinDistance .GetDistance(x, y) < 2)) .ToList(); foreach (var k in keys1) { if (!keys.Contains(k)) { keys.Add(k); } } } foreach (var dict in idxs) { if (dict.Keys.Count(x => keys.Contains(x)) >= keys.Count - 2) { foreach (var key in keys) { if (dict.ContainsKey(key)) { ids.Add(dict[key]); } } } } var result = new List <DocumentModel>(); foreach (var id in ids.Distinct()) { result.Add(await DatabaseService.FindById(indexModel, id)); } return(result); }
//private StringBuilder msg = new StringBuilder(); public override float Compare(string str1, string str2) { float isMainCityFactor = 1f; float similarity = 0.0f; // normalize "Wiesbaden-Dotzheim" -> "wiesbaden-dotzheim" string city1 = this.Normalize(str1); string city2 = this.Normalize(str2); //msg.AppendLine("Normalize1:" + str1 + " -> " + city1); //msg.AppendLine("Normalize2:" + str2 + " -> " + city2); // "Mainz-Bingen/Bingen" -> "Bingen" if (city1.Contains("/")) { city1 = this.GetLeftPart(city1, "/"); isMainCityFactor = 0.9f; } else if (city1.Contains("-")) { // e.g. "Wiesbaden-Dotzheim" // e.g. "Mainz-Bingen" city1 = this.GetLeftPart(city1, "-"); isMainCityFactor = 0.9f; } if (city2.Contains("/")) { city2 = this.GetLeftPart(city2, "/"); isMainCityFactor = 0.9f; } else if (city2.Contains("-")) { city2 = this.GetLeftPart(city2, "-"); isMainCityFactor = 0.9f; } StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); similarity = comparer.Compare(city1, city2); // reduce similarity, 100% cannot be reached, when one city is only part of the other similarity = similarity * isMainCityFactor; return(similarity); }
public override float Compare(string str1, string str2) { float similarity = 0.0f; string name1 = str1; string name2 = str2; // check if name is shortened like "Müller" -> "M." if (name1.EndsWith(".") || name2.EndsWith(".")) { // normalize "M.-Thurgau" -> "m thurgau" name1 = this.Normalize(name1); name2 = this.Normalize(name2); // take length of the shortened name "M" int minLength = Math.Min(name1.Length, name2.Length); name1 = name1.TrySubstring(minLength); name2 = name2.TrySubstring(minLength); StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); similarity = comparer.Compare(name1, name2); // reduce similarity, 100% cannot be reached, when one name is shortened similarity = similarity * 0.8f; } else { // normalize "M.-Thurgau" -> "m thurgau" name1 = this.Normalize(name1); name2 = this.Normalize(name2); StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); similarity = comparer.Compare(name1, name2); // Reduce the score if the first letters don't match //if (name1.CharAt(0) != name2.CharAt(0)) //{ // similarity = Math.Min(similarity, MAX_SCORE_FOR_NO_FIRST_LETTER_MATCH); //} } return(similarity); }
public override float Compare(string str1, string str2) { string name1 = this.Normalize(str1); string name2 = this.Normalize(str2); // check if company is shortened like "International Business Machines" -> "IBM" string firstChars1 = this.GetFirstCharsFromWords(name1).Join(""); string firstChars2 = this.GetFirstCharsFromWords(name2).Join(""); if (firstChars1 == firstChars2) { // company name is shortened an equals (e.g. "IBM" == "IBM") return(0.9f); } StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); float similarityShortened = comparer.Compare(firstChars1, firstChars2); float similarityNormal = comparer.Compare(name1, name2); // return what is better: the shortened version vs. the normal version return(Math.Max(similarityShortened, similarityNormal)); }
private void StringCompareTest(string input, string[] testCases) { Debug.WriteLine("Dice Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer dice = new DiceCoefficent(); double diceValue = dice.Compare(input, name); Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("Jaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jaccard = new Jaccard(); double jaccardValue = jaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer exjaccard = new ExtendedJaccard(); double exjaccardValue = exjaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lev = new DamerauLevenshteinDistance(); var levenStein = lev.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", levenStein, name); } Debug.WriteLine(""); Debug.WriteLine("JaroWinkler for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jw = new JaroWinkler(); var jwValue = jw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", jwValue, name); } Debug.WriteLine(""); Debug.WriteLine("Monge-Elkan for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer me = new MongeElkan(); var meValue = me.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", meValue, name); } Debug.WriteLine(""); Debug.WriteLine("NGramDistance(2) for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer ngram2 = new NGramDistance(); (ngram2 as NGramDistance).NGramLength = 2; var ngramValue2 = ngram2.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", ngramValue2, name); } Debug.WriteLine(""); Debug.WriteLine("SmithWaterman for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer sw = new SmithWaterman(); var swValue = sw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", swValue, name); } Debug.WriteLine(""); Debug.WriteLine("Extended Editex for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer edx = new ExtendedEditex(); var edxValue = edx.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", edxValue, name); } Debug.WriteLine(""); Debug.WriteLine("Longest Common Subsequence for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lcs = new LongestCommonSubsequence(); var lcsValue = lcs.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); }
public void ShouldReturnDamerauLevenshteinDistanceOf0() { var result = DamerauLevenshteinDistance.DamLev(InputArm, InputArm); Assert.AreEqual(0, result); }
public void ShouldReturnDamerauLevenshteinDistanceOf2() { var result = DamerauLevenshteinDistance.Calculate(InputRam, InputMom); Assert.AreEqual(2, result); }
public void ShouldSetDamerauLevenshteinDistanceInput2() { Algorithm = new DamerauLevenshteinDistance(InputRam, InputArm); Assert.AreEqual(InputArm, Algorithm.Input2); }
public void ShouldReturnDamerauLevenshteinDistanceInstance() { Algorithm = new DamerauLevenshteinDistance(InputArm, InputRam); Assert.IsNotNull(Algorithm); }
public void ShouldReturnDamerauLevenshteinDistanceOfNegative1() { var result = DamerauLevenshteinDistance.DamLev(InputRam, InputMom, 1); Assert.AreEqual(-1, result); }
public void ShouldReturnDamerauLevenshteinDistanceOf1() { var result = DamerauLevenshteinDistance.DamLev(InputRam, InputReam); Assert.AreEqual(1, result); }
/// <summary> /// /// </summary> /// <param name="comparisons"></param> /// <param name="concFiles"></param> /// <returns></returns> private List <string> pairWithPk(IEnumerable <TreatmentComparison> comparisons, List <string> files) { var pairedFiles = Enumerable.Repeat <string>(null, comparisons.Count()).ToList(); var distance = new DamerauLevenshteinDistance(); // Initialize scores array var scoresArray = comparisons.Select((c, i) => new { c, i }) .ToDictionary(row => row.i, row => { var pkFilename = Path.GetFileNameWithoutExtension(row.c.PkFile.Path).ToLower(); return(files.Select((f, j) => new { f, j }) .ToDictionary(col => col.j, col => distance.Calculate( Path.GetFileNameWithoutExtension(col.f).ToLower(), pkFilename) )); }); // Iteratively extract the best matches while (scoresArray.Any() && scoresArray.Count * scoresArray.First().Value.Count > 1) { // find the current best match int currentScore = int.MaxValue, pkId = -1, fId = -1; foreach (var row in scoresArray) { int rowScore = int.MaxValue, rowfId = -1; foreach (var col in row.Value) { if (col.Value < rowScore) { rowfId = col.Key; rowScore = col.Value; } } if (rowScore < currentScore) { pkId = row.Key; fId = rowfId; currentScore = rowScore; } } // Save selected match pairedFiles[pkId] = files[fId]; // Clean scores array for next iteration scoresArray.Remove(pkId); foreach (var row in scoresArray) { row.Value.Remove(fId); } } // if one match remaining if (scoresArray.Any()) { pairedFiles[scoresArray.First().Key] = files[scoresArray.First().Value.First().Key]; } return(pairedFiles); }