public void DefaultConstructor() { var simDefault = new BagOfWordsSimilarity(); simDefault.InternalTokenSimilarity.Should().Be(TokenSimilarity.Levenshtein); simDefault.IsSymmetric.Should().Be(false); }
public void GetSimilarityNormalizedStringNotSymmetric() { var simNormString = new BagOfWordsSimilarity(); var patternTokens = new[] { new NormalizedString("miss"), new NormalizedString("anna"), new NormalizedString("kurnikovova") }; var targetTokens = new[] { new NormalizedString("kurnikovova"), new NormalizedString("anna"), }; var simResult = simNormString.GetSimilarity(patternTokens, targetTokens); simResult.Should().BeApproximately(0.696, ErrorTolerance); }
public void GetSimilarityTokenizerNotSymmetric() { const string pattern = "miss anna kurnikovova"; const string target = "kurnikovova anna"; var simNormString = new BagOfWordsSimilarity(); var patternTokenizer = new Tokenizer(pattern); var targetTokenizer = new Tokenizer(target); var simResult = simNormString.GetSimilarity(patternTokenizer, targetTokenizer); simResult.Should().BeApproximately(0.696, ErrorTolerance); }
public void GetSimilarityStringsDefault() { var bagOfTokensSim = new BagOfWordsSimilarity(); var patternTokens = new[] { "anna", "kurnikovova" }; var targetTokens = new[] { "kurnikovova", "anna" }; var scoreRes = bagOfTokensSim.GetSimilarity(patternTokens, targetTokens); scoreRes.Should().Be(1.0); var patternTokens2 = new[] { "jaromir", "jagr" }; var targetTokens2 = new[] { "jaromir", "jager" }; var scoreRes2 = bagOfTokensSim.GetSimilarity(patternTokens2, targetTokens2); scoreRes2.Should().Be(0.9); }
public void GetSimilarityStringsLevenshtein() { var bagOfTokensSim = new BagOfWordsSimilarity(TokenSimilarity.Levenshtein); bagOfTokensSim.InternalTokenSimilarity.Should().Be(TokenSimilarity.Levenshtein); var patternTokens = new[] { "anna", "kurnikovova" }; var targetTokens = new[] { "kurnikovova", "anna" }; var scoreRes = bagOfTokensSim.GetSimilarity(patternTokens, targetTokens); scoreRes.Should().Be(1.0); var patternTokens2 = new[] { "jaromir", "jagr" }; var targetTokens2 = new[] { "jaromir", "jager" }; var scoreRes2 = bagOfTokensSim.GetSimilarity(patternTokens2, targetTokens2); scoreRes2.Should().BeApproximately(0.9, ErrorTolerance); }
public void GetSimilaritySymmetric() { // not symmetric instance of BagOfTokensSimilarity var bagOfTokensNotSymmetric = new BagOfWordsSimilarity(TokenSimilarity.Levenshtein, false); bagOfTokensNotSymmetric.IsSymmetric.Should().Be(false); // symmetric instance of BagOfTokensSimilarity var bagOfTokensSymmetric = new BagOfWordsSimilarity(TokenSimilarity.Levenshtein, true); bagOfTokensSymmetric.IsSymmetric.Should().Be(true); var patternTokens = new[] { "miss", "anna", "kurnikovova" }; var targetTokens = new[] { "kurnikovova", "anna" }; var scoreSymmetricRes = bagOfTokensNotSymmetric.GetSimilarity(patternTokens, targetTokens); scoreSymmetricRes.Should().BeApproximately(0.696, ErrorTolerance); var scoreNotSymmetricRes = bagOfTokensSymmetric.GetSimilarity(patternTokens, targetTokens); scoreNotSymmetricRes.Should().Be(1.0); }
private static void Main(string[] args) { /****************************************************/ /* BagOfTokensSimilarity */ /****************************************************/ // the recommend method for complex similarity on more words var bagOfTokens = new BagOfWordsSimilarity(); const string pattern = "John Smith"; const string targetText = "Mr. John Smith, Jr."; // using normalized string and tokenizer returns score 1.0 var resultingSim = bagOfTokens.GetSimilarity(new Tokenizer(new NormalizedString(pattern)), new Tokenizer(new NormalizedString(targetText))); /****************************************************/ /* Levenshtein */ /****************************************************/ const string nameCorrect = "martha"; const string nameError = "marhta"; // Levenshtein distance (implements interface IDistance) // & similarity (implements interface ISimilarity) var lev = new Levenshtein(); // returns edit distance 2 var distLev = lev.GetDistance(nameCorrect, nameError); // normalized string removes special symbols, diacritics and make case insensitivity // returns score 0.67 var simLev = lev.GetSimilarity(new NormalizedString(nameCorrect), new NormalizedString(nameError)); /****************************************************/ /* Damerau-Levenshtein */ /****************************************************/ // DamerauLevenshtein implements IDistance and ISimilarity var damLev = new DamerauLevenshtein(); // returns edit distance 1 var distDamLev = damLev.GetDistance(nameCorrect, nameError); // returns score 0.83 var simDamLev = damLev.GetSimilarity(nameCorrect, nameError); /****************************************************/ /* Jaro, Jaro-Winler */ /****************************************************/ // Jaro, Jaro-Winkler implements only ISimilarity var nameFirst = new Token("dwayne"); var nameSecond = new Token("duane"); var jaro = new Jaro(); var jaroWinkler = new JaroWinkler(); jaroWinkler.GetSimilarity(nameFirst, nameSecond); /****************************************************/ /* Q-grams coefficient */ /****************************************************/ // q-grams similarity coefficient - Dice, Jaccard, Overlap // with different q-grams type var diceUnigrams = new DiceCoefficient <Bigram>(); var jaccardBigrams = new JaccardCoefficient <Bigram>(); var overlapTrigrams = new OverlapCoefficient <Bigram>(); // returns score 0.5 var jaccardSim = jaccardBigrams.GetSimilarity(pattern, targetText); // returns score 0.67 var diceSim = diceUnigrams.GetSimilarity(pattern, targetText); // returns score 1.0 var overlapSim = overlapTrigrams.GetSimilarity(pattern, targetText); Console.ReadKey(); }