예제 #1
0
        public void DefaultConstructor()
        {
            var simDefault = new BagOfWordsSimilarity();

            simDefault.InternalTokenSimilarity.Should().Be(TokenSimilarity.Levenshtein);
            simDefault.IsSymmetric.Should().Be(false);
        }
예제 #2
0
        public void GetSimilarityNormalizedStringNotSymmetric()
        {
            var simNormString = new BagOfWordsSimilarity();
            var patternTokens = new[] { new NormalizedString("miss"), new NormalizedString("anna"), new NormalizedString("kurnikovova") };
            var targetTokens  = new[] { new NormalizedString("kurnikovova"), new NormalizedString("anna"), };

            var simResult = simNormString.GetSimilarity(patternTokens, targetTokens);

            simResult.Should().BeApproximately(0.696, ErrorTolerance);
        }
예제 #3
0
        public void GetSimilarityTokenizerNotSymmetric()
        {
            const string pattern = "miss anna kurnikovova";
            const string target  = "kurnikovova anna";

            var simNormString    = new BagOfWordsSimilarity();
            var patternTokenizer = new Tokenizer(pattern);
            var targetTokenizer  = new Tokenizer(target);
            var simResult        = simNormString.GetSimilarity(patternTokenizer, targetTokenizer);

            simResult.Should().BeApproximately(0.696, ErrorTolerance);
        }
예제 #4
0
        public void GetSimilarityStringsDefault()
        {
            var bagOfTokensSim = new BagOfWordsSimilarity();

            var patternTokens = new[] { "anna", "kurnikovova" };
            var targetTokens  = new[] { "kurnikovova", "anna" };
            var scoreRes      = bagOfTokensSim.GetSimilarity(patternTokens, targetTokens);

            scoreRes.Should().Be(1.0);

            var patternTokens2 = new[] { "jaromir", "jagr" };
            var targetTokens2  = new[] { "jaromir", "jager" };
            var scoreRes2      = bagOfTokensSim.GetSimilarity(patternTokens2, targetTokens2);

            scoreRes2.Should().Be(0.9);
        }
예제 #5
0
        public void GetSimilarityStringsLevenshtein()
        {
            var bagOfTokensSim = new BagOfWordsSimilarity(TokenSimilarity.Levenshtein);

            bagOfTokensSim.InternalTokenSimilarity.Should().Be(TokenSimilarity.Levenshtein);

            var patternTokens = new[] { "anna", "kurnikovova" };
            var targetTokens  = new[] { "kurnikovova", "anna" };
            var scoreRes      = bagOfTokensSim.GetSimilarity(patternTokens, targetTokens);

            scoreRes.Should().Be(1.0);

            var patternTokens2 = new[] { "jaromir", "jagr" };
            var targetTokens2  = new[] { "jaromir", "jager" };
            var scoreRes2      = bagOfTokensSim.GetSimilarity(patternTokens2, targetTokens2);

            scoreRes2.Should().BeApproximately(0.9, ErrorTolerance);
        }
예제 #6
0
        public void GetSimilaritySymmetric()
        {
            // not symmetric instance of BagOfTokensSimilarity
            var bagOfTokensNotSymmetric = new BagOfWordsSimilarity(TokenSimilarity.Levenshtein, false);

            bagOfTokensNotSymmetric.IsSymmetric.Should().Be(false);

            // symmetric instance of BagOfTokensSimilarity
            var bagOfTokensSymmetric = new BagOfWordsSimilarity(TokenSimilarity.Levenshtein, true);

            bagOfTokensSymmetric.IsSymmetric.Should().Be(true);

            var patternTokens = new[] { "miss", "anna", "kurnikovova" };
            var targetTokens  = new[] { "kurnikovova", "anna" };

            var scoreSymmetricRes = bagOfTokensNotSymmetric.GetSimilarity(patternTokens, targetTokens);

            scoreSymmetricRes.Should().BeApproximately(0.696, ErrorTolerance);

            var scoreNotSymmetricRes = bagOfTokensSymmetric.GetSimilarity(patternTokens, targetTokens);

            scoreNotSymmetricRes.Should().Be(1.0);
        }
예제 #7
0
        private static void Main(string[] args)
        {
            /****************************************************/
            /*				BagOfTokensSimilarity				*/
            /****************************************************/

            // the recommend method for complex similarity on more words
            var          bagOfTokens = new BagOfWordsSimilarity();
            const string pattern     = "John Smith";
            const string targetText  = "Mr. John Smith, Jr.";

            // using normalized string and tokenizer returns score 1.0
            var resultingSim = bagOfTokens.GetSimilarity(new Tokenizer(new NormalizedString(pattern)),
                                                         new Tokenizer(new NormalizedString(targetText)));


            /****************************************************/
            /*				Levenshtein							*/
            /****************************************************/
            const string nameCorrect = "martha";
            const string nameError   = "marhta";

            // Levenshtein distance (implements interface IDistance)
            // & similarity (implements interface ISimilarity)
            var lev = new Levenshtein();

            // returns edit distance 2
            var distLev = lev.GetDistance(nameCorrect, nameError);

            // normalized string removes special symbols, diacritics and make case insensitivity
            // returns score 0.67
            var simLev = lev.GetSimilarity(new NormalizedString(nameCorrect), new NormalizedString(nameError));

            /****************************************************/
            /*				Damerau-Levenshtein							*/
            /****************************************************/
            // DamerauLevenshtein implements IDistance and ISimilarity
            var damLev = new DamerauLevenshtein();

            // returns edit distance 1
            var distDamLev = damLev.GetDistance(nameCorrect, nameError);

            // returns score 0.83
            var simDamLev = damLev.GetSimilarity(nameCorrect, nameError);

            /****************************************************/
            /*				Jaro, Jaro-Winler					*/
            /****************************************************/

            // Jaro, Jaro-Winkler implements only ISimilarity
            var nameFirst   = new Token("dwayne");
            var nameSecond  = new Token("duane");
            var jaro        = new Jaro();
            var jaroWinkler = new JaroWinkler();

            jaroWinkler.GetSimilarity(nameFirst, nameSecond);


            /****************************************************/
            /*				Q-grams coefficient					*/
            /****************************************************/
            // q-grams similarity coefficient - Dice, Jaccard, Overlap
            // with different q-grams type
            var diceUnigrams    = new DiceCoefficient <Bigram>();
            var jaccardBigrams  = new JaccardCoefficient <Bigram>();
            var overlapTrigrams = new OverlapCoefficient <Bigram>();

            // returns score 0.5
            var jaccardSim = jaccardBigrams.GetSimilarity(pattern, targetText);

            // returns score 0.67
            var diceSim = diceUnigrams.GetSimilarity(pattern, targetText);

            // returns score 1.0
            var overlapSim = overlapTrigrams.GetSimilarity(pattern, targetText);

            Console.ReadKey();
        }