예제 #1
0
        public void TestDistance()
        {
            var damerau = new Damerau();

            Assert.Equal(expected: 1.0, actual: damerau.Distance("ABCDEF", "ABDCEF"));
            Assert.Equal(expected: 2.0, actual: damerau.Distance("ABCDEF", "BACDFE"));
            Assert.Equal(expected: 1.0, actual: damerau.Distance("ABCDEF", "ABCDE"));
        }
예제 #2
0
        public void Damerau()
        {
            var d = new Damerau();

            Assert.AreEqual(3, d.Distance("abc", "123"));
            Assert.AreEqual(2, d.Distance("abc", "12c"));
            Assert.AreEqual(3, d.Distance("abc", "123abc"));
            Assert.AreEqual(1, d.Distance("abc", "acb"));
        }
예제 #3
0
        public void TestDistance()
        {
            var instance = new Damerau();

            Assert.Equal(expected: 1.0, actual: instance.Distance("ABCDEF", "ABDCEF"));
            Assert.Equal(expected: 2.0, actual: instance.Distance("ABCDEF", "BACDFE"));
            Assert.Equal(expected: 1.0, actual: instance.Distance("ABCDEF", "ABCDE"));

            NullEmptyTests.TestDistance(instance);
        }
        //TODO - look at this and how it returns similar request
        private double CalculateSimilarity(string source, string target)
        {
            var d = new Damerau();

            if (source == null || target == null)
            {
                return(0.0);
            }

            if (source.Length == 0 || target.Length == 0)
            {
                return(0.0);
            }

            if (source == target)
            {
                return(1.0);
            }

            var stepsToSame = d.Distance(source, target);

            return(1.0 - stepsToSame / Math.Max(source.Length, target.Length));
        }
예제 #5
0
        public async Task Execute()
        {
            using (var dbContext = new GitRepositoryDbContext(false))
            {
                var normalizedDevelopers = new List <AliasedDeveloperName>();

                var authorsPlace = new Dictionary <string, string>();

                var authors = dbContext.Commits
                              .Select(m => new { m.AuthorEmail, m.AuthorName })
                              .Distinct()
                              .ToArray();

                _logger.LogInformation("{datetime}: there are {count} authors submitted all the commits.", DateTime.Now, authors.Count());

                foreach (var author in authors)
                {
                    var normalizedEmail = author.AuthorEmail
                                          .Replace(" ", string.Empty)
                                          .Replace(".", string.Empty)
                                          .Replace("[", string.Empty)
                                          .Replace("]", string.Empty)
                                          .Replace("_", string.Empty)
                                          .Replace("-", string.Empty)
                                          .Replace("(", string.Empty)
                                          .Replace(")", string.Empty)
                                          .ToLower()
                                          .Trim()
                                          .RemoveDiacritics();

                    var normalizedName = author.AuthorName
                                         .Replace(" ", string.Empty)
                                         .Replace(".", string.Empty)
                                         .Replace("[", string.Empty)
                                         .Replace("]", string.Empty)
                                         .Replace("_", string.Empty)
                                         .Replace("-", string.Empty)
                                         .Replace("(", string.Empty)
                                         .Replace(")", string.Empty)
                                         .Trim()
                                         .ToLower()
                                         .RemoveDiacritics();

                    if (authorsPlace.ContainsKey(normalizedName))
                    {
                        var uniqueId = authorsPlace[normalizedName];

                        if (authorsPlace.ContainsKey(normalizedEmail) &&
                            authorsPlace[normalizedEmail] != uniqueId)
                        {
                            /* it supports following edge case:
                             * Occurence 1 ehsan,[email protected]
                             * Occurence 2 ali,[email protected]
                             * Occurence 3 ehsan,[email protected]
                             */

                            var oldUniqueId = authorsPlace[normalizedEmail];

                            foreach (var dev in normalizedDevelopers.Where(q => q.NormalizedName == oldUniqueId))
                            {
                                dev.NormalizedName = uniqueId;
                            }
                        }

                        authorsPlace[normalizedEmail] = uniqueId;
                    }
                    else if (authorsPlace.ContainsKey(normalizedEmail))
                    {
                        authorsPlace[normalizedName] = authorsPlace[normalizedEmail];
                    }
                    else
                    {
                        authorsPlace[normalizedName]  = normalizedName;
                        authorsPlace[normalizedEmail] = normalizedName;
                    }

                    normalizedDevelopers.Add(new AliasedDeveloperName()
                    {
                        Email          = author.AuthorEmail,
                        Name           = author.AuthorName,
                        NormalizedName = authorsPlace[normalizedName]
                    });
                }

                var damerauDistanceAlgorithm = new Damerau();
                normalizedDevelopers = normalizedDevelopers.OrderBy(q => q.NormalizedName)
                                       .ToList();

                for (var i = 0; i < normalizedDevelopers.Count - 1; i++)
                {
                    var firstDev  = normalizedDevelopers[i];
                    var secondDev = normalizedDevelopers[i + 1];
                    var distance  = damerauDistanceAlgorithm.Distance(firstDev.NormalizedName, secondDev.NormalizedName);

                    if (distance == 1)
                    {
                        secondDev.NormalizedName = firstDev.NormalizedName;
                    }
                }

                _logger.LogInformation("{datetime}: after normalization, there are {count} unique authors have been found.",
                                       DateTime.Now, normalizedDevelopers.Select(q => q.NormalizedName).Distinct().Count());

                dbContext.AddRange(normalizedDevelopers);
                await dbContext.SaveChangesAsync().ConfigureAwait(false);

                _logger.LogInformation("{datetime}: aliased results have been saves successfully.", DateTime.Now);
            }
        }