Esempio n. 1
0
        public static IEnumerable <Cluster <Word> > GenerateCognateSets(this CogProject project, Meaning meaning)
        {
            var words = new HashSet <Word>();
            var noise = new HashSet <Word>();

            foreach (VarietyPair vp in project.VarietyPairs)
            {
                WordPair wp;
                if (vp.WordPairs.TryGetValue(meaning, out wp))
                {
                    if (wp.AreCognatePredicted)
                    {
                        words.Add(wp.Word1);
                        words.Add(wp.Word2);
                        noise.Remove(wp.Word1);
                        noise.Remove(wp.Word2);
                    }
                    else
                    {
                        if (!words.Contains(wp.Word1))
                        {
                            noise.Add(wp.Word1);
                        }
                        if (!words.Contains(wp.Word2))
                        {
                            noise.Add(wp.Word2);
                        }
                    }
                }
            }

            double min = double.MaxValue, max = double.MinValue;
            var    distanceMatrix = new Dictionary <UnorderedTuple <Word, Word>, double>();

            Word[] wordArray = words.ToArray();
            for (int i = 0; i < wordArray.Length; i++)
            {
                for (int j = i + 1; j < wordArray.Length; j++)
                {
                    Word     w1    = wordArray[i];
                    Word     w2    = wordArray[j];
                    double   score = 0;
                    WordPair wp;
                    if (w1.Variety != w2.Variety && w1.Variety.VarietyPairs[w2.Variety].WordPairs.TryGetValue(meaning, out wp) && wp.AreCognatePredicted &&
                        wp.GetWord(w1.Variety) == w1 && wp.GetWord(w2.Variety) == w2)
                    {
                        score = wp.CognicityScore;
                    }
                    double distance = 1.0 - score;
                    min = Math.Min(min, distance);
                    max = Math.Max(max, distance);
                    distanceMatrix[UnorderedTuple.Create(w1, w2)] = distance;
                }
            }

            var clusterer = new FlatUpgmaClusterer <Word>((w1, w2) => distanceMatrix[UnorderedTuple.Create(w1, w2)], (max + min) / 2);

            return(clusterer.GenerateClusters(words).Concat(new Cluster <Word>(noise, true)));
        }
Esempio n. 2
0
        public void Cluster()
        {
            var matrix = new[,]
                {
                    {0.00, 0.50, 0.67, 0.80, 0.20},
                    {0.50, 0.00, 0.40, 0.70, 0.60},
                    {0.67, 0.40, 0.00, 0.80, 0.80},
                    {0.80, 0.70, 0.80, 0.00, 0.30},
                    {0.20, 0.60, 0.80, 0.30, 0.00}
                };
            var fupgma = new FlatUpgmaClusterer<char>((o1, o2) => matrix[o1 - 'A', o2 - 'A'], 0.5);
            Cluster<char>[] clusters = fupgma.GenerateClusters(new[] {'A', 'B', 'C', 'D', 'E'}).ToArray();

            var expected = new[]
                {
                    new Cluster<char>('B', 'C'),
                    new Cluster<char>('A', 'E', 'D')
                };

            Assert.That(clusters, Is.EquivalentTo(expected).Using(new ClusterEqualityComparer<char>()));
        }
        public void Cluster()
        {
            var matrix = new[, ]
            {
                { 0.00, 0.50, 0.67, 0.80, 0.20 },
                { 0.50, 0.00, 0.40, 0.70, 0.60 },
                { 0.67, 0.40, 0.00, 0.80, 0.80 },
                { 0.80, 0.70, 0.80, 0.00, 0.30 },
                { 0.20, 0.60, 0.80, 0.30, 0.00 }
            };
            var fupgma = new FlatUpgmaClusterer <char>((o1, o2) => matrix[o1 - 'A', o2 - 'A'], 0.5);

            Cluster <char>[] clusters = fupgma.GenerateClusters(new[] { 'A', 'B', 'C', 'D', 'E' }).ToArray();

            var expected = new[]
            {
                new Cluster <char>('B', 'C'),
                new Cluster <char>('A', 'E', 'D')
            };

            Assert.That(clusters, Is.EquivalentTo(expected).Using(new ClusterEqualityComparer <char>()));
        }
Esempio n. 4
0
        private void ClusterVarieties()
        {
            if (!_projectService.AreAllVarietiesCompared)
            {
                return;
            }

            Func <Variety, Variety, double> getDistance = null;

            switch (_similarityMetric)
            {
            case SimilarityMetric.Lexical:
                getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].LexicalSimilarityScore;
                break;

            case SimilarityMetric.Phonetic:
                getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].PhoneticSimilarityScore;
                break;
            }

            var clusterer = new FlatUpgmaClusterer <Variety>(getDistance, 1.0 - _similarityScoreThreshold);

            _currentClusters.Clear();
            _currentClusters.AddRange(clusterer.GenerateClusters(_varieties.Select(v => v.DomainVariety)).Where(c => c.DataObjects.Any(v => v.Regions.Count > 0)).OrderByDescending(c => c.DataObjects.Count));
            foreach (GeographicalVarietyViewModel variety in _varieties)
            {
                if (variety.Regions.Count > 0)
                {
                    int index = _currentClusters.FindIndex(c => c.DataObjects.Contains(variety.DomainVariety));
                    variety.ClusterIndex = index;
                }
                else
                {
                    variety.ClusterIndex = -1;
                }
            }
        }
Esempio n. 5
0
/* Commented out for now, because the CommandLine library doesn't like finding two Usage attributes and we already have one on VerbBase
 *              [Usage(ApplicationAlias = "cog-cmdline")]
 *              public new static IEnumerable<Example> Examples
 *              {
 *                      get
 *                      {
 *                              yield return new Example("UPGMA clustering (specify a threshhold value)", new ClusterVerb { Method = "upgma", Threshhold = 0.2 });
 *                              yield return new Example("DBSCAN clustering (specify epsilon and min-words values)", new ClusterVerb { Method = "dbscan", Epsilon = 0.2, MinWords = 2 });
 *                              yield return new Example("LSDBC clustering (specify alpha and K values)", new ClusterVerb { Method = "lsdbc", Alpha = 0.2, K = 3 });
 *                      }
 *              }
 */

        protected override ReturnCode DoWork(TextReader inputReader, TextWriter outputWriter, TextWriter errorWriter)
        {
            ReturnCode retcode = ReturnCode.Okay;

            SetupProject();

            string lowerMethod = Method.ToLowerInvariant();

            switch (lowerMethod)
            {
            case "dbscan":
            case "lsdbc":
            case "upgma":
                break;

            default:
                Errors.Add($"Invalid clustering method {Method}. Valid values are \"upgma\", \"dbscan\", and \"lsdbc\" (not case-sensitive, e.g. \"Upgma\" also works.)");
                return(ReturnCode.InputError);
            }

            foreach (string line in ReadLines(inputReader))
            {
                // Format: word1 word2 score (where score is a floating-point number with 1.0 = 100% similarity)
                string[] words = line.Split(' ');
                if (words.Length < 3)
                {
                    Errors.Add(line, "Each line should contain two words and one score, separated by spaces.");
                    continue;
                }
                double score;
                if (!double.TryParse(words[2], NumberStyles.Float, CultureInfo.InvariantCulture, out score))
                {
                    Errors.Add(line, $"Could not parse score \"{words[2]}\". Scores should be a number between 0 and 1.");
                    continue;
                }
                if (score < 0.0)
                {
                    Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be negative, but should be a number between 0 and 1.");
                    continue;
                }
                if (score > 1.0)
                {
                    Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be greater than 1, but should be a number between 0 and 1.");
                    continue;
                }

                double distance = 1.0 - score;
                KeepScoreForUpgmaClusterer(words[0], words[1], distance);                 // TODO: Since we always call this, maybe we should rename it
            }
            IFlatClusterer <string> clusterer;

            switch (Method.ToLowerInvariant())
            {
            case "dbscan":
                // public DbscanClusterer(Func<T, IEnumerable<T>> getNeighbors, double minPoints)
                KeepScoreForDbscanClusterer();
                clusterer = new DbscanClusterer <string>(word => _distanceGraph[word]
                                                         .TakeWhile(scoreWordTuple => scoreWordTuple.Item1 <= Epsilon)
                                                         .Select(scoreWordTuple => scoreWordTuple.Item2), MinWords);
                break;

            case "lsdbc":
                // public LsdbcClusterer(double alpha, Func<T, IEnumerable<Tuple<T, double>>> getKNearestNeighbors)
                KeepScoreForLsdbcClusterer();
                clusterer = new LsdbcClusterer <string>(Alpha, word => _distanceGraph[word].Take(K)
                                                        .Select(tuple => new Tuple <string, double>(tuple.Item2, tuple.Item1)));
                break;

            default:
                clusterer = new FlatUpgmaClusterer <string>((w1, w2) =>
                                                            _distanceDict[new UnorderedTuple <string, string>(w1, w2)], Threshhold);
                break;
            }
            IEnumerable <Cluster <string> > clusters = clusterer.GenerateClusters(_allWords);

            PrintResults(outputWriter, clusters);
            return(retcode);
        }
Esempio n. 6
0
        private void ClusterVarieties()
        {
            if (!_projectService.AreAllVarietiesCompared)
                return;

            Func<Variety, Variety, double> getDistance = null;
            switch (_similarityMetric)
            {
                case SimilarityMetric.Lexical:
                    getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].LexicalSimilarityScore;
                    break;
                case SimilarityMetric.Phonetic:
                    getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].PhoneticSimilarityScore;
                    break;
            }

            var clusterer = new FlatUpgmaClusterer<Variety>(getDistance, 1.0 - _similarityScoreThreshold);
            _currentClusters.Clear();
            _currentClusters.AddRange(clusterer.GenerateClusters(_varieties.Select(v => v.DomainVariety)).Where(c => c.DataObjects.Any(v => v.Regions.Count > 0)).OrderByDescending(c => c.DataObjects.Count));
            foreach (GeographicalVarietyViewModel variety in _varieties)
            {
                if (variety.Regions.Count > 0)
                {
                    int index = _currentClusters.FindIndex(c => c.DataObjects.Contains(variety.DomainVariety));
                    variety.ClusterIndex = index;
                }
                else
                {
                    variety.ClusterIndex = -1;
                }
            }
        }
Esempio n. 7
0
        public static IEnumerable<Cluster<Word>> GenerateCognateSets(this CogProject project, Meaning meaning)
        {
            var words = new HashSet<Word>();
            var noise = new HashSet<Word>();
            foreach (VarietyPair vp in project.VarietyPairs)
            {
                WordPair wp;
                if (vp.WordPairs.TryGetValue(meaning, out wp))
                {
                    if (wp.Cognacy)
                    {
                        words.Add(wp.Word1);
                        words.Add(wp.Word2);
                        noise.Remove(wp.Word1);
                        noise.Remove(wp.Word2);
                    }
                    else
                    {
                        if (!words.Contains(wp.Word1))
                            noise.Add(wp.Word1);
                        if (!words.Contains(wp.Word2))
                            noise.Add(wp.Word2);
                    }
                }
            }

            double min = double.MaxValue;
            var distanceMatrix = new Dictionary<UnorderedTuple<Word, Word>, double>();
            Word[] wordArray = words.ToArray();
            for (int i = 0; i < wordArray.Length; i++)
            {
                for (int j = i + 1; j < wordArray.Length; j++)
                {
                    Word w1 = wordArray[i];
                    Word w2 = wordArray[j];
                    double score = 0;
                    WordPair wp;
                    if (w1.Variety != w2.Variety && w1.Variety.VarietyPairs[w2.Variety].WordPairs.TryGetValue(meaning, out wp) && wp.Cognacy
                        && wp.GetWord(w1.Variety) == w1 && wp.GetWord(w2.Variety) == w2)
                    {
                        score = wp.PredictedCognacyScore;
                    }
                    double distance = 1.0 - score;
                    min = Math.Min(min, distance);
                    distanceMatrix[UnorderedTuple.Create(w1, w2)] = distance;
                }
            }

            var clusterer = new FlatUpgmaClusterer<Word>((w1, w2) => distanceMatrix[UnorderedTuple.Create(w1, w2)], (1.0 + min) / 2);
            return clusterer.GenerateClusters(words).Concat(new Cluster<Word>(noise, true));
        }