public static IEnumerable <Cluster <Word> > GenerateCognateSets(this CogProject project, Meaning meaning) { var words = new HashSet <Word>(); var noise = new HashSet <Word>(); foreach (VarietyPair vp in project.VarietyPairs) { WordPair wp; if (vp.WordPairs.TryGetValue(meaning, out wp)) { if (wp.AreCognatePredicted) { words.Add(wp.Word1); words.Add(wp.Word2); noise.Remove(wp.Word1); noise.Remove(wp.Word2); } else { if (!words.Contains(wp.Word1)) { noise.Add(wp.Word1); } if (!words.Contains(wp.Word2)) { noise.Add(wp.Word2); } } } } double min = double.MaxValue, max = double.MinValue; var distanceMatrix = new Dictionary <UnorderedTuple <Word, Word>, double>(); Word[] wordArray = words.ToArray(); for (int i = 0; i < wordArray.Length; i++) { for (int j = i + 1; j < wordArray.Length; j++) { Word w1 = wordArray[i]; Word w2 = wordArray[j]; double score = 0; WordPair wp; if (w1.Variety != w2.Variety && w1.Variety.VarietyPairs[w2.Variety].WordPairs.TryGetValue(meaning, out wp) && wp.AreCognatePredicted && wp.GetWord(w1.Variety) == w1 && wp.GetWord(w2.Variety) == w2) { score = wp.CognicityScore; } double distance = 1.0 - score; min = Math.Min(min, distance); max = Math.Max(max, distance); distanceMatrix[UnorderedTuple.Create(w1, w2)] = distance; } } var clusterer = new FlatUpgmaClusterer <Word>((w1, w2) => distanceMatrix[UnorderedTuple.Create(w1, w2)], (max + min) / 2); return(clusterer.GenerateClusters(words).Concat(new Cluster <Word>(noise, true))); }
public void Cluster() { var matrix = new[,] { {0.00, 0.50, 0.67, 0.80, 0.20}, {0.50, 0.00, 0.40, 0.70, 0.60}, {0.67, 0.40, 0.00, 0.80, 0.80}, {0.80, 0.70, 0.80, 0.00, 0.30}, {0.20, 0.60, 0.80, 0.30, 0.00} }; var fupgma = new FlatUpgmaClusterer<char>((o1, o2) => matrix[o1 - 'A', o2 - 'A'], 0.5); Cluster<char>[] clusters = fupgma.GenerateClusters(new[] {'A', 'B', 'C', 'D', 'E'}).ToArray(); var expected = new[] { new Cluster<char>('B', 'C'), new Cluster<char>('A', 'E', 'D') }; Assert.That(clusters, Is.EquivalentTo(expected).Using(new ClusterEqualityComparer<char>())); }
public void Cluster() { var matrix = new[, ] { { 0.00, 0.50, 0.67, 0.80, 0.20 }, { 0.50, 0.00, 0.40, 0.70, 0.60 }, { 0.67, 0.40, 0.00, 0.80, 0.80 }, { 0.80, 0.70, 0.80, 0.00, 0.30 }, { 0.20, 0.60, 0.80, 0.30, 0.00 } }; var fupgma = new FlatUpgmaClusterer <char>((o1, o2) => matrix[o1 - 'A', o2 - 'A'], 0.5); Cluster <char>[] clusters = fupgma.GenerateClusters(new[] { 'A', 'B', 'C', 'D', 'E' }).ToArray(); var expected = new[] { new Cluster <char>('B', 'C'), new Cluster <char>('A', 'E', 'D') }; Assert.That(clusters, Is.EquivalentTo(expected).Using(new ClusterEqualityComparer <char>())); }
private void ClusterVarieties() { if (!_projectService.AreAllVarietiesCompared) { return; } Func <Variety, Variety, double> getDistance = null; switch (_similarityMetric) { case SimilarityMetric.Lexical: getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].LexicalSimilarityScore; break; case SimilarityMetric.Phonetic: getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].PhoneticSimilarityScore; break; } var clusterer = new FlatUpgmaClusterer <Variety>(getDistance, 1.0 - _similarityScoreThreshold); _currentClusters.Clear(); _currentClusters.AddRange(clusterer.GenerateClusters(_varieties.Select(v => v.DomainVariety)).Where(c => c.DataObjects.Any(v => v.Regions.Count > 0)).OrderByDescending(c => c.DataObjects.Count)); foreach (GeographicalVarietyViewModel variety in _varieties) { if (variety.Regions.Count > 0) { int index = _currentClusters.FindIndex(c => c.DataObjects.Contains(variety.DomainVariety)); variety.ClusterIndex = index; } else { variety.ClusterIndex = -1; } } }
/* Commented out for now, because the CommandLine library doesn't like finding two Usage attributes and we already have one on VerbBase * [Usage(ApplicationAlias = "cog-cmdline")] * public new static IEnumerable<Example> Examples * { * get * { * yield return new Example("UPGMA clustering (specify a threshhold value)", new ClusterVerb { Method = "upgma", Threshhold = 0.2 }); * yield return new Example("DBSCAN clustering (specify epsilon and min-words values)", new ClusterVerb { Method = "dbscan", Epsilon = 0.2, MinWords = 2 }); * yield return new Example("LSDBC clustering (specify alpha and K values)", new ClusterVerb { Method = "lsdbc", Alpha = 0.2, K = 3 }); * } * } */ protected override ReturnCode DoWork(TextReader inputReader, TextWriter outputWriter, TextWriter errorWriter) { ReturnCode retcode = ReturnCode.Okay; SetupProject(); string lowerMethod = Method.ToLowerInvariant(); switch (lowerMethod) { case "dbscan": case "lsdbc": case "upgma": break; default: Errors.Add($"Invalid clustering method {Method}. Valid values are \"upgma\", \"dbscan\", and \"lsdbc\" (not case-sensitive, e.g. \"Upgma\" also works.)"); return(ReturnCode.InputError); } foreach (string line in ReadLines(inputReader)) { // Format: word1 word2 score (where score is a floating-point number with 1.0 = 100% similarity) string[] words = line.Split(' '); if (words.Length < 3) { Errors.Add(line, "Each line should contain two words and one score, separated by spaces."); continue; } double score; if (!double.TryParse(words[2], NumberStyles.Float, CultureInfo.InvariantCulture, out score)) { Errors.Add(line, $"Could not parse score \"{words[2]}\". Scores should be a number between 0 and 1."); continue; } if (score < 0.0) { Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be negative, but should be a number between 0 and 1."); continue; } if (score > 1.0) { Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be greater than 1, but should be a number between 0 and 1."); continue; } double distance = 1.0 - score; KeepScoreForUpgmaClusterer(words[0], words[1], distance); // TODO: Since we always call this, maybe we should rename it } IFlatClusterer <string> clusterer; switch (Method.ToLowerInvariant()) { case "dbscan": // public DbscanClusterer(Func<T, IEnumerable<T>> getNeighbors, double minPoints) KeepScoreForDbscanClusterer(); clusterer = new DbscanClusterer <string>(word => _distanceGraph[word] .TakeWhile(scoreWordTuple => scoreWordTuple.Item1 <= Epsilon) .Select(scoreWordTuple => scoreWordTuple.Item2), MinWords); break; case "lsdbc": // public LsdbcClusterer(double alpha, Func<T, IEnumerable<Tuple<T, double>>> getKNearestNeighbors) KeepScoreForLsdbcClusterer(); clusterer = new LsdbcClusterer <string>(Alpha, word => _distanceGraph[word].Take(K) .Select(tuple => new Tuple <string, double>(tuple.Item2, tuple.Item1))); break; default: clusterer = new FlatUpgmaClusterer <string>((w1, w2) => _distanceDict[new UnorderedTuple <string, string>(w1, w2)], Threshhold); break; } IEnumerable <Cluster <string> > clusters = clusterer.GenerateClusters(_allWords); PrintResults(outputWriter, clusters); return(retcode); }
private void ClusterVarieties() { if (!_projectService.AreAllVarietiesCompared) return; Func<Variety, Variety, double> getDistance = null; switch (_similarityMetric) { case SimilarityMetric.Lexical: getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].LexicalSimilarityScore; break; case SimilarityMetric.Phonetic: getDistance = (v1, v2) => 1.0 - v1.VarietyPairs[v2].PhoneticSimilarityScore; break; } var clusterer = new FlatUpgmaClusterer<Variety>(getDistance, 1.0 - _similarityScoreThreshold); _currentClusters.Clear(); _currentClusters.AddRange(clusterer.GenerateClusters(_varieties.Select(v => v.DomainVariety)).Where(c => c.DataObjects.Any(v => v.Regions.Count > 0)).OrderByDescending(c => c.DataObjects.Count)); foreach (GeographicalVarietyViewModel variety in _varieties) { if (variety.Regions.Count > 0) { int index = _currentClusters.FindIndex(c => c.DataObjects.Contains(variety.DomainVariety)); variety.ClusterIndex = index; } else { variety.ClusterIndex = -1; } } }
public static IEnumerable<Cluster<Word>> GenerateCognateSets(this CogProject project, Meaning meaning) { var words = new HashSet<Word>(); var noise = new HashSet<Word>(); foreach (VarietyPair vp in project.VarietyPairs) { WordPair wp; if (vp.WordPairs.TryGetValue(meaning, out wp)) { if (wp.Cognacy) { words.Add(wp.Word1); words.Add(wp.Word2); noise.Remove(wp.Word1); noise.Remove(wp.Word2); } else { if (!words.Contains(wp.Word1)) noise.Add(wp.Word1); if (!words.Contains(wp.Word2)) noise.Add(wp.Word2); } } } double min = double.MaxValue; var distanceMatrix = new Dictionary<UnorderedTuple<Word, Word>, double>(); Word[] wordArray = words.ToArray(); for (int i = 0; i < wordArray.Length; i++) { for (int j = i + 1; j < wordArray.Length; j++) { Word w1 = wordArray[i]; Word w2 = wordArray[j]; double score = 0; WordPair wp; if (w1.Variety != w2.Variety && w1.Variety.VarietyPairs[w2.Variety].WordPairs.TryGetValue(meaning, out wp) && wp.Cognacy && wp.GetWord(w1.Variety) == w1 && wp.GetWord(w2.Variety) == w2) { score = wp.PredictedCognacyScore; } double distance = 1.0 - score; min = Math.Min(min, distance); distanceMatrix[UnorderedTuple.Create(w1, w2)] = distance; } } var clusterer = new FlatUpgmaClusterer<Word>((w1, w2) => distanceMatrix[UnorderedTuple.Create(w1, w2)], (1.0 + min) / 2); return clusterer.GenerateClusters(words).Concat(new Cluster<Word>(noise, true)); }