public void Cluster_NoiseNotEmpty() { var matrix = new[,] { {0.00, 0.09, 0.22}, {0.09, 0.00, 0.30}, {0.22, 0.30, 0.00} }; char[] objects = {'A', 'B', 'C'}; var clusterer = new DbscanClusterer<char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2); Cluster<char>[] clusters = clusterer.GenerateClusters(objects).ToArray(); Assert.That(clusters, Is.EquivalentTo(new[] {new Cluster<char>('A', 'B'), new Cluster<char>(new[] {'C'}, true)}).Using(new ClusterEqualityComparer<char>())); }
public void Cluster_AllObjectsInSameCluster() { var matrix = new[,] { {0.00, 0.09, 0.12}, {0.09, 0.00, 0.21}, {0.12, 0.21, 0.00} }; char[] objects = {'A', 'B', 'C'}; var clusterer = new DbscanClusterer<char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2); Cluster<char>[] clusters = clusterer.GenerateClusters(objects).ToArray(); Assert.That(clusters, Is.EquivalentTo(new[] {new Cluster<char>('A', 'B', 'C'), new Cluster<char>(Enumerable.Empty<char>(), true)}).Using(new ClusterEqualityComparer<char>())); }
public void Cluster_NoiseNotEmpty() { var matrix = new[, ] { { 0.00, 0.09, 0.22 }, { 0.09, 0.00, 0.30 }, { 0.22, 0.30, 0.00 } }; char[] objects = { 'A', 'B', 'C' }; var clusterer = new DbscanClusterer <char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2); Cluster <char>[] clusters = clusterer.GenerateClusters(objects).ToArray(); Assert.That(clusters, Is.EquivalentTo(new[] { new Cluster <char>('A', 'B'), new Cluster <char>(new[] { 'C' }, true) }).Using(new ClusterEqualityComparer <char>())); }
public void Cluster_AllObjectsInSameCluster() { var matrix = new[, ] { { 0.00, 0.09, 0.12 }, { 0.09, 0.00, 0.21 }, { 0.12, 0.21, 0.00 } }; char[] objects = { 'A', 'B', 'C' }; var clusterer = new DbscanClusterer <char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2); Cluster <char>[] clusters = clusterer.GenerateClusters(objects).ToArray(); Assert.That(clusters, Is.EquivalentTo(new[] { new Cluster <char>('A', 'B', 'C'), new Cluster <char>(Enumerable.Empty <char>(), true) }).Using(new ClusterEqualityComparer <char>())); }
/* Commented out for now, because the CommandLine library doesn't like finding two Usage attributes and we already have one on VerbBase * [Usage(ApplicationAlias = "cog-cmdline")] * public new static IEnumerable<Example> Examples * { * get * { * yield return new Example("UPGMA clustering (specify a threshhold value)", new ClusterVerb { Method = "upgma", Threshhold = 0.2 }); * yield return new Example("DBSCAN clustering (specify epsilon and min-words values)", new ClusterVerb { Method = "dbscan", Epsilon = 0.2, MinWords = 2 }); * yield return new Example("LSDBC clustering (specify alpha and K values)", new ClusterVerb { Method = "lsdbc", Alpha = 0.2, K = 3 }); * } * } */ protected override ReturnCode DoWork(TextReader inputReader, TextWriter outputWriter, TextWriter errorWriter) { ReturnCode retcode = ReturnCode.Okay; SetupProject(); string lowerMethod = Method.ToLowerInvariant(); switch (lowerMethod) { case "dbscan": case "lsdbc": case "upgma": break; default: Errors.Add($"Invalid clustering method {Method}. Valid values are \"upgma\", \"dbscan\", and \"lsdbc\" (not case-sensitive, e.g. \"Upgma\" also works.)"); return(ReturnCode.InputError); } foreach (string line in ReadLines(inputReader)) { // Format: word1 word2 score (where score is a floating-point number with 1.0 = 100% similarity) string[] words = line.Split(' '); if (words.Length < 3) { Errors.Add(line, "Each line should contain two words and one score, separated by spaces."); continue; } double score; if (!double.TryParse(words[2], NumberStyles.Float, CultureInfo.InvariantCulture, out score)) { Errors.Add(line, $"Could not parse score \"{words[2]}\". Scores should be a number between 0 and 1."); continue; } if (score < 0.0) { Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be negative, but should be a number between 0 and 1."); continue; } if (score > 1.0) { Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be greater than 1, but should be a number between 0 and 1."); continue; } double distance = 1.0 - score; KeepScoreForUpgmaClusterer(words[0], words[1], distance); // TODO: Since we always call this, maybe we should rename it } IFlatClusterer <string> clusterer; switch (Method.ToLowerInvariant()) { case "dbscan": // public DbscanClusterer(Func<T, IEnumerable<T>> getNeighbors, double minPoints) KeepScoreForDbscanClusterer(); clusterer = new DbscanClusterer <string>(word => _distanceGraph[word] .TakeWhile(scoreWordTuple => scoreWordTuple.Item1 <= Epsilon) .Select(scoreWordTuple => scoreWordTuple.Item2), MinWords); break; case "lsdbc": // public LsdbcClusterer(double alpha, Func<T, IEnumerable<Tuple<T, double>>> getKNearestNeighbors) KeepScoreForLsdbcClusterer(); clusterer = new LsdbcClusterer <string>(Alpha, word => _distanceGraph[word].Take(K) .Select(tuple => new Tuple <string, double>(tuple.Item2, tuple.Item1))); break; default: clusterer = new FlatUpgmaClusterer <string>((w1, w2) => _distanceDict[new UnorderedTuple <string, string>(w1, w2)], Threshhold); break; } IEnumerable <Cluster <string> > clusters = clusterer.GenerateClusters(_allWords); PrintResults(outputWriter, clusters); return(retcode); }