Esempio n. 1
0
 public void Cluster_NoiseNotEmpty()
 {
     var matrix = new[,]
         {
             {0.00, 0.09, 0.22},
             {0.09, 0.00, 0.30},
             {0.22, 0.30, 0.00}
         };
     char[] objects = {'A', 'B', 'C'};
     var clusterer = new DbscanClusterer<char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2);
     Cluster<char>[] clusters = clusterer.GenerateClusters(objects).ToArray();
     Assert.That(clusters, Is.EquivalentTo(new[] {new Cluster<char>('A', 'B'), new Cluster<char>(new[] {'C'}, true)}).Using(new ClusterEqualityComparer<char>()));
 }
Esempio n. 2
0
 public void Cluster_AllObjectsInSameCluster()
 {
     var matrix = new[,]
         {
             {0.00, 0.09, 0.12},
             {0.09, 0.00, 0.21},
             {0.12, 0.21, 0.00}
         };
     char[] objects = {'A', 'B', 'C'};
     var clusterer = new DbscanClusterer<char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2);
     Cluster<char>[] clusters = clusterer.GenerateClusters(objects).ToArray();
     Assert.That(clusters, Is.EquivalentTo(new[] {new Cluster<char>('A', 'B', 'C'), new Cluster<char>(Enumerable.Empty<char>(), true)}).Using(new ClusterEqualityComparer<char>()));
 }
        public void Cluster_NoiseNotEmpty()
        {
            var matrix = new[, ]
            {
                { 0.00, 0.09, 0.22 },
                { 0.09, 0.00, 0.30 },
                { 0.22, 0.30, 0.00 }
            };

            char[] objects   = { 'A', 'B', 'C' };
            var    clusterer = new DbscanClusterer <char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2);

            Cluster <char>[] clusters = clusterer.GenerateClusters(objects).ToArray();
            Assert.That(clusters, Is.EquivalentTo(new[] { new Cluster <char>('A', 'B'), new Cluster <char>(new[] { 'C' }, true) }).Using(new ClusterEqualityComparer <char>()));
        }
        public void Cluster_AllObjectsInSameCluster()
        {
            var matrix = new[, ]
            {
                { 0.00, 0.09, 0.12 },
                { 0.09, 0.00, 0.21 },
                { 0.12, 0.21, 0.00 }
            };

            char[] objects   = { 'A', 'B', 'C' };
            var    clusterer = new DbscanClusterer <char>(o1 => objects.Where(o2 => matrix[o1 - 'A', o2 - 'A'] <= 0.2), 2);

            Cluster <char>[] clusters = clusterer.GenerateClusters(objects).ToArray();
            Assert.That(clusters, Is.EquivalentTo(new[] { new Cluster <char>('A', 'B', 'C'), new Cluster <char>(Enumerable.Empty <char>(), true) }).Using(new ClusterEqualityComparer <char>()));
        }
Esempio n. 5
0
/* Commented out for now, because the CommandLine library doesn't like finding two Usage attributes and we already have one on VerbBase
 *              [Usage(ApplicationAlias = "cog-cmdline")]
 *              public new static IEnumerable<Example> Examples
 *              {
 *                      get
 *                      {
 *                              yield return new Example("UPGMA clustering (specify a threshhold value)", new ClusterVerb { Method = "upgma", Threshhold = 0.2 });
 *                              yield return new Example("DBSCAN clustering (specify epsilon and min-words values)", new ClusterVerb { Method = "dbscan", Epsilon = 0.2, MinWords = 2 });
 *                              yield return new Example("LSDBC clustering (specify alpha and K values)", new ClusterVerb { Method = "lsdbc", Alpha = 0.2, K = 3 });
 *                      }
 *              }
 */

        protected override ReturnCode DoWork(TextReader inputReader, TextWriter outputWriter, TextWriter errorWriter)
        {
            ReturnCode retcode = ReturnCode.Okay;

            SetupProject();

            string lowerMethod = Method.ToLowerInvariant();

            switch (lowerMethod)
            {
            case "dbscan":
            case "lsdbc":
            case "upgma":
                break;

            default:
                Errors.Add($"Invalid clustering method {Method}. Valid values are \"upgma\", \"dbscan\", and \"lsdbc\" (not case-sensitive, e.g. \"Upgma\" also works.)");
                return(ReturnCode.InputError);
            }

            foreach (string line in ReadLines(inputReader))
            {
                // Format: word1 word2 score (where score is a floating-point number with 1.0 = 100% similarity)
                string[] words = line.Split(' ');
                if (words.Length < 3)
                {
                    Errors.Add(line, "Each line should contain two words and one score, separated by spaces.");
                    continue;
                }
                double score;
                if (!double.TryParse(words[2], NumberStyles.Float, CultureInfo.InvariantCulture, out score))
                {
                    Errors.Add(line, $"Could not parse score \"{words[2]}\". Scores should be a number between 0 and 1.");
                    continue;
                }
                if (score < 0.0)
                {
                    Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be negative, but should be a number between 0 and 1.");
                    continue;
                }
                if (score > 1.0)
                {
                    Errors.Add(line, $"Invalid score \"{words[2]}\". Scores should not be greater than 1, but should be a number between 0 and 1.");
                    continue;
                }

                double distance = 1.0 - score;
                KeepScoreForUpgmaClusterer(words[0], words[1], distance);                 // TODO: Since we always call this, maybe we should rename it
            }
            IFlatClusterer <string> clusterer;

            switch (Method.ToLowerInvariant())
            {
            case "dbscan":
                // public DbscanClusterer(Func<T, IEnumerable<T>> getNeighbors, double minPoints)
                KeepScoreForDbscanClusterer();
                clusterer = new DbscanClusterer <string>(word => _distanceGraph[word]
                                                         .TakeWhile(scoreWordTuple => scoreWordTuple.Item1 <= Epsilon)
                                                         .Select(scoreWordTuple => scoreWordTuple.Item2), MinWords);
                break;

            case "lsdbc":
                // public LsdbcClusterer(double alpha, Func<T, IEnumerable<Tuple<T, double>>> getKNearestNeighbors)
                KeepScoreForLsdbcClusterer();
                clusterer = new LsdbcClusterer <string>(Alpha, word => _distanceGraph[word].Take(K)
                                                        .Select(tuple => new Tuple <string, double>(tuple.Item2, tuple.Item1)));
                break;

            default:
                clusterer = new FlatUpgmaClusterer <string>((w1, w2) =>
                                                            _distanceDict[new UnorderedTuple <string, string>(w1, w2)], Threshhold);
                break;
            }
            IEnumerable <Cluster <string> > clusters = clusterer.GenerateClusters(_allWords);

            PrintResults(outputWriter, clusters);
            return(retcode);
        }