Esempio n. 1
0
        static void Main(string[] args)
        {
            var dataset = LoadCsv("iris.csv", 5);

            var result = HdbscanRunner.Run(new HdbscanParameters
            {
                DataSet          = dataset,
                MinPoints        = 25,
                MinClusterSize   = 25,
                DistanceFunction = new CosineSimilarity()
            });

            for (var specie = 1; specie <= 3; specie++)
            {
                var       offset = (specie - 1) * 50;
                const int size   = 50;

                Console.Write("Specie #" + specie + " ");

                for (int i = 0; i < size; i++)
                {
                    var label = result.Labels[offset + i];
                    Console.Write(label);
                }
                Console.WriteLine();
            }
            Console.WriteLine();

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Esempio n. 2
0
        public void TestValidateOutlierScoreBetweenZeroAndOne()
        {
            // Cluster 1
            var a = new double[] { 21.33, 21.33, 21.33, 21.33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
            var b = new double[] { 19.99, 19.99, 19.99, 19.990000000000002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

            // Cluster 2
            var c = new double[] { 1, 2, 3, 4, 5, 6, 6, 7, 7, 9, 3, 4, 3, 2, 2, 1 };
            var d = new double[] { 1, 3, 3, 5, 5, 6, 6, 8, 8, 9, 3, 4, 3, 2, 1, 2 };

            // Outliers
            var e = new double[] { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 };
            var f = new double[] { 1, 2, 3, 0, 5, 6, 0, 0, 7, 9, 0, 0, 3, 0, 2, 0 };

            var dataset = new List <double[]>();

            dataset.Add(a);
            dataset.Add(b);
            dataset.Add(c);
            dataset.Add(d);
            dataset.Add(e);
            dataset.Add(f);

            var result = HdbscanRunner.Run(new HdbscanParameters <double[]>
            {
                DataSet          = dataset.ToArray(),
                MinPoints        = 2,
                MinClusterSize   = 2,
                DistanceFunction = new PearsonCorrelation()
            });

            var numInvalidScore = result.OutliersScore.Count(m => m.Score < 0 || m.Score > 1);

            Assert.AreEqual(numInvalidScore, 0);
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            // Specify which files to use.
            var projectDir = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName;
            var pathFiles  = Directory.EnumerateFiles(projectDir + @"\DocumentClusteringExample\Samples").ToList();

            // Hyper parameters.

            // This option prevent overfitting on missing words.
            var replaceMissingValueWithRandomValue = false;

            var usePCA            = false;
            var numberOfOutputPCA = 100;
            var distanceFunction  = new PearsonCorrelation();

            var strategy              = ValueStrategy.Freq;
            var minVectorElements     = 2;
            var freqMin               = 2;
            var minWordCount          = 1;
            var maxWordCount          = 3;
            var minGroupOfWordsLength = 3;
            var minWordLength         = 1;
            var firstWordMinLength    = 1;
            var lastWordMinLength     = 1;
            var maxComposition        = int.MaxValue;
            var badWords              = File.ReadLines(projectDir + @"\DocumentClusteringExample\stop-words-english.txt")
                                        .Where(m => !string.IsNullOrWhiteSpace(m))
                                        .ToArray();
            var badPatternList = new string[]
            {
            };

            // Files -> List of expressions (Our dictionary based on files)
            var expressions = ExtractExpressionFromTextFiles.ExtractExpressions(
                pathFiles,
                new ExtractExpressionFromTextFilesOption
            {
                BadPatternList           = badPatternList,
                BadWords                 = badWords,
                FirstWordMinLength       = firstWordMinLength,
                LastWordMinLength        = lastWordMinLength,
                MaxExpressionComposition = maxComposition,
                MaxWordCount             = maxWordCount,
                MinGroupOfWordsLength    = minGroupOfWordsLength,
                MinWordCount             = minWordCount,
                MinWordFrequency         = freqMin,
                MinWordLength            = minWordLength
            });

            Console.WriteLine("Expressions: " + expressions.Count);

            // Files -­> Vectors
            var expressionVectorOption = new TextFileToExpressionVectorOption
            {
                MinVectorElements = minVectorElements,
                BadPatternList    = badPatternList,
                MaxWordCount      = maxWordCount,
                MinWordCount      = minWordCount,
                Strategy          = strategy,
                ReplaceMissingValueWithRandomValue = replaceMissingValueWithRandomValue
            };
            List <Tuple <string, double[]> > filesToVector = new List <Tuple <string, double[]> >();

            foreach (var pathFile in pathFiles)
            {
                filesToVector.Add(
                    new Tuple <string, double[]>(
                        pathFile,
                        TextFileToExpressionVector.GenerateExpressionVector(
                            expressions,
                            pathFile,
                            expressionVectorOption)
                        )
                    );
            }
            var vectors = filesToVector
                          .Select(m => m.Item2)
                          .ToList();

            Console.WriteLine("vectors count: " + vectors.Count);

            // Remove non-representative vectors
            for (int i = 0; i < vectors.Count; i++)
            {
                var vector = vectors[i];
                if (vector.Sum() < minVectorElements)
                {
                    vectors.RemoveAt(i);
                    pathFiles.RemoveAt(i);
                    i--;
                }
            }
            Console.WriteLine("vectors count (after removing non-representative vectors): " + vectors.Count);

            // Reduce the vector size with PCA.
            if (usePCA)
            {
                Console.WriteLine("Reducing vector size with PCA");
                Stopwatch stopwatch = new Stopwatch();
                stopwatch.Start();
                PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();
                pca.NumberOfOutputs = numberOfOutputPCA;
                var trainingVector = vectors.ToArray();
                Shuffle(trainingVector);
                trainingVector = trainingVector.Take(600).ToArray();
                var pcaResult             = pca.Learn(trainingVector);
                var reducedVectorsWithPCA = pcaResult.Transform(vectors.ToArray());
                stopwatch.Stop();
                Console.WriteLine("PCA duration: " + stopwatch.Elapsed.ToString());

                vectors = reducedVectorsWithPCA.ToList();
            }


            // Run HDBSCAN algo.
            Console.WriteLine("HDBSCAN starting...");

            var contraintsList = new List <HdbscanConstraint>();

            if (usePCA)
            {
                for (int i = 1; i < numberOfOutputPCA; i++)
                {
                    contraintsList.Add(new HdbscanConstraint(i - 1, i, HdbscanConstraintType.CannotLink));
                }
            }

            var watch  = Stopwatch.StartNew();
            var result = HdbscanRunner.Run(new HdbscanParameters
            {
                DataSet           = vectors.ToArray(),
                MinPoints         = 5,
                MinClusterSize    = 5,
                DistanceFunction  = distanceFunction,
                Constraints       = contraintsList,
                UseMultipleThread = true
            });

            watch.Stop();
            Console.WriteLine("HDBSCAN done " + watch.Elapsed);

            // Read results.
            var labels = result.Labels;
            int n      = labels.Max();

            Console.WriteLine("\n\n");

            int clusterId = 0;

            for (int iCluster = 1; iCluster <= n; iCluster++)
            {
                Dictionary <string, int> categories = new Dictionary <string, int>();
                bool anyFound = false;
                for (int i = 0; i < labels.Length; i++)
                {
                    if (labels[i] == iCluster)
                    {
                        var fileName = Path.GetFileNameWithoutExtension(pathFiles[i]);
                        var category = fileName.Split('-')[0].Trim();

                        if (categories.ContainsKey(category))
                        {
                            var count = categories[category];
                            categories.Remove(category);
                            categories.Add(category, count + 1);
                        }
                        else
                        {
                            categories.Add(category, 1);
                        }

                        anyFound = true;
                    }
                }
                if (anyFound)
                {
                    clusterId++;
                    Console.WriteLine("Cluster #" + clusterId);

                    Console.WriteLine();
                    foreach (var category in categories)
                    {
                        Console.WriteLine(category.Key + ": " + category.Value);
                    }
                    Console.ReadLine();
                }
            }

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }