static void Main(string[] args) { var dataset = LoadCsv("iris.csv", 5); var result = HdbscanRunner.Run(new HdbscanParameters { DataSet = dataset, MinPoints = 25, MinClusterSize = 25, DistanceFunction = new CosineSimilarity() }); for (var specie = 1; specie <= 3; specie++) { var offset = (specie - 1) * 50; const int size = 50; Console.Write("Specie #" + specie + " "); for (int i = 0; i < size; i++) { var label = result.Labels[offset + i]; Console.Write(label); } Console.WriteLine(); } Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }
public void TestValidateOutlierScoreBetweenZeroAndOne() { // Cluster 1 var a = new double[] { 21.33, 21.33, 21.33, 21.33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; var b = new double[] { 19.99, 19.99, 19.99, 19.990000000000002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // Cluster 2 var c = new double[] { 1, 2, 3, 4, 5, 6, 6, 7, 7, 9, 3, 4, 3, 2, 2, 1 }; var d = new double[] { 1, 3, 3, 5, 5, 6, 6, 8, 8, 9, 3, 4, 3, 2, 1, 2 }; // Outliers var e = new double[] { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 }; var f = new double[] { 1, 2, 3, 0, 5, 6, 0, 0, 7, 9, 0, 0, 3, 0, 2, 0 }; var dataset = new List <double[]>(); dataset.Add(a); dataset.Add(b); dataset.Add(c); dataset.Add(d); dataset.Add(e); dataset.Add(f); var result = HdbscanRunner.Run(new HdbscanParameters <double[]> { DataSet = dataset.ToArray(), MinPoints = 2, MinClusterSize = 2, DistanceFunction = new PearsonCorrelation() }); var numInvalidScore = result.OutliersScore.Count(m => m.Score < 0 || m.Score > 1); Assert.AreEqual(numInvalidScore, 0); }
static void Main(string[] args) { // Specify which files to use. var projectDir = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName; var pathFiles = Directory.EnumerateFiles(projectDir + @"\DocumentClusteringExample\Samples").ToList(); // Hyper parameters. // This option prevent overfitting on missing words. var replaceMissingValueWithRandomValue = false; var usePCA = false; var numberOfOutputPCA = 100; var distanceFunction = new PearsonCorrelation(); var strategy = ValueStrategy.Freq; var minVectorElements = 2; var freqMin = 2; var minWordCount = 1; var maxWordCount = 3; var minGroupOfWordsLength = 3; var minWordLength = 1; var firstWordMinLength = 1; var lastWordMinLength = 1; var maxComposition = int.MaxValue; var badWords = File.ReadLines(projectDir + @"\DocumentClusteringExample\stop-words-english.txt") .Where(m => !string.IsNullOrWhiteSpace(m)) .ToArray(); var badPatternList = new string[] { }; // Files -> List of expressions (Our dictionary based on files) var expressions = ExtractExpressionFromTextFiles.ExtractExpressions( pathFiles, new ExtractExpressionFromTextFilesOption { BadPatternList = badPatternList, BadWords = badWords, FirstWordMinLength = firstWordMinLength, LastWordMinLength = lastWordMinLength, MaxExpressionComposition = maxComposition, MaxWordCount = maxWordCount, MinGroupOfWordsLength = minGroupOfWordsLength, MinWordCount = minWordCount, MinWordFrequency = freqMin, MinWordLength = minWordLength }); Console.WriteLine("Expressions: " + expressions.Count); // Files -> Vectors var expressionVectorOption = new TextFileToExpressionVectorOption { MinVectorElements = minVectorElements, BadPatternList = badPatternList, MaxWordCount = maxWordCount, MinWordCount = minWordCount, Strategy = strategy, ReplaceMissingValueWithRandomValue = replaceMissingValueWithRandomValue }; List <Tuple <string, double[]> > filesToVector = new List <Tuple <string, double[]> >(); foreach (var pathFile in pathFiles) { filesToVector.Add( new Tuple <string, double[]>( pathFile, TextFileToExpressionVector.GenerateExpressionVector( expressions, pathFile, expressionVectorOption) ) ); } var vectors = filesToVector .Select(m => m.Item2) .ToList(); Console.WriteLine("vectors count: " + vectors.Count); // Remove non-representative vectors for (int i = 0; i < vectors.Count; i++) { var vector = vectors[i]; if (vector.Sum() < minVectorElements) { vectors.RemoveAt(i); pathFiles.RemoveAt(i); i--; } } Console.WriteLine("vectors count (after removing non-representative vectors): " + vectors.Count); // Reduce the vector size with PCA. if (usePCA) { Console.WriteLine("Reducing vector size with PCA"); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis(); pca.NumberOfOutputs = numberOfOutputPCA; var trainingVector = vectors.ToArray(); Shuffle(trainingVector); trainingVector = trainingVector.Take(600).ToArray(); var pcaResult = pca.Learn(trainingVector); var reducedVectorsWithPCA = pcaResult.Transform(vectors.ToArray()); stopwatch.Stop(); Console.WriteLine("PCA duration: " + stopwatch.Elapsed.ToString()); vectors = reducedVectorsWithPCA.ToList(); } // Run HDBSCAN algo. Console.WriteLine("HDBSCAN starting..."); var contraintsList = new List <HdbscanConstraint>(); if (usePCA) { for (int i = 1; i < numberOfOutputPCA; i++) { contraintsList.Add(new HdbscanConstraint(i - 1, i, HdbscanConstraintType.CannotLink)); } } var watch = Stopwatch.StartNew(); var result = HdbscanRunner.Run(new HdbscanParameters { DataSet = vectors.ToArray(), MinPoints = 5, MinClusterSize = 5, DistanceFunction = distanceFunction, Constraints = contraintsList, UseMultipleThread = true }); watch.Stop(); Console.WriteLine("HDBSCAN done " + watch.Elapsed); // Read results. var labels = result.Labels; int n = labels.Max(); Console.WriteLine("\n\n"); int clusterId = 0; for (int iCluster = 1; iCluster <= n; iCluster++) { Dictionary <string, int> categories = new Dictionary <string, int>(); bool anyFound = false; for (int i = 0; i < labels.Length; i++) { if (labels[i] == iCluster) { var fileName = Path.GetFileNameWithoutExtension(pathFiles[i]); var category = fileName.Split('-')[0].Trim(); if (categories.ContainsKey(category)) { var count = categories[category]; categories.Remove(category); categories.Add(category, count + 1); } else { categories.Add(category, 1); } anyFound = true; } } if (anyFound) { clusterId++; Console.WriteLine("Cluster #" + clusterId); Console.WriteLine(); foreach (var category in categories) { Console.WriteLine(category.Key + ": " + category.Value); } Console.ReadLine(); } } Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }