Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            // Specify which files to use.
            var projectDir = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName;
            var pathFiles  = Directory.EnumerateFiles(projectDir + @"\OrderByMostSimilarDocumentExample\Samples").ToList();

            // Hyper parameters.

            // This option prevent overfitting on missing words.
            var replaceMissingValueWithRandomValue = false;

            var strategy              = ValueStrategy.Presence;
            var minVectorElements     = 25;
            var freqMin               = 5;
            var minWordCount          = 1;
            var maxWordCount          = 3;
            var minGroupOfWordsLength = 1;
            var minWordLength         = 1;
            var firstWordMinLength    = 1;
            var lastWordMinLength     = 1;
            var maxComposition        = 50;
            var badWords              = File.ReadLines(projectDir + @"\DocumentClusteringExample\stop-words-english.txt")
                                        .Where(m => !string.IsNullOrWhiteSpace(m))
                                        .ToArray();
            var badPatternList = new string[]
            {
            };

            // Files -> List of expressions (Our dictionary based on files)
            var expressions = ExtractExpressionFromTextFiles.ExtractExpressions(
                pathFiles,
                new ExtractExpressionFromTextFilesOption
            {
                BadPatternList           = badPatternList,
                BadWords                 = badWords,
                FirstWordMinLength       = firstWordMinLength,
                LastWordMinLength        = lastWordMinLength,
                MaxExpressionComposition = maxComposition,
                MaxWordCount             = maxWordCount,
                MinGroupOfWordsLength    = minGroupOfWordsLength,
                MinWordCount             = minWordCount,
                MinWordFrequency         = freqMin,
                MinWordLength            = minWordLength
            });

            Console.WriteLine("Expressions: " + expressions.Count);

            // Files -­> Vectors
            var expressionVectorOption = new TextFileToExpressionVectorOption
            {
                MinVectorElements = minVectorElements,
                BadPatternList    = badPatternList,
                MaxWordCount      = maxWordCount,
                MinWordCount      = minWordCount,
                Strategy          = strategy,
                ReplaceMissingValueWithRandomValue = replaceMissingValueWithRandomValue
            };
            List <Tuple <string, double[]> > filesToVector = new List <Tuple <string, double[]> >();

            foreach (var pathFile in pathFiles)
            {
                filesToVector.Add(
                    new Tuple <string, double[]>(
                        pathFile,
                        TextFileToExpressionVector.GenerateExpressionVector(
                            expressions,
                            pathFile,
                            expressionVectorOption)
                        )
                    );
            }
            var vectors = filesToVector
                          .Select(m => m.Item2)
                          .ToList();

            Console.WriteLine("vectors count: " + vectors.Count);

            // Remove non-representative vectors
            for (int i = 0; i < vectors.Count; i++)
            {
                var vector = vectors[i];
                if (vector.Sum() < minVectorElements)
                {
                    vectors.RemoveAt(i);
                    pathFiles.RemoveAt(i);
                    i--;
                }
            }
            Console.WriteLine("vectors count (after removing non-representative vectors): " + vectors.Count);

            var listFileAndVector = new List <FileAndVector>();

            for (int i = 0; i < vectors.Count; i++)
            {
                var path   = pathFiles[i];
                var vector = vectors[i];
                listFileAndVector.Add(new FileAndVector {
                    Path = path, Vector = vector
                });
            }

            var distanceFunc = new CustomDistance();

            Shuffle(listFileAndVector);

            for (int i = 0; i < listFileAndVector.Count; i++)
            {
                var element     = listFileAndVector[i];
                var orderedList = listFileAndVector.OrderByDescending(m => distanceFunc.ComputeDistance(element.Vector, m.Vector));

                var pathA = Path.GetFileNameWithoutExtension(element.Path);
                pathA = string.Join("", pathA.Take(70));
                var catA = pathA.Split('-')[0].Trim();

                int countSameCat = 0;

                Console.WriteLine("\n\n\n# " + pathA + "\n");
                foreach (var item in orderedList.Skip(1).Take(5))
                {
                    var pathB = Path.GetFileNameWithoutExtension(item.Path);
                    pathB = string.Join("", pathB.Take(70));
                    var catB = pathB.Split('-')[0].Trim();

                    double score = distanceFunc.ComputeDistance(element.Vector, item.Vector);

                    if (catA == catB)
                    {
                        countSameCat++;
                    }

                    Console.WriteLine(" - " + pathB + " " + string.Format("{0:#.##}", score));
                }
                Console.WriteLine("\nSame category: " + countSameCat);
                Console.ReadLine();
            }

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
Ejemplo n.º 2
0
        static void Main(string[] args)
        {
            // Specify which files to use.
            var projectDir = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName;
            var pathFiles  = Directory.EnumerateFiles(projectDir + @"\DocumentClusteringExample\Samples").ToList();

            // Hyper parameters.

            // This option prevent overfitting on missing words.
            var replaceMissingValueWithRandomValue = false;

            var usePCA            = false;
            var numberOfOutputPCA = 100;
            var distanceFunction  = new PearsonCorrelation();

            var strategy              = ValueStrategy.Freq;
            var minVectorElements     = 2;
            var freqMin               = 2;
            var minWordCount          = 1;
            var maxWordCount          = 3;
            var minGroupOfWordsLength = 3;
            var minWordLength         = 1;
            var firstWordMinLength    = 1;
            var lastWordMinLength     = 1;
            var maxComposition        = int.MaxValue;
            var badWords              = File.ReadLines(projectDir + @"\DocumentClusteringExample\stop-words-english.txt")
                                        .Where(m => !string.IsNullOrWhiteSpace(m))
                                        .ToArray();
            var badPatternList = new string[]
            {
            };

            // Files -> List of expressions (Our dictionary based on files)
            var expressions = ExtractExpressionFromTextFiles.ExtractExpressions(
                pathFiles,
                new ExtractExpressionFromTextFilesOption
            {
                BadPatternList           = badPatternList,
                BadWords                 = badWords,
                FirstWordMinLength       = firstWordMinLength,
                LastWordMinLength        = lastWordMinLength,
                MaxExpressionComposition = maxComposition,
                MaxWordCount             = maxWordCount,
                MinGroupOfWordsLength    = minGroupOfWordsLength,
                MinWordCount             = minWordCount,
                MinWordFrequency         = freqMin,
                MinWordLength            = minWordLength
            });

            Console.WriteLine("Expressions: " + expressions.Count);

            // Files -­> Vectors
            var expressionVectorOption = new TextFileToExpressionVectorOption
            {
                MinVectorElements = minVectorElements,
                BadPatternList    = badPatternList,
                MaxWordCount      = maxWordCount,
                MinWordCount      = minWordCount,
                Strategy          = strategy,
                ReplaceMissingValueWithRandomValue = replaceMissingValueWithRandomValue
            };
            List <Tuple <string, double[]> > filesToVector = new List <Tuple <string, double[]> >();

            foreach (var pathFile in pathFiles)
            {
                filesToVector.Add(
                    new Tuple <string, double[]>(
                        pathFile,
                        TextFileToExpressionVector.GenerateExpressionVector(
                            expressions,
                            pathFile,
                            expressionVectorOption)
                        )
                    );
            }
            var vectors = filesToVector
                          .Select(m => m.Item2)
                          .ToList();

            Console.WriteLine("vectors count: " + vectors.Count);

            // Remove non-representative vectors
            for (int i = 0; i < vectors.Count; i++)
            {
                var vector = vectors[i];
                if (vector.Sum() < minVectorElements)
                {
                    vectors.RemoveAt(i);
                    pathFiles.RemoveAt(i);
                    i--;
                }
            }
            Console.WriteLine("vectors count (after removing non-representative vectors): " + vectors.Count);

            // Reduce the vector size with PCA.
            if (usePCA)
            {
                Console.WriteLine("Reducing vector size with PCA");
                Stopwatch stopwatch = new Stopwatch();
                stopwatch.Start();
                PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();
                pca.NumberOfOutputs = numberOfOutputPCA;
                var trainingVector = vectors.ToArray();
                Shuffle(trainingVector);
                trainingVector = trainingVector.Take(600).ToArray();
                var pcaResult             = pca.Learn(trainingVector);
                var reducedVectorsWithPCA = pcaResult.Transform(vectors.ToArray());
                stopwatch.Stop();
                Console.WriteLine("PCA duration: " + stopwatch.Elapsed.ToString());

                vectors = reducedVectorsWithPCA.ToList();
            }


            // Run HDBSCAN algo.
            Console.WriteLine("HDBSCAN starting...");

            var contraintsList = new List <HdbscanConstraint>();

            if (usePCA)
            {
                for (int i = 1; i < numberOfOutputPCA; i++)
                {
                    contraintsList.Add(new HdbscanConstraint(i - 1, i, HdbscanConstraintType.CannotLink));
                }
            }

            var watch  = Stopwatch.StartNew();
            var result = HdbscanRunner.Run(new HdbscanParameters
            {
                DataSet           = vectors.ToArray(),
                MinPoints         = 5,
                MinClusterSize    = 5,
                DistanceFunction  = distanceFunction,
                Constraints       = contraintsList,
                UseMultipleThread = true
            });

            watch.Stop();
            Console.WriteLine("HDBSCAN done " + watch.Elapsed);

            // Read results.
            var labels = result.Labels;
            int n      = labels.Max();

            Console.WriteLine("\n\n");

            int clusterId = 0;

            for (int iCluster = 1; iCluster <= n; iCluster++)
            {
                Dictionary <string, int> categories = new Dictionary <string, int>();
                bool anyFound = false;
                for (int i = 0; i < labels.Length; i++)
                {
                    if (labels[i] == iCluster)
                    {
                        var fileName = Path.GetFileNameWithoutExtension(pathFiles[i]);
                        var category = fileName.Split('-')[0].Trim();

                        if (categories.ContainsKey(category))
                        {
                            var count = categories[category];
                            categories.Remove(category);
                            categories.Add(category, count + 1);
                        }
                        else
                        {
                            categories.Add(category, 1);
                        }

                        anyFound = true;
                    }
                }
                if (anyFound)
                {
                    clusterId++;
                    Console.WriteLine("Cluster #" + clusterId);

                    Console.WriteLine();
                    foreach (var category in categories)
                    {
                        Console.WriteLine(category.Key + ": " + category.Value);
                    }
                    Console.ReadLine();
                }
            }

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }