コード例 #1
0
ファイル: CustomDistance.cs プロジェクト: Anshul-Bansal/gsoc
 internal static HandleRef getCPtr(CustomDistance obj)
 {
     return((obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr);
 }
コード例 #2
0
ファイル: Program.cs プロジェクト: erisonliang/HdbscanSharp
        static void Main(string[] args)
        {
            // Specify which files to use.
            var projectDir = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName;
            var pathFiles  = Directory.EnumerateFiles(projectDir + @"\OrderByMostSimilarDocumentExample\Samples").ToList();

            // Hyper parameters.

            // This option prevent overfitting on missing words.
            var replaceMissingValueWithRandomValue = false;

            var strategy              = ValueStrategy.Presence;
            var minVectorElements     = 25;
            var freqMin               = 5;
            var minWordCount          = 1;
            var maxWordCount          = 3;
            var minGroupOfWordsLength = 1;
            var minWordLength         = 1;
            var firstWordMinLength    = 1;
            var lastWordMinLength     = 1;
            var maxComposition        = 50;
            var badWords              = File.ReadLines(projectDir + @"\DocumentClusteringExample\stop-words-english.txt")
                                        .Where(m => !string.IsNullOrWhiteSpace(m))
                                        .ToArray();
            var badPatternList = new string[]
            {
            };

            // Files -> List of expressions (Our dictionary based on files)
            var expressions = ExtractExpressionFromTextFiles.ExtractExpressions(
                pathFiles,
                new ExtractExpressionFromTextFilesOption
            {
                BadPatternList           = badPatternList,
                BadWords                 = badWords,
                FirstWordMinLength       = firstWordMinLength,
                LastWordMinLength        = lastWordMinLength,
                MaxExpressionComposition = maxComposition,
                MaxWordCount             = maxWordCount,
                MinGroupOfWordsLength    = minGroupOfWordsLength,
                MinWordCount             = minWordCount,
                MinWordFrequency         = freqMin,
                MinWordLength            = minWordLength
            });

            Console.WriteLine("Expressions: " + expressions.Count);

            // Files -­> Vectors
            var expressionVectorOption = new TextFileToExpressionVectorOption
            {
                MinVectorElements = minVectorElements,
                BadPatternList    = badPatternList,
                MaxWordCount      = maxWordCount,
                MinWordCount      = minWordCount,
                Strategy          = strategy,
                ReplaceMissingValueWithRandomValue = replaceMissingValueWithRandomValue
            };
            List <Tuple <string, double[]> > filesToVector = new List <Tuple <string, double[]> >();

            foreach (var pathFile in pathFiles)
            {
                filesToVector.Add(
                    new Tuple <string, double[]>(
                        pathFile,
                        TextFileToExpressionVector.GenerateExpressionVector(
                            expressions,
                            pathFile,
                            expressionVectorOption)
                        )
                    );
            }
            var vectors = filesToVector
                          .Select(m => m.Item2)
                          .ToList();

            Console.WriteLine("vectors count: " + vectors.Count);

            // Remove non-representative vectors
            for (int i = 0; i < vectors.Count; i++)
            {
                var vector = vectors[i];
                if (vector.Sum() < minVectorElements)
                {
                    vectors.RemoveAt(i);
                    pathFiles.RemoveAt(i);
                    i--;
                }
            }
            Console.WriteLine("vectors count (after removing non-representative vectors): " + vectors.Count);

            var listFileAndVector = new List <FileAndVector>();

            for (int i = 0; i < vectors.Count; i++)
            {
                var path   = pathFiles[i];
                var vector = vectors[i];
                listFileAndVector.Add(new FileAndVector {
                    Path = path, Vector = vector
                });
            }

            var distanceFunc = new CustomDistance();

            Shuffle(listFileAndVector);

            for (int i = 0; i < listFileAndVector.Count; i++)
            {
                var element     = listFileAndVector[i];
                var orderedList = listFileAndVector.OrderByDescending(m => distanceFunc.ComputeDistance(element.Vector, m.Vector));

                var pathA = Path.GetFileNameWithoutExtension(element.Path);
                pathA = string.Join("", pathA.Take(70));
                var catA = pathA.Split('-')[0].Trim();

                int countSameCat = 0;

                Console.WriteLine("\n\n\n# " + pathA + "\n");
                foreach (var item in orderedList.Skip(1).Take(5))
                {
                    var pathB = Path.GetFileNameWithoutExtension(item.Path);
                    pathB = string.Join("", pathB.Take(70));
                    var catB = pathB.Split('-')[0].Trim();

                    double score = distanceFunc.ComputeDistance(element.Vector, item.Vector);

                    if (catA == catB)
                    {
                        countSameCat++;
                    }

                    Console.WriteLine(" - " + pathB + " " + string.Format("{0:#.##}", score));
                }
                Console.WriteLine("\nSame category: " + countSameCat);
                Console.ReadLine();
            }

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
コード例 #3
0
ファイル: CustomDistance.cs プロジェクト: Anshul-Bansal/gsoc
 internal static HandleRef getCPtr(CustomDistance obj) {
   return (obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr;
 }