示例#1
0
        //public static void ClassifyAndValidateOne(string trainFile, string sampleClassifiedFile, string classificationFile, string validationFile)
        //1412520
        public static void ClassifyAndValidateOne()
        {
            var testsWithValueType = FileIO.ReadFile("../../validation/testTarget.txt");
            var testRecords        = RemoveLabelOfList(testsWithValueType);

            FileIO.WriteFile(testRecords, "../../validation/test.txt");
            Bow_tfidf.GenerateTFIDFMatrix("../../validation/train.txt", "../../validation/processed.txt", "../../validation/features.txt", "../../validation/tf_idf.txt");

            Model.classifyForValidate();
            Validate("../../validation/testTarget.txt", "../../validation/testResult.txt", "../../validation/result.txt");
        }
示例#2
0
        // 1412542
        // Date: 08/04/2018
        // Cross validation
        public static void Validate(string kFoldsFile, string inputFile)
        {
            List <string> fileNameList   = FileUtils.getAllFileNames("../../splited_data");
            String        trainFile      = "../../validation/train.txt";
            String        processedFile  = "../../validation/processed.txt";
            String        featuresFile   = "../../validation/features.txt";
            String        tfIdfFile      = "../../validation/tf_idf.txt";
            String        testFile       = "../../validation/test.txt";
            String        testTargetFile = "../../validation/testTarget.txt";
            String        testResultFile = "../../validation/testResult.txt";

            // Read the number of subset
            int numberOfFolds = int.Parse(FileIO.ReadFile(kFoldsFile)[0]);

            /*
             * // Read data
             * List<string> data = FileIO.ReadFile(inputFile);
             *
             * // Split data into subsets
             * SplitTrainTest(data, numberOfFolds);
             */


            // Read subsets
            List <List <string> > subset = new List <List <string> >();

            foreach (string fileName in fileNameList)
            {
                subset.Add(FileIO.ReadFile(fileName));
            }

            //calculate avg Fmicro and avg Fmacro
            double sum_Fmicro = 0;
            double sum_Fmacro = 0;

            // build model and test by k-folds cross validation
            for (int i = 0; i < numberOfFolds; i++)
            {
                var watch = System.Diagnostics.Stopwatch.StartNew();
                Console.WriteLine("Fold {0}: ", i);
                //subset[i] is test set, others are training sets
                List <string> trainingSet = new List <string>();
                List <string> testTarget  = new List <string>();
                for (int subIndex = 0; subIndex < numberOfFolds; subIndex++)
                {
                    if (subIndex != i)
                    {
                        trainingSet.AddRange(subset[subIndex]);
                    }
                    else
                    {
                        testTarget = subset[i].ToList();
                    }
                }
                Console.WriteLine("The time to create training and test sets: {0} ", watch.ElapsedMilliseconds);
                FileIO.WriteFile(testTarget, testTargetFile);
                //Remove label of testSetTarget
                List <Vector> targetVector = new List <Vector>();
                var           testSet      = FileIO.ReadFileIntoVector(testTargetFile, out targetVector, true);
                //testSet is removed label

                //Write testSet and trainSet into file
                FileIO.WriteFile(trainingSet, trainFile);
                FileIO.WriteFile(testSet, testFile);
                Console.WriteLine("The time to write training and test sets: {0} ", watch.ElapsedMilliseconds);

                Bow_tfidf.GenerateTFIDFMatrix(trainFile, processedFile, featuresFile, tfIdfFile);
                Console.WriteLine("The time to calculate tf_idf: {0} ", watch.ElapsedMilliseconds);

                Model.classifyForValidate();

                List <Vector> sourceVector = new List <Vector>();
                FileIO.ReadFileIntoVector(testResultFile, out sourceVector, true);

                Double Rmacro = calculateRmacro(sourceVector, targetVector);
                Double Pmacro = CalculatePmacro(sourceVector, targetVector);

                sum_Fmacro += calculateFmacroOrFscore(Rmacro, Pmacro);
                sum_Fmicro += calculateFmicro(sourceVector, targetVector);

                watch.Stop();
                var elapsedMs = watch.ElapsedMilliseconds;
                Console.WriteLine("Validating the fold {0} takes: {1} ", i, elapsedMs);
            }

            List <string> F_array    = new List <string>();
            double        avg_Fmacro = (1.0 * sum_Fmicro) / numberOfFolds;
            double        avg_Fmicro = (1.0 * sum_Fmacro) / numberOfFolds;

            F_array.Add(avg_Fmacro.ToString());
            F_array.Add(avg_Fmicro.ToString());

            //Write avg Fmacro and avg Fmicro into file
            FileIO.WriteFile(F_array, "../../validation/result.txt");
        }