//public static void ClassifyAndValidateOne(string trainFile, string sampleClassifiedFile, string classificationFile, string validationFile) //1412520 public static void ClassifyAndValidateOne() { var testsWithValueType = FileIO.ReadFile("../../validation/testTarget.txt"); var testRecords = RemoveLabelOfList(testsWithValueType); FileIO.WriteFile(testRecords, "../../validation/test.txt"); Bow_tfidf.GenerateTFIDFMatrix("../../validation/train.txt", "../../validation/processed.txt", "../../validation/features.txt", "../../validation/tf_idf.txt"); Model.classifyForValidate(); Validate("../../validation/testTarget.txt", "../../validation/testResult.txt", "../../validation/result.txt"); }
// 1412542 // Date: 08/04/2018 // Cross validation public static void Validate(string kFoldsFile, string inputFile) { List <string> fileNameList = FileUtils.getAllFileNames("../../splited_data"); String trainFile = "../../validation/train.txt"; String processedFile = "../../validation/processed.txt"; String featuresFile = "../../validation/features.txt"; String tfIdfFile = "../../validation/tf_idf.txt"; String testFile = "../../validation/test.txt"; String testTargetFile = "../../validation/testTarget.txt"; String testResultFile = "../../validation/testResult.txt"; // Read the number of subset int numberOfFolds = int.Parse(FileIO.ReadFile(kFoldsFile)[0]); /* * // Read data * List<string> data = FileIO.ReadFile(inputFile); * * // Split data into subsets * SplitTrainTest(data, numberOfFolds); */ // Read subsets List <List <string> > subset = new List <List <string> >(); foreach (string fileName in fileNameList) { subset.Add(FileIO.ReadFile(fileName)); } //calculate avg Fmicro and avg Fmacro double sum_Fmicro = 0; double sum_Fmacro = 0; // build model and test by k-folds cross validation for (int i = 0; i < numberOfFolds; i++) { var watch = System.Diagnostics.Stopwatch.StartNew(); Console.WriteLine("Fold {0}: ", i); //subset[i] is test set, others are training sets List <string> trainingSet = new List <string>(); List <string> testTarget = new List <string>(); for (int subIndex = 0; subIndex < numberOfFolds; subIndex++) { if (subIndex != i) { trainingSet.AddRange(subset[subIndex]); } else { testTarget = subset[i].ToList(); } } Console.WriteLine("The time to create training and test sets: {0} ", watch.ElapsedMilliseconds); FileIO.WriteFile(testTarget, testTargetFile); //Remove label of testSetTarget List <Vector> targetVector = new List <Vector>(); var testSet = FileIO.ReadFileIntoVector(testTargetFile, out targetVector, true); //testSet is removed label //Write testSet and trainSet into file FileIO.WriteFile(trainingSet, trainFile); FileIO.WriteFile(testSet, testFile); Console.WriteLine("The time to write training and test sets: {0} ", watch.ElapsedMilliseconds); Bow_tfidf.GenerateTFIDFMatrix(trainFile, processedFile, featuresFile, tfIdfFile); Console.WriteLine("The time to calculate tf_idf: {0} ", watch.ElapsedMilliseconds); Model.classifyForValidate(); List <Vector> sourceVector = new List <Vector>(); FileIO.ReadFileIntoVector(testResultFile, out sourceVector, true); Double Rmacro = calculateRmacro(sourceVector, targetVector); Double Pmacro = CalculatePmacro(sourceVector, targetVector); sum_Fmacro += calculateFmacroOrFscore(Rmacro, Pmacro); sum_Fmicro += calculateFmicro(sourceVector, targetVector); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine("Validating the fold {0} takes: {1} ", i, elapsedMs); } List <string> F_array = new List <string>(); double avg_Fmacro = (1.0 * sum_Fmicro) / numberOfFolds; double avg_Fmicro = (1.0 * sum_Fmacro) / numberOfFolds; F_array.Add(avg_Fmacro.ToString()); F_array.Add(avg_Fmicro.ToString()); //Write avg Fmacro and avg Fmicro into file FileIO.WriteFile(F_array, "../../validation/result.txt"); }