/*private static void TrainSentenceDetector(int iterations, int cut) * { * // train sentence detector * var sdTrainingFileDirectory = CurrentDirectory + "Input/SentenceDetect"; * var allTrainingFiles = Directory.GetFiles(sdTrainingFileDirectory); * * Console.WriteLine("Starting training..."); * var model = MaximumEntropySentenceDetector.TrainModel(allTrainingFiles, iterations, cut, new CharactersSpecificEndOfSentenceScanner('.','?','!','"')); * var tokenizer = new MaximumEntropySentenceDetector(model); * * // test data * var trainingLines = new List<string>(); * foreach (var trainingFile in allTrainingFiles) * { * trainingLines.AddRange(File.ReadAllLines(trainingFile)); * } * var testData = new List<TokenizerTestData>(); * foreach (var trainingLine in trainingLines) * { * var testDataPoint = new TokenizerTestData(trainingLine, splitMarker.ToString()); * testData.Add(testDataPoint); * } * var results = tokenizer.RunAgainstTestData(testData); * Console.WriteLine("Accuracy of model iteration={0}, cut={1}: {2}", iterations, cut, results.GetAccuracy()); * }*/ private static void TrainTokenizer(int iterations, int cut) { // train tokenizer var tokenizeTrainingFileDirectory = CurrentDirectory + "Input/Tokenize/"; const char splitMarker = '|'; var allTrainingFiles = Directory.GetFiles(tokenizeTrainingFileDirectory); Console.WriteLine("Starting training..."); var model = MaximumEntropyTokenizer.Train(allTrainingFiles, iterations, cut); var tokenizer = new MaximumEntropyTokenizer(model); // test data var trainingLines = new List <string>(); foreach (var trainingFile in allTrainingFiles) { trainingLines.AddRange(File.ReadAllLines(trainingFile)); } var testData = new List <TokenizerTestData>(); foreach (var trainingLine in trainingLines) { var testDataPoint = new TokenizerTestData(trainingLine, splitMarker.ToString()); testData.Add(testDataPoint); } var results = tokenizer.RunAgainstTestData(testData); Console.WriteLine("Accuracy of model iteration={0}, cut={1}: {2}", iterations, cut, results.GetAccuracy()); }
private static void OptimizeTokenizerTraining(bool includeAllCapsExamples = false) { // all directories in Input folder var inputFolderPath = CurrentDirectory + "Input/"; var allDirectories = Directory.GetDirectories(inputFolderPath); Console.WriteLine("Pick the model to train:"); for (var i = 0; i < allDirectories.Length; i++) { Console.WriteLine("{0} - {1}", i, Path.GetFileName(allDirectories[i])); } // read directory chosen by user int directoryIndexPicked = LoopUntilValidUserInput(input => int.Parse(input), i => i < allDirectories.Length, string.Format("Please enter a number in [0..{0}]", allDirectories.Length - 1)); // read user parameters Console.WriteLine("Enter the iteration values to test, separated by a comma (ex: 10,100,200)"); var iterations = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(), li => li != null && li.Any(), "At least one iteration value is required"); Console.WriteLine("Enter the cut values to test, separated by a comma (ex: 1,2,5)"); var cuts = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(), li => li != null && li.Any(), "At least one cut value is required"); // train model file var directory = allDirectories[directoryIndexPicked]; var allTrainFiles = Directory.GetFiles(directory, "*.train") /*.Where(f => f.Contains("wsj"))*/; Console.WriteLine("Training model with files: {0}", string.Join(", ", allTrainFiles.Select(f => Path.GetFileNameWithoutExtension(f)))); // load training data var trainingLines = new List <string>(); foreach (var file in allTrainFiles /*.Where(f => f.Contains("wsj"))*/) { trainingLines.AddRange(File.ReadAllLines(file)); } var bestIterationValue = iterations.First(); var bestCutValue = iterations.First(); var bestAccuracy = 0.0d; foreach (var iteration in iterations) { foreach (var cut in cuts) { var model = MaximumEntropyTokenizer.Train(allTrainFiles, iteration, cut, includeAllCapsExamples: includeAllCapsExamples); // compute accuracy var tokenizer = new MaximumEntropyTokenizer(model); // test data var testData = new List <TokenizerTestData>(); foreach (var trainingLine in trainingLines) { var testDataPoint = new TokenizerTestData(trainingLine, "|"); testData.Add(testDataPoint); } var results = tokenizer.RunAgainstTestData(testData); Console.WriteLine("Accuracy of model iteration={0}, cut={1}: {2}", iteration, cut, results.GetAccuracy()); if (results.GetAccuracy() > bestAccuracy) { bestAccuracy = results.GetAccuracy(); bestIterationValue = iteration; bestCutValue = cut; } } } // Persit model var outputFilePath = CurrentDirectory + "Output/" + Path.GetFileName(directory) + ".nbin"; Console.WriteLine("Persisting model for iteration={0} and cut={1} to file '{2}'...", bestIterationValue, bestCutValue, outputFilePath); var bestModel = MaximumEntropyTokenizer.Train(allTrainFiles, bestIterationValue, bestCutValue); new BinaryGisModelWriter().Persist(bestModel, outputFilePath); Console.WriteLine("Output file written."); }