public static void TestUserQuery(List <ILanguageModel> models, AccentConverter converter) { while (true) { Console.Write("Enter a phrase: "); string data = Console.ReadLine(); if (data.Contains("quit")) { break; } File.AppendAllText(g_LogFile, data + "\r\n"); Clocker.Tick(); string predictedWords = String.Join(" ", Predict(data, models, converter).Select(wc => wc[0])); string predicted = String.Format("Predicted: {0} - {1} seconds", predictedWords, Clocker.Seconds()); Console.WriteLine(predicted); Console.WriteLine(); File.AppendAllText(g_LogFile, predicted + "\r\n\r\n"); } }
public static void Test(AccentConverter converter, int modelVersion) { string modelFile = (converter is VietConverter) ? g_ModelFile : g_FrenchModelFile; string[] modelFiles = Directory.GetFiles(DataManager.BaseDir, Path.GetFileName(String.Format(modelFile, "*"))); List <ILanguageModel> models = new ILanguageModel[5].ToList(); for (int n = 1; n <= 3; n++) { string fileName = String.Format(modelFile, n); if (!File.Exists(fileName)) { continue; } Console.WriteLine("Loading {0}-gram model...", n); Clocker.Tick(); models[n - 1] = ModelFactory.LoadModel(modelFiles[n - 1], modelVersion); Clocker.Tock(); } TestUserQuery(models, converter); }
public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true) { List <int> grams = new List <int>(); for (int n = minGram; n <= maxGram; n++) { if (!File.Exists(String.Format(outModelFilePattern, n))) { grams.Add(n); } } if (grams.Count == 0) { return; } // Load dictionary of raw words Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null; // Load segments from training data List <List <string> > segments = TextParser.ParseData(inputTrainingFiles); StringBuilder sbRaw = new StringBuilder(); StringBuilder sbAcc = new StringBuilder(); foreach (int n in grams) { int iG = n - 1; Console.WriteLine("Building {0}-gram ...", iG + 1); Clocker.Tick(); using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1)))) { ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion); for (int iS = 0; iS < segments.Count; iS++) { List <string> words = segments[iS]; for (int i = 0; i < words.Count - iG; i++) { sbRaw.Clear(); sbAcc.Clear(); bool shouldProceed = true; if (learnKnownWordsOnly) { for (int g = 0; g <= iG; g++) { string accWord = words[i + g]; string rawWord = converter.Convert(accWord); if (!dictionary.ContainsKey(rawWord)) { shouldProceed = false; break; } sbAcc.Append(accWord); sbRaw.Append(rawWord); if (g < iG) { sbRaw.Append(" "); } } } else { for (int g = 0; g <= iG; g++) { sbAcc.Append(words[i + g]); sbRaw.Append(converter.Convert(words[i + g])); if (g < iG) { sbRaw.Append(" "); } } } if (shouldProceed) { string accents = ExtractAccents(sbAcc.ToString(), converter); igGram.Add(sbRaw.ToString(), accents); } } } igGram.WriteToBinary(bwModel); } Clocker.Tock(); } }
private static void TestDataSet(List <string> segments, List <ILanguageModel> models, VietConverter converter, string logFile) { Clocker.Tick(); Console.WriteLine("{0} expected total segments", segments.Count); using (StreamWriter sw = new StreamWriter(File.Create(logFile))) { int nCorrectSegments = 0; int nTotalSegments = 0; long nCorrectWords = 0; long nTotalWords = 0; var logs = new ConcurrentBag <string>(); Parallel.ForEach( segments, new ParallelOptions { MaxDegreeOfParallelism = 2 * Environment.ProcessorCount }, actualData => { string rawData = converter.Convert(actualData); string predicted = String.Join(" ", Predict(rawData, models, converter).Select(wc => wc[0])); string[] wQuery = actualData.Split(new char[0]); string[] wPredicted = predicted.Split(new char[0]); bool match = true; if (wPredicted.Length != wQuery.Length) { match = false; } else { for (int i = 0; i < wQuery.Length; i++) { if (wQuery[i] != wPredicted[i]) { match = false; wQuery[i] = wQuery[i].ToUpper(); wPredicted[i] = wPredicted[i].ToUpper(); } else { Interlocked.Increment(ref nCorrectWords); } } } if (!match) { logs.Add(String.Format("{0}\r\n{1}\r\n--------------------", String.Join(" ", wQuery), String.Join(" ", wPredicted))); } else { Interlocked.Increment(ref nCorrectSegments); } Interlocked.Increment(ref nTotalSegments); if (nTotalSegments % 10000 == 0) { Console.WriteLine(nTotalSegments); } Interlocked.Add(ref nTotalWords, wQuery.Length); } ); foreach (string log in logs) { sw.WriteLine(log); } Console.WriteLine("{0} total segments, {1}% segment accuracy", nTotalSegments, ((double)nCorrectSegments / nTotalSegments) * 100.0); Console.WriteLine("{0} total words, {1}% word accuracy", nTotalWords, ((double)nCorrectWords / nTotalWords) * 100.0); sw.WriteLine("{0} total segments, {1}% segment accuracy", nTotalSegments, ((double)nCorrectSegments / nTotalSegments) * 100.0); sw.WriteLine("{0} total words, {1}% word accuracy", nTotalWords, ((double)nCorrectWords / nTotalWords) * 100.0); } Clocker.Tock(); }