public static string Accentify(string rawSentence, string accents, AccentConverter converter) { if (rawSentence == null || accents == null || converter == null) { return(null); } char[] rawChars = rawSentence.ToArray(); char[] accChars = converter.Convert(accents).ToArray(); int ia = 0; for (int i = 0; i < rawChars.Length; i++) { if (converter.RawCharMap.ContainsKey(rawChars[i])) { if (ia >= accChars.Length || rawChars[i] != accChars[ia]) { return(null); } rawChars[i] = accents[ia++]; } } return(new string(rawChars)); }
public static string ExtractAccents(string sentence, AccentConverter converter) { StringBuilder sb = new StringBuilder(); string rawSentence = converter.Convert(sentence); for (int i = 0; i < rawSentence.Length; i++) { if (converter.RawCharMap.ContainsKey(rawSentence[i])) { sb.Append(sentence[i]); } } return(sb.ToString()); }
public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true) { List <int> grams = new List <int>(); for (int n = minGram; n <= maxGram; n++) { if (!File.Exists(String.Format(outModelFilePattern, n))) { grams.Add(n); } } if (grams.Count == 0) { return; } // Load dictionary of raw words Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null; // Load segments from training data List <List <string> > segments = TextParser.ParseData(inputTrainingFiles); StringBuilder sbRaw = new StringBuilder(); StringBuilder sbAcc = new StringBuilder(); foreach (int n in grams) { int iG = n - 1; Console.WriteLine("Building {0}-gram ...", iG + 1); Clocker.Tick(); using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1)))) { ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion); for (int iS = 0; iS < segments.Count; iS++) { List <string> words = segments[iS]; for (int i = 0; i < words.Count - iG; i++) { sbRaw.Clear(); sbAcc.Clear(); bool shouldProceed = true; if (learnKnownWordsOnly) { for (int g = 0; g <= iG; g++) { string accWord = words[i + g]; string rawWord = converter.Convert(accWord); if (!dictionary.ContainsKey(rawWord)) { shouldProceed = false; break; } sbAcc.Append(accWord); sbRaw.Append(rawWord); if (g < iG) { sbRaw.Append(" "); } } } else { for (int g = 0; g <= iG; g++) { sbAcc.Append(words[i + g]); sbRaw.Append(converter.Convert(words[i + g])); if (g < iG) { sbRaw.Append(" "); } } } if (shouldProceed) { string accents = ExtractAccents(sbAcc.ToString(), converter); igGram.Add(sbRaw.ToString(), accents); } } } igGram.WriteToBinary(bwModel); } Clocker.Tock(); } }