public static Dictionary <int, TupleList <string, short> > LoadModel(string segmentModelFile, string wordModelFile) { Dictionary <int, TupleList <long, short> > wordModel = LazyTrainer.ReadWordModel(wordModelFile); Dictionary <long, string> offsetToSegmentMap = new Dictionary <long, string>(); using (BinaryReader brSegment = new BinaryReader(File.OpenRead(segmentModelFile))) { while (brSegment.BaseStream.Position != brSegment.BaseStream.Length) { offsetToSegmentMap.Add(brSegment.BaseStream.Position, brSegment.ReadString()); } } Dictionary <int, TupleList <string, short> > model = new Dictionary <int, TupleList <string, short> >(); foreach (int wordIdx in wordModel.Keys) { model.Add(wordIdx, new TupleList <string, short>()); foreach (Tuple <long, short> segmentLocation in wordModel[wordIdx]) { model[wordIdx].Add(offsetToSegmentMap[segmentLocation.Item1], segmentLocation.Item2); } } return(model); }
private static void Train(string dictionaryFile, string segmentModelFile, string wordModelFile) { System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); sw.Start(); List <List <string> > accentSegmentWordList = TextParser.ParseData(new TupleList <string, string> { { @"..\..\..\..\Data", "batches_train.txt" }, { @"..\..\..\..\Data\downloaded", "*.txt" } }); Console.Write("Generating segments... "); // List of segment offsets in the binary file List <long> offsets = LazyTrainer.WriteSegmentModel(accentSegmentWordList, segmentModelFile); Console.WriteLine(sw.ElapsedMilliseconds); sw.Restart(); Console.Write("Loading dictionary of raw words... "); // load dictionary of raw words Dictionary <string, int> dictionary = LazyTrainer.ReadDictionary(dictionaryFile); Console.WriteLine(sw.ElapsedMilliseconds); sw.Restart(); Console.Write("Mapping words to segments... "); VietConverter tc = new VietConverter(); Dictionary <int, TupleList <long, short> > model = new Dictionary <int, TupleList <long, short> >(); // pass through all segments, recording where each word appears in the segment for (int iSegment = 0; iSegment < accentSegmentWordList.Count; iSegment++) { List <string> wordSegment = accentSegmentWordList[iSegment]; for (short iWord = 0; iWord < wordSegment.Count; iWord++) { string rawWord = tc.Convert(wordSegment[iWord]); // If word is in dictionary if (dictionary.ContainsKey(rawWord)) { int wordKey = dictionary[rawWord]; if (!model.ContainsKey(wordKey)) { model.Add(wordKey, new TupleList <long, short>()); } model[wordKey].Add(offsets[iSegment], iWord); } } } LazyTrainer.WriteWordModel(model, wordModelFile); Console.WriteLine(sw.ElapsedMilliseconds); }
public static void Run(bool conserveMemory) { if (!File.Exists(g_SegmentModelFile) || !File.Exists(g_WordModelFile)) { Directory.CreateDirectory(Path.GetDirectoryName(g_SegmentModelFile)); LazyTrainer.Train(g_DictionaryFile, g_SegmentModelFile, g_WordModelFile); } LazyTrainer.Test(g_DictionaryFile, g_SegmentModelFile, g_WordModelFile, conserveMemory); }
private static void Test(string dictionaryFile, string segmentModelFile, string wordModelFile, bool conserveMemory) { Console.OutputEncoding = Encoding.UTF8; System.Diagnostics.Stopwatch watch = new System.Diagnostics.Stopwatch(); watch.Start(); Console.Write("Starting up..."); Dictionary <string, int> dictionary = LazyTrainer.ReadDictionary(dictionaryFile); Dictionary <int, TupleList <long, short> > model = LazyTrainer.ReadWordModel(wordModelFile); VietConverter tc = new VietConverter(); using (BinaryReader brSegment = new BinaryReader(File.OpenRead(segmentModelFile))) { // If not conserving memory then preload everything Dictionary <long, string> offsetToSegmentMap = new Dictionary <long, string>(); if (!conserveMemory) { while (brSegment.BaseStream.Position != brSegment.BaseStream.Length) { offsetToSegmentMap.Add(brSegment.BaseStream.Position, brSegment.ReadString()); } } Console.WriteLine(" - {0} seconds", watch.ElapsedMilliseconds / 1000.0); Console.WriteLine(); while (true) { Console.Write("Enter a phrase: "); string data = Console.ReadLine(); if (data.Contains("quit")) { break; } File.AppendAllText(g_LogFile, data + "\r\n"); watch.Restart(); List <string> prediction = new List <string>(); string[] words = data.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); for (short iw = 0; iw < words.Length; iw++) { string w = words[iw]; if (!dictionary.ContainsKey(w)) { prediction.Add(w); File.AppendAllText(g_LogFile, String.Format("{0} is not a known word, leaving as is\r\n", w)); } else { int wKey = dictionary[w]; if (model.ContainsKey(wKey)) { var locations = model[wKey]; short maxMatchCount = -1; List <string> mostLikelySequence = new List <string>(); string mostLikelyWord = String.Empty; List <string> segments = new List <string>(); if (conserveMemory) { foreach (var loc in locations) { brSegment.BaseStream.Position = loc.Item1; segments.Add(brSegment.ReadString()); } } else { foreach (var loc in locations) { segments.Add(offsetToSegmentMap[loc.Item1]); } } ConcurrentBag <Tuple <short, List <string>, string> > results = new ConcurrentBag <Tuple <short, List <string>, string> >(); Parallel.For(0, locations.Count, new ParallelOptions() { MaxDegreeOfParallelism = 8 }, (int i) => { short iWord = locations[i].Item2; // The accented word list List <string> actual = segments[i].Split(new char[0], StringSplitOptions.RemoveEmptyEntries).ToList(); // The converted raw word list List <string> rawActual = new List <string>(); foreach (string aw in actual) { rawActual.Add(tc.Convert(aw)); } short matchCount = 0; LazyTrainer.MatchSequence(words.ToList(), iw, rawActual, iWord, out matchCount); results.Add(new Tuple <short, List <string>, string>(matchCount, actual, actual[iWord])); }); foreach (var item in results) { if (item.Item1 >= maxMatchCount) { maxMatchCount = item.Item1; mostLikelySequence = item.Item2; mostLikelyWord = item.Item3; } } prediction.Add(mostLikelyWord); File.AppendAllText(g_LogFile, String.Format("{0} has a {1}-gram match in: {2}\r\n", w, maxMatchCount, String.Join(" ", mostLikelySequence))); } } } string predicted = String.Format("Predicted: {0} - {1} seconds", String.Join(" ", prediction), watch.ElapsedMilliseconds / 1000.0); Console.WriteLine(predicted); Console.WriteLine(); File.AppendAllText(g_LogFile, predicted + "\r\n\r\n"); } } }
public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true) { List <int> grams = new List <int>(); for (int n = minGram; n <= maxGram; n++) { if (!File.Exists(String.Format(outModelFilePattern, n))) { grams.Add(n); } } if (grams.Count == 0) { return; } // Load dictionary of raw words Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null; // Load segments from training data List <List <string> > segments = TextParser.ParseData(inputTrainingFiles); StringBuilder sbRaw = new StringBuilder(); StringBuilder sbAcc = new StringBuilder(); foreach (int n in grams) { int iG = n - 1; Console.WriteLine("Building {0}-gram ...", iG + 1); Clocker.Tick(); using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1)))) { ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion); for (int iS = 0; iS < segments.Count; iS++) { List <string> words = segments[iS]; for (int i = 0; i < words.Count - iG; i++) { sbRaw.Clear(); sbAcc.Clear(); bool shouldProceed = true; if (learnKnownWordsOnly) { for (int g = 0; g <= iG; g++) { string accWord = words[i + g]; string rawWord = converter.Convert(accWord); if (!dictionary.ContainsKey(rawWord)) { shouldProceed = false; break; } sbAcc.Append(accWord); sbRaw.Append(rawWord); if (g < iG) { sbRaw.Append(" "); } } } else { for (int g = 0; g <= iG; g++) { sbAcc.Append(words[i + g]); sbRaw.Append(converter.Convert(words[i + g])); if (g < iG) { sbRaw.Append(" "); } } } if (shouldProceed) { string accents = ExtractAccents(sbAcc.ToString(), converter); igGram.Add(sbRaw.ToString(), accents); } } } igGram.WriteToBinary(bwModel); } Clocker.Tock(); } }