Пример #1
0
        public static Dictionary <int, TupleList <string, short> > LoadModel(string segmentModelFile, string wordModelFile)
        {
            Dictionary <int, TupleList <long, short> > wordModel = LazyTrainer.ReadWordModel(wordModelFile);

            Dictionary <long, string> offsetToSegmentMap = new Dictionary <long, string>();

            using (BinaryReader brSegment = new BinaryReader(File.OpenRead(segmentModelFile)))
            {
                while (brSegment.BaseStream.Position != brSegment.BaseStream.Length)
                {
                    offsetToSegmentMap.Add(brSegment.BaseStream.Position, brSegment.ReadString());
                }
            }

            Dictionary <int, TupleList <string, short> > model = new Dictionary <int, TupleList <string, short> >();

            foreach (int wordIdx in wordModel.Keys)
            {
                model.Add(wordIdx, new TupleList <string, short>());
                foreach (Tuple <long, short> segmentLocation in wordModel[wordIdx])
                {
                    model[wordIdx].Add(offsetToSegmentMap[segmentLocation.Item1], segmentLocation.Item2);
                }
            }
            return(model);
        }
Пример #2
0
        private static void Train(string dictionaryFile, string segmentModelFile, string wordModelFile)
        {
            System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            List <List <string> > accentSegmentWordList = TextParser.ParseData(new TupleList <string, string>
            {
                { @"..\..\..\..\Data", "batches_train.txt" },
                { @"..\..\..\..\Data\downloaded", "*.txt" }
            });

            Console.Write("Generating segments... ");

            // List of segment offsets in the binary file
            List <long> offsets = LazyTrainer.WriteSegmentModel(accentSegmentWordList, segmentModelFile);

            Console.WriteLine(sw.ElapsedMilliseconds);
            sw.Restart();

            Console.Write("Loading dictionary of raw words... ");

            // load dictionary of raw words
            Dictionary <string, int> dictionary = LazyTrainer.ReadDictionary(dictionaryFile);

            Console.WriteLine(sw.ElapsedMilliseconds);
            sw.Restart();

            Console.Write("Mapping words to segments... ");

            VietConverter tc = new VietConverter();

            Dictionary <int, TupleList <long, short> > model = new Dictionary <int, TupleList <long, short> >();

            // pass through all segments, recording where each word appears in the segment
            for (int iSegment = 0; iSegment < accentSegmentWordList.Count; iSegment++)
            {
                List <string> wordSegment = accentSegmentWordList[iSegment];
                for (short iWord = 0; iWord < wordSegment.Count; iWord++)
                {
                    string rawWord = tc.Convert(wordSegment[iWord]);

                    // If word is in dictionary
                    if (dictionary.ContainsKey(rawWord))
                    {
                        int wordKey = dictionary[rawWord];
                        if (!model.ContainsKey(wordKey))
                        {
                            model.Add(wordKey, new TupleList <long, short>());
                        }
                        model[wordKey].Add(offsets[iSegment], iWord);
                    }
                }
            }

            LazyTrainer.WriteWordModel(model, wordModelFile);

            Console.WriteLine(sw.ElapsedMilliseconds);
        }
Пример #3
0
 public static void Run(bool conserveMemory)
 {
     if (!File.Exists(g_SegmentModelFile) || !File.Exists(g_WordModelFile))
     {
         Directory.CreateDirectory(Path.GetDirectoryName(g_SegmentModelFile));
         LazyTrainer.Train(g_DictionaryFile, g_SegmentModelFile, g_WordModelFile);
     }
     LazyTrainer.Test(g_DictionaryFile, g_SegmentModelFile, g_WordModelFile, conserveMemory);
 }
Пример #4
0
        private static void Test(string dictionaryFile, string segmentModelFile, string wordModelFile, bool conserveMemory)
        {
            Console.OutputEncoding = Encoding.UTF8;

            System.Diagnostics.Stopwatch watch = new System.Diagnostics.Stopwatch();
            watch.Start();

            Console.Write("Starting up...");

            Dictionary <string, int> dictionary = LazyTrainer.ReadDictionary(dictionaryFile);
            Dictionary <int, TupleList <long, short> > model = LazyTrainer.ReadWordModel(wordModelFile);
            VietConverter tc = new VietConverter();

            using (BinaryReader brSegment = new BinaryReader(File.OpenRead(segmentModelFile)))
            {
                // If not conserving memory then preload everything
                Dictionary <long, string> offsetToSegmentMap = new Dictionary <long, string>();
                if (!conserveMemory)
                {
                    while (brSegment.BaseStream.Position != brSegment.BaseStream.Length)
                    {
                        offsetToSegmentMap.Add(brSegment.BaseStream.Position, brSegment.ReadString());
                    }
                }

                Console.WriteLine(" - {0} seconds", watch.ElapsedMilliseconds / 1000.0);
                Console.WriteLine();

                while (true)
                {
                    Console.Write("Enter a phrase: ");

                    string data = Console.ReadLine();
                    if (data.Contains("quit"))
                    {
                        break;
                    }

                    File.AppendAllText(g_LogFile, data + "\r\n");

                    watch.Restart();

                    List <string> prediction = new List <string>();
                    string[]      words      = data.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                    for (short iw = 0; iw < words.Length; iw++)
                    {
                        string w = words[iw];
                        if (!dictionary.ContainsKey(w))
                        {
                            prediction.Add(w);

                            File.AppendAllText(g_LogFile, String.Format("{0} is not a known word, leaving as is\r\n", w));
                        }
                        else
                        {
                            int wKey = dictionary[w];
                            if (model.ContainsKey(wKey))
                            {
                                var locations = model[wKey];

                                short         maxMatchCount      = -1;
                                List <string> mostLikelySequence = new List <string>();
                                string        mostLikelyWord     = String.Empty;

                                List <string> segments = new List <string>();
                                if (conserveMemory)
                                {
                                    foreach (var loc in locations)
                                    {
                                        brSegment.BaseStream.Position = loc.Item1;
                                        segments.Add(brSegment.ReadString());
                                    }
                                }
                                else
                                {
                                    foreach (var loc in locations)
                                    {
                                        segments.Add(offsetToSegmentMap[loc.Item1]);
                                    }
                                }

                                ConcurrentBag <Tuple <short, List <string>, string> > results = new ConcurrentBag <Tuple <short, List <string>, string> >();

                                Parallel.For(0, locations.Count, new ParallelOptions()
                                {
                                    MaxDegreeOfParallelism = 8
                                }, (int i) =>
                                {
                                    short iWord = locations[i].Item2;

                                    // The accented word list
                                    List <string> actual = segments[i].Split(new char[0], StringSplitOptions.RemoveEmptyEntries).ToList();

                                    // The converted raw word list
                                    List <string> rawActual = new List <string>();

                                    foreach (string aw in actual)
                                    {
                                        rawActual.Add(tc.Convert(aw));
                                    }

                                    short matchCount = 0;
                                    LazyTrainer.MatchSequence(words.ToList(), iw, rawActual, iWord, out matchCount);

                                    results.Add(new Tuple <short, List <string>, string>(matchCount, actual, actual[iWord]));
                                });

                                foreach (var item in results)
                                {
                                    if (item.Item1 >= maxMatchCount)
                                    {
                                        maxMatchCount      = item.Item1;
                                        mostLikelySequence = item.Item2;
                                        mostLikelyWord     = item.Item3;
                                    }
                                }

                                prediction.Add(mostLikelyWord);

                                File.AppendAllText(g_LogFile,
                                                   String.Format("{0} has a {1}-gram match in: {2}\r\n", w, maxMatchCount, String.Join(" ", mostLikelySequence)));
                            }
                        }
                    }
                    string predicted = String.Format("Predicted: {0} - {1} seconds", String.Join(" ", prediction), watch.ElapsedMilliseconds / 1000.0);
                    Console.WriteLine(predicted);
                    Console.WriteLine();

                    File.AppendAllText(g_LogFile, predicted + "\r\n\r\n");
                }
            }
        }
Пример #5
0
        public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true)
        {
            List <int> grams = new List <int>();

            for (int n = minGram; n <= maxGram; n++)
            {
                if (!File.Exists(String.Format(outModelFilePattern, n)))
                {
                    grams.Add(n);
                }
            }
            if (grams.Count == 0)
            {
                return;
            }

            // Load dictionary of raw words
            Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null;

            // Load segments from training data
            List <List <string> > segments = TextParser.ParseData(inputTrainingFiles);

            StringBuilder sbRaw = new StringBuilder();
            StringBuilder sbAcc = new StringBuilder();

            foreach (int n in grams)
            {
                int iG = n - 1;
                Console.WriteLine("Building {0}-gram ...", iG + 1);

                Clocker.Tick();

                using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1))))
                {
                    ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion);
                    for (int iS = 0; iS < segments.Count; iS++)
                    {
                        List <string> words = segments[iS];
                        for (int i = 0; i < words.Count - iG; i++)
                        {
                            sbRaw.Clear();
                            sbAcc.Clear();

                            bool shouldProceed = true;
                            if (learnKnownWordsOnly)
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    string accWord = words[i + g];
                                    string rawWord = converter.Convert(accWord);

                                    if (!dictionary.ContainsKey(rawWord))
                                    {
                                        shouldProceed = false;
                                        break;
                                    }

                                    sbAcc.Append(accWord);
                                    sbRaw.Append(rawWord);
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }
                            else
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    sbAcc.Append(words[i + g]);
                                    sbRaw.Append(converter.Convert(words[i + g]));
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }

                            if (shouldProceed)
                            {
                                string accents = ExtractAccents(sbAcc.ToString(), converter);

                                igGram.Add(sbRaw.ToString(), accents);
                            }
                        }
                    }

                    igGram.WriteToBinary(bwModel);
                }

                Clocker.Tock();
            }
        }