Beispiel #1
0
        public static void Test(AccentConverter converter, int modelVersion)
        {
            string modelFile = (converter is VietConverter) ? g_ModelFile : g_FrenchModelFile;

            string[] modelFiles = Directory.GetFiles(DataManager.BaseDir, Path.GetFileName(String.Format(modelFile, "*")));

            List <ILanguageModel> models = new ILanguageModel[5].ToList();

            for (int n = 1; n <= 3; n++)
            {
                string fileName = String.Format(modelFile, n);
                if (!File.Exists(fileName))
                {
                    continue;
                }

                Console.WriteLine("Loading {0}-gram model...", n);

                Clocker.Tick();

                models[n - 1] = ModelFactory.LoadModel(modelFiles[n - 1], modelVersion);

                Clocker.Tock();
            }

            TestUserQuery(models, converter);
        }
Beispiel #2
0
        public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true)
        {
            List <int> grams = new List <int>();

            for (int n = minGram; n <= maxGram; n++)
            {
                if (!File.Exists(String.Format(outModelFilePattern, n)))
                {
                    grams.Add(n);
                }
            }
            if (grams.Count == 0)
            {
                return;
            }

            // Load dictionary of raw words
            Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null;

            // Load segments from training data
            List <List <string> > segments = TextParser.ParseData(inputTrainingFiles);

            StringBuilder sbRaw = new StringBuilder();
            StringBuilder sbAcc = new StringBuilder();

            foreach (int n in grams)
            {
                int iG = n - 1;
                Console.WriteLine("Building {0}-gram ...", iG + 1);

                Clocker.Tick();

                using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1))))
                {
                    ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion);
                    for (int iS = 0; iS < segments.Count; iS++)
                    {
                        List <string> words = segments[iS];
                        for (int i = 0; i < words.Count - iG; i++)
                        {
                            sbRaw.Clear();
                            sbAcc.Clear();

                            bool shouldProceed = true;
                            if (learnKnownWordsOnly)
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    string accWord = words[i + g];
                                    string rawWord = converter.Convert(accWord);

                                    if (!dictionary.ContainsKey(rawWord))
                                    {
                                        shouldProceed = false;
                                        break;
                                    }

                                    sbAcc.Append(accWord);
                                    sbRaw.Append(rawWord);
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }
                            else
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    sbAcc.Append(words[i + g]);
                                    sbRaw.Append(converter.Convert(words[i + g]));
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }

                            if (shouldProceed)
                            {
                                string accents = ExtractAccents(sbAcc.ToString(), converter);

                                igGram.Add(sbRaw.ToString(), accents);
                            }
                        }
                    }

                    igGram.WriteToBinary(bwModel);
                }

                Clocker.Tock();
            }
        }
Beispiel #3
0
        private static void TestDataSet(List <string> segments, List <ILanguageModel> models, VietConverter converter, string logFile)
        {
            Clocker.Tick();

            Console.WriteLine("{0} expected total segments", segments.Count);

            using (StreamWriter sw = new StreamWriter(File.Create(logFile)))
            {
                int nCorrectSegments = 0;
                int nTotalSegments   = 0;

                long nCorrectWords = 0;
                long nTotalWords   = 0;

                var logs = new ConcurrentBag <string>();

                Parallel.ForEach(
                    segments,
                    new ParallelOptions {
                    MaxDegreeOfParallelism = 2 * Environment.ProcessorCount
                },
                    actualData =>
                {
                    string rawData   = converter.Convert(actualData);
                    string predicted = String.Join(" ", Predict(rawData, models, converter).Select(wc => wc[0]));

                    string[] wQuery     = actualData.Split(new char[0]);
                    string[] wPredicted = predicted.Split(new char[0]);

                    bool match = true;

                    if (wPredicted.Length != wQuery.Length)
                    {
                        match = false;
                    }
                    else
                    {
                        for (int i = 0; i < wQuery.Length; i++)
                        {
                            if (wQuery[i] != wPredicted[i])
                            {
                                match         = false;
                                wQuery[i]     = wQuery[i].ToUpper();
                                wPredicted[i] = wPredicted[i].ToUpper();
                            }
                            else
                            {
                                Interlocked.Increment(ref nCorrectWords);
                            }
                        }
                    }

                    if (!match)
                    {
                        logs.Add(String.Format("{0}\r\n{1}\r\n--------------------",
                                               String.Join(" ", wQuery), String.Join(" ", wPredicted)));
                    }
                    else
                    {
                        Interlocked.Increment(ref nCorrectSegments);
                    }
                    Interlocked.Increment(ref nTotalSegments);
                    if (nTotalSegments % 10000 == 0)
                    {
                        Console.WriteLine(nTotalSegments);
                    }

                    Interlocked.Add(ref nTotalWords, wQuery.Length);
                }
                    );

                foreach (string log in logs)
                {
                    sw.WriteLine(log);
                }

                Console.WriteLine("{0} total segments, {1}% segment accuracy", nTotalSegments, ((double)nCorrectSegments / nTotalSegments) * 100.0);
                Console.WriteLine("{0} total words, {1}% word accuracy", nTotalWords, ((double)nCorrectWords / nTotalWords) * 100.0);

                sw.WriteLine("{0} total segments, {1}% segment accuracy", nTotalSegments, ((double)nCorrectSegments / nTotalSegments) * 100.0);
                sw.WriteLine("{0} total words, {1}% word accuracy", nTotalWords, ((double)nCorrectWords / nTotalWords) * 100.0);
            }

            Clocker.Tock();
        }