示例#1
0
        public static string Accentify(string rawSentence, string accents, AccentConverter converter)
        {
            if (rawSentence == null || accents == null || converter == null)
            {
                return(null);
            }

            char[] rawChars = rawSentence.ToArray();
            char[] accChars = converter.Convert(accents).ToArray();

            int ia = 0;

            for (int i = 0; i < rawChars.Length; i++)
            {
                if (converter.RawCharMap.ContainsKey(rawChars[i]))
                {
                    if (ia >= accChars.Length || rawChars[i] != accChars[ia])
                    {
                        return(null);
                    }
                    rawChars[i] = accents[ia++];
                }
            }
            return(new string(rawChars));
        }
示例#2
0
        public static string ExtractAccents(string sentence, AccentConverter converter)
        {
            StringBuilder sb = new StringBuilder();

            string rawSentence = converter.Convert(sentence);

            for (int i = 0; i < rawSentence.Length; i++)
            {
                if (converter.RawCharMap.ContainsKey(rawSentence[i]))
                {
                    sb.Append(sentence[i]);
                }
            }
            return(sb.ToString());
        }
示例#3
0
        public static void Train(string outModelFilePattern, TupleList <string, string> inputTrainingFiles, int modelVersion, int minGram, int maxGram, AccentConverter converter, bool learnKnownWordsOnly = true)
        {
            List <int> grams = new List <int>();

            for (int n = minGram; n <= maxGram; n++)
            {
                if (!File.Exists(String.Format(outModelFilePattern, n)))
                {
                    grams.Add(n);
                }
            }
            if (grams.Count == 0)
            {
                return;
            }

            // Load dictionary of raw words
            Dictionary <string, int> dictionary = learnKnownWordsOnly ? LazyTrainer.ReadDictionary(DataManager.DictionaryFile) : null;

            // Load segments from training data
            List <List <string> > segments = TextParser.ParseData(inputTrainingFiles);

            StringBuilder sbRaw = new StringBuilder();
            StringBuilder sbAcc = new StringBuilder();

            foreach (int n in grams)
            {
                int iG = n - 1;
                Console.WriteLine("Building {0}-gram ...", iG + 1);

                Clocker.Tick();

                using (BinaryWriter bwModel = new BinaryWriter(File.Create(String.Format(outModelFilePattern, iG + 1))))
                {
                    ILanguageModel igGram = ModelFactory.CreateModelByVersion(modelVersion);
                    for (int iS = 0; iS < segments.Count; iS++)
                    {
                        List <string> words = segments[iS];
                        for (int i = 0; i < words.Count - iG; i++)
                        {
                            sbRaw.Clear();
                            sbAcc.Clear();

                            bool shouldProceed = true;
                            if (learnKnownWordsOnly)
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    string accWord = words[i + g];
                                    string rawWord = converter.Convert(accWord);

                                    if (!dictionary.ContainsKey(rawWord))
                                    {
                                        shouldProceed = false;
                                        break;
                                    }

                                    sbAcc.Append(accWord);
                                    sbRaw.Append(rawWord);
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }
                            else
                            {
                                for (int g = 0; g <= iG; g++)
                                {
                                    sbAcc.Append(words[i + g]);
                                    sbRaw.Append(converter.Convert(words[i + g]));
                                    if (g < iG)
                                    {
                                        sbRaw.Append(" ");
                                    }
                                }
                            }

                            if (shouldProceed)
                            {
                                string accents = ExtractAccents(sbAcc.ToString(), converter);

                                igGram.Add(sbRaw.ToString(), accents);
                            }
                        }
                    }

                    igGram.WriteToBinary(bwModel);
                }

                Clocker.Tock();
            }
        }