public static void Train(IBMModel1 model, IEnumerable <SimpleTrainingRecord> training_corpus) { foreach (SimpleTrainingRecord rec in training_corpus) { string[] input_lang = rec.InputLang; string[] output_lang = rec.OutputLang; int[] a = rec.Alignment; int m_input_len = input_lang.Length; int l_output_len = output_lang.Length; Debug.Assert(m_input_len == a.Length); for (int j = 0; j < m_input_len; ++j) { int a_j = a[j]; string word_input = input_lang[j]; string word_output = output_lang[a_j]; double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output); double count_output = model.TranslationMap.GetCountOfOutputWord(word_output); model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + 1); model.TranslationMap.SetCountOfOutputWord(word_output, count_output + 1); model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output); } } }
public static void Initialize(IBMModel1 model, IEnumerable <EMTrainingRecord> training_corpus) { foreach (EMTrainingRecord rec in training_corpus) { string[] input_lang = rec.InputLang; string[] output_lang = rec.OutputLang; int m_input_len = input_lang.Length; int l_output_len = output_lang.Length; for (int j = 0; j < m_input_len; ++j) { for (int a_j = 0; a_j < l_output_len; ++a_j) { string word_input = input_lang[j]; string word_output = output_lang[a_j]; double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output); double count_output = model.TranslationMap.GetCountOfOutputWord(word_output); model.TranslationMap.SetProbabilityInputWordGivenOutputWord(word_input, word_output, mRandom.NextDouble()); } } } }
public static void Train(IBMModel1 model, IEnumerable <EMTrainingRecord> training_corpus, int maxIterations) { for (int iteration = 0; iteration < maxIterations; ++iteration) { foreach (EMTrainingRecord rec in training_corpus) { string[] input_lang = rec.InputLang; string[] output_lang = rec.OutputLang; int m_input_len = input_lang.Length; int l_output_len = output_lang.Length; double[][] delta = new double[m_input_len][]; double sum = 0; for (int j = 0; j < m_input_len; ++j) { delta[j] = new double[l_output_len]; string word_input = input_lang[j]; for (int a_j = 0; a_j < l_output_len; ++a_j) { string word_output = output_lang[a_j]; double t = model.TranslationMap.GetProbabilityInputWordGivenOutputWord(word_input, word_output); double deltaVal = t; delta[j][a_j] = deltaVal; sum += deltaVal; } } for (int j = 0; j < m_input_len; ++j) { for (int a_j = 0; a_j < l_output_len; ++a_j) { delta[j][a_j] /= sum; } } for (int j = 0; j < m_input_len; ++j) { for (int a_j = 0; a_j < l_output_len; ++a_j) { double deltaVal = delta[j][a_j]; string word_input = input_lang[j]; string word_output = output_lang[a_j]; double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output); double count_output = model.TranslationMap.GetCountOfOutputWord(word_output); model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + deltaVal); model.TranslationMap.SetCountOfOutputWord(word_output, count_output + deltaVal); model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output); } } } } }
public void RunSimpleTraining() { IBMModel1.Helpers.IBMModel1 model = new IBMModel1.Helpers.IBMModel1(); List <SimpleTrainingRecord> training_corpus = new List <SimpleTrainingRecord>(); EnglishTokenizer tokenizer_output = new EnglishTokenizer(); FrenchTokenizer tokenizer_input = new FrenchTokenizer(); SimpleTrainingMethod.Train(model, training_corpus); string sentence_input = "la maison bleue"; string sentence_output = "the blue house"; string[] input_lang = tokenizer_input.Tokenize(sentence_input); string[] output_lang = tokenizer_output.Tokenize(sentence_output); int[] alignment = model.GetAlignment(input_lang, output_lang); Dictionary <int, string> output_mapping = new Dictionary <int, string>(); int m_input_len = input_lang.Length; string output_word = ""; int ia = 1; //adding for (int j = 0; j < m_input_len; ++j) { int a_j = alignment[j] + ia; output_word = output_lang[a_j]; output_mapping[a_j] = output_word; ia++; } List <int> output_sentence_index_list = output_mapping.Keys.ToList(); output_sentence_index_list.Sort(); string[] predicted_output_lang = new string[output_sentence_index_list.Count]; for (int i = 0; i < predicted_output_lang.Length; ++i) { predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]]; } Console.WriteLine("Original French Sentence: {0}", sentence_input); Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang)); Console.ReadLine(); }