public Dictionary <int, int> GetIntersectionPoints(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, string[] ws_f, string[] ws_e) { Dictionary <int, int> intersection_points; GetAlignmentMatrix(model_f_to_e, model_e_to_f, ws_f, ws_e, out intersection_points); return(intersection_points); }
public static void Initialize(IBMModel2 model, IEnumerable <EMTrainingRecord> training_corpus) { foreach (EMTrainingRecord rec in training_corpus) { string[] input_lang = rec.InputLang; string[] output_lang = rec.OutputLang; int m_input_len = input_lang.Length; int l_output_len = output_lang.Length; for (int j = 0; j < m_input_len; ++j) { for (int a_j = 0; a_j < l_output_len; ++a_j) { string word_input = input_lang[j]; string word_output = output_lang[a_j]; double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output); double count_output = model.TranslationMap.GetCountOfOutputWord(word_output); model.TranslationMap.SetProbabilityInputWordGivenOutputWord(word_input, word_output, mRandom.NextDouble()); double count_j_given_aj_l_m = model.DistortionMap.GetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len); double count_aj_l_m = model.DistortionMap.GetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len); model.DistortionMap.SetDistortionParameter(a_j, j, m_input_len, l_output_len, mRandom.NextDouble()); } } } }
public static void Train(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, IEnumerable <EMTrainingRecord> training_corpus_from_f_to_e, int maxIterations) { Train(model_f_to_e, training_corpus_from_f_to_e, maxIterations); List <EMTrainingRecord> training_corpus_from_e_to_f = new List <EMTrainingRecord>(); foreach (EMTrainingRecord rec in training_corpus_from_f_to_e) { EMTrainingRecord rec2 = new EMTrainingRecord() { InputLang = rec.OutputLang, OutputLang = rec.InputLang }; training_corpus_from_e_to_f.Add(rec2); } Train(model_e_to_f, training_corpus_from_f_to_e, maxIterations); }
public void GetPhrases(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, string[] ws_f, string[] ws_e, Dictionary <string, string> phrases) { Dictionary <int, int> intersection_points = GetIntersectionPoints(model_f_to_e, model_e_to_f, ws_f, ws_e); List <int> ipList = intersection_points.Keys.ToList(); ipList.Sort(); for (int k = 0; k < ipList.Count - 1; ++k) { int j_f1 = ipList[k]; int j_f2 = ipList[k + 1]; int i_e1 = intersection_points[j_f1]; int i_e2 = intersection_points[j_f2]; } }
public void RunEMTraining() { IBMModel2 model = new IBMModel2(); List <EMTrainingRecord> training_corpus = new List <EMTrainingRecord>(); EnglishTokenizer tokenizer_output = new EnglishTokenizer(); FrenchTokenizer tokenizer_input = new FrenchTokenizer(); EMTrainingMethod.Train(model, training_corpus, 20); string sentence_input = "[Some French Sentence]"; string sentence_output = "[Some English Sentence]"; string[] input_lang = tokenizer_input.Tokenize(sentence_input); string[] output_lang = tokenizer_output.Tokenize(sentence_output); int[] alignment = model.GetAlignment(input_lang, output_lang); Dictionary <int, string> output_mapping = new Dictionary <int, string>(); int m_input_len = input_lang.Length; for (int j = 0; j < m_input_len; ++j) { int a_j = alignment[j]; string output_word = output_lang[a_j]; output_mapping[a_j] = output_word; } List <int> output_sentence_index_list = output_mapping.Keys.ToList(); output_sentence_index_list.Sort(); string[] predicted_output_lang = new string[output_sentence_index_list.Count]; for (int i = 0; i < predicted_output_lang.Length; ++i) { predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]]; } Console.WriteLine("Original French Sentence: {0}", sentence_input); Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang)); }
public void GetAlignmentMatrix(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, string[] ws_f, string[] ws_e, out Dictionary <int, int> intersection_points) { int[] alignment_f_to_e = model_f_to_e.GetAlignment(ws_f, ws_e); int[] alignment_e_to_f = model_e_to_f.GetAlignment(ws_e, ws_f); int m_f_len = alignment_f_to_e.Length; int l_e_len = alignment_e_to_f.Length; intersection_points = new Dictionary <int, int>(); int[][] alignment_matrix = new int[m_f_len][]; for (int j = 0; j < m_f_len; ++j) { alignment_matrix[j] = new int[l_e_len]; for (int i = 0; i < l_e_len; ++i) { if (alignment_f_to_e[j] == i && alignment_e_to_f[i] == j) { intersection_points[j] = i; alignment_matrix[j][i] = 1; } } } }
public static void Train(IBMModel2 model, IEnumerable <SimpleTrainingRecord> training_corpus) { foreach (SimpleTrainingRecord rec in training_corpus) { string[] input_lang = rec.InputLang; string[] output_lang = rec.OutputLang; int[] a = rec.Alignment; int m_input_len = input_lang.Length; int l_output_len = output_lang.Length; Debug.Assert(m_input_len == a.Length); for (int j = 0; j < m_input_len; ++j) { int a_j = a[j]; string word_input = input_lang[j]; string word_output = output_lang[a_j]; double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output); double count_output = model.TranslationMap.GetCountOfOutputWord(word_output); model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + 1); model.TranslationMap.SetCountOfOutputWord(word_output, count_output + 1); model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output); double count_j_given_aj_l_m = model.DistortionMap.GetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len); double count_aj_l_m = model.DistortionMap.GetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len); model.DistortionMap.SetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len, count_j_given_aj_l_m + 1); model.DistortionMap.SetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len, count_aj_l_m + 1); model.DistortionMap.UpdateDistortionParameter(a_j, j, m_input_len, l_output_len); } } }
public static void Train(IBMModel2 model, IEnumerable <EMTrainingRecord> training_corpus, int maxIterations) { for (int iteration = 0; iteration < maxIterations; ++iteration) { foreach (EMTrainingRecord rec in training_corpus) { string[] input_lang = rec.InputLang; string[] output_lang = rec.OutputLang; int m_input_len = input_lang.Length; int l_output_len = output_lang.Length; double[][] delta = new double[m_input_len][]; double sum = 0; for (int j = 0; j < m_input_len; ++j) { delta[j] = new double[l_output_len]; string word_input = input_lang[j]; for (int a_j = 0; a_j < l_output_len; ++a_j) { string word_output = output_lang[a_j]; double t = model.TranslationMap.GetProbabilityInputWordGivenOutputWord(word_input, word_output); double q = model.DistortionMap.GetDistortionParameter(a_j, j, m_input_len, l_output_len); double deltaVal = t * q; delta[j][a_j] = deltaVal; sum += deltaVal; } } for (int j = 0; j < m_input_len; ++j) { for (int a_j = 0; a_j < l_output_len; ++a_j) { delta[j][a_j] /= sum; } } for (int j = 0; j < m_input_len; ++j) { for (int a_j = 0; a_j < l_output_len; ++a_j) { double deltaVal = delta[j][a_j]; string word_input = input_lang[j]; string word_output = output_lang[a_j]; double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output); double count_output = model.TranslationMap.GetCountOfOutputWord(word_output); model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + deltaVal); model.TranslationMap.SetCountOfOutputWord(word_output, count_output + deltaVal); model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output); double count_j_given_aj_l_m = model.DistortionMap.GetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len); double count_aj_l_m = model.DistortionMap.GetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len); model.DistortionMap.SetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len, count_j_given_aj_l_m + deltaVal); model.DistortionMap.SetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len, count_aj_l_m + deltaVal); model.DistortionMap.UpdateDistortionParameter(a_j, j, m_input_len, l_output_len); } } } } }
public void InitializeIBMModels(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, IEnumerable <EMTrainingRecord> training_corpus_f_to_e, int maxIterations) { EMTrainingMethod.Train(model_f_to_e, model_e_to_f, training_corpus_f_to_e, maxIterations); }