Пример #1
0
        public Dictionary <int, int> GetIntersectionPoints(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, string[] ws_f, string[] ws_e)
        {
            Dictionary <int, int> intersection_points;

            GetAlignmentMatrix(model_f_to_e, model_e_to_f, ws_f, ws_e, out intersection_points);
            return(intersection_points);
        }
        public static void Initialize(IBMModel2 model, IEnumerable <EMTrainingRecord> training_corpus)
        {
            foreach (EMTrainingRecord rec in training_corpus)
            {
                string[] input_lang  = rec.InputLang;
                string[] output_lang = rec.OutputLang;

                int m_input_len  = input_lang.Length;
                int l_output_len = output_lang.Length;

                for (int j = 0; j < m_input_len; ++j)
                {
                    for (int a_j = 0; a_j < l_output_len; ++a_j)
                    {
                        string word_input  = input_lang[j];
                        string word_output = output_lang[a_j];

                        double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output);
                        double count_output       = model.TranslationMap.GetCountOfOutputWord(word_output);

                        model.TranslationMap.SetProbabilityInputWordGivenOutputWord(word_input, word_output, mRandom.NextDouble());

                        double count_j_given_aj_l_m = model.DistortionMap.GetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len);
                        double count_aj_l_m         = model.DistortionMap.GetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len);

                        model.DistortionMap.SetDistortionParameter(a_j, j, m_input_len, l_output_len, mRandom.NextDouble());
                    }
                }
            }
        }
        public static void Train(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, IEnumerable <EMTrainingRecord> training_corpus_from_f_to_e, int maxIterations)
        {
            Train(model_f_to_e, training_corpus_from_f_to_e, maxIterations);

            List <EMTrainingRecord> training_corpus_from_e_to_f = new List <EMTrainingRecord>();

            foreach (EMTrainingRecord rec in training_corpus_from_f_to_e)
            {
                EMTrainingRecord rec2 = new EMTrainingRecord()
                {
                    InputLang  = rec.OutputLang,
                    OutputLang = rec.InputLang
                };
                training_corpus_from_e_to_f.Add(rec2);
            }
            Train(model_e_to_f, training_corpus_from_f_to_e, maxIterations);
        }
Пример #4
0
        public void GetPhrases(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, string[] ws_f, string[] ws_e, Dictionary <string, string> phrases)
        {
            Dictionary <int, int> intersection_points = GetIntersectionPoints(model_f_to_e, model_e_to_f, ws_f, ws_e);

            List <int> ipList = intersection_points.Keys.ToList();

            ipList.Sort();

            for (int k = 0; k < ipList.Count - 1; ++k)
            {
                int j_f1 = ipList[k];
                int j_f2 = ipList[k + 1];

                int i_e1 = intersection_points[j_f1];
                int i_e2 = intersection_points[j_f2];
            }
        }
Пример #5
0
        public void RunEMTraining()
        {
            IBMModel2 model = new IBMModel2();

            List <EMTrainingRecord> training_corpus = new List <EMTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer  tokenizer_input  = new FrenchTokenizer();


            EMTrainingMethod.Train(model, training_corpus, 20);

            string sentence_input  = "[Some French Sentence]";
            string sentence_output = "[Some English Sentence]";

            string[] input_lang  = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[]    alignment   = model.GetAlignment(input_lang, output_lang);

            Dictionary <int, string> output_mapping = new Dictionary <int, string>();
            int m_input_len = input_lang.Length;

            for (int j = 0; j < m_input_len; ++j)
            {
                int    a_j         = alignment[j];
                string output_word = output_lang[a_j];
                output_mapping[a_j] = output_word;
            }
            List <int> output_sentence_index_list = output_mapping.Keys.ToList();

            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
        }
Пример #6
0
        public void GetAlignmentMatrix(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, string[] ws_f, string[] ws_e, out Dictionary <int, int> intersection_points)
        {
            int[] alignment_f_to_e = model_f_to_e.GetAlignment(ws_f, ws_e);
            int[] alignment_e_to_f = model_e_to_f.GetAlignment(ws_e, ws_f);
            int   m_f_len          = alignment_f_to_e.Length;
            int   l_e_len          = alignment_e_to_f.Length;

            intersection_points = new Dictionary <int, int>();

            int[][] alignment_matrix = new int[m_f_len][];
            for (int j = 0; j < m_f_len; ++j)
            {
                alignment_matrix[j] = new int[l_e_len];
                for (int i = 0; i < l_e_len; ++i)
                {
                    if (alignment_f_to_e[j] == i && alignment_e_to_f[i] == j)
                    {
                        intersection_points[j] = i;
                        alignment_matrix[j][i] = 1;
                    }
                }
            }
        }
        public static void Train(IBMModel2 model, IEnumerable <SimpleTrainingRecord> training_corpus)
        {
            foreach (SimpleTrainingRecord rec in training_corpus)
            {
                string[] input_lang  = rec.InputLang;
                string[] output_lang = rec.OutputLang;
                int[]    a           = rec.Alignment;

                int m_input_len  = input_lang.Length;
                int l_output_len = output_lang.Length;

                Debug.Assert(m_input_len == a.Length);

                for (int j = 0; j < m_input_len; ++j)
                {
                    int    a_j         = a[j];
                    string word_input  = input_lang[j];
                    string word_output = output_lang[a_j];

                    double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output);
                    double count_output       = model.TranslationMap.GetCountOfOutputWord(word_output);

                    model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + 1);
                    model.TranslationMap.SetCountOfOutputWord(word_output, count_output + 1);

                    model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output);

                    double count_j_given_aj_l_m = model.DistortionMap.GetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len);
                    double count_aj_l_m         = model.DistortionMap.GetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len);

                    model.DistortionMap.SetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len, count_j_given_aj_l_m + 1);
                    model.DistortionMap.SetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len, count_aj_l_m + 1);

                    model.DistortionMap.UpdateDistortionParameter(a_j, j, m_input_len, l_output_len);
                }
            }
        }
        public static void Train(IBMModel2 model, IEnumerable <EMTrainingRecord> training_corpus, int maxIterations)
        {
            for (int iteration = 0; iteration < maxIterations; ++iteration)
            {
                foreach (EMTrainingRecord rec in training_corpus)
                {
                    string[] input_lang  = rec.InputLang;
                    string[] output_lang = rec.OutputLang;

                    int m_input_len  = input_lang.Length;
                    int l_output_len = output_lang.Length;

                    double[][] delta = new double[m_input_len][];
                    double     sum   = 0;
                    for (int j = 0; j < m_input_len; ++j)
                    {
                        delta[j] = new double[l_output_len];
                        string word_input = input_lang[j];
                        for (int a_j = 0; a_j < l_output_len; ++a_j)
                        {
                            string word_output = output_lang[a_j];
                            double t           = model.TranslationMap.GetProbabilityInputWordGivenOutputWord(word_input, word_output);
                            double q           = model.DistortionMap.GetDistortionParameter(a_j, j, m_input_len, l_output_len);
                            double deltaVal    = t * q;
                            delta[j][a_j] = deltaVal;
                            sum          += deltaVal;
                        }
                    }

                    for (int j = 0; j < m_input_len; ++j)
                    {
                        for (int a_j = 0; a_j < l_output_len; ++a_j)
                        {
                            delta[j][a_j] /= sum;
                        }
                    }

                    for (int j = 0; j < m_input_len; ++j)
                    {
                        for (int a_j = 0; a_j < l_output_len; ++a_j)
                        {
                            double deltaVal = delta[j][a_j];

                            string word_input  = input_lang[j];
                            string word_output = output_lang[a_j];

                            double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output);
                            double count_output       = model.TranslationMap.GetCountOfOutputWord(word_output);

                            model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + deltaVal);
                            model.TranslationMap.SetCountOfOutputWord(word_output, count_output + deltaVal);

                            model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output);

                            double count_j_given_aj_l_m = model.DistortionMap.GetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len);
                            double count_aj_l_m         = model.DistortionMap.GetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len);

                            model.DistortionMap.SetCountOfAlignmentIndexGivenAlignmentValueAndInputLenAndOutputLen(j, a_j, l_output_len, m_input_len, count_j_given_aj_l_m + deltaVal);
                            model.DistortionMap.SetCountOfAlignmentValueAndInputLenAndOutputLen(a_j, l_output_len, m_input_len, count_aj_l_m + deltaVal);

                            model.DistortionMap.UpdateDistortionParameter(a_j, j, m_input_len, l_output_len);
                        }
                    }
                }
            }
        }
Пример #9
0
 public void InitializeIBMModels(IBMModel2 model_f_to_e, IBMModel2 model_e_to_f, IEnumerable <EMTrainingRecord> training_corpus_f_to_e, int maxIterations)
 {
     EMTrainingMethod.Train(model_f_to_e, model_e_to_f, training_corpus_f_to_e, maxIterations);
 }