Пример #1
0
        public static void Train(IBMModel1 model, IEnumerable <SimpleTrainingRecord> training_corpus)
        {
            foreach (SimpleTrainingRecord rec in training_corpus)
            {
                string[] input_lang  = rec.InputLang;
                string[] output_lang = rec.OutputLang;
                int[]    a           = rec.Alignment;

                int m_input_len  = input_lang.Length;
                int l_output_len = output_lang.Length;

                Debug.Assert(m_input_len == a.Length);

                for (int j = 0; j < m_input_len; ++j)
                {
                    int    a_j         = a[j];
                    string word_input  = input_lang[j];
                    string word_output = output_lang[a_j];

                    double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output);
                    double count_output       = model.TranslationMap.GetCountOfOutputWord(word_output);

                    model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + 1);
                    model.TranslationMap.SetCountOfOutputWord(word_output, count_output + 1);

                    model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output);
                }
            }
        }
Пример #2
0
        public static void Initialize(IBMModel1 model, IEnumerable <EMTrainingRecord> training_corpus)
        {
            foreach (EMTrainingRecord rec in training_corpus)
            {
                string[] input_lang  = rec.InputLang;
                string[] output_lang = rec.OutputLang;

                int m_input_len  = input_lang.Length;
                int l_output_len = output_lang.Length;

                for (int j = 0; j < m_input_len; ++j)
                {
                    for (int a_j = 0; a_j < l_output_len; ++a_j)
                    {
                        string word_input  = input_lang[j];
                        string word_output = output_lang[a_j];

                        double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output);
                        double count_output       = model.TranslationMap.GetCountOfOutputWord(word_output);

                        model.TranslationMap.SetProbabilityInputWordGivenOutputWord(word_input, word_output, mRandom.NextDouble());
                    }
                }
            }
        }
Пример #3
0
        public static void Train(IBMModel1 model, IEnumerable <EMTrainingRecord> training_corpus, int maxIterations)
        {
            for (int iteration = 0; iteration < maxIterations; ++iteration)
            {
                foreach (EMTrainingRecord rec in training_corpus)
                {
                    string[] input_lang  = rec.InputLang;
                    string[] output_lang = rec.OutputLang;

                    int m_input_len  = input_lang.Length;
                    int l_output_len = output_lang.Length;

                    double[][] delta = new double[m_input_len][];
                    double     sum   = 0;
                    for (int j = 0; j < m_input_len; ++j)
                    {
                        delta[j] = new double[l_output_len];
                        string word_input = input_lang[j];
                        for (int a_j = 0; a_j < l_output_len; ++a_j)
                        {
                            string word_output = output_lang[a_j];
                            double t           = model.TranslationMap.GetProbabilityInputWordGivenOutputWord(word_input, word_output);

                            double deltaVal = t;
                            delta[j][a_j] = deltaVal;
                            sum          += deltaVal;
                        }
                    }

                    for (int j = 0; j < m_input_len; ++j)
                    {
                        for (int a_j = 0; a_j < l_output_len; ++a_j)
                        {
                            delta[j][a_j] /= sum;
                        }
                    }

                    for (int j = 0; j < m_input_len; ++j)
                    {
                        for (int a_j = 0; a_j < l_output_len; ++a_j)
                        {
                            double deltaVal = delta[j][a_j];

                            string word_input  = input_lang[j];
                            string word_output = output_lang[a_j];

                            double count_input_output = model.TranslationMap.GetCountOfInputWordAndOutputWord(word_input, word_output);
                            double count_output       = model.TranslationMap.GetCountOfOutputWord(word_output);

                            model.TranslationMap.SetCountOfInputWordAndOutputWord(word_input, word_output, count_input_output + deltaVal);
                            model.TranslationMap.SetCountOfOutputWord(word_output, count_output + deltaVal);

                            model.TranslationMap.UpdateProbabilityInputWordGivenOutputWord(word_input, word_output);
                        }
                    }
                }
            }
        }
Пример #4
0
        public void RunSimpleTraining()
        {
            IBMModel1.Helpers.IBMModel1 model = new IBMModel1.Helpers.IBMModel1();

            List <SimpleTrainingRecord> training_corpus = new List <SimpleTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer  tokenizer_input  = new FrenchTokenizer();


            SimpleTrainingMethod.Train(model, training_corpus);


            string sentence_input  = "la maison bleue";
            string sentence_output = "the blue house";

            string[] input_lang  = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[]    alignment   = model.GetAlignment(input_lang, output_lang);
            Dictionary <int, string> output_mapping = new Dictionary <int, string>();
            int m_input_len = input_lang.Length;

            string output_word = "";
            int    ia          = 1; //adding

            for (int j = 0; j < m_input_len; ++j)
            {
                int a_j = alignment[j] + ia;
                output_word         = output_lang[a_j];
                output_mapping[a_j] = output_word;
                ia++;
            }
            List <int> output_sentence_index_list = output_mapping.Keys.ToList();

            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
            Console.ReadLine();
        }