Beispiel #1
0
        private void TuneTranslationModel(string tuneTMPrefix, string tuneLMPrefix,
                                          IReadOnlyList <IReadOnlyList <string> > tuneSourceCorpus,
                                          IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, ThotTrainProgressReporter reporter)
        {
            reporter.Step("Tuning translation model");

            if (tuneSourceCorpus.Count == 0)
            {
                return;
            }

            string phraseTableFileName = tuneTMPrefix + ".ttable";

            FilterPhraseTableUsingCorpus(phraseTableFileName, tuneSourceCorpus);
            FilterPhraseTableNBest(phraseTableFileName, 20);

            ThotSmtParameters oldParameters     = Parameters;
            ThotSmtParameters initialParameters = oldParameters.Clone();

            initialParameters.TranslationModelFileNamePrefix = tuneTMPrefix;
            initialParameters.LanguageModelFileNamePrefix    = tuneLMPrefix;
            initialParameters.ModelWeights = new[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0f };
            initialParameters.Freeze();

            ThotSmtParameters tunedParameters = _modelWeightTuner.Tune(initialParameters, tuneSourceCorpus,
                                                                       tuneTargetCorpus, reporter, Stats);

            Parameters = tunedParameters.Clone();
            Parameters.TranslationModelFileNamePrefix = oldParameters.TranslationModelFileNamePrefix;
            Parameters.LanguageModelFileNamePrefix    = oldParameters.LanguageModelFileNamePrefix;
            Parameters.Freeze();
        }
Beispiel #2
0
        public ThotSmtBatchTrainer(ThotSmtParameters parameters, Func <string, string> sourcePreprocessor,
                                   Func <string, string> targetPreprocessor, ParallelTextCorpus corpus, int maxCorpusCount = int.MaxValue)
        {
            Parameters = parameters;
            Parameters.Freeze();
            _sourcePreprocessor = sourcePreprocessor;
            _targetPreprocessor = targetPreprocessor;
            _maxCorpusCount     = maxCorpusCount;
            _parallelCorpus     = corpus;
            //_modelWeightTuner = new MiraModelWeightTuner();
            _modelWeightTuner = new SimplexModelWeightTuner {
                ProgressIncrementInterval = 10
            };
            _tuneCorpusIndices = CreateTuneCorpus();

            do
            {
                _tempDir = Path.Combine(Path.GetTempPath(), "thot-train-" + Guid.NewGuid());
            } while (Directory.Exists(_tempDir));
            Directory.CreateDirectory(_tempDir);

            _lmFilePrefix = Path.GetFileName(Parameters.LanguageModelFileNamePrefix);
            _tmFilePrefix = Path.GetFileName(Parameters.TranslationModelFileNamePrefix);
            _trainLMDir   = Path.Combine(_tempDir, "lm");
            _trainTMDir   = Path.Combine(_tempDir, "tm_train");
        }
Beispiel #3
0
        public ThotSmtModel(ThotSmtParameters parameters)
        {
            Parameters = parameters;
            Parameters.Freeze();

            Handle = Thot.LoadSmtModel(Parameters);

            _directWordAlignmentModel = new ThotWordAlignmentModel(
                Thot.smtModel_getSingleWordAlignmentModel(Handle));
            _inverseWordAlignmentModel = new ThotWordAlignmentModel(
                Thot.smtModel_getInverseSingleWordAlignmentModel(Handle));
        }
Beispiel #4
0
        public ThotSmtParameters Tune(ThotSmtParameters parameters,
                                      IReadOnlyList <IReadOnlyList <string> > tuneSourceCorpus,
                                      IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, ThotTrainProgressReporter reporter,
                                      SmtBatchTrainStats stats)
        {
            float sentLenWeight = parameters.ModelWeights[7];
            int   numFuncEvals  = 0;

            double Evaluate(Vector weights)
            {
                ThotSmtParameters newParameters = parameters.Clone();

                newParameters.ModelWeights = weights.Select(w => (float)w).Concat(sentLenWeight).ToArray();
                newParameters.Freeze();
                double quality = CalculateBleu(newParameters, tuneSourceCorpus, tuneTargetCorpus);

                numFuncEvals++;
                if (numFuncEvals < MaxFunctionEvaluations && ProgressIncrementInterval > 0 &&
                    numFuncEvals % ProgressIncrementInterval == 0)
                {
                    reporter.Step();
                }
                else
                {
                    reporter.CheckCanceled();
                }
                return(quality);
            };
            var simplex = new NelderMeadSimplex(ConvergenceTolerance, MaxFunctionEvaluations, 1.0);
            MinimizationResult result = simplex.FindMinimum(Evaluate,
                                                            parameters.ModelWeights.Select(w => (double)w).Take(7));

            stats.TranslationModelBleu = 1.0 - result.ErrorValue;

            ThotSmtParameters bestParameters = parameters.Clone();

            bestParameters.ModelWeights = result.MinimizingPoint.Select(w => (float)w).Concat(sentLenWeight).ToArray();
            bestParameters.Freeze();
            return(bestParameters);
        }
Beispiel #5
0
        public ThotSmtParameters Tune(ThotSmtParameters parameters,
                                      IReadOnlyList <IReadOnlyList <string> > tuneSourceCorpus,
                                      IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, SmtBatchTrainStats stats,
                                      IProgress <ProgressStatus> progress)
        {
            float sentLenWeight = parameters.ModelWeights[7];
            int   numFuncEvals  = 0;

            double Evaluate(Vector weights)
            {
                ThotSmtParameters newParameters = parameters.Clone();

                newParameters.ModelWeights = weights.Select(w => (float)w).Concat(sentLenWeight).ToArray();
                newParameters.Freeze();
                double quality = CalculateBleu(newParameters, tuneSourceCorpus, tuneTargetCorpus);

                numFuncEvals++;
                int currentStep = Math.Min(numFuncEvals, MaxProgressFunctionEvaluations);

                progress.Report(new ProgressStatus(currentStep, MaxProgressFunctionEvaluations));
                return(quality);
            };
            progress.Report(new ProgressStatus(0, MaxFunctionEvaluations));
            var simplex = new NelderMeadSimplex(ConvergenceTolerance, MaxFunctionEvaluations, 1.0);
            MinimizationResult result = simplex.FindMinimum(Evaluate,
                                                            parameters.ModelWeights.Select(w => (double)w).Take(7));

            stats.TranslationModelBleu = 1.0 - result.ErrorValue;

            ThotSmtParameters bestParameters = parameters.Clone();

            bestParameters.ModelWeights = result.MinimizingPoint.Select(w => (float)w).Concat(sentLenWeight).ToArray();
            bestParameters.Freeze();

            if (result.FunctionEvaluationCount < MaxProgressFunctionEvaluations)
            {
                progress.Report(new ProgressStatus(1.0));
            }
            return(bestParameters);
        }
Beispiel #6
0
        public ThotSmtParameters Tune(ThotSmtParameters parameters,
                                      IReadOnlyList <IReadOnlyList <string> > tuneSourceCorpus,
                                      IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, SmtBatchTrainStats stats,
                                      IProgress <ProgressStatus> progress)
        {
            IntPtr weightUpdaterHandle = Thot.llWeightUpdater_create();

            try
            {
                var                         iterQualities  = new List <double>();
                double                      bestQuality    = double.MinValue;
                ThotSmtParameters           bestParameters = null;
                int                         iter           = 0;
                HashSet <TranslationInfo>[] curNBestLists  = null;
                float[]                     curWeights     = parameters.ModelWeights.ToArray();

                while (true)
                {
                    progress.Report(new ProgressStatus(iter, MaxIterations));

                    ThotSmtParameters newParameters = parameters.Clone();
                    newParameters.ModelWeights = curWeights;
                    newParameters.Freeze();
                    IList <TranslationInfo>[] nbestLists = GetNBestLists(newParameters, tuneSourceCorpus).ToArray();
                    double quality = Evaluation.CalculateBleu(nbestLists.Select(nbl => nbl.First().Translation),
                                                              tuneTargetCorpus);
                    iterQualities.Add(quality);
                    if (quality > bestQuality)
                    {
                        bestQuality    = quality;
                        bestParameters = newParameters;
                    }

                    iter++;
                    if (iter >= MaxIterations || IsTuningConverged(iterQualities))
                    {
                        break;
                    }

                    if (curNBestLists == null)
                    {
                        curNBestLists = nbestLists.Select(nbl => new HashSet <TranslationInfo>(nbl)).ToArray();
                    }
                    else
                    {
                        for (int i = 0; i < nbestLists.Length; i++)
                        {
                            curNBestLists[i].UnionWith(nbestLists[i]);
                        }
                    }

                    UpdateWeights(weightUpdaterHandle, tuneTargetCorpus, curNBestLists, curWeights);
                }

                if (iter < MaxIterations)
                {
                    progress.Report(new ProgressStatus(1.0));
                }
                stats.TranslationModelBleu = bestQuality;
                return(bestParameters);
            }
            finally
            {
                Thot.llWeightUpdater_close(weightUpdaterHandle);
            }
        }