Exemplo n.º 1
0
        private void TrainTranslationModel(string tmPrefix, ThotTrainProgressReporter reporter)
        {
            string invswmPrefix = tmPrefix + "_invswm";

            GenerateWordAlignmentModel(invswmPrefix, _sourcePreprocessor, _targetPreprocessor, _parallelCorpus,
                                       reporter, false);

            string swmPrefix = tmPrefix + "_swm";

            GenerateWordAlignmentModel(swmPrefix, _targetPreprocessor, _sourcePreprocessor, _parallelCorpus.Invert(),
                                       reporter, true);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                Thot.giza_symmetr1(swmPrefix + ".bestal", invswmPrefix + ".bestal", tmPrefix + ".A3.final", true);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                Thot.phraseModel_generate(tmPrefix + ".A3.final", 10, tmPrefix + ".ttable");

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                FilterPhraseTableNBest(tmPrefix + ".ttable", 20);

            File.WriteAllText(tmPrefix + ".lambda", "0.7 0.7");
            File.WriteAllText(tmPrefix + ".srcsegmlentable", "Uniform");
            File.WriteAllText(tmPrefix + ".trgcutstable", "0.999");
            File.WriteAllText(tmPrefix + ".trgsegmlentable", "Geometric");
        }
Exemplo n.º 2
0
        private void GenerateWordAlignmentModel(string swmPrefix, Func <string, string> sourcePreprocessor,
                                                Func <string, string> targetPreprocessor, ParallelTextCorpus corpus, ThotTrainProgressReporter reporter,
                                                bool inverted)
        {
            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
            {
                TrainWordAlignmentModel(swmPrefix, sourcePreprocessor, targetPreprocessor, corpus, phaseProgress);
            }

            reporter.CheckCanceled();

            PruneLexTable(swmPrefix + ".hmm_lexnd", 0.00001);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
            {
                GenerateBestAlignments(swmPrefix, swmPrefix + ".bestal", sourcePreprocessor, targetPreprocessor, corpus,
                                       phaseProgress);
            }
        }
Exemplo n.º 3
0
        public virtual void Train(IProgress <ProgressStatus> progress = null, Action checkCanceled = null)
        {
            var reporter = new ThotTrainProgressReporter(progress, checkCanceled);

            Directory.CreateDirectory(_trainLMDir);
            string trainLMPrefix = Path.Combine(_trainLMDir, _lmFilePrefix);

            Directory.CreateDirectory(_trainTMDir);
            string trainTMPrefix = Path.Combine(_trainTMDir, _tmFilePrefix);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                TrainLanguageModel(trainLMPrefix, 3);

            TrainTranslationModel(trainTMPrefix, reporter);

            reporter.CheckCanceled();

            string tuneTMDir = Path.Combine(_tempDir, "tm_tune");

            Directory.CreateDirectory(tuneTMDir);
            string tuneTMPrefix = Path.Combine(tuneTMDir, _tmFilePrefix);

            CopyFiles(_trainTMDir, tuneTMDir, _tmFilePrefix);

            var tuneSourceCorpus = new List <IReadOnlyList <string> >(_tuneCorpusIndices.Count);
            var tuneTargetCorpus = new List <IReadOnlyList <string> >(_tuneCorpusIndices.Count);

            foreach (ParallelTextSegment segment in GetTuningSegments(_parallelCorpus))
            {
                tuneSourceCorpus.Add(segment.SourceSegment.Preprocess(_sourcePreprocessor));
                tuneTargetCorpus.Add(segment.TargetSegment.Preprocess(_targetPreprocessor));
            }

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                TuneLanguageModel(trainLMPrefix, tuneTargetCorpus, 3);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                TuneTranslationModel(tuneTMPrefix, trainLMPrefix, tuneSourceCorpus, tuneTargetCorpus, phaseProgress);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                TrainTuneCorpus(trainTMPrefix, trainLMPrefix, tuneSourceCorpus, tuneTargetCorpus, phaseProgress);
        }