private void TrainTranslationModel(string tmPrefix, ThotTrainProgressReporter reporter) { string invswmPrefix = tmPrefix + "_invswm"; GenerateWordAlignmentModel(invswmPrefix, _sourcePreprocessor, _targetPreprocessor, _parallelCorpus, reporter, false); string swmPrefix = tmPrefix + "_swm"; GenerateWordAlignmentModel(swmPrefix, _targetPreprocessor, _sourcePreprocessor, _parallelCorpus.Invert(), reporter, true); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) Thot.giza_symmetr1(swmPrefix + ".bestal", invswmPrefix + ".bestal", tmPrefix + ".A3.final", true); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) Thot.phraseModel_generate(tmPrefix + ".A3.final", 10, tmPrefix + ".ttable"); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) FilterPhraseTableNBest(tmPrefix + ".ttable", 20); File.WriteAllText(tmPrefix + ".lambda", "0.7 0.7"); File.WriteAllText(tmPrefix + ".srcsegmlentable", "Uniform"); File.WriteAllText(tmPrefix + ".trgcutstable", "0.999"); File.WriteAllText(tmPrefix + ".trgsegmlentable", "Geometric"); }
private void GenerateWordAlignmentModel(string swmPrefix, Func <string, string> sourcePreprocessor, Func <string, string> targetPreprocessor, ParallelTextCorpus corpus, ThotTrainProgressReporter reporter, bool inverted) { using (PhaseProgress phaseProgress = reporter.StartNextPhase()) { TrainWordAlignmentModel(swmPrefix, sourcePreprocessor, targetPreprocessor, corpus, phaseProgress); } reporter.CheckCanceled(); PruneLexTable(swmPrefix + ".hmm_lexnd", 0.00001); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) { GenerateBestAlignments(swmPrefix, swmPrefix + ".bestal", sourcePreprocessor, targetPreprocessor, corpus, phaseProgress); } }
public virtual void Train(IProgress <ProgressStatus> progress = null, Action checkCanceled = null) { var reporter = new ThotTrainProgressReporter(progress, checkCanceled); Directory.CreateDirectory(_trainLMDir); string trainLMPrefix = Path.Combine(_trainLMDir, _lmFilePrefix); Directory.CreateDirectory(_trainTMDir); string trainTMPrefix = Path.Combine(_trainTMDir, _tmFilePrefix); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) TrainLanguageModel(trainLMPrefix, 3); TrainTranslationModel(trainTMPrefix, reporter); reporter.CheckCanceled(); string tuneTMDir = Path.Combine(_tempDir, "tm_tune"); Directory.CreateDirectory(tuneTMDir); string tuneTMPrefix = Path.Combine(tuneTMDir, _tmFilePrefix); CopyFiles(_trainTMDir, tuneTMDir, _tmFilePrefix); var tuneSourceCorpus = new List <IReadOnlyList <string> >(_tuneCorpusIndices.Count); var tuneTargetCorpus = new List <IReadOnlyList <string> >(_tuneCorpusIndices.Count); foreach (ParallelTextSegment segment in GetTuningSegments(_parallelCorpus)) { tuneSourceCorpus.Add(segment.SourceSegment.Preprocess(_sourcePreprocessor)); tuneTargetCorpus.Add(segment.TargetSegment.Preprocess(_targetPreprocessor)); } using (PhaseProgress phaseProgress = reporter.StartNextPhase()) TuneLanguageModel(trainLMPrefix, tuneTargetCorpus, 3); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) TuneTranslationModel(tuneTMPrefix, trainLMPrefix, tuneSourceCorpus, tuneTargetCorpus, phaseProgress); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) TrainTuneCorpus(trainTMPrefix, trainLMPrefix, tuneSourceCorpus, tuneTargetCorpus, phaseProgress); }