private void PreprocessInput() { FileInfo sourceSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); FileInfo targetSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single(); this.segmentationMethod = sourceSegModel.Extension; var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null; this.spSource = MarianHelper.PreprocessLanguage( this.customSource, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spTarget = MarianHelper.PreprocessLanguage( this.customTarget, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode); if (tatoebaValidFileInfos == null) { tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir); } ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage( combinedValid.Source, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spValidTarget = MarianHelper.PreprocessLanguage( combinedValid.Target, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); }
private void PreprocessInput() { var sourceSpm = this.customDir.GetFiles("source.spm").Single(); var targetSpm = this.customDir.GetFiles("target.spm").Single(); this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode); ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); }