Exemplo n.º 1
0
        private void PreprocessInput()
        {
            FileInfo sourceSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();
            FileInfo targetSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single();

            this.segmentationMethod = sourceSegModel.Extension;

            var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null;

            this.spSource = MarianHelper.PreprocessLanguage(
                this.customSource,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spTarget = MarianHelper.PreprocessLanguage(
                this.customTarget,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos =
                HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode);

            if (tatoebaValidFileInfos == null)
            {
                tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir);
            }

            ParallelFilePair combinedValid = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(
                combinedValid.Source,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spValidTarget = MarianHelper.PreprocessLanguage(
                combinedValid.Target,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);
        }
Exemplo n.º 2
0
        private void PreprocessInput()
        {
            var sourceSpm = this.customDir.GetFiles("source.spm").Single();
            var targetSpm = this.customDir.GetFiles("target.spm").Single();

            this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode);
            ParallelFilePair combinedValid         = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);
        }