internal FileInfo PreprocessInput(IEnumerable <string> input, Boolean preprocessedInput = false)
        {
            var fileGuid = Guid.NewGuid();
            var srcFile  = new FileInfo(Path.Combine(Path.GetTempPath(), $"{fileGuid}.{this.SourceCode}"));

            using (var srcStream = new StreamWriter(srcFile.FullName, true, Encoding.UTF8))
            {
                foreach (var line in input)
                {
                    srcStream.WriteLine(line);
                }
            }

            FileInfo spSrcFile;

            if (!preprocessedInput)
            {
                var spmModel = this.modelDir.GetFiles("source.spm").Single();
                spSrcFile = MarianHelper.PreprocessLanguage(srcFile, new DirectoryInfo(Path.GetTempPath()), this.SourceCode, spmModel, this.includePlaceholderTags, this.includeTagPairs);
            }
            else
            {
                spSrcFile = srcFile;
            }
            return(spSrcFile);
        }
        private void PreprocessInput()
        {
            FileInfo sourceSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();
            FileInfo targetSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single();

            this.segmentationMethod = sourceSegModel.Extension;

            var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null;

            this.spSource = MarianHelper.PreprocessLanguage(
                this.customSource,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spTarget = MarianHelper.PreprocessLanguage(
                this.customTarget,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos =
                HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode);

            if (tatoebaValidFileInfos == null)
            {
                tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir);
            }

            ParallelFilePair combinedValid = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(
                combinedValid.Source,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spValidTarget = MarianHelper.PreprocessLanguage(
                combinedValid.Target,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);
        }
        internal Process BatchTranslate(
            IEnumerable <string> input,
            FileInfo spOutput,
            Boolean preprocessedInput = false,
            Boolean storeTranslations = false)
        {
            Log.Information($"Starting batch translator for model {this.SystemName}.");

            var      srcFile = MarianHelper.LinesToFile(input, this.SourceCode);
            FileInfo spInput;

            if (!preprocessedInput)
            {
                FileInfo sourceSegModel =
                    this.modelDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();

                spInput = MarianHelper.PreprocessLanguage(
                    srcFile,
                    new DirectoryInfo(Path.GetTempPath()),
                    this.TargetCode,
                    sourceSegModel,
                    this.includePlaceholderTags,
                    this.includeTagPairs);
            }
            else
            {
                spInput = srcFile;
            }

            //TODO: check the translation cache for translations beforehand, and only translate new
            //segments (also change translation cache to account for different decoder configs for
            //same systems, i.e. keep track of decoder settings)

            FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign");
            var      args          = $"\"{this.modelDir.FullName}\" \"{spInput.FullName}\" \"{transAndAlign.FullName}\" --log-level=info --quiet";

            if (storeTranslations)
            {
                this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spInput, transAndAlign);
            }

            EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y);

            var cmd          = "TranslateBatchSentencePiece.bat";
            var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler);


            return(batchProcess);
        }
Beispiel #4
0
        private void PreprocessInput()
        {
            var sourceSpm = this.customDir.GetFiles("source.spm").Single();
            var targetSpm = this.customDir.GetFiles("target.spm").Single();

            this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode);
            ParallelFilePair combinedValid         = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);
        }