internal FileInfo PreprocessInput(IEnumerable <string> input, Boolean preprocessedInput = false) { var fileGuid = Guid.NewGuid(); var srcFile = new FileInfo(Path.Combine(Path.GetTempPath(), $"{fileGuid}.{this.SourceCode}")); using (var srcStream = new StreamWriter(srcFile.FullName, true, Encoding.UTF8)) { foreach (var line in input) { srcStream.WriteLine(line); } } FileInfo spSrcFile; if (!preprocessedInput) { var spmModel = this.modelDir.GetFiles("source.spm").Single(); spSrcFile = MarianHelper.PreprocessLanguage(srcFile, new DirectoryInfo(Path.GetTempPath()), this.SourceCode, spmModel, this.includePlaceholderTags, this.includeTagPairs); } else { spSrcFile = srcFile; } return(spSrcFile); }
private void PreprocessInput() { FileInfo sourceSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); FileInfo targetSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single(); this.segmentationMethod = sourceSegModel.Extension; var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null; this.spSource = MarianHelper.PreprocessLanguage( this.customSource, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spTarget = MarianHelper.PreprocessLanguage( this.customTarget, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode); if (tatoebaValidFileInfos == null) { tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir); } ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage( combinedValid.Source, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spValidTarget = MarianHelper.PreprocessLanguage( combinedValid.Target, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); }
internal Process BatchTranslate( IEnumerable <string> input, FileInfo spOutput, Boolean preprocessedInput = false, Boolean storeTranslations = false) { Log.Information($"Starting batch translator for model {this.SystemName}."); var srcFile = MarianHelper.LinesToFile(input, this.SourceCode); FileInfo spInput; if (!preprocessedInput) { FileInfo sourceSegModel = this.modelDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); spInput = MarianHelper.PreprocessLanguage( srcFile, new DirectoryInfo(Path.GetTempPath()), this.TargetCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs); } else { spInput = srcFile; } //TODO: check the translation cache for translations beforehand, and only translate new //segments (also change translation cache to account for different decoder configs for //same systems, i.e. keep track of decoder settings) FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign"); var args = $"\"{this.modelDir.FullName}\" \"{spInput.FullName}\" \"{transAndAlign.FullName}\" --log-level=info --quiet"; if (storeTranslations) { this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spInput, transAndAlign); } EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y); var cmd = "TranslateBatchSentencePiece.bat"; var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler); return(batchProcess); }
private void PreprocessInput() { var sourceSpm = this.customDir.GetFiles("source.spm").Single(); var targetSpm = this.customDir.GetFiles("target.spm").Single(); this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode); ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); }