internal static FileInfo PreprocessLanguage( FileInfo languageFile, DirectoryInfo directory, string languageCode, FileInfo spmModel, bool includePlaceholderTags, bool includeTagPairs) { var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}")); //Marian doesn't like spaces in names var spFile = new FileInfo(Path.Combine(directory.FullName, $"sp_{languageFile.Name.Replace(" ", "_")}")); using (var rawFile = languageFile.OpenText()) using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName)) { String line; while ((line = rawFile.ReadLine()) != null) { var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs); preprocessedWriter.WriteLine(preprocessedLine); } } var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{spmModel.FullName}\" --output \"{spFile.FullName}\""; var spmProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs); spmProcess.WaitForExit(); return(spFile); }
//Callback can be used to do different things with translation output/input (default is to save in translation cache) internal Process BatchTranslate( IEnumerable <string> input, FileInfo spOutput, Boolean preprocessedInput = false, Boolean storeTranslations = false) { if (storeTranslations) { this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spOutput); } Log.Information($"Starting batch translator for model {this.SystemName}."); var cmd = "TranslateBatchSentencePiece.bat"; FileInfo spInput = this.PreprocessInput(input, preprocessedInput); //TODO: check the translation cache for translations beforehand, and only translate new //segments (also change translation cache to account for different decoder configs for //same systems, i.e. keep track of decoder settings) FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign"); var args = $"{this.modelDir.FullName} {spInput.FullName} {transAndAlign.FullName} --log-level=info --quiet"; EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y); var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler); return(batchProcess); }
internal Process BatchTranslate( IEnumerable <string> input, FileInfo spOutput, Boolean preprocessedInput = false, Boolean storeTranslations = false) { Log.Information($"Starting batch translator for model {this.SystemName}."); var srcFile = MarianHelper.LinesToFile(input, this.SourceCode); FileInfo spInput; if (!preprocessedInput) { FileInfo sourceSegModel = this.modelDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); spInput = MarianHelper.PreprocessLanguage( srcFile, new DirectoryInfo(Path.GetTempPath()), this.TargetCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs); } else { spInput = srcFile; } //TODO: check the translation cache for translations beforehand, and only translate new //segments (also change translation cache to account for different decoder configs for //same systems, i.e. keep track of decoder settings) FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign"); var args = $"\"{this.modelDir.FullName}\" \"{spInput.FullName}\" \"{transAndAlign.FullName}\" --log-level=info --quiet"; if (storeTranslations) { this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spInput, transAndAlign); } EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y); var cmd = "TranslateBatchSentencePiece.bat"; var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler); return(batchProcess); }
internal static void GenerateAlignments(FileInfo spSource, FileInfo spTarget, FileInfo alignmentFile, FileInfo priorsFile) { var alignArgs = $"-s \"{spSource.FullName}\" -t \"{spTarget.FullName}\" -f \"{alignmentFile.FullName}.fwd\" -r \"{alignmentFile.FullName}.rev\""; Log.Information($"Aligning fine-tuning corpus with args {alignArgs}"); var alignProcess = MarianHelper.StartProcessInBackgroundWithRedirects("python Alignment\\align.py", alignArgs); alignProcess.WaitForExit(); var symmetryArgs = $"-c grow-diag-final -i \"{alignmentFile.FullName}.fwd\" -j \"{alignmentFile.FullName}.rev\" > \"{alignmentFile.FullName}\""; Log.Information($"Symmetrisizing alignment with args {symmetryArgs}"); var symmetryProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Alignment\\atools.exe", symmetryArgs); symmetryProcess.WaitForExit(); }
private Process StartTraining() { var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig); var deserializer = new Deserializer(); MarianTrainerConfig trainingConfig; using (var reader = new StreamReader(configPath)) { trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader); } this.trainingLog.TrainingConfig = trainingConfig; //var trainingArgs = $"--config {configPath} --log-level=warn"; var trainingArgs = $"--config \"{configPath}\" --log-level=info"; // --quiet"; var trainProcess = MarianHelper.StartProcessInBackgroundWithRedirects( Path.Combine(OpusCatMTEngineSettings.Default.MarianDir, "marian.exe"), trainingArgs, this.MarianExitHandler, this.MarianProgressHandler); return(trainProcess); }
internal static FileInfo PreprocessLanguage( FileInfo languageFile, DirectoryInfo directory, string languageCode, FileInfo segmentationModel, bool includePlaceholderTags, bool includeTagPairs, string targetLanguageToPrefix = null) { var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}")); using (var rawFile = languageFile.OpenText()) using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName)) { String line; while ((line = rawFile.ReadLine()) != null) { var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs); preprocessedWriter.WriteLine(preprocessedLine); } } //Marian doesn't like spaces in names var segmentedFile = new FileInfo(Path.Combine(directory.FullName, $"seg_{languageFile.Name.Replace(" ", "_")}")); switch (segmentationModel.Extension) { case ".spm": var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{segmentationModel.FullName}\" --output \"{segmentedFile.FullName}\""; var segmentationProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs); segmentationProcess.WaitForExit(); break; case ".bpe": //Truecasing is not used in any models, so this is a dummy tc model (empty). So it does not //matter is source.tcmodel is used for target language. var tcModelPath = $@"{directory.FullName}\source.tcmodel"; var mosesProcess = MarianHelper.StartProcessInBackgroundWithRedirects( $"type {preprocessedFile.FullName} | Preprocessing\\StartMosesBpePreprocessPipe.bat {languageCode} \"{tcModelPath}\" \"{segmentationModel.FullName}\" > {segmentedFile.FullName}"); mosesProcess.WaitForExit(); break; default: segmentationProcess = null; throw new Exception("No segmentation model found"); break; } if (targetLanguageToPrefix != null) { var segmentedWithTargetPrefix = new FileInfo(Path.Combine(directory.FullName, $"prefix_{languageFile.Name.Replace(" ", "_")}")); using (var segFile = segmentedFile.OpenText()) using (var prefixWriter = new StreamWriter(segmentedWithTargetPrefix.FullName)) { String line; while ((line = segFile.ReadLine()) != null) { var prefixedLine = $">>{targetLanguageToPrefix}<< {line}"; prefixWriter.WriteLine(prefixedLine); } } return(segmentedWithTargetPrefix); } else { return(segmentedFile); } }