internal static FileInfo PreprocessLanguage( FileInfo languageFile, DirectoryInfo directory, string languageCode, FileInfo spmModel, bool includePlaceholderTags, bool includeTagPairs) { var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}")); //Marian doesn't like spaces in names var spFile = new FileInfo(Path.Combine(directory.FullName, $"sp_{languageFile.Name.Replace(" ", "_")}")); using (var rawFile = languageFile.OpenText()) using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName)) { String line; while ((line = rawFile.ReadLine()) != null) { var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs); preprocessedWriter.WriteLine(preprocessedLine); } } var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{spmModel.FullName}\" --output \"{spFile.FullName}\""; var spmProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs); spmProcess.WaitForExit(); return(spFile); }
internal FileInfo PreprocessInput(IEnumerable <string> input, Boolean preprocessedInput = false) { var fileGuid = Guid.NewGuid(); var srcFile = new FileInfo(Path.Combine(Path.GetTempPath(), $"{fileGuid}.{this.SourceCode}")); using (var srcStream = new StreamWriter(srcFile.FullName, true, Encoding.UTF8)) { foreach (var line in input) { srcStream.WriteLine(line); } } FileInfo spSrcFile; if (!preprocessedInput) { var spmModel = this.modelDir.GetFiles("source.spm").Single(); spSrcFile = MarianHelper.PreprocessLanguage(srcFile, new DirectoryInfo(Path.GetTempPath()), this.SourceCode, spmModel, this.includePlaceholderTags, this.includeTagPairs); } else { spSrcFile = srcFile; } return(spSrcFile); }
//Callback can be used to do different things with translation output/input (default is to save in translation cache) internal Process BatchTranslate( IEnumerable <string> input, FileInfo spOutput, Boolean preprocessedInput = false, Boolean storeTranslations = false) { if (storeTranslations) { this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spOutput); } Log.Information($"Starting batch translator for model {this.SystemName}."); var cmd = "TranslateBatchSentencePiece.bat"; FileInfo spInput = this.PreprocessInput(input, preprocessedInput); //TODO: check the translation cache for translations beforehand, and only translate new //segments (also change translation cache to account for different decoder configs for //same systems, i.e. keep track of decoder settings) FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign"); var args = $"{this.modelDir.FullName} {spInput.FullName} {transAndAlign.FullName} --log-level=info --quiet"; EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y); var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler); return(batchProcess); }
private void PreprocessInput() { FileInfo sourceSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); FileInfo targetSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single(); this.segmentationMethod = sourceSegModel.Extension; var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null; this.spSource = MarianHelper.PreprocessLanguage( this.customSource, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spTarget = MarianHelper.PreprocessLanguage( this.customTarget, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode); if (tatoebaValidFileInfos == null) { tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir); } ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage( combinedValid.Source, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spValidTarget = MarianHelper.PreprocessLanguage( combinedValid.Target, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); }
internal Process BatchTranslate( IEnumerable <string> input, FileInfo spOutput, Boolean preprocessedInput = false, Boolean storeTranslations = false) { Log.Information($"Starting batch translator for model {this.SystemName}."); var srcFile = MarianHelper.LinesToFile(input, this.SourceCode); FileInfo spInput; if (!preprocessedInput) { FileInfo sourceSegModel = this.modelDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); spInput = MarianHelper.PreprocessLanguage( srcFile, new DirectoryInfo(Path.GetTempPath()), this.TargetCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs); } else { spInput = srcFile; } //TODO: check the translation cache for translations beforehand, and only translate new //segments (also change translation cache to account for different decoder configs for //same systems, i.e. keep track of decoder settings) FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign"); var args = $"\"{this.modelDir.FullName}\" \"{spInput.FullName}\" \"{transAndAlign.FullName}\" --log-level=info --quiet"; if (storeTranslations) { this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spInput, transAndAlign); } EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y); var cmd = "TranslateBatchSentencePiece.bat"; var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler); return(batchProcess); }
internal static void GenerateAlignments(FileInfo spSource, FileInfo spTarget, FileInfo alignmentFile, FileInfo priorsFile) { var alignArgs = $"-s \"{spSource.FullName}\" -t \"{spTarget.FullName}\" -f \"{alignmentFile.FullName}.fwd\" -r \"{alignmentFile.FullName}.rev\""; Log.Information($"Aligning fine-tuning corpus with args {alignArgs}"); var alignProcess = MarianHelper.StartProcessInBackgroundWithRedirects("python Alignment\\align.py", alignArgs); alignProcess.WaitForExit(); var symmetryArgs = $"-c grow-diag-final -i \"{alignmentFile.FullName}.fwd\" -j \"{alignmentFile.FullName}.rev\" > \"{alignmentFile.FullName}\""; Log.Information($"Symmetrisizing alignment with args {symmetryArgs}"); var symmetryProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Alignment\\atools.exe", symmetryArgs); symmetryProcess.WaitForExit(); }
private void PreprocessInput() { var sourceSpm = this.customDir.GetFiles("source.spm").Single(); var targetSpm = this.customDir.GetFiles("target.spm").Single(); this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode); ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); }
private Process StartTraining() { var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig); var deserializer = new Deserializer(); MarianTrainerConfig trainingConfig; using (var reader = new StreamReader(configPath)) { trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader); } this.trainingLog.TrainingConfig = trainingConfig; //var trainingArgs = $"--config {configPath} --log-level=warn"; var trainingArgs = $"--config \"{configPath}\" --log-level=info"; // --quiet"; var trainProcess = MarianHelper.StartProcessInBackgroundWithRedirects( Path.Combine(OpusCatMTEngineSettings.Default.MarianDir, "marian.exe"), trainingArgs, this.MarianExitHandler, this.MarianProgressHandler); return(trainProcess); }
public Process Customize() { this.OnProgressChanged(new ProgressChangedEventArgs(1, new MarianCustomizationStatus(CustomizationStep.Copying_model, null))); //First copy the model to new dir try { this.CopyModelDir(this.modelDir, this.customLabel); //Save model config as soon as the model dir exists this.customModel.SaveModelConfig(); } catch (Exception ex) { Log.Information($"Customization failed: {ex.Message}"); return(null); } //Save the batch to translate after customization to a file (to be batch translated after successful exit) if (this.postCustomizationBatch != null && this.postCustomizationBatch.Count > 0) { FileInfo postCustomizationBatchFile = new FileInfo(Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.PostFinetuneBatchName)); using (var writer = postCustomizationBatchFile.CreateText()) { foreach (var sourceString in this.postCustomizationBatch) { writer.WriteLine(sourceString); } } } this.OnProgressChanged(new ProgressChangedEventArgs(2, new MarianCustomizationStatus(CustomizationStep.Copying_training_files, null))); //Copy raw files to model dir this.customSource = this.customSource.CopyTo(Path.Combine(this.customDir.FullName, "custom.source")); this.customTarget = this.customTarget.CopyTo(Path.Combine(this.customDir.FullName, "custom.target")); this.OnProgressChanged(new ProgressChangedEventArgs(3, new MarianCustomizationStatus(CustomizationStep.Preprocessing_training_files, null))); //Preprocess input files this.PreprocessInput(); var decoderYaml = this.customDir.GetFiles("decoder.yml").Single(); var deserializer = new Deserializer(); var decoderSettings = deserializer.Deserialize <MarianDecoderConfig>(decoderYaml.OpenText()); if (this.guidedAlignment) { //Generate alignments for fine-tuning corpus this.alignmentFile = new FileInfo(Path.Combine(this.customDir.FullName, "custom.alignments")); MarianHelper.GenerateAlignments(this.spSource, this.spTarget, this.alignmentFile, this.model.AlignmentPriorsFile); // //Generate alignments for validation set (for evaluating fine-tuning effect on alignment) this.validAlignmentFile = new FileInfo(Path.Combine(this.customDir.FullName, "combined.alignments")); MarianHelper.GenerateAlignments(this.spValidSource, this.spValidTarget, this.validAlignmentFile, this.model.AlignmentPriorsFile); } this.OnProgressChanged(new ProgressChangedEventArgs(4, new MarianCustomizationStatus(CustomizationStep.Initial_evaluation, null))); //Do the initial evaluation var initialValidProcess = this.model.TranslateAndEvaluate( this.spValidSource, new FileInfo(Path.Combine(this.customDir.FullName, "valid.0.txt")), this.spValidTarget, OpusCatMTEngineSettings.Default.OODValidSetSize, this.sourceLanguage, this.targetLanguage, true ); //Wait for the initial valid to finish before starting customization //(TODO: make sure this is not done on UI thread) initialValidProcess.WaitForExit(); this.OnProgressChanged(new ProgressChangedEventArgs(6, new MarianCustomizationStatus(CustomizationStep.Finetuning, null))); //Use the initial translation time as basis for estimating the duration of validation file //translation this.trainingLog.EstimatedTranslationDuration = Convert.ToInt32((initialValidProcess.ExitTime - initialValidProcess.StartTime).TotalSeconds); MarianTrainerConfig trainingConfig; var baseCustomizeYmlPath = HelperFunctions.GetLocalAppDataPath( OpusCatMTEngineSettings.Default.CustomizationBaseConfig); var processDir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); //Make sure there's a customization file. if (!File.Exists(baseCustomizeYmlPath)) { File.Copy( Path.Combine(processDir, OpusCatMTEngineSettings.Default.CustomizationBaseConfig), baseCustomizeYmlPath); } //deserialize yaml file using (var reader = new StreamReader(baseCustomizeYmlPath)) { trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader); } trainingConfig.trainSets = new List <string> { this.spSource.FullName, this.spTarget.FullName }; trainingConfig.ValidSets = new List <string> { this.spValidSource.FullName, this.spValidTarget.FullName }; trainingConfig.vocabs = new List <string> { Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0]), Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0]) }; switch (this.segmentationMethod) { case ".bpe": string validScriptPath = Path.Combine(this.customDir.FullName, "ValidateBpe.bat"); trainingConfig.validScriptPath = $"\"{validScriptPath}\""; File.Copy( Path.Combine(processDir, "ValidateBpe.bat"), validScriptPath); break; case ".spm": validScriptPath = Path.Combine(this.customDir.FullName, "ValidateSp.bat"); trainingConfig.validScriptPath = $"\"{validScriptPath}\""; File.Copy( Path.Combine(processDir, "ValidateSp.bat"), validScriptPath); break; default: break; } trainingConfig.validScriptArgs = new List <string> { $"{spValidTarget.FullName}", $"OOD{OpusCatMTEngineSettings.Default.OODValidSetSize.ToString()}" }; trainingConfig.validTranslationOutput = Path.Combine(this.customDir.FullName, "valid.{U}.txt"); if (this.guidedAlignment) { trainingConfig.guidedAlignment = this.alignmentFile.FullName; } trainingConfig.validLog = Path.Combine(this.customDir.FullName, "valid.log"); trainingConfig.log = Path.Combine(this.customDir.FullName, "train.log"); trainingConfig.model = Path.Combine(this.customDir.FullName, decoderSettings.models.Single()); var builder = new SerializerBuilder(); builder.ConfigureDefaultValuesHandling(DefaultValuesHandling.OmitNull); var serializer = builder.Build(); var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig); using (var writer = File.CreateText(configPath)) { serializer.Serialize(writer, trainingConfig, typeof(MarianTrainerConfig)); } Process trainProcess = this.StartTraining(); return(trainProcess); }
internal static FileInfo PreprocessLanguage( FileInfo languageFile, DirectoryInfo directory, string languageCode, FileInfo segmentationModel, bool includePlaceholderTags, bool includeTagPairs, string targetLanguageToPrefix = null) { var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}")); using (var rawFile = languageFile.OpenText()) using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName)) { String line; while ((line = rawFile.ReadLine()) != null) { var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs); preprocessedWriter.WriteLine(preprocessedLine); } } //Marian doesn't like spaces in names var segmentedFile = new FileInfo(Path.Combine(directory.FullName, $"seg_{languageFile.Name.Replace(" ", "_")}")); switch (segmentationModel.Extension) { case ".spm": var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{segmentationModel.FullName}\" --output \"{segmentedFile.FullName}\""; var segmentationProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs); segmentationProcess.WaitForExit(); break; case ".bpe": //Truecasing is not used in any models, so this is a dummy tc model (empty). So it does not //matter is source.tcmodel is used for target language. var tcModelPath = $@"{directory.FullName}\source.tcmodel"; var mosesProcess = MarianHelper.StartProcessInBackgroundWithRedirects( $"type {preprocessedFile.FullName} | Preprocessing\\StartMosesBpePreprocessPipe.bat {languageCode} \"{tcModelPath}\" \"{segmentationModel.FullName}\" > {segmentedFile.FullName}"); mosesProcess.WaitForExit(); break; default: segmentationProcess = null; throw new Exception("No segmentation model found"); break; } if (targetLanguageToPrefix != null) { var segmentedWithTargetPrefix = new FileInfo(Path.Combine(directory.FullName, $"prefix_{languageFile.Name.Replace(" ", "_")}")); using (var segFile = segmentedFile.OpenText()) using (var prefixWriter = new StreamWriter(segmentedWithTargetPrefix.FullName)) { String line; while ((line = segFile.ReadLine()) != null) { var prefixedLine = $">>{targetLanguageToPrefix}<< {line}"; prefixWriter.WriteLine(prefixedLine); } } return(segmentedWithTargetPrefix); } else { return(segmentedFile); } }