internal static FileInfo PreprocessLanguage( FileInfo languageFile, DirectoryInfo directory, string languageCode, FileInfo spmModel, bool includePlaceholderTags, bool includeTagPairs) { var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}")); var spFile = new FileInfo(Path.Combine(directory.FullName, $"sp_{languageFile.Name}")); using (var rawFile = languageFile.OpenText()) using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName)) { String line; while ((line = rawFile.ReadLine()) != null) { var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs); preprocessedWriter.WriteLine(preprocessedLine); } } var spArgs = $"{preprocessedFile.FullName} --model {spmModel.FullName} --output {spFile.FullName}"; var spmProcess = MarianHelper.StartProcessInBackgroundWithRedirects("spm_encode.exe", spArgs); spmProcess.WaitForExit(); return(spFile); }
private void PreprocessInput() { var sourceSpm = this.customDir.GetFiles("source.spm").Single(); var targetSpm = this.customDir.GetFiles("target.spm").Single(); this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); this.spValidSource = MarianHelper.PreprocessLanguage(this.validationSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spValidTarget = MarianHelper.PreprocessLanguage(this.validationTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); }
internal Process BatchTranslate(List <string> input) { Log.Information($"Starting batch translator for model {this.SystemName}."); var cmd = "TranslateBatchSentencePiece.bat"; FileInfo spInput = this.PreprocessInput(input); FileInfo spOutput = new FileInfo( spInput.FullName.Replace($".{SourceCode}", $".{TargetCode}")); var args = $"{this.modelDir.FullName} {spInput.FullName} {spOutput.FullName} --log-level=info --quiet"; var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args); batchProcess.Exited += (x, y) => BatchProcess_Exited(input, spOutput, x, y); return(batchProcess); }
private FileInfo PreprocessInput(List <string> input) { var fileGuid = Guid.NewGuid(); var srcFile = new FileInfo(Path.Combine(Path.GetTempPath(), $"{fileGuid}.{this.SourceCode}")); using (var srcStream = new StreamWriter(srcFile.FullName, true, Encoding.UTF8)) { foreach (var line in input) { srcStream.WriteLine(line); } } var spmModel = this.modelDir.GetFiles("source.spm").Single(); var spSrcFile = MarianHelper.PreprocessLanguage(srcFile, new DirectoryInfo(Path.GetTempPath()), this.SourceCode, spmModel, this.includePlaceholderTags, this.includeTagPairs); return(spSrcFile); }
public void Customize(EventHandler exitHandler) { //First copy the model to new dir try { this.CopyModelDir(this.modelDir, this.customLabel); } catch (Exception ex) { Log.Information($"Customization failed: {ex.Message}"); return; } //Preprocess input files this.PreprocessInput(); var decoderYaml = this.customDir.GetFiles("decoder.yml").Single(); var deserializer = new Deserializer(); var decoderSettings = deserializer.Deserialize <MarianDecoderConfig>(decoderYaml.OpenText()); MarianTrainerConfig trainingConfig; using (var reader = new StreamReader(FiskmoMTEngineSettings.Default.CustomizationBaseConfig)) { trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader); } trainingConfig.TrainSets = new List <string> { spSource.FullName, spTarget.FullName }; trainingConfig.ValidSets = new List <string> { spValidSource.FullName, spValidTarget.FullName }; trainingConfig.vocabs = new List <string> { Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0]), Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0]) }; trainingConfig.validLog = Path.Combine(this.customDir.FullName, "valid.log"); trainingConfig.log = Path.Combine(this.customDir.FullName, "train.log"); trainingConfig.model = Path.Combine(this.customDir.FullName, decoderSettings.models.Single()); var serializer = new Serializer(); var configPath = Path.Combine(this.customDir.FullName, "customize.yml"); using (var writer = File.CreateText(configPath)) { serializer.Serialize(writer, trainingConfig, typeof(MarianTrainerConfig)); } //var trainingArgs = $"--config {configPath} --log-level=warn"; var trainingArgs = $"--config {configPath} --log-level=info --quiet"; var trainProcess = MarianHelper.StartProcessInBackgroundWithRedirects("marian.exe", trainingArgs); if (exitHandler != null) { trainProcess.Exited += exitHandler; } }